Files
MC_NDCC/MC_NDCC_python/MC_NDCC.py
2025-11-08 18:45:48 +01:00

99 lines
4.4 KiB
Python
Executable File

# TITLE: Multi-Class Normal Distribution Cubic Clusters Dataset Generator
# AUTHOR: Dr. Hossein Moosaei, Saeed Khosravi
# Date: 10/09/2020
# NORMALLY DISTRIBUTED CLUSTERS is a data generator.
# It generates a series of random centers for multivariate
#normal distributions. NDC randomly generates a fraction
# of data for each center, i.e. what fraction of data points
# will come from this center. NDC randomly generates a
# separating plane. Based on this plane, classes for are
# chosen for each center. NDC then randomly generates the
# points from the distributions. NDC can increase
# inseparability by increasng variances of distributions.
# A measure of "true" separability is obtained by looking
# at how many points end up on the wrong side of the
# separating plane. All values are taken as integers
# for simplicity.
import numpy as np
import pandas as pd
class MC_NDCC:
def __init__(self):
self.n_samples = int(input("Enter number of samples: \n"))
self.n_features = int(input("Enter number of features: \n"))
self.n_classes = int(input("Enter number of classes: \n"))
self.centers_list = [100, 300, 500]
self.center_points = self.centers_matrix(self.centers_list, self.n_features)
self.n_centers = 2*len(self.centers_list)*self.n_features
self.class_locations = self.class_center_locations(self.n_classes, self.n_centers)
self.ss = self.sample_spliter(self.n_samples, self.n_classes, self.n_centers)
r, c = self.class_locations.shape
self.M = np.zeros((0, self.n_features))
self.l = np.zeros((0, 1))
for i in range(r):
for j in range(c):
self.temp = np.random.normal(loc = self.center_points[int(self.class_locations[i, j])],size = (int(self.ss[i,j]), self.n_features),scale = 5)
self.label_temp = np.ones((int(self.ss[i,j]), 1))*(i+1)
self.l = np.concatenate((self.l, self.label_temp), axis = 0)
self.M = np.concatenate((self.M, self.temp) , axis = 0)
self.M = np.concatenate((self.M, self.l), axis = 1).astype('int32')
np.random.shuffle(self.M)
def sample_spliter(self, n_samples, n_classes, n_centers):
# This function generates the number of samples belongs to each class
# Centers approximately have n_centers/n_classes samples with a small variance
count = 0
n_cen_fe_cls = int(np.floor(n_centers/n_classes))
n_each_c = np.zeros((n_classes, n_cen_fe_cls))
while(n_samples > count):
r = np.random.randint(n_classes)
r2 = np.random.randint(n_cen_fe_cls)
n_each_c[r, r2] += 1
count += 1
return n_each_c
def class_center_locations(self, n_classes, n_centers):
# This function specifies which center
# points belong to which classes
# It returns a matrix in size of n_classess by
# n_centers_for_each_class that means a row for each class
rng = np.random.default_rng()
# Generate list of random non-repeatative numbers from 1 to n_center
locs = rng.choice(n_centers, n_centers, replace=False)
# number of centers for each class
n_cen_fe_cls = int(np.floor(n_centers/n_classes))
cls_locs = np.zeros((n_classes,n_cen_fe_cls))
k = 0
for i in range(n_classes):
for j in range(n_cen_fe_cls):
cls_locs[i,j] = locs[k]
k += 1
return cls_locs
def centers_matrix(self, centers_list, n_features):
# This function returns the matrix of center locations
# based on centers_list in n_features space
n_centers = 2*len(centers_list)*n_features
centers_matrix = np.zeros((n_centers, n_features))
for i in range(len(centers_list)):
for j in range(n_features):
centers_matrix[i*2*n_features + 2*j , j] = centers_list[i]
centers_matrix[i*2*n_features + 2*j+1, j] = -centers_list[i]
return centers_matrix
def get_matrix(self):
# Get the dataset as a numpy matrix
return self.M
def get_csv(self, filename):
# Save the dataset as csv file
df = pd.DataFrame(self.M)
df.to_csv(filename, header = False, index = False)
print(f'Dataset saved as {filename} in current directory. ')