# TITLE: Multi-Class Normal Distribution Cubic Clusters Dataset Generator # AUTHOR: Dr. Hossein Moosaei, Saeed Khosravi # Date: 10/09/2020 # NORMALLY DISTRIBUTED CLUSTERS is a data generator. # It generates a series of random centers for multivariate #normal distributions. NDC randomly generates a fraction # of data for each center, i.e. what fraction of data points # will come from this center. NDC randomly generates a # separating plane. Based on this plane, classes for are # chosen for each center. NDC then randomly generates the # points from the distributions. NDC can increase # inseparability by increasng variances of distributions. # A measure of "true" separability is obtained by looking # at how many points end up on the wrong side of the # separating plane. All values are taken as integers # for simplicity. import numpy as np import pandas as pd class MC_NDCC: def __init__(self,n_centers, n_samples, n_features, n_classes): # self.n_samples = int(input("Enter number of samples: \n")) # self.n_features = int(input("Enter number of features: \n")) # self.n_classes = int(input("Enter number of classes: \n")) centers = [100, 300, 500, 700] self.centers_list = centers[0:n_centers] self.n_samples = n_samples self.n_features = n_features self.n_classes = n_classes self.center_points = self.centers_matrix(self.centers_list, self.n_features) self.n_centers = 2*len(self.centers_list)*self.n_features self.class_locations = self.class_center_locations(self.n_classes, self.n_centers) self.ss = self.sample_spliter(self.n_samples, self.n_classes, self.n_centers) r, c = self.class_locations.shape self.M = np.zeros((0, self.n_features)) self.l = np.zeros((0, 1)) for i in range(r): for j in range(c): self.temp = np.random.normal(loc = self.center_points[int(self.class_locations[i, j])],size = (int(self.ss[i,j]), self.n_features),scale = 5) self.label_temp = np.ones((int(self.ss[i,j]), 1))*(i+1) self.l = np.concatenate((self.l, self.label_temp), axis = 0) self.M = np.concatenate((self.M, self.temp) , axis = 0) self.M = np.concatenate((self.M, self.l), axis = 1).astype('int32') np.random.shuffle(self.M) def sample_spliter(self, n_samples, n_classes, n_centers): # This function generates the number of samples belongs to each class # Centers approximately have n_centers/n_classes samples with a small variance count = 0 n_cen_fe_cls = int(np.floor(n_centers/n_classes)) n_each_c = np.zeros((n_classes, n_cen_fe_cls)) while(n_samples > count): r = np.random.randint(n_classes) r2 = np.random.randint(n_cen_fe_cls) n_each_c[r, r2] += 1 count += 1 return n_each_c def class_center_locations(self, n_classes, n_centers): # This function specifies which center # points belong to which classes # It returns a matrix in size of n_classess by # n_centers_for_each_class that means a row for each class rng = np.random.default_rng() # Generate list of random non-repeatative numbers from 1 to n_center locs = rng.choice(n_centers, n_centers, replace=False) # number of centers for each class n_cen_fe_cls = int(np.floor(n_centers/n_classes)) cls_locs = np.zeros((n_classes,n_cen_fe_cls)) k = 0 for i in range(n_classes): for j in range(n_cen_fe_cls): cls_locs[i,j] = locs[k] k += 1 return cls_locs def centers_matrix(self, centers_list, n_features): # This function returns the matrix of center locations # based on centers_list in n_features space n_centers = 2*len(centers_list)*n_features centers_matrix = np.zeros((n_centers, n_features)) for i in range(len(centers_list)): for j in range(n_features): centers_matrix[i*2*n_features + 2*j , j] = centers_list[i] centers_matrix[i*2*n_features + 2*j+1, j] = -centers_list[i] return centers_matrix def get_matrix(self): # Get the dataset as a numpy matrix return self.M def get_csv(self, filename): # Save the dataset as csv file df = pd.DataFrame(self.M) df.to_csv(filename, header = False, index = False) print(f'Dataset saved as {filename} in current directory. ')