init commit

2025-11-08 18:45:48 +01:00
parent 6d1d845c4e
commit 2f82041a21
9 changed files with 1393 additions and 0 deletions
--- a/MC_NDCC.py
+++ b/MC_NDCC.py
@@ -0,0 +1,98 @@
+# TITLE: Multi-Class Normal Distribution Cubic Clusters Dataset Generator
+# AUTHOR: Dr. Hossein Moosaei, Saeed Khosravi
+# Date: 10/09/2020
+
+# NORMALLY DISTRIBUTED CLUSTERS is a data generator. 
+# It generates a series of random centers for multivariate
+#normal distributions. NDC randomly generates a fraction
+# of data for each center, i.e. what fraction of data points
+# will come from this center. NDC randomly generates a 
+# separating plane. Based on this plane, classes for are 
+# chosen for each center. NDC then randomly generates the 
+# points from the distributions. NDC can increase 
+# inseparability by increasng variances of distributions.
+# A measure of "true" separability is obtained by looking 
+# at how many points end up on the wrong side of the 
+# separating plane. All values are taken as integers 
+# for simplicity.
+
+import numpy as np
+import pandas as pd
+class MC_NDCC:
+    
+    def __init__(self):
+        self.n_samples  = int(input("Enter number of samples: \n"))
+        self.n_features = int(input("Enter number of features: \n"))
+        self.n_classes  = int(input("Enter number of classes: \n"))
+        
+        self.centers_list    = [100, 300, 500]
+        self.center_points   = self.centers_matrix(self.centers_list, self.n_features)
+        self.n_centers       = 2*len(self.centers_list)*self.n_features
+        self.class_locations = self.class_center_locations(self.n_classes, self.n_centers)
+        self.ss = self.sample_spliter(self.n_samples, self.n_classes, self.n_centers)
+        r, c    = self.class_locations.shape
+        self.M  = np.zeros((0, self.n_features))
+        self.l  = np.zeros((0, 1))
+        for i in range(r):
+            for j in range(c):
+                self.temp = np.random.normal(loc = self.center_points[int(self.class_locations[i, j])],size = (int(self.ss[i,j]), self.n_features),scale = 5)
+                self.label_temp = np.ones((int(self.ss[i,j]), 1))*(i+1)
+                self.l = np.concatenate((self.l, self.label_temp), axis = 0)
+                self.M = np.concatenate((self.M, self.temp) , axis = 0)
+        self.M = np.concatenate((self.M, self.l), axis = 1).astype('int32')
+        np.random.shuffle(self.M)
+    def sample_spliter(self, n_samples, n_classes, n_centers):
+        # This function generates the number of samples belongs to each class
+        # Centers approximately have n_centers/n_classes samples with a small variance 
+        count = 0
+        n_cen_fe_cls = int(np.floor(n_centers/n_classes))
+        n_each_c = np.zeros((n_classes, n_cen_fe_cls))
+        while(n_samples > count):
+            r = np.random.randint(n_classes)
+            r2 = np.random.randint(n_cen_fe_cls)
+            n_each_c[r, r2] += 1
+            count += 1
+        return n_each_c
+
+    def class_center_locations(self, n_classes, n_centers):
+        
+        # This function specifies which center 
+        # points belong to which classes
+    
+        # It returns a matrix in size of n_classess by 
+        # n_centers_for_each_class that means a row for each class
+        
+        rng = np.random.default_rng()
+        # Generate list of random non-repeatative numbers from 1 to n_center
+        locs = rng.choice(n_centers, n_centers, replace=False)
+        # number of centers for each class
+        n_cen_fe_cls = int(np.floor(n_centers/n_classes))
+        cls_locs = np.zeros((n_classes,n_cen_fe_cls))
+        k = 0
+        for i in range(n_classes):
+            for j in range(n_cen_fe_cls):
+                cls_locs[i,j] = locs[k]
+                k += 1 
+        return cls_locs
+
+    def centers_matrix(self, centers_list, n_features):
+        # This function returns the matrix of center locations 
+        # based on centers_list in n_features space
+        n_centers = 2*len(centers_list)*n_features
+        centers_matrix = np.zeros((n_centers, n_features))
+        for i in range(len(centers_list)):
+            for j in range(n_features):
+                centers_matrix[i*2*n_features + 2*j  , j] =  centers_list[i]
+                centers_matrix[i*2*n_features + 2*j+1, j] = -centers_list[i]
+        return centers_matrix
+    
+        
+    def get_matrix(self):
+        # Get the dataset as a numpy matrix
+        return self.M
+    
+    def get_csv(self, filename):
+        # Save the dataset as csv file
+        df = pd.DataFrame(self.M)
+        df.to_csv(filename, header = False, index = False)
+        print(f'Dataset saved as {filename} in current directory. ')
--- a/MC_NDCC_matlab/NDCC.m
+++ b/MC_NDCC_matlab/NDCC.m
@@ -0,0 +1,39 @@
+%{
+NORMALLY DISTRIBUTED CLUSTERS is a data generator. 
+It generates a series of random centers for multivariate
+normal distributions. NDC randomly generates a fraction
+of data for each center, i.e. what fraction of data points
+will come from this center. NDC randomly generates a 
+separating plane. Based on this plane, classes for are 
+chosen for each center. NDC then randomly generates the 
+points from the distributions. NDC can increase 
+inseparability by increasng variances of distributions.
+A measure of "true" separability is obtained by looking 
+at how many points end up on the wrong side of the 
+separating plane. All values are taken as integers 
+for simplicity.
+%}
+
+
+centers_list = [100, 300, 500];
+n_samples  = input('Enter the number of samples:\n');
+n_features = input('Enter the number of features:\n');
+n_classes  = input('Enter the number of classes:\n');
+
+
+% Generating center matrix based on centers_list and number of features
+centers_matrix = get_centers_mat(centers_list, n_features);
+n_centers = 2*length(centers_list)*n_features;
+
+% The same number of randomly chosen centers will dedicate to each class
+class_locations = class_center_locations(n_classes, n_centers);
+
+% Deciding randomly that how many samples should be in each class_locations
+ss = sample_spliter(n_samples, n_classes, n_centers);
+
+%Generating dataset 
+ds = generate_dataset(centers_matrix, ss,class_locations, n_features);
+
+%Saving the dataset as a csv file in current directory
+writematrix(ds, 'dataset.csv');
+    
--- a/MC_NDCC_matlab/class_center_locations.m
+++ b/MC_NDCC_matlab/class_center_locations.m
@@ -0,0 +1,27 @@
+function cls_locs = class_center_locations(n_classes, n_centers);
+    %{
+        
+        *** This function specifies which center points belong 
+        to which classes
+    
+        *** It returns a matrix in size of n_classess by
+        n_centers_for_each_class that means a row for each class
+
+    %}
+
+    % Generate list of random non-repeatative numbers from 1 to n_center 
+    locs = datasample(1:n_centers,n_centers,'Replace',false);
+    
+    % number of centers for each class
+    n_cen_fe_cls = int32(floor(n_centers/n_classes));
+    
+    cls_locs = zeros(n_classes,n_cen_fe_cls);
+    k = 1;
+    for i = 1:n_classes
+        for j = 1:n_cen_fe_cls
+            cls_locs(i,j) = locs(k);
+            k = k+1;
+        end
+    end
+
+end
--- a/MC_NDCC_matlab/dataset.csv
+++ b/MC_NDCC_matlab/dataset.csv
--- a/MC_NDCC_matlab/generate_dataset.m
+++ b/MC_NDCC_matlab/generate_dataset.m
@@ -0,0 +1,30 @@
+function M = generate_dataset(centers_matrix, ss,class_locations, n_features)
+    %{
+    
+        *** This function returns the generated dataset matrix with
+        coresponding labels 
+        
+        *** Samples of each center generated using normal distribution 
+        function with mu as the center location and sigma as 5 
+
+        *** Size of the samples are given from sample spliter funtion
+        
+    %}
+
+    [r, c] = size(class_locations);
+    %Intialize the matrix as an empty matrix
+    M = zeros(0, n_features);
+    l = zeros(0, 1);
+    for i = 1:r
+        for j = 1:c
+            %Generate samples in specific center point(mu) and (sigma = 5,
+            %and) in size of ss by n_features
+            tmp = normrnd(centers_matrix(int32(class_locations(i,j))), 5,[int32(ss(i,j)), n_features]);
+            label_tmp = ones(int32(ss(i,j)), 1)*(i);
+         
+            l = [l; label_tmp];
+            M = [M; tmp];
+        end
+    end
+    M = [M, l];
+end
--- a/MC_NDCC_matlab/get_centers_mat.m
+++ b/MC_NDCC_matlab/get_centers_mat.m
@@ -0,0 +1,17 @@
+function [centers_matrix] = get_centers_mat(centers_list, n_features)
+    %{
+
+        *** This function returns the matrix of center locations 
+            based on centers_list in n_features space
+
+    %}
+
+    n_centers = 2*length(centers_list) * n_features;
+    centers_matrix = zeros(n_centers, n_features);
+    for i = 1:length(centers_list)
+        for j = 1:n_features
+            centers_matrix((i-1)*2*n_features + (2*j)-1, j)   =  centers_list(i);
+            centers_matrix((i-1)*2*n_features + (2*j), j) = -centers_list(i);
+        end
+    end
+end
--- a/MC_NDCC_matlab/sample_spliter.m
+++ b/MC_NDCC_matlab/sample_spliter.m
@@ -0,0 +1,19 @@
+function n_each_c = sample_spliter(n_samples, n_classes, n_centers)
+    %{
+
+        *** This function generates the number of samples belongs 
+        to each class
+
+        *** Centers approximately have n_centers/n_classes samples with a small variance 
+ 
+    %} 
+    count = 0;
+    n_cen_fe_cls = int32(floor(n_centers/n_classes));
+    n_each_c = zeros(n_classes, n_cen_fe_cls);
+    while n_samples > count
+        r = randi(n_classes);
+        r2 = randi(n_cen_fe_cls);
+        n_each_c(r, r2) = n_each_c(r, r2) + 1;
+        count = count + 1;
+    end
+end
--- a/MC_NDCC_python/MC_NDCC.py
+++ b/MC_NDCC_python/MC_NDCC.py
@@ -0,0 +1,98 @@
+# TITLE: Multi-Class Normal Distribution Cubic Clusters Dataset Generator
+# AUTHOR: Dr. Hossein Moosaei, Saeed Khosravi
+# Date: 10/09/2020
+
+# NORMALLY DISTRIBUTED CLUSTERS is a data generator. 
+# It generates a series of random centers for multivariate
+#normal distributions. NDC randomly generates a fraction
+# of data for each center, i.e. what fraction of data points
+# will come from this center. NDC randomly generates a 
+# separating plane. Based on this plane, classes for are 
+# chosen for each center. NDC then randomly generates the 
+# points from the distributions. NDC can increase 
+# inseparability by increasng variances of distributions.
+# A measure of "true" separability is obtained by looking 
+# at how many points end up on the wrong side of the 
+# separating plane. All values are taken as integers 
+# for simplicity.
+
+import numpy as np
+import pandas as pd
+class MC_NDCC:
+    
+    def __init__(self):
+        self.n_samples  = int(input("Enter number of samples: \n"))
+        self.n_features = int(input("Enter number of features: \n"))
+        self.n_classes  = int(input("Enter number of classes: \n"))
+        
+        self.centers_list    = [100, 300, 500]
+        self.center_points   = self.centers_matrix(self.centers_list, self.n_features)
+        self.n_centers       = 2*len(self.centers_list)*self.n_features
+        self.class_locations = self.class_center_locations(self.n_classes, self.n_centers)
+        self.ss = self.sample_spliter(self.n_samples, self.n_classes, self.n_centers)
+        r, c    = self.class_locations.shape
+        self.M  = np.zeros((0, self.n_features))
+        self.l  = np.zeros((0, 1))
+        for i in range(r):
+            for j in range(c):
+                self.temp = np.random.normal(loc = self.center_points[int(self.class_locations[i, j])],size = (int(self.ss[i,j]), self.n_features),scale = 5)
+                self.label_temp = np.ones((int(self.ss[i,j]), 1))*(i+1)
+                self.l = np.concatenate((self.l, self.label_temp), axis = 0)
+                self.M = np.concatenate((self.M, self.temp) , axis = 0)
+        self.M = np.concatenate((self.M, self.l), axis = 1).astype('int32')
+        np.random.shuffle(self.M)
+    def sample_spliter(self, n_samples, n_classes, n_centers):
+        # This function generates the number of samples belongs to each class
+        # Centers approximately have n_centers/n_classes samples with a small variance 
+        count = 0
+        n_cen_fe_cls = int(np.floor(n_centers/n_classes))
+        n_each_c = np.zeros((n_classes, n_cen_fe_cls))
+        while(n_samples > count):
+            r = np.random.randint(n_classes)
+            r2 = np.random.randint(n_cen_fe_cls)
+            n_each_c[r, r2] += 1
+            count += 1
+        return n_each_c
+
+    def class_center_locations(self, n_classes, n_centers):
+        
+        # This function specifies which center 
+        # points belong to which classes
+    
+        # It returns a matrix in size of n_classess by 
+        # n_centers_for_each_class that means a row for each class
+        
+        rng = np.random.default_rng()
+        # Generate list of random non-repeatative numbers from 1 to n_center
+        locs = rng.choice(n_centers, n_centers, replace=False)
+        # number of centers for each class
+        n_cen_fe_cls = int(np.floor(n_centers/n_classes))
+        cls_locs = np.zeros((n_classes,n_cen_fe_cls))
+        k = 0
+        for i in range(n_classes):
+            for j in range(n_cen_fe_cls):
+                cls_locs[i,j] = locs[k]
+                k += 1 
+        return cls_locs
+
+    def centers_matrix(self, centers_list, n_features):
+        # This function returns the matrix of center locations 
+        # based on centers_list in n_features space
+        n_centers = 2*len(centers_list)*n_features
+        centers_matrix = np.zeros((n_centers, n_features))
+        for i in range(len(centers_list)):
+            for j in range(n_features):
+                centers_matrix[i*2*n_features + 2*j  , j] =  centers_list[i]
+                centers_matrix[i*2*n_features + 2*j+1, j] = -centers_list[i]
+        return centers_matrix
+    
+        
+    def get_matrix(self):
+        # Get the dataset as a numpy matrix
+        return self.M
+    
+    def get_csv(self, filename):
+        # Save the dataset as csv file
+        df = pd.DataFrame(self.M)
+        df.to_csv(filename, header = False, index = False)
+        print(f'Dataset saved as {filename} in current directory. ')
--- a/README.md
+++ b/README.md
@@ -0,0 +1,65 @@
+# MC_NDCC
+
+Multi-Class Normal Distribution Cubic Clusters Dataset Generator
+
+## Description
+
+MC_NDCC is a data generator that creates synthetic datasets with multiple classes using normally distributed clusters. It generates random centers for multivariate normal distributions, assigns class labels based on separating planes, and randomly generates data points from these distributions.
+
+## Features
+
+- Generate multi-class datasets with customizable number of samples, features, and classes
+- Random center generation for multivariate normal distributions
+- Automatic class assignment based on separating planes
+- Support for Python and MATLAB implementations
+
+## Requirements
+
+### Python
+- numpy
+- pandas
+
+### MATLAB
+- MATLAB R2019b or later
+
+## Usage
+
+### Python
+
+```python
+from MC_NDCC import MC_NDCC
+
+# Initialize an instance (will prompt for inputs)
+ndcc = MC_NDCC()
+
+# Get the dataset as a numpy matrix
+dataset = ndcc.get_matrix()
+
+# Save the dataset as a CSV file
+ndcc.get_csv('dataset.csv')
+```
+
+### MATLAB
+
+1. Run `MC_NDCC_matlab/NDCC.m`
+2. Enter the number of samples, features, and classes when prompted
+3. The generated dataset will be saved as `dataset.csv` in the current directory
+
+## Project Structure
+
+```
+MC_NDCC/
+├── MC_NDCC.py              # Python implementation (root)
+├── MC_NDCC_python/         # Python package directory
+├── MC_NDCC_matlab/         # MATLAB implementation
+└── README.md               # This file
+```
+
+## Authors
+
+Dr. Hossein Moosaei, Saeed Khosravi
+
+## Date
+
+10/09/2020
+