init commit

This commit is contained in:
2025-11-08 18:45:48 +01:00
parent 6d1d845c4e
commit 2f82041a21
9 changed files with 1393 additions and 0 deletions

98
MC_NDCC.py Executable file
View File

@@ -0,0 +1,98 @@
# TITLE: Multi-Class Normal Distribution Cubic Clusters Dataset Generator
# AUTHOR: Dr. Hossein Moosaei, Saeed Khosravi
# Date: 10/09/2020
# NORMALLY DISTRIBUTED CLUSTERS is a data generator.
# It generates a series of random centers for multivariate
#normal distributions. NDC randomly generates a fraction
# of data for each center, i.e. what fraction of data points
# will come from this center. NDC randomly generates a
# separating plane. Based on this plane, classes for are
# chosen for each center. NDC then randomly generates the
# points from the distributions. NDC can increase
# inseparability by increasng variances of distributions.
# A measure of "true" separability is obtained by looking
# at how many points end up on the wrong side of the
# separating plane. All values are taken as integers
# for simplicity.
import numpy as np
import pandas as pd
class MC_NDCC:
def __init__(self):
self.n_samples = int(input("Enter number of samples: \n"))
self.n_features = int(input("Enter number of features: \n"))
self.n_classes = int(input("Enter number of classes: \n"))
self.centers_list = [100, 300, 500]
self.center_points = self.centers_matrix(self.centers_list, self.n_features)
self.n_centers = 2*len(self.centers_list)*self.n_features
self.class_locations = self.class_center_locations(self.n_classes, self.n_centers)
self.ss = self.sample_spliter(self.n_samples, self.n_classes, self.n_centers)
r, c = self.class_locations.shape
self.M = np.zeros((0, self.n_features))
self.l = np.zeros((0, 1))
for i in range(r):
for j in range(c):
self.temp = np.random.normal(loc = self.center_points[int(self.class_locations[i, j])],size = (int(self.ss[i,j]), self.n_features),scale = 5)
self.label_temp = np.ones((int(self.ss[i,j]), 1))*(i+1)
self.l = np.concatenate((self.l, self.label_temp), axis = 0)
self.M = np.concatenate((self.M, self.temp) , axis = 0)
self.M = np.concatenate((self.M, self.l), axis = 1).astype('int32')
np.random.shuffle(self.M)
def sample_spliter(self, n_samples, n_classes, n_centers):
# This function generates the number of samples belongs to each class
# Centers approximately have n_centers/n_classes samples with a small variance
count = 0
n_cen_fe_cls = int(np.floor(n_centers/n_classes))
n_each_c = np.zeros((n_classes, n_cen_fe_cls))
while(n_samples > count):
r = np.random.randint(n_classes)
r2 = np.random.randint(n_cen_fe_cls)
n_each_c[r, r2] += 1
count += 1
return n_each_c
def class_center_locations(self, n_classes, n_centers):
# This function specifies which center
# points belong to which classes
# It returns a matrix in size of n_classess by
# n_centers_for_each_class that means a row for each class
rng = np.random.default_rng()
# Generate list of random non-repeatative numbers from 1 to n_center
locs = rng.choice(n_centers, n_centers, replace=False)
# number of centers for each class
n_cen_fe_cls = int(np.floor(n_centers/n_classes))
cls_locs = np.zeros((n_classes,n_cen_fe_cls))
k = 0
for i in range(n_classes):
for j in range(n_cen_fe_cls):
cls_locs[i,j] = locs[k]
k += 1
return cls_locs
def centers_matrix(self, centers_list, n_features):
# This function returns the matrix of center locations
# based on centers_list in n_features space
n_centers = 2*len(centers_list)*n_features
centers_matrix = np.zeros((n_centers, n_features))
for i in range(len(centers_list)):
for j in range(n_features):
centers_matrix[i*2*n_features + 2*j , j] = centers_list[i]
centers_matrix[i*2*n_features + 2*j+1, j] = -centers_list[i]
return centers_matrix
def get_matrix(self):
# Get the dataset as a numpy matrix
return self.M
def get_csv(self, filename):
# Save the dataset as csv file
df = pd.DataFrame(self.M)
df.to_csv(filename, header = False, index = False)
print(f'Dataset saved as {filename} in current directory. ')

39
MC_NDCC_matlab/NDCC.m Executable file
View File

@@ -0,0 +1,39 @@
%{
NORMALLY DISTRIBUTED CLUSTERS is a data generator.
It generates a series of random centers for multivariate
normal distributions. NDC randomly generates a fraction
of data for each center, i.e. what fraction of data points
will come from this center. NDC randomly generates a
separating plane. Based on this plane, classes for are
chosen for each center. NDC then randomly generates the
points from the distributions. NDC can increase
inseparability by increasng variances of distributions.
A measure of "true" separability is obtained by looking
at how many points end up on the wrong side of the
separating plane. All values are taken as integers
for simplicity.
%}
centers_list = [100, 300, 500];
n_samples = input('Enter the number of samples:\n');
n_features = input('Enter the number of features:\n');
n_classes = input('Enter the number of classes:\n');
% Generating center matrix based on centers_list and number of features
centers_matrix = get_centers_mat(centers_list, n_features);
n_centers = 2*length(centers_list)*n_features;
% The same number of randomly chosen centers will dedicate to each class
class_locations = class_center_locations(n_classes, n_centers);
% Deciding randomly that how many samples should be in each class_locations
ss = sample_spliter(n_samples, n_classes, n_centers);
%Generating dataset
ds = generate_dataset(centers_matrix, ss,class_locations, n_features);
%Saving the dataset as a csv file in current directory
writematrix(ds, 'dataset.csv');

View File

@@ -0,0 +1,27 @@
function cls_locs = class_center_locations(n_classes, n_centers);
%{
*** This function specifies which center points belong
to which classes
*** It returns a matrix in size of n_classess by
n_centers_for_each_class that means a row for each class
%}
% Generate list of random non-repeatative numbers from 1 to n_center
locs = datasample(1:n_centers,n_centers,'Replace',false);
% number of centers for each class
n_cen_fe_cls = int32(floor(n_centers/n_classes));
cls_locs = zeros(n_classes,n_cen_fe_cls);
k = 1;
for i = 1:n_classes
for j = 1:n_cen_fe_cls
cls_locs(i,j) = locs(k);
k = k+1;
end
end
end

1000
MC_NDCC_matlab/dataset.csv Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,30 @@
function M = generate_dataset(centers_matrix, ss,class_locations, n_features)
%{
*** This function returns the generated dataset matrix with
coresponding labels
*** Samples of each center generated using normal distribution
function with mu as the center location and sigma as 5
*** Size of the samples are given from sample spliter funtion
%}
[r, c] = size(class_locations);
%Intialize the matrix as an empty matrix
M = zeros(0, n_features);
l = zeros(0, 1);
for i = 1:r
for j = 1:c
%Generate samples in specific center point(mu) and (sigma = 5,
%and) in size of ss by n_features
tmp = normrnd(centers_matrix(int32(class_locations(i,j))), 5,[int32(ss(i,j)), n_features]);
label_tmp = ones(int32(ss(i,j)), 1)*(i);
l = [l; label_tmp];
M = [M; tmp];
end
end
M = [M, l];
end

View File

@@ -0,0 +1,17 @@
function [centers_matrix] = get_centers_mat(centers_list, n_features)
%{
*** This function returns the matrix of center locations
based on centers_list in n_features space
%}
n_centers = 2*length(centers_list) * n_features;
centers_matrix = zeros(n_centers, n_features);
for i = 1:length(centers_list)
for j = 1:n_features
centers_matrix((i-1)*2*n_features + (2*j)-1, j) = centers_list(i);
centers_matrix((i-1)*2*n_features + (2*j), j) = -centers_list(i);
end
end
end

19
MC_NDCC_matlab/sample_spliter.m Executable file
View File

@@ -0,0 +1,19 @@
function n_each_c = sample_spliter(n_samples, n_classes, n_centers)
%{
*** This function generates the number of samples belongs
to each class
*** Centers approximately have n_centers/n_classes samples with a small variance
%}
count = 0;
n_cen_fe_cls = int32(floor(n_centers/n_classes));
n_each_c = zeros(n_classes, n_cen_fe_cls);
while n_samples > count
r = randi(n_classes);
r2 = randi(n_cen_fe_cls);
n_each_c(r, r2) = n_each_c(r, r2) + 1;
count = count + 1;
end
end

98
MC_NDCC_python/MC_NDCC.py Executable file
View File

@@ -0,0 +1,98 @@
# TITLE: Multi-Class Normal Distribution Cubic Clusters Dataset Generator
# AUTHOR: Dr. Hossein Moosaei, Saeed Khosravi
# Date: 10/09/2020
# NORMALLY DISTRIBUTED CLUSTERS is a data generator.
# It generates a series of random centers for multivariate
#normal distributions. NDC randomly generates a fraction
# of data for each center, i.e. what fraction of data points
# will come from this center. NDC randomly generates a
# separating plane. Based on this plane, classes for are
# chosen for each center. NDC then randomly generates the
# points from the distributions. NDC can increase
# inseparability by increasng variances of distributions.
# A measure of "true" separability is obtained by looking
# at how many points end up on the wrong side of the
# separating plane. All values are taken as integers
# for simplicity.
import numpy as np
import pandas as pd
class MC_NDCC:
def __init__(self):
self.n_samples = int(input("Enter number of samples: \n"))
self.n_features = int(input("Enter number of features: \n"))
self.n_classes = int(input("Enter number of classes: \n"))
self.centers_list = [100, 300, 500]
self.center_points = self.centers_matrix(self.centers_list, self.n_features)
self.n_centers = 2*len(self.centers_list)*self.n_features
self.class_locations = self.class_center_locations(self.n_classes, self.n_centers)
self.ss = self.sample_spliter(self.n_samples, self.n_classes, self.n_centers)
r, c = self.class_locations.shape
self.M = np.zeros((0, self.n_features))
self.l = np.zeros((0, 1))
for i in range(r):
for j in range(c):
self.temp = np.random.normal(loc = self.center_points[int(self.class_locations[i, j])],size = (int(self.ss[i,j]), self.n_features),scale = 5)
self.label_temp = np.ones((int(self.ss[i,j]), 1))*(i+1)
self.l = np.concatenate((self.l, self.label_temp), axis = 0)
self.M = np.concatenate((self.M, self.temp) , axis = 0)
self.M = np.concatenate((self.M, self.l), axis = 1).astype('int32')
np.random.shuffle(self.M)
def sample_spliter(self, n_samples, n_classes, n_centers):
# This function generates the number of samples belongs to each class
# Centers approximately have n_centers/n_classes samples with a small variance
count = 0
n_cen_fe_cls = int(np.floor(n_centers/n_classes))
n_each_c = np.zeros((n_classes, n_cen_fe_cls))
while(n_samples > count):
r = np.random.randint(n_classes)
r2 = np.random.randint(n_cen_fe_cls)
n_each_c[r, r2] += 1
count += 1
return n_each_c
def class_center_locations(self, n_classes, n_centers):
# This function specifies which center
# points belong to which classes
# It returns a matrix in size of n_classess by
# n_centers_for_each_class that means a row for each class
rng = np.random.default_rng()
# Generate list of random non-repeatative numbers from 1 to n_center
locs = rng.choice(n_centers, n_centers, replace=False)
# number of centers for each class
n_cen_fe_cls = int(np.floor(n_centers/n_classes))
cls_locs = np.zeros((n_classes,n_cen_fe_cls))
k = 0
for i in range(n_classes):
for j in range(n_cen_fe_cls):
cls_locs[i,j] = locs[k]
k += 1
return cls_locs
def centers_matrix(self, centers_list, n_features):
# This function returns the matrix of center locations
# based on centers_list in n_features space
n_centers = 2*len(centers_list)*n_features
centers_matrix = np.zeros((n_centers, n_features))
for i in range(len(centers_list)):
for j in range(n_features):
centers_matrix[i*2*n_features + 2*j , j] = centers_list[i]
centers_matrix[i*2*n_features + 2*j+1, j] = -centers_list[i]
return centers_matrix
def get_matrix(self):
# Get the dataset as a numpy matrix
return self.M
def get_csv(self, filename):
# Save the dataset as csv file
df = pd.DataFrame(self.M)
df.to_csv(filename, header = False, index = False)
print(f'Dataset saved as {filename} in current directory. ')

View File

@@ -0,0 +1,65 @@
# MC_NDCC
Multi-Class Normal Distribution Cubic Clusters Dataset Generator
## Description
MC_NDCC is a data generator that creates synthetic datasets with multiple classes using normally distributed clusters. It generates random centers for multivariate normal distributions, assigns class labels based on separating planes, and randomly generates data points from these distributions.
## Features
- Generate multi-class datasets with customizable number of samples, features, and classes
- Random center generation for multivariate normal distributions
- Automatic class assignment based on separating planes
- Support for Python and MATLAB implementations
## Requirements
### Python
- numpy
- pandas
### MATLAB
- MATLAB R2019b or later
## Usage
### Python
```python
from MC_NDCC import MC_NDCC
# Initialize an instance (will prompt for inputs)
ndcc = MC_NDCC()
# Get the dataset as a numpy matrix
dataset = ndcc.get_matrix()
# Save the dataset as a CSV file
ndcc.get_csv('dataset.csv')
```
### MATLAB
1. Run `MC_NDCC_matlab/NDCC.m`
2. Enter the number of samples, features, and classes when prompted
3. The generated dataset will be saved as `dataset.csv` in the current directory
## Project Structure
```
MC_NDCC/
├── MC_NDCC.py # Python implementation (root)
├── MC_NDCC_python/ # Python package directory
├── MC_NDCC_matlab/ # MATLAB implementation
└── README.md # This file
```
## Authors
Dr. Hossein Moosaei, Saeed Khosravi
## Date
10/09/2020