From ff7986e7f4f7b0aac80cefba7975f1a45c744205 Mon Sep 17 00:00:00 2001 From: saeedkhosravi94 Date: Sun, 30 Nov 2025 23:28:08 +0100 Subject: [PATCH] editing readme.md --- README.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ data.py | 6 ----- utils.py | 5 ---- 3 files changed, 78 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 61344d7..8323e4d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,80 @@ # Electrocardiogram +We are dealing with an exteremly imbalance dataset related to electrocardiogram signals that contain binary class labels as good and bad signals. + +### STEP 1: Fill missing values + + All the columns in our data contain missing values a range from 25 to 70. By using `from sklearn.impute import KNNImputer` + + ``` + imputer = KNNImputer(n_neighbors=5) + data_imputed = imputer.fit_transform(data_frame) + data_frame_imputed = pandas.DataFrame(data_imputed, columns=columns) + + missing_value_counts = data_frame_imputed.isna().sum() + write_textfile(f"{data_directory}/no_missing.txt", missing_value_counts) + return data_frame_imputed + ``` + +### STEP 2: Scaling + + We used `from sklearn.preprocessing import RobustScaler` to handle scaling. + + ``` + scaler = RobustScaler() + x = data_frame.drop("label", axis=1) + x_scale = scaler.fit_transform(x) + data_frame_scaled = pandas.DataFrame(x_scale, columns=x.columns) + data_frame_scaled["label"] = labels.values + ``` + +### STEP 3: k-fold cross validation + stratify classes + balancing training data + + First of all we split the dataset into 2 parts train (85%) and test (15%). For making sure that majority class and imbalanced class + distributed fairly we passed `stratify=y` + + ``` + x_train, x_test, y_train, y_test = train_test_split( + X, + y, + test_size=0.15, + stratify=y, + random_state=42, + ) + ``` + Then, for train dataset we used `from sklearn.model_selection import StratifiedKFold` to this class distribution also apply for train and + validation data. + + ``` + skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) + for fold_num, (train_idx, val_idx) in enumerate( + tqdm.tqdm(skf.split(X, y), total=skf.n_splits, desc="Training Folds"), start=1 + ): + X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] + y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] + ``` + and finally we use one of these balancing methods `from imblearn.over_sampling import ADASYN, SMOTE, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE` to augment samples for only train data + + ``` + if smote: + if smote_method.lower() == "kmeans": + sampler = KMeansSMOTE( + k_neighbors=5, + cluster_balance_threshold=0.1, + random_state=random_state, + ) + elif smote_method.lower() == "smote": + sampler = SMOTE(k_neighbors=5, random_state=random_state) + elif smote_method.lower() == "svmsmote": + sampler = SVMSMOTE(k_neighbors=5, random_state=random_state) + elif smote_method.lower() == "borderline": + sampler = BorderlineSMOTE(k_neighbors=5, random_state=random_state) + elif smote_method.lower() == "adasyn": + sampler = ADASYN(n_neighbors=5, random_state=random_state) + else: + raise ValueError(f"Unknown smote_method: {smote_method}") + + X_train, y_train = sampler.fit_resample(X_train, y_train) + + model.fit(X_train, y_train) + ``` diff --git a/data.py b/data.py index a9cb4ac..0f0a2cd 100644 --- a/data.py +++ b/data.py @@ -1,12 +1,6 @@ -""" -Saeed Khosravi - 26 Nov 2025 -""" - import os - import pandas -from train import test_model, train_model_with_kfold from utils import missing_value_handler, scaling_handler # STEP 1: handle missing values + remove id column + robust scaling diff --git a/utils.py b/utils.py index b94545d..6f9dea6 100644 --- a/utils.py +++ b/utils.py @@ -1,8 +1,3 @@ -""" -Saeed Khosravi - 27 Nov 2025 -""" - - def split_path(full_path): import os