editing readme.md
This commit is contained in:
78
README.md
78
README.md
@@ -1,2 +1,80 @@
|
||||
# Electrocardiogram
|
||||
|
||||
We are dealing with an exteremly imbalance dataset related to electrocardiogram signals that contain binary class labels as good and bad signals.
|
||||
|
||||
### STEP 1: Fill missing values
|
||||
|
||||
All the columns in our data contain missing values a range from 25 to 70. By using `from sklearn.impute import KNNImputer`
|
||||
|
||||
```
|
||||
imputer = KNNImputer(n_neighbors=5)
|
||||
data_imputed = imputer.fit_transform(data_frame)
|
||||
data_frame_imputed = pandas.DataFrame(data_imputed, columns=columns)
|
||||
|
||||
missing_value_counts = data_frame_imputed.isna().sum()
|
||||
write_textfile(f"{data_directory}/no_missing.txt", missing_value_counts)
|
||||
return data_frame_imputed
|
||||
```
|
||||
|
||||
### STEP 2: Scaling
|
||||
|
||||
We used `from sklearn.preprocessing import RobustScaler` to handle scaling.
|
||||
|
||||
```
|
||||
scaler = RobustScaler()
|
||||
x = data_frame.drop("label", axis=1)
|
||||
x_scale = scaler.fit_transform(x)
|
||||
data_frame_scaled = pandas.DataFrame(x_scale, columns=x.columns)
|
||||
data_frame_scaled["label"] = labels.values
|
||||
```
|
||||
|
||||
### STEP 3: k-fold cross validation + stratify classes + balancing training data
|
||||
|
||||
First of all we split the dataset into 2 parts train (85%) and test (15%). For making sure that majority class and imbalanced class
|
||||
distributed fairly we passed `stratify=y`
|
||||
|
||||
```
|
||||
x_train, x_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
test_size=0.15,
|
||||
stratify=y,
|
||||
random_state=42,
|
||||
)
|
||||
```
|
||||
Then, for train dataset we used `from sklearn.model_selection import StratifiedKFold` to this class distribution also apply for train and
|
||||
validation data.
|
||||
|
||||
```
|
||||
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
|
||||
for fold_num, (train_idx, val_idx) in enumerate(
|
||||
tqdm.tqdm(skf.split(X, y), total=skf.n_splits, desc="Training Folds"), start=1
|
||||
):
|
||||
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
|
||||
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
|
||||
```
|
||||
and finally we use one of these balancing methods `from imblearn.over_sampling import ADASYN, SMOTE, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE` to augment samples for only train data
|
||||
|
||||
```
|
||||
if smote:
|
||||
if smote_method.lower() == "kmeans":
|
||||
sampler = KMeansSMOTE(
|
||||
k_neighbors=5,
|
||||
cluster_balance_threshold=0.1,
|
||||
random_state=random_state,
|
||||
)
|
||||
elif smote_method.lower() == "smote":
|
||||
sampler = SMOTE(k_neighbors=5, random_state=random_state)
|
||||
elif smote_method.lower() == "svmsmote":
|
||||
sampler = SVMSMOTE(k_neighbors=5, random_state=random_state)
|
||||
elif smote_method.lower() == "borderline":
|
||||
sampler = BorderlineSMOTE(k_neighbors=5, random_state=random_state)
|
||||
elif smote_method.lower() == "adasyn":
|
||||
sampler = ADASYN(n_neighbors=5, random_state=random_state)
|
||||
else:
|
||||
raise ValueError(f"Unknown smote_method: {smote_method}")
|
||||
|
||||
X_train, y_train = sampler.fit_resample(X_train, y_train)
|
||||
|
||||
model.fit(X_train, y_train)
|
||||
```
|
||||
|
||||
6
data.py
6
data.py
@@ -1,12 +1,6 @@
|
||||
"""
|
||||
Saeed Khosravi - 26 Nov 2025
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pandas
|
||||
|
||||
from train import test_model, train_model_with_kfold
|
||||
from utils import missing_value_handler, scaling_handler
|
||||
|
||||
# STEP 1: handle missing values + remove id column + robust scaling
|
||||
|
||||
Reference in New Issue
Block a user