def split_path(full_path): import os directory = os.path.dirname(full_path) filename = os.path.splitext(os.path.basename(full_path))[0] return directory, filename def write_textfile(path, data_list): with open(path, "w") as file: for data in data_list: file.write(f"{data} \n") def missing_value_handler(data_path): import pandas from sklearn.impute import KNNImputer data_directory, data_filename = split_path(data_path) data_frame = pandas.read_csv(data_path) columns = list(data_frame.head(0)) # remove column id if "id" in columns: data_frame = data_frame.drop("id", axis="columns") columns = list(data_frame.head(0)) write_textfile(f"{data_directory}/columns.txt", columns) # find missing values missing_value_counts = data_frame.isna().sum() write_textfile(f"{data_directory}/missing.txt", missing_value_counts) # fill missing values - KNNImputer imputer = KNNImputer(n_neighbors=5) data_imputed = imputer.fit_transform(data_frame) data_frame_imputed = pandas.DataFrame(data_imputed, columns=columns) missing_value_counts = data_frame_imputed.isna().sum() write_textfile(f"{data_directory}/no_missing.txt", missing_value_counts) return data_frame_imputed def scaling_handler(data_frame, method="robust_scaling"): if method == "robust_scaling": import pandas from sklearn.preprocessing import RobustScaler labels = data_frame["label"] scaler = RobustScaler() x = data_frame.drop("label", axis=1) x_scale = scaler.fit_transform(x) data_frame_scaled = pandas.DataFrame(x_scale, columns=x.columns) data_frame_scaled["label"] = labels.values return data_frame_scaled