|
""" |
|
Preliminary preprocessing on the data, such as: |
|
- correcting column names |
|
- encoding the target column |
|
""" |
|
|
|
import pandas as pd |
|
from sklearn import preprocessing |
|
|
|
|
|
TRAINING_FILE_NAME = "./data/Training.csv" |
|
TESTING_FILE_NAME = "./data/Testing.csv" |
|
|
|
|
|
TARGET_COLUMN = "prognosis" |
|
DROP_COLUMNS = ["Unnamed: 133"] |
|
|
|
RENAME_COLUMNS = { |
|
"scurring": "scurving", |
|
"dischromic _patches": "dischromic_patches", |
|
"spotting_ urination": "spotting_urination", |
|
"foul_smell_of urine": "foul_smell_of_urine", |
|
} |
|
|
|
RENAME_VALUES = { |
|
"(vertigo) Paroymsal Positional Vertigo": "Paroymsal Positional Vertigo", |
|
"Dimorphic hemmorhoids(piles)": "Dimorphic hemmorhoids (piles)", |
|
"Peptic ulcer diseae": "Peptic Ulcer", |
|
} |
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
df_train = pd.read_csv(TRAINING_FILE_NAME) |
|
df_test = pd.read_csv(TESTING_FILE_NAME) |
|
|
|
|
|
df_train.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True) |
|
df_test.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True) |
|
|
|
|
|
df_train.rename(columns=RENAME_COLUMNS, inplace=True) |
|
df_test.rename(columns=RENAME_COLUMNS, inplace=True) |
|
|
|
df_train[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True) |
|
df_train[TARGET_COLUMN] = df_train[TARGET_COLUMN].apply(str.title) |
|
|
|
df_test[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True) |
|
df_test[TARGET_COLUMN] = df_test[TARGET_COLUMN].apply(str.title) |
|
|
|
|
|
label_encoder = preprocessing.LabelEncoder() |
|
label_encoder.fit(df_train[[TARGET_COLUMN]].values.flatten()) |
|
|
|
df_train[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform( |
|
df_train[[TARGET_COLUMN]].values.flatten() |
|
) |
|
df_test[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform( |
|
df_test[[TARGET_COLUMN]].values.flatten() |
|
) |
|
|
|
|
|
float_columns = df_train.columns.drop([TARGET_COLUMN]) |
|
df_train[float_columns] = df_train[float_columns].astype("float32") |
|
df_test[float_columns] = df_test[float_columns].astype("float32") |
|
|
|
|
|
df_train.to_csv(path_or_buf="./data/Training_preprocessed.csv", index=False) |
|
df_test.to_csv(path_or_buf="./data/Testing_preprocessed.csv", index=False) |
|
|