File size: 2,434 Bytes
13fb76e 58df7f1 13fb76e 58df7f1 13fb76e 58df7f1 13fb76e 58df7f1 13fb76e 58df7f1 13fb76e 58df7f1 13fb76e 58df7f1 13fb76e 58df7f1 13fb76e 58df7f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
"""
Preliminary preprocessing on the data, such as:
- correcting column names
- encoding the target column
"""
import pandas as pd
from sklearn import preprocessing
# Files location
TRAINING_FILE_NAME = "./data/Training.csv"
TESTING_FILE_NAME = "./data/Testing.csv"
# Columns processing
TARGET_COLUMN = "prognosis"
DROP_COLUMNS = ["Unnamed: 133"]
RENAME_COLUMNS = {
"scurring": "scurving",
"dischromic _patches": "dischromic_patches",
"spotting_ urination": "spotting_urination",
"foul_smell_of urine": "foul_smell_of_urine",
}
RENAME_VALUES = {
"(vertigo) Paroymsal Positional Vertigo": "Paroymsal Positional Vertigo",
"Dimorphic hemmorhoids(piles)": "Dimorphic hemmorhoids (piles)",
"Peptic ulcer diseae": "Peptic Ulcer",
}
if __name__ == "__main__":
# Load data
df_train = pd.read_csv(TRAINING_FILE_NAME)
df_test = pd.read_csv(TESTING_FILE_NAME)
# Remove unseless columns
df_train.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True)
df_test.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True)
# Correct some typos in some columns name
df_train.rename(columns=RENAME_COLUMNS, inplace=True)
df_test.rename(columns=RENAME_COLUMNS, inplace=True)
df_train[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True)
df_train[TARGET_COLUMN] = df_train[TARGET_COLUMN].apply(str.title)
df_test[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True)
df_test[TARGET_COLUMN] = df_test[TARGET_COLUMN].apply(str.title)
# Convert the `TARGET_COLUMN` to a numeric label
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(df_train[[TARGET_COLUMN]].values.flatten())
df_train[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform(
df_train[[TARGET_COLUMN]].values.flatten()
)
df_test[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform(
df_test[[TARGET_COLUMN]].values.flatten()
)
# Cast X features from int64 to float32
float_columns = df_train.columns.drop([TARGET_COLUMN])
df_train[float_columns] = df_train[float_columns].astype("float32")
df_test[float_columns] = df_test[float_columns].astype("float32")
# Save preprocessed data
df_train.to_csv(path_or_buf="./data/Training_preprocessed.csv", index=False)
df_test.to_csv(path_or_buf="./data/Testing_preprocessed.csv", index=False)
|