Spaces:

prthgo
/

Tabular-Data-Analysis-and-Auto-ML

Sleeping

App Files Files Community

prthgo commited on Oct 30, 2023

Commit

6335d24

•

1 Parent(s): e04b6b4

Upload 2 files

Browse files

Files changed (2) hide show

app.py +1018 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,1018 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import io
+import matplotlib.pyplot as plt
+from matplotlib.ticker import PercentFormatter
+import seaborn as sns
+from sklearn.preprocessing import (
+    OneHotEncoder,
+    OrdinalEncoder,
+    StandardScaler,
+    MinMaxScaler,
+)
+from sklearn.model_selection import train_test_split
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler, SMOTE
+from sklearn.linear_model import Ridge, Lasso, LogisticRegression
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+from sklearn.svm import SVR, SVC
+from sklearn.naive_bayes import MultinomialNB
+from xgboost import XGBRFRegressor, XGBRFClassifier
+from lightgbm import LGBMRegressor, LGBMClassifier
+from sklearn.metrics import (
+    mean_absolute_error,
+    mean_squared_error,
+    mean_squared_error,
+    r2_score,
+)
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    roc_auc_score,
+    confusion_matrix,
+)
+import pickle
+st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖")
+sns.set_style("white")
+sns.set_context("poster", font_scale=0.7)
+palette = [
+    "#1d7874",
+    "#679289",
+    "#f4c095",
+    "#ee2e31",
+    "#ffb563",
+    "#918450",
+    "#f85e00",
+    "#a41623",
+    "#9a031e",
+    "#d6d6d6",
+    "#ffee32",
+    "#ffd100",
+    "#333533",
+    "#202020",
+]
+def main():
+    file = st.sidebar.file_uploader("Upload Your CSV File Here: ")
+    process = st.sidebar.button("Process")
+    option = st.sidebar.radio(
+        "Select an Option: ",
+        (
+            "Basic EDA",
+            "Univariate Analysis",
+            "Bivariate Analysis",
+            "Preprocess",
+            "Training and Evaluation",
+        ),
+    )
+    placeholder = st.empty()
+    placeholder.markdown(
+    "<h1 style='text-align: center;'>Welcome to Tabular Data Analysis and Auto ML🤖</h1>",
+    unsafe_allow_html=True
+)
+    if file is not None and process:
+        data = load_csv(file)
+        st.session_state["data"] = data
+    if "data" in st.session_state:
+        data = st.session_state["data"]
+        placeholder.empty()
+        if option == "Basic EDA":
+            st.markdown(
+                "<h1 style='text-align: center;'>Basic EDA</h1>", unsafe_allow_html=True
+            )
+            st.subheader("Data Overview")
+            st.write(data_overview(data))
+            st.write(duplicate(data))
+            st.dataframe(data.head())
+            st.subheader("Data Types and Unique Value Counts")
+            display_data_info(data)
+            st.subheader("Missing Data")
+            missing_data(data)
+            st.subheader("Value Counts")
+            value_counts(data)
+            st.subheader("Descriptive Statistics")
+            st.write(data.describe().T)
+        if option == "Univariate Analysis":
+            st.markdown(
+                "<h1 style='text-align: center;'>Univariate Analysis</h1>",
+                unsafe_allow_html=True,
+            )
+            plot = st.radio(
+                "Select a chart: ",
+                ("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"),
+            )
+            if plot == "Count Plot":
+                column = st.selectbox(
+                    "Select a column", [""] + list(data.select_dtypes("O"))
+                )
+                if column:
+                    countplot(data, column)
+            if plot == "Pie Chart":
+                column = st.selectbox(
+                    "Select a column", [""] + list(data.select_dtypes("O"))
+                )
+                if column:
+                    piechart(data, column)
+            if plot == "Histogram":
+                column = st.selectbox(
+                    "Select a column",
+                    [""] + list(data.select_dtypes(include=["int", "float"])),
+                )
+                if column:
+                    histogram(data, column)
+            if plot == "Violin Plot":
+                column = st.selectbox(
+                    "Select a column",
+                    [""] + list(data.select_dtypes(include=["int", "float"])),
+                )
+                if column:
+                    violinplot(data, column)
+            if plot == "Scatter Plot":
+                column = st.selectbox(
+                    "Select a column",
+                    [""] + list(data.select_dtypes(include=["int", "float"])),
+                )
+                if column:
+                    scatterplot(data, column)
+        if option == "Bivariate Analysis":
+            st.markdown(
+                "<h1 style='text-align: center;'>Bivariate Analysis</h1>",
+                unsafe_allow_html=True,
+            )
+            plot = st.radio(
+                "Select a chart: ",
+                ("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"),
+            )
+            if plot == "Scatter Plot":
+                columns = st.multiselect(
+                    "Select two columns",
+                    [""] + list(data.select_dtypes(include=["int", "float"])),
+                )
+                if columns:
+                    biscatterplot(data, columns)
+            if plot == "Bar Plot":
+                columns = st.multiselect("Select two columns", list(data.columns))
+                if columns:
+                    bibarplot(data, columns)
+            if plot == "Box Plot":
+                columns = st.multiselect("Select two columns", list(data.columns))
+                if columns:
+                    biboxplot(data, columns)
+            if plot == "Pareto Chart":
+                column = st.selectbox(
+                    "Select a columns",
+                    [""] + list(data.select_dtypes(include="object")),
+                )
+                if column:
+                    paretoplot(data, column)
+        if option == "Preprocess":
+            st.markdown(
+                "<h1 style='text-align: center;'>Data Preprocessing</h1>",
+                unsafe_allow_html=True,
+            )
+            operation = st.radio(
+                "Select preprocessing step: ",
+                (
+                    "Drop Columns",
+                    "Handling Missing Values",
+                    "Encode Categorical Features",
+                ),
+            )
+            if operation == "Drop Columns":
+                columns = st.multiselect("Select Columns to drop: ", (data.columns))
+                drop_columns = st.button("Drop Columns")
+                if drop_columns:
+                    data.drop(columns, axis=1, inplace=True)
+                    st.success("Dropped selected columns✅✅✅")
+            elif operation == "Handling Missing Values":
+                num_missing = st.selectbox(
+                    "Select a Approach (Numerical columns only): ",
+                    ("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"),
+                ).lower()
+                cat_missing = st.selectbox(
+                    "Select a Approach (Categorical columns only): ",
+                    ("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"),
+                ).lower()
+                hmv = st.button("Handle Missing Values")
+                if hmv:
+                    if num_missing:
+                        num_data = data.select_dtypes(include=["int64", "float64"])
+                        if num_missing == "drop":
+                            data = data.dropna(subset=num_data.columns)
+                        elif num_missing in [
+                            "mean",
+                            "median",
+                            "backward fill",
+                            "forward fill",
+                        ]:
+                            if num_missing == "mean":
+                                fill_values = num_data.mean()
+                            elif num_missing == "median":
+                                fill_values = num_data.median()
+                            elif num_missing == "backward fill":
+                                fill_values = num_data.bfill()
+                            elif num_missing == "forward fill":
+                                fill_values = num_data.ffill()
+                            data.fillna(value=fill_values, inplace=True)
+                            st.success(
+                                "Imputed missing values in numerical columns with selected approach."
+                            )
+                    if cat_missing:
+                        cat_data = data.select_dtypes(exclude=["int", "float"])
+                        if cat_missing == "drop":
+                            data = data.dropna(subset=cat_data.columns)
+                        elif cat_missing == "most frequent values":
+                            mode_values = data[cat_data.columns].mode().iloc[0]
+                            data[cat_data.columns] = data[cat_data.columns].fillna(
+                                mode_values
+                            )
+                        elif cat_missing == "replace with 'unknown'":
+                            data[cat_data.columns] = data[cat_data.columns].fillna(
+                                "Unknown"
+                            )
+                        st.success(
+                            "Imputed missing values in categorical columns with selected approach."
+                        )
+            elif operation == "Encode Categorical Features":
+                oe_columns = st.multiselect(
+                    "Choose Columns for Ordinal Encoding",
+                    [""] + list(data.select_dtypes(include="object")),
+                )
+                st.info("Other columns will be One Hot Encoded.")
+                encode_columns = st.button("Encode Columns")
+                if encode_columns:
+                    bool_columns = data.select_dtypes(include=bool).columns
+                    data[bool_columns] = data[bool_columns].astype(int)
+                    if oe_columns:
+                        oe = OrdinalEncoder()
+                        data[oe_columns] = oe.fit_transform(
+                            data[oe_columns].astype("str")
+                        )
+                    try:
+                        remaining_cat_cols = [
+                        col
+                        for col in data.select_dtypes(include="object")
+                        if col not in oe_columns
+                    ]
+                    except:
+                        pass
+                    if len(remaining_cat_cols) > 0:
+                        data = pd.get_dummies(
+                            data, columns=remaining_cat_cols, drop_first=False
+                        )
+                        bool_columns = data.select_dtypes(include=bool).columns
+                        data[bool_columns] = data[bool_columns].astype(int)
+                    st.success("Encoded categorical columns")
+            preprocessed_data_csv = data.to_csv(index=False)
+            # Create a StringIO object to handle the data
+            preprocessed_data_buffer = io.StringIO()
+            preprocessed_data_buffer.write(preprocessed_data_csv)
+            preprocessed_data_bytes = preprocessed_data_buffer.getvalue()
+            # Now you can add a download button for the preprocessed data
+            if st.download_button(
+                label="Download Preprocessed Data",
+                key="preprocessed_data",
+                on_click=None,
+                data=preprocessed_data_bytes.encode(),
+                file_name="preprocessed_data.csv",
+                mime="text/csv",
+            ):
+                pass
+        if option == "Training and Evaluation":
+            st.markdown(
+                "<h1 style='text-align: center;'>Training and Evaluation</h1>",
+                unsafe_allow_html=True,
+            )
+            algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification"))
+            if algo == "Regression":
+                target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
+                try:
+                    X = data.drop(target, axis=1)
+                    Y = data[target]
+                except Exception as e:
+                    st.write(str(e))
+                st.write(
+                    "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
+                )
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, Y, test_size=0.2, random_state=42
+                )
+                scale = st.selectbox(
+                    "Choose how do you want to scale features:",
+                    ("", "Standard Scaler", "Min Max Scaler"),
+                )
+                if scale == "Standard Scaler":
+                    scaler = StandardScaler()
+                    X_train = scaler.fit_transform(X_train)
+                    X_test = scaler.transform(X_test)
+                elif scale == "Min Max Scaler":
+                    scaler = MinMaxScaler()
+                    X_train = scaler.fit_transform(X_train)
+                    X_test = scaler.transform(X_test)
+                model = st.selectbox(
+                    "Choose Regression Model for training: ",
+                    (
+                        "",
+                        "Ridge Regression",
+                        "Decision Tree Regressor",
+                        "Random Forest Regressor",
+                        "SVR",
+                        "XGBRF Regressor",
+                        "LGBM Regressor",
+                    ),
+                )
+                if model == "Ridge Regression":
+                    reg = Ridge(alpha=1.0)
+                    reg.fit(X_train, y_train)
+                    pred = reg.predict(X_test)
+                    st.write(
+                        "Mean Absolute Error (MAE): {:.4f}".format(
+                            mean_absolute_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Mean Squared Error (MSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Root Mean Squared Error (RMSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test, squared=False)
+                        )
+                    )
+                    st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(reg),
+                        file_name="ridge_regression_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("ridge_regression_model.pkl", "wb") as model_file:
+                            pickle.dump(reg, model_file)
+                elif model == "Decision Tree Regressor":
+                    reg = DecisionTreeRegressor(max_depth=10)
+                    reg.fit(X_train, y_train)
+                    pred = reg.predict(X_test)
+                    st.write(
+                        "Mean Absolute Error (MAE): {:.4f}".format(
+                            mean_absolute_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Mean Squared Error (MSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Root Mean Squared Error (RMSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test, squared=False)
+                        )
+                    )
+                    st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(reg),
+                        file_name="decision_tree_regression_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open(
+                            "decision_tree_regression_model.pkl", "wb"
+                        ) as model_file:
+                            pickle.dump(reg, model_file)
+                elif model == "Random Forest Regressor":
+                    reg = RandomForestRegressor(max_depth=10, n_estimators=100)
+                    reg.fit(X_train, y_train)
+                    pred = reg.predict(X_test)
+                    st.write(
+                        "Mean Absolute Error (MAE): {:.4f}".format(
+                            mean_absolute_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Mean Squared Error (MSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Root Mean Squared Error (RMSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test, squared=False)
+                        )
+                    )
+                    st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(reg),
+                        file_name="random_forest_regression_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open(
+                            "random_forest_regression_model.pkl", "wb"
+                        ) as model_file:
+                            pickle.dump(reg, model_file)
+                elif model == "SVR":
+                    reg = SVR(C=1.0, epsilon=0.2)
+                    reg.fit(X_train, y_train)
+                    pred = reg.predict(X_test)
+                    st.write(
+                        "Mean Absolute Error (MAE): {:.4f}".format(
+                            mean_absolute_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Mean Squared Error (MSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Root Mean Squared Error (RMSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test, squared=False)
+                        )
+                    )
+                    st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(reg),
+                        file_name="svr_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("svr_model.pkl", "wb") as model_file:
+                            pickle.dump(reg, model_file)
+                elif model == "XGBRF Regressor":
+                    reg = XGBRFRegressor(reg_lambda=1)
+                    reg.fit(X_train, y_train)
+                    pred = reg.predict(X_test)
+                    st.write(
+                        "Mean Absolute Error (MAE): {:.4f}".format(
+                            mean_absolute_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Mean Squared Error (MSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Root Mean Squared Error (RMSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test, squared=False)
+                        )
+                    )
+                    st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(reg),
+                        file_name="xgbrf_regression_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("xgbrf_regression_model.pkl", "wb") as model_file:
+                            pickle.dump(reg, model_file)
+                elif model == "LGBM Regressor":
+                    reg = LGBMRegressor(reg_lambda=1)
+                    reg.fit(X_train, y_train)
+                    pred = reg.predict(X_test)
+                    st.write(
+                        "Mean Absolute Error (MAE): {:.4f}".format(
+                            mean_absolute_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Mean Squared Error (MSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test)
+                        )
+                    )
+                    st.write(
+                        "Root Mean Squared Error (RMSE): {:.4f}".format(
+                            mean_squared_error(pred, y_test, squared=False)
+                        )
+                    )
+                    st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(reg),
+                        file_name="lgbm_regression_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("lgbm_regression_model.pkl", "wb") as model_file:
+                            pickle.dump(reg, model_file)
+            elif algo == "Classification":
+                target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
+                try:
+                    X = data.drop(target, axis=1)
+                    Y = data[target]
+                except Exception as e:
+                    st.write(str(e))
+                st.write(
+                    "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
+                )
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, Y, test_size=0.2, random_state=42
+                )
+                balance = st.selectbox(
+                    "Do you want to balance dataset?", ("", "Yes", "No")
+                )
+                if balance == "Yes":
+                    piechart(data, target)
+                    sample = st.selectbox(
+                        "Which approach you want to use?",
+                        ("", "Random Under Sampling", "Random Over Sampling", "SMOTE"),
+                    )
+                    if sample == "Random Under Sampling":
+                        rus = RandomUnderSampler(random_state=42)
+                        X_train, y_train = rus.fit_resample(X_train, y_train)
+                    elif sample == "Random Over Sampling":
+                        ros = RandomOverSampler(random_state=42)
+                        X_train, y_train = ros.fit_resample(X_train, y_train)
+                    elif sample == "SMOTE":
+                        smote = SMOTE(random_state=42)
+                        X_train, y_train = smote.fit_resample(X_train, y_train)
+                scale = st.selectbox(
+                    "Choose how do you want to scale features:",
+                    ("", "Standard Scaler", "Min Max Scaler"),
+                )
+                if scale == "Standard Scaler":
+                    scaler = StandardScaler()
+                    X_train = scaler.fit_transform(X_train)
+                    X_test = scaler.transform(X_test)
+                elif scale == "Min Max Scaler":
+                    scaler = MinMaxScaler()
+                    X_train = scaler.fit_transform(X_train)
+                    X_test = scaler.transform(X_test)
+                model = st.selectbox(
+                    "Choose Classification Model for training: ",
+                    (
+                        "",
+                        "Logistic Regression",
+                        "Decision Tree Classifier",
+                        "Random Forest Classifier",
+                        "SVC",
+                        "XGBRF Classifier",
+                        "LGBM Classifier",
+                    ),
+                )
+                if model == "Logistic Regression":
+                    clf = LogisticRegression(penalty="l2")
+                    clf.fit(X_train, y_train)
+                    pred = clf.predict(X_test)
+                    st.write(
+                        "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
+                    )
+                    try:
+                        st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
+                    except ValueError:
+                        st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
+                    plot_confusion_matrix(
+                        pred, y_test, "Logistic Regression Confusion Matrix "
+                    )
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(clf),
+                        file_name="logistic_regression_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("logistic_regression_model.pkl", "wb") as model_file:
+                            pickle.dump(clf, model_file)
+                if model == "Decision Tree Classifier":
+                    clf = DecisionTreeClassifier(max_depth=5)
+                    clf.fit(X_train, y_train)
+                    pred = clf.predict(X_test)
+                    st.write(
+                        "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
+                    )
+                    try:
+                        st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
+                    except ValueError:
+                        st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
+                    plot_confusion_matrix(
+                        pred, y_test, "DecisionTree Classifier Confusion Matrix "
+                    )
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(clf),
+                        file_name="decision_tree_classifier_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open(
+                            "decision_tree_classifier_model.pkl", "wb"
+                        ) as model_file:
+                            pickle.dump(clf, model_file)
+                if model == "Random Forest Classifier":
+                    clf = RandomForestClassifier(n_estimators=100, max_depth=5)
+                    clf.fit(X_train, y_train)
+                    pred = clf.predict(X_test)
+                    st.write(
+                        "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
+                    )
+                    try:
+                        st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
+                    except ValueError:
+                        st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
+                    plot_confusion_matrix(
+                        pred, y_test, "RandomForest Classifier Confusion Matrix "
+                    )
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(clf),
+                        file_name="random_forest_classifier_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open(
+                            "random_forest_classifier_model.pkl", "wb"
+                        ) as model_file:
+                            pickle.dump(clf, model_file)
+                if model == "SVC":
+                    clf = SVC(C=1.5)
+                    clf.fit(X_train, y_train)
+                    pred = clf.predict(X_test)
+                    st.write(
+                        "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
+                    )
+                    try:
+                        st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
+                    except ValueError:
+                        st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
+                    plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ")
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(clf),
+                        file_name="svc_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("svc_model.pkl", "wb") as model_file:
+                            pickle.dump(clf, model_file)
+                if model == "XGBRF Classifier":
+                    clf = XGBRFClassifier(reg_lambda=1.0)
+                    clf.fit(X_train, y_train)
+                    pred = clf.predict(X_test)
+                    st.write(
+                        "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
+                    )
+                    try:
+                        st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
+                    except ValueError:
+                        st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
+                    plot_confusion_matrix(
+                        pred, y_test, "XGBRF Classifier Confusion Matrix "
+                    )
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(clf),
+                        file_name="xgbrf_classifier_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("xgbrf_classifier_model.pkl", "wb") as model_file:
+                            pickle.dump(clf, model_file)
+                if model == "LGBM Classifier":
+                    clf = LGBMClassifier(reg_lambda=1.0)
+                    clf.fit(X_train, y_train)
+                    pred = clf.predict(X_test)
+                    st.write(
+                        "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
+                    )
+                    try:
+                        st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
+                    except ValueError:
+                        st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
+                    plot_confusion_matrix(
+                        pred, y_test, "LGBM Classifier Confusion Matrix "
+                    )
+                    if st.download_button(
+                        label="Download Trained Model",
+                        key="trained_model",
+                        on_click=None,
+                        data=pickle.dumps(clf),
+                        file_name="lgbm_classifier_model.pkl",
+                        mime="application/octet-stream",
+                    ):
+                        with open("lgbm_classifier_model.pkl", "wb") as model_file:
+                            pickle.dump(clf, model_file)
+def load_csv(file):
+    data = pd.read_csv(file)
+    return data
+def data_overview(data):
+    r, c = data.shape
+    st.write(f"Number of Rows: {r}")
+    return f"Number of Columns: {c}"
+def missing_data(data):
+    missing_values = data.isna().sum()
+    missing_values = missing_values[missing_values > 0]
+    missing_value_per = (missing_values / data.shape[0]) * 100
+    missing_value_per = missing_value_per.round(2).astype(str) + "%"
+    missing_df = pd.DataFrame(
+        {"Missing Values": missing_values, "Percentage": missing_value_per}
+    )
+    missing_df_html = missing_df.to_html(
+        classes="table table-striped", justify="center"
+    )
+    return st.markdown(missing_df_html, unsafe_allow_html=True)
+def display_data_info(data):
+    dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"])
+    dtypes.reset_index(inplace=True)
+    nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"])
+    nunique.reset_index(inplace=True)
+    dtypes.columns = ["Column", "Data Type"]
+    nunique.columns = ["Column", "Unique Counts"]
+    combined_df = pd.merge(dtypes, nunique, on="Column")
+    combined_df_html = combined_df.to_html(
+        classes="table table-striped", justify="center"
+    )
+    return st.markdown(combined_df_html, unsafe_allow_html=True)
+def value_counts(data):
+    column = st.selectbox("Select a Column", [""] + list(data.columns))
+    if column:
+        st.write(data[column].value_counts())
+def duplicate(data):
+    if data.duplicated().any():
+        st.write(
+            f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped."
+        )
+        data.drop_duplicates(keep="first", inplace=True)
+        return ""
+    else:
+        return "There are no duplicate rows in the DataFrame."
+def countplot(data, col):
+    plt.figure(figsize=(10, 6))
+    sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2)
+    plt.title(f"Countplot of {col} Column")
+    st.pyplot(plt)
+def piechart(data, col):
+    value_counts = data[col].value_counts()
+    plt.figure(figsize=(8, 6))
+    plt.pie(
+        value_counts,
+        labels=value_counts.index,
+        autopct="%1.1f%%",
+        colors=palette,
+        shadow=False,
+        wedgeprops=dict(edgecolor="#1c1c1c"),
+    )
+    plt.title(f"Pie Chart of {col} Column")
+    st.pyplot(plt)
+def histogram(data, col):
+    plt.figure(figsize=(10, 6))
+    sns.histplot(
+        data[col],
+        kde=True,
+        color=palette[4],
+        fill=True,
+        edgecolor="#1c1c1c",
+        linewidth=2,
+    )
+    plt.title(f"Histogram of {col} Column")
+    st.pyplot(plt)
+def violinplot(data, col):
+    plt.figure(figsize=(10, 6))
+    sns.violinplot(data[col], color=palette[8])
+    plt.title(f"Violin Plot of {col} Column")
+    st.pyplot(plt)
+def scatterplot(data, col):
+    plt.figure(figsize=(10, 8))
+    sns.scatterplot(data[col], color=palette[3])
+    plt.title(f"Scatter Plot of {col} Column")
+    st.pyplot(plt)
+def biscatterplot(data, cols):
+    try:
+        plt.figure(figsize=(10, 8))
+        sns.scatterplot(
+            data=data,
+            x=cols[0],
+            y=cols[1],
+            palette=palette[1:],
+            edgecolor="#1c1c1c",
+            linewidth=2,
+        )
+        plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns")
+        st.pyplot(plt)
+    except Exception as e:
+        st.write(str(e))
+def bibarplot(data, cols):
+    try:
+        plt.figure(figsize=(10, 8))
+        sns.barplot(
+            data=data,
+            x=cols[0],
+            y=cols[1],
+            palette=palette[1:],
+            edgecolor="#1c1c1c",
+            linewidth=2,
+        )
+        plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns")
+        st.pyplot(plt)
+    except Exception as e:
+        st.write(str(e))
+def biboxplot(data, cols):
+    try:
+        plt.figure(figsize=(10, 8))
+        sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2)
+        plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns")
+        st.pyplot(plt)
+    except Exception as e:
+        st.write(str(e))
+def paretoplot(data, categorical_col):
+    try:
+        value_counts = data[categorical_col].value_counts()
+        cumulative_percentage = (value_counts / value_counts.sum()).cumsum()
+        pareto_df = pd.DataFrame(
+            {
+                "Categories": value_counts.index,
+                "Frequency": value_counts.values,
+                "Cumulative Percentage": cumulative_percentage.values * 100,
+            }
+        )
+        pareto_df = pareto_df.sort_values(by="Frequency", ascending=False)
+        fig, ax1 = plt.subplots(figsize=(10, 8))
+        ax1.bar(
+            pareto_df["Categories"],
+            pareto_df["Frequency"],
+            color=palette[1:],
+            edgecolor="#1c1c1c",
+            linewidth=2,
+        )
+        ax2 = ax1.twinx()
+        ax2.yaxis.set_major_formatter(PercentFormatter())
+        ax2.plot(
+            pareto_df["Categories"],
+            pareto_df["Cumulative Percentage"],
+            color=palette[3],
+            marker="D",
+            ms=10,
+        )
+        ax1.set_xlabel(categorical_col)
+        ax1.set_ylabel("Frequency", color=palette[0])
+        ax2.set_ylabel("Cumulative Percentage", color=palette[3])
+        st.pyplot(fig)
+    except Exception as e:
+        pass
+def plot_confusion_matrix(y_true, y_pred, title):
+    cm = confusion_matrix(y_true, y_pred)
+    plt.figure(figsize=(6, 4))
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
+    plt.xlabel("Predicted Label")
+    plt.ylabel("True Label")
+    plt.title(title)
+    st.pyplot(plt)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+scikit-learn
+numpy
+pandas
+matplotlib
+seaborn
+imblearn
+xgboost
+lightgbm