Spaces:

scikit-learn
/

baseline-trainer

Runtime error

App Files Files Community

mervenoyan commited on Jul 5, 2022

Commit

ae1692d

•

1 Parent(s): 9541eae

misc improvements

Browse files

Files changed (2) hide show

app.py +37 -21
logs.txt +31 -0

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import pandas as pd
-from huggingface_hub.hf_api import create_repo, upload_folder, upload_file
 from huggingface_hub.repository import Repository
 import subprocess
 import os
@@ -12,8 +12,9 @@ import dabl
 import re
-def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwise="off"):
     df = pd.read_csv(dataset.name)
     if column is not None:
         analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
     else:
@@ -21,7 +22,7 @@ def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwi
         analyze_report.show_html('index.html', open_browser=False)
     repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
-    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
     readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
     with open("README.md", "w+") as f:
         f.write(readme)
@@ -40,30 +41,47 @@ def extract_estimator_config(model):
         table += f"| {hyperparameter} | {value} |\n"
     return table
-def train_baseline(dataset, username, dataset_name, token, column):
     df = pd.read_csv(dataset.name)
-    fc = dabl.SimpleClassifier(random_state=0)
-    df_clean = dabl.clean(df)
     X = df_clean.drop(column, axis = 1)
     y = df_clean[column]
     with tempfile.TemporaryDirectory() as tmpdirname:
         from contextlib import redirect_stdout
-        with open('logs.txt', 'w') as f:
-            with redirect_stdout(f):
-                print('Logging training')
-                fc.fit(X, y)
         repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
         readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
         readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
-        readme+="Metrics of the best model:\n\n"
         for elem in str(fc.current_best_).split("\n"):
             readme+= f"{elem}\n\n"
-        readme+= "\n\nSee model plot below:\n\n"
         readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
         with open(f"{tmpdirname}/README.md", "w+") as f:
             f.write(readme)
         with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
@@ -76,7 +94,7 @@ def train_baseline(dataset, username, dataset_name, token, column):
 with gr.Blocks() as demo:
     main_title = gr.Markdown("""# Baseline Trainer 🪄🌟✨""")
-    main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card.""")
     with gr.Tabs():
@@ -87,17 +105,16 @@ with gr.Blocks() as demo:
                     description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
                     dataset = gr.File(label = "Dataset")
                     column = gr.Text(label = "Enter target variable:")
                     dataset_name = gr.Text(label = "Enter dataset name:")
-                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                     token = gr.Textbox(label = "Your Hugging Face Token")
-                    username = gr.Textbox(label = "Your Hugging Face User Name")
                     inference_run = gr.Button("Train")
                     inference_progress = gr.StatusTracker(cover_container=True)
                 outcome = gr.outputs.Textbox(label = "Progress")
                 inference_run.click(
                     train_baseline,
-                    inputs=[dataset, username, dataset_name, token, column],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )
@@ -110,15 +127,14 @@ with gr.Blocks() as demo:
                     column = gr.Text(label = "Compare dataset against a target variable (Optional)")
                     pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
                     token = gr.Textbox(label = "Your Hugging Face Token")
-                    username = gr.Textbox(label = "Your Hugging Face User Name")
                     dataset_name = gr.Textbox(label = "Dataset Name")
-                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
                     inference_run = gr.Button("Infer")
                     inference_progress = gr.StatusTracker(cover_container=True)
                 outcome = gr.outputs.Textbox()
                 inference_run.click(
                     analyze_datasets,
-                    inputs=[dataset, dataset_name, username, token, column, pairwise],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )

 import gradio as gr
 import pandas as pd
+from huggingface_hub.hf_api import create_repo, upload_folder, upload_file, HfApi
 from huggingface_hub.repository import Repository
 import subprocess
 import os
 import re
+def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"):
     df = pd.read_csv(dataset.name)
+    username = HfApi().whoami(token=token)["name"]
     if column is not None:
         analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
     else:
         analyze_report.show_html('index.html', open_browser=False)
     repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
+    upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
     readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
     with open("README.md", "w+") as f:
         f.write(readme)
         table += f"| {hyperparameter} | {value} |\n"
     return table
+def detect_training(df, column):
+    if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
+        trainer = dabl.SimpleRegressor()
+    elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
+        trainer = dabl.SimpleClassifier()
+    return trainer
+def edit_types(df):
+    types = dabl.detect_types(df)
+    low_cardinality = types[types["low_card_int"] == True].index.tolist()
+    dirty_float = types[types["dirty_float"] == True].index.tolist()
+    type_hints = {}
+    for col in low_cardinality:
+        type_hints[col] = "categorical"
+    for col in dirty_float:
+        type_hints[col] = "continuous"
+    df_clean = dabl.clean(df, type_hints=type_hints)
+    return df_clean
+def train_baseline(dataset, dataset_name, token, column):
     df = pd.read_csv(dataset.name)
+    df_clean = edit_types(df)
+    fc = detect_training(df_clean, column)
     X = df_clean.drop(column, axis = 1)
     y = df_clean[column]
     with tempfile.TemporaryDirectory() as tmpdirname:
         from contextlib import redirect_stdout
+        fc.fit(X, y)
+        username = HfApi().whoami(token=token)["name"]
         repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
         readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
         readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
+        readme+="**Metrics of the best model:**\n\n"
         for elem in str(fc.current_best_).split("\n"):
             readme+= f"{elem}\n\n"
+        readme+= "\n\n**See model plot below:**\n\n"
         readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
+        readme+= "\n\nThis model is trained with dabl library as a baseline, for better results, use AutoTrain.\n\n"
         with open(f"{tmpdirname}/README.md", "w+") as f:
             f.write(readme)
         with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
 with gr.Blocks() as demo:
     main_title = gr.Markdown("""# Baseline Trainer 🪄🌟✨""")
+    main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card. For better results, use AutoTrain.""")
     with gr.Tabs():
                     description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
                     dataset = gr.File(label = "Dataset")
                     column = gr.Text(label = "Enter target variable:")
+                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
                     dataset_name = gr.Text(label = "Enter dataset name:")
                     token = gr.Textbox(label = "Your Hugging Face Token")
                     inference_run = gr.Button("Train")
                     inference_progress = gr.StatusTracker(cover_container=True)
                 outcome = gr.outputs.Textbox(label = "Progress")
                 inference_run.click(
                     train_baseline,
+                    inputs=[dataset, dataset_name, token, column],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )
                     column = gr.Text(label = "Compare dataset against a target variable (Optional)")
                     pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
                     token = gr.Textbox(label = "Your Hugging Face Token")
                     dataset_name = gr.Textbox(label = "Dataset Name")
+                    pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
                     inference_run = gr.Button("Infer")
                     inference_progress = gr.StatusTracker(cover_container=True)
                 outcome = gr.outputs.Textbox()
                 inference_run.click(
                     analyze_datasets,
+                    inputs=[dataset, dataset_name, token, column, pairwise],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )

logs.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+Logging training
+Running DummyClassifier()
+accuracy: 0.643 average_precision: 0.357 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.392
+=== new best DummyClassifier() (using recall_macro):
+accuracy: 0.643 average_precision: 0.357 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.392
+Running GaussianNB()
+accuracy: 0.623 average_precision: 0.505 roc_auc: 0.590 recall_macro: 0.560 f1_macro: 0.549
+=== new best GaussianNB() (using recall_macro):
+accuracy: 0.623 average_precision: 0.505 roc_auc: 0.590 recall_macro: 0.560 f1_macro: 0.549
+Running MultinomialNB()
+accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588
+=== new best MultinomialNB() (using recall_macro):
+accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588
+Running DecisionTreeClassifier(class_weight='balanced', max_depth=1)
+accuracy: 0.586 average_precision: 0.401 roc_auc: 0.568 recall_macro: 0.568 f1_macro: 0.558
+Running DecisionTreeClassifier(class_weight='balanced', max_depth=5)
+accuracy: 0.590 average_precision: 0.419 roc_auc: 0.564 recall_macro: 0.576 f1_macro: 0.560
+Running DecisionTreeClassifier(class_weight='balanced', min_impurity_decrease=0.01)
+accuracy: 0.582 average_precision: 0.393 roc_auc: 0.563 recall_macro: 0.567 f1_macro: 0.555
+Running LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000)
+accuracy: 0.574 average_precision: 0.487 roc_auc: 0.425 recall_macro: 0.548 f1_macro: 0.547
+Running LogisticRegression(class_weight='balanced', max_iter=1000)
+accuracy: 0.578 average_precision: 0.470 roc_auc: 0.437 recall_macro: 0.562 f1_macro: 0.557
+Best model:
+Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('multinomialnb', MultinomialNB())])
+Best Scores:
+accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588