Spaces:

scikit-learn
/

baseline-trainer

Runtime error

App Files Files Community

mervenoyan commited on Jul 5, 2022

Commit

d2a61f1

•

1 Parent(s): acaa4d9

simplified UI

Browse files

Files changed (1) hide show

app.py +26 -20

app.py CHANGED Viewed

@@ -12,22 +12,23 @@ import dabl
 import re
-def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"):
     df = pd.read_csv(dataset.name)
     username = HfApi().whoami(token=token)["name"]
     if column is not None:
         analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
     else:
         analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
     analyze_report.show_html('./index.html', open_browser=False)
-    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
-    upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
     readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
     with open("README.md", "w+") as f:
         f.write(readme)
-    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
     return f"Your dataset report will be ready at {repo_url}"
@@ -45,9 +46,11 @@ def extract_estimator_config(model):
 def detect_training(df, column):
     if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
         trainer = dabl.SimpleRegressor()
     elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
         trainer = dabl.SimpleClassifier()
-    return trainer
 def edit_types(df):
     types = dabl.detect_types(df)
@@ -61,10 +64,11 @@ def edit_types(df):
     df_clean = dabl.clean(df, type_hints=type_hints)
     return df_clean
-def train_baseline(dataset, dataset_name, token, column):
     df = pd.read_csv(dataset.name)
     df_clean = edit_types(df)
-    fc = detect_training(df_clean, column)
     X = df_clean.drop(column, axis = 1)
     y = df_clean[column]
@@ -76,21 +80,25 @@ def train_baseline(dataset, dataset_name, token, column):
                 print('Logging training')
                 fc.fit(X, y)
         username = HfApi().whoami(token=token)["name"]
-        repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
-        readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
-        readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
         readme+="**Metrics of the best model:**\n\n"
         for elem in str(fc.current_best_).split("\n"):
             readme+= f"{elem}\n\n"
         readme+= "\n\n**See model plot below:**\n\n"
         readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
-        readme+= "\n\nThis model is trained with dabl library as a baseline, for better results, use [AutoTrain](https://huggingface.co/autotrain).\n\n"
         with open(f"{tmpdirname}/README.md", "w+") as f:
             f.write(readme)
         with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
             pickle.dump(fc, file=f)
-        upload_folder(repo_id =f"{username}/{dataset_name}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./")
     return f"Your model will be ready at {repo_url}"
@@ -107,10 +115,9 @@ with gr.Blocks() as demo:
                 with gr.Column():
                     title = gr.Markdown(""" ## Train a supervised baseline model""")
                     description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
-                    dataset = gr.File(label = "Dataset")
                     column = gr.Text(label = "Enter target variable:")
                     pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
-                    dataset_name = gr.Text(label = "Enter dataset name:")
                     token = gr.Textbox(label = "Your Hugging Face Token")
                     inference_run = gr.Button("Train")
                     inference_progress = gr.StatusTracker(cover_container=True)
@@ -118,7 +125,7 @@ with gr.Blocks() as demo:
                 outcome = gr.outputs.Textbox(label = "Progress")
                 inference_run.click(
                     train_baseline,
-                    inputs=[dataset, dataset_name, token, column],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )
@@ -127,18 +134,17 @@ with gr.Blocks() as demo:
                 with gr.Column():
                     title = gr.Markdown(""" ## Analyze Dataset """)
                     description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
-                    dataset = gr.File(label = "Dataset")
                     column = gr.Text(label = "Compare dataset against a target variable (Optional)")
                     pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
                     token = gr.Textbox(label = "Your Hugging Face Token")
-                    dataset_name = gr.Textbox(label = "Dataset Name")
                     pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
                     inference_run = gr.Button("Infer")
                     inference_progress = gr.StatusTracker(cover_container=True)
                 outcome = gr.outputs.Textbox()
                 inference_run.click(
                     analyze_datasets,
-                    inputs=[dataset, dataset_name, token, column, pairwise],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )

 import re
+def analyze_datasets(dataset, token, column=None, pairwise="off"):
     df = pd.read_csv(dataset.name)
     username = HfApi().whoami(token=token)["name"]
     if column is not None:
         analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
     else:
         analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
+    dataset_name = dataset.name.split("/")[-1].strip(".csv")
     analyze_report.show_html('./index.html', open_browser=False)
+    repo_url = create_repo(f"{username}/{dataset_name}-report", repo_type = "space", token = token, space_sdk = "static", private=False)
+    upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)
     readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
     with open("README.md", "w+") as f:
         f.write(readme)
+    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)
     return f"Your dataset report will be ready at {repo_url}"
 def detect_training(df, column):
     if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
         trainer = dabl.SimpleRegressor()
+        task = "regression"
     elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
         trainer = dabl.SimpleClassifier()
+        task = "classification"
+    return trainer, task
 def edit_types(df):
     types = dabl.detect_types(df)
     df_clean = dabl.clean(df, type_hints=type_hints)
     return df_clean
+def train_baseline(dataset,  token, column):
     df = pd.read_csv(dataset.name)
+    dataset_name = dataset.name.split("/")[-1].strip(".csv")
     df_clean = edit_types(df)
+    fc, task = detect_training(df_clean, column)
     X = df_clean.drop(column, axis = 1)
     y = df_clean[column]
                 print('Logging training')
                 fc.fit(X, y)
         username = HfApi().whoami(token=token)["name"]
+        repo_url = create_repo(repo_id = f"{username}/{dataset_name}-{column}-{task}", token = token)
+        if task == "regression":
+            task_metadata = "tabular-regression"
+        else:
+            task_metadata = "tabular-classification"
+        readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\ntags:\n- {task_metadata}\n- baseline-trainer\n---\n\n"
+        readme += f"## Baseline Model trained on {dataset_name} to apply {task} on {column}\n\n"
         readme+="**Metrics of the best model:**\n\n"
         for elem in str(fc.current_best_).split("\n"):
             readme+= f"{elem}\n\n"
         readme+= "\n\n**See model plot below:**\n\n"
         readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
+        readme+= "\n\n**Disclaimer:** This model is trained with dabl library as a baseline, for better results, use [AutoTrain](https://huggingface.co/autotrain).\n\n"
+        readme+= "**Logs of training** including the models tried in the process can be found in logs.txt"
         with open(f"{tmpdirname}/README.md", "w+") as f:
             f.write(readme)
         with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
             pickle.dump(fc, file=f)
+        upload_folder(repo_id =f"{username}/{dataset_name}-{column}-{task}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./")
     return f"Your model will be ready at {repo_url}"
                 with gr.Column():
                     title = gr.Markdown(""" ## Train a supervised baseline model""")
                     description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
+                    dataset = gr.File(label = "CSV Dataset")
                     column = gr.Text(label = "Enter target variable:")
                     pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
                     token = gr.Textbox(label = "Your Hugging Face Token")
                     inference_run = gr.Button("Train")
                     inference_progress = gr.StatusTracker(cover_container=True)
                 outcome = gr.outputs.Textbox(label = "Progress")
                 inference_run.click(
                     train_baseline,
+                    inputs=[dataset, token, column],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )
                 with gr.Column():
                     title = gr.Markdown(""" ## Analyze Dataset """)
                     description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
+                    dataset = gr.File(label = "CSV Dataset")
                     column = gr.Text(label = "Compare dataset against a target variable (Optional)")
                     pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
                     token = gr.Textbox(label = "Your Hugging Face Token")
                     pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
                     inference_run = gr.Button("Infer")
                     inference_progress = gr.StatusTracker(cover_container=True)
                 outcome = gr.outputs.Textbox()
                 inference_run.click(
                     analyze_datasets,
+                    inputs=[dataset, token, column, pairwise],
                     outputs=outcome,
                     status_tracker=inference_progress,
                 )