Spaces:
Runtime error
Runtime error
mervenoyan
commited on
Commit
•
d2a61f1
1
Parent(s):
acaa4d9
simplified UI
Browse files
app.py
CHANGED
@@ -12,22 +12,23 @@ import dabl
|
|
12 |
import re
|
13 |
|
14 |
|
15 |
-
def analyze_datasets(dataset,
|
16 |
df = pd.read_csv(dataset.name)
|
17 |
username = HfApi().whoami(token=token)["name"]
|
18 |
if column is not None:
|
19 |
analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
|
20 |
else:
|
21 |
analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
|
22 |
-
|
23 |
analyze_report.show_html('./index.html', open_browser=False)
|
24 |
-
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
|
25 |
|
26 |
-
|
|
|
|
|
27 |
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
|
28 |
with open("README.md", "w+") as f:
|
29 |
f.write(readme)
|
30 |
-
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
|
31 |
|
32 |
return f"Your dataset report will be ready at {repo_url}"
|
33 |
|
@@ -45,9 +46,11 @@ def extract_estimator_config(model):
|
|
45 |
def detect_training(df, column):
|
46 |
if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
|
47 |
trainer = dabl.SimpleRegressor()
|
|
|
48 |
elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
|
49 |
trainer = dabl.SimpleClassifier()
|
50 |
-
|
|
|
51 |
|
52 |
def edit_types(df):
|
53 |
types = dabl.detect_types(df)
|
@@ -61,10 +64,11 @@ def edit_types(df):
|
|
61 |
df_clean = dabl.clean(df, type_hints=type_hints)
|
62 |
return df_clean
|
63 |
|
64 |
-
def train_baseline(dataset,
|
65 |
df = pd.read_csv(dataset.name)
|
|
|
66 |
df_clean = edit_types(df)
|
67 |
-
fc = detect_training(df_clean, column)
|
68 |
X = df_clean.drop(column, axis = 1)
|
69 |
y = df_clean[column]
|
70 |
|
@@ -76,21 +80,25 @@ def train_baseline(dataset, dataset_name, token, column):
|
|
76 |
print('Logging training')
|
77 |
fc.fit(X, y)
|
78 |
username = HfApi().whoami(token=token)["name"]
|
79 |
-
repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
83 |
readme+="**Metrics of the best model:**\n\n"
|
84 |
for elem in str(fc.current_best_).split("\n"):
|
85 |
readme+= f"{elem}\n\n"
|
86 |
readme+= "\n\n**See model plot below:**\n\n"
|
87 |
readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
|
88 |
-
readme+= "\n\
|
|
|
89 |
with open(f"{tmpdirname}/README.md", "w+") as f:
|
90 |
f.write(readme)
|
91 |
with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
|
92 |
pickle.dump(fc, file=f)
|
93 |
-
upload_folder(repo_id =f"{username}/{dataset_name}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./")
|
94 |
|
95 |
return f"Your model will be ready at {repo_url}"
|
96 |
|
@@ -107,10 +115,9 @@ with gr.Blocks() as demo:
|
|
107 |
with gr.Column():
|
108 |
title = gr.Markdown(""" ## Train a supervised baseline model""")
|
109 |
description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
|
110 |
-
dataset = gr.File(label = "Dataset")
|
111 |
column = gr.Text(label = "Enter target variable:")
|
112 |
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
|
113 |
-
dataset_name = gr.Text(label = "Enter dataset name:")
|
114 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
115 |
inference_run = gr.Button("Train")
|
116 |
inference_progress = gr.StatusTracker(cover_container=True)
|
@@ -118,7 +125,7 @@ with gr.Blocks() as demo:
|
|
118 |
outcome = gr.outputs.Textbox(label = "Progress")
|
119 |
inference_run.click(
|
120 |
train_baseline,
|
121 |
-
inputs=[dataset,
|
122 |
outputs=outcome,
|
123 |
status_tracker=inference_progress,
|
124 |
)
|
@@ -127,18 +134,17 @@ with gr.Blocks() as demo:
|
|
127 |
with gr.Column():
|
128 |
title = gr.Markdown(""" ## Analyze Dataset """)
|
129 |
description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
|
130 |
-
dataset = gr.File(label = "Dataset")
|
131 |
column = gr.Text(label = "Compare dataset against a target variable (Optional)")
|
132 |
pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
|
133 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
134 |
-
dataset_name = gr.Textbox(label = "Dataset Name")
|
135 |
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
|
136 |
inference_run = gr.Button("Infer")
|
137 |
inference_progress = gr.StatusTracker(cover_container=True)
|
138 |
outcome = gr.outputs.Textbox()
|
139 |
inference_run.click(
|
140 |
analyze_datasets,
|
141 |
-
inputs=[dataset,
|
142 |
outputs=outcome,
|
143 |
status_tracker=inference_progress,
|
144 |
)
|
|
|
12 |
import re
|
13 |
|
14 |
|
15 |
+
def analyze_datasets(dataset, token, column=None, pairwise="off"):
|
16 |
df = pd.read_csv(dataset.name)
|
17 |
username = HfApi().whoami(token=token)["name"]
|
18 |
if column is not None:
|
19 |
analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
|
20 |
else:
|
21 |
analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
|
22 |
+
dataset_name = dataset.name.split("/")[-1].strip(".csv")
|
23 |
analyze_report.show_html('./index.html', open_browser=False)
|
|
|
24 |
|
25 |
+
repo_url = create_repo(f"{username}/{dataset_name}-report", repo_type = "space", token = token, space_sdk = "static", private=False)
|
26 |
+
|
27 |
+
upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)
|
28 |
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
|
29 |
with open("README.md", "w+") as f:
|
30 |
f.write(readme)
|
31 |
+
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)
|
32 |
|
33 |
return f"Your dataset report will be ready at {repo_url}"
|
34 |
|
|
|
46 |
def detect_training(df, column):
|
47 |
if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
|
48 |
trainer = dabl.SimpleRegressor()
|
49 |
+
task = "regression"
|
50 |
elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
|
51 |
trainer = dabl.SimpleClassifier()
|
52 |
+
task = "classification"
|
53 |
+
return trainer, task
|
54 |
|
55 |
def edit_types(df):
|
56 |
types = dabl.detect_types(df)
|
|
|
64 |
df_clean = dabl.clean(df, type_hints=type_hints)
|
65 |
return df_clean
|
66 |
|
67 |
+
def train_baseline(dataset, token, column):
|
68 |
df = pd.read_csv(dataset.name)
|
69 |
+
dataset_name = dataset.name.split("/")[-1].strip(".csv")
|
70 |
df_clean = edit_types(df)
|
71 |
+
fc, task = detect_training(df_clean, column)
|
72 |
X = df_clean.drop(column, axis = 1)
|
73 |
y = df_clean[column]
|
74 |
|
|
|
80 |
print('Logging training')
|
81 |
fc.fit(X, y)
|
82 |
username = HfApi().whoami(token=token)["name"]
|
83 |
+
repo_url = create_repo(repo_id = f"{username}/{dataset_name}-{column}-{task}", token = token)
|
84 |
+
if task == "regression":
|
85 |
+
task_metadata = "tabular-regression"
|
86 |
+
else:
|
87 |
+
task_metadata = "tabular-classification"
|
88 |
+
readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\ntags:\n- {task_metadata}\n- baseline-trainer\n---\n\n"
|
89 |
+
readme += f"## Baseline Model trained on {dataset_name} to apply {task} on {column}\n\n"
|
90 |
readme+="**Metrics of the best model:**\n\n"
|
91 |
for elem in str(fc.current_best_).split("\n"):
|
92 |
readme+= f"{elem}\n\n"
|
93 |
readme+= "\n\n**See model plot below:**\n\n"
|
94 |
readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
|
95 |
+
readme+= "\n\n**Disclaimer:** This model is trained with dabl library as a baseline, for better results, use [AutoTrain](https://huggingface.co/autotrain).\n\n"
|
96 |
+
readme+= "**Logs of training** including the models tried in the process can be found in logs.txt"
|
97 |
with open(f"{tmpdirname}/README.md", "w+") as f:
|
98 |
f.write(readme)
|
99 |
with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
|
100 |
pickle.dump(fc, file=f)
|
101 |
+
upload_folder(repo_id =f"{username}/{dataset_name}-{column}-{task}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./")
|
102 |
|
103 |
return f"Your model will be ready at {repo_url}"
|
104 |
|
|
|
115 |
with gr.Column():
|
116 |
title = gr.Markdown(""" ## Train a supervised baseline model""")
|
117 |
description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
|
118 |
+
dataset = gr.File(label = "CSV Dataset")
|
119 |
column = gr.Text(label = "Enter target variable:")
|
120 |
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
|
|
|
121 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
122 |
inference_run = gr.Button("Train")
|
123 |
inference_progress = gr.StatusTracker(cover_container=True)
|
|
|
125 |
outcome = gr.outputs.Textbox(label = "Progress")
|
126 |
inference_run.click(
|
127 |
train_baseline,
|
128 |
+
inputs=[dataset, token, column],
|
129 |
outputs=outcome,
|
130 |
status_tracker=inference_progress,
|
131 |
)
|
|
|
134 |
with gr.Column():
|
135 |
title = gr.Markdown(""" ## Analyze Dataset """)
|
136 |
description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
|
137 |
+
dataset = gr.File(label = "CSV Dataset")
|
138 |
column = gr.Text(label = "Compare dataset against a target variable (Optional)")
|
139 |
pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
|
140 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
|
|
141 |
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
|
142 |
inference_run = gr.Button("Infer")
|
143 |
inference_progress = gr.StatusTracker(cover_container=True)
|
144 |
outcome = gr.outputs.Textbox()
|
145 |
inference_run.click(
|
146 |
analyze_datasets,
|
147 |
+
inputs=[dataset, token, column, pairwise],
|
148 |
outputs=outcome,
|
149 |
status_tracker=inference_progress,
|
150 |
)
|