Spaces:
Runtime error
Runtime error
Madhavan Iyengar
commited on
Commit
•
ab76bab
1
Parent(s):
465d7de
add submission capability
Browse files- app.py +38 -43
- src/submission/evaluate.py +111 -0
app.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1 |
import subprocess
|
2 |
import gradio as gr
|
|
|
|
|
|
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
@@ -26,13 +29,41 @@ from src.display.utils import (
|
|
26 |
WeightType,
|
27 |
Precision
|
28 |
)
|
29 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
31 |
from src.submission.submit import add_new_eval
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
def handle_new_eval_submission(model_name, model_zip, model_link):
|
34 |
-
# This is a placeholder for the actual submission logic
|
35 |
-
return "We are not accepting submissions at this time, please check back soon!"
|
36 |
|
37 |
def restart_space():
|
38 |
API.restart_space(repo_id=REPO_ID)
|
@@ -241,49 +272,13 @@ with demo:
|
|
241 |
with gr.Column():
|
242 |
with gr.Row():
|
243 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
244 |
-
|
245 |
-
# with gr.Column():
|
246 |
-
# with gr.Accordion(
|
247 |
-
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
248 |
-
# open=False,
|
249 |
-
# ):
|
250 |
-
# with gr.Row():
|
251 |
-
# finished_eval_table = gr.components.Dataframe(
|
252 |
-
# value=finished_eval_queue_df,
|
253 |
-
# headers=EVAL_COLS,
|
254 |
-
# datatype=EVAL_TYPES,
|
255 |
-
# row_count=5,
|
256 |
-
# )
|
257 |
-
# with gr.Accordion(
|
258 |
-
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
259 |
-
# open=False,
|
260 |
-
# ):
|
261 |
-
# with gr.Row():
|
262 |
-
# running_eval_table = gr.components.Dataframe(
|
263 |
-
# value=running_eval_queue_df,
|
264 |
-
# headers=EVAL_COLS,
|
265 |
-
# datatype=EVAL_TYPES,
|
266 |
-
# row_count=5,
|
267 |
-
# )
|
268 |
-
|
269 |
-
# with gr.Accordion(
|
270 |
-
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
271 |
-
# open=False,
|
272 |
-
# ):
|
273 |
-
# with gr.Row():
|
274 |
-
# pending_eval_table = gr.components.Dataframe(
|
275 |
-
# value=pending_eval_queue_df,
|
276 |
-
# headers=EVAL_COLS,
|
277 |
-
# datatype=EVAL_TYPES,
|
278 |
-
# row_count=5,
|
279 |
-
# )
|
280 |
with gr.Row():
|
281 |
gr.Markdown("# 📋 Submit your results here!", elem_classes="markdown-text")
|
282 |
|
283 |
with gr.Row():
|
284 |
model_name_textbox = gr.Textbox(label="Model name")
|
285 |
model_zip_file = gr.File(label="Upload model prediction result ZIP file")
|
286 |
-
model_link_textbox = gr.Textbox(label="Link to model page")
|
287 |
with gr.Row():
|
288 |
gr.Column()
|
289 |
with gr.Column(scale=2):
|
@@ -292,7 +287,7 @@ with demo:
|
|
292 |
|
293 |
submit_button.click(
|
294 |
handle_new_eval_submission,
|
295 |
-
[model_name_textbox, model_zip_file
|
296 |
submission_result
|
297 |
)
|
298 |
gr.Column()
|
|
|
1 |
import subprocess
|
2 |
import gradio as gr
|
3 |
+
import zipfile
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
import pandas as pd
|
7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
+
from huggingface_hub import snapshot_download, Repository, HfFolder
|
9 |
|
10 |
from src.about import (
|
11 |
CITATION_BUTTON_LABEL,
|
|
|
29 |
WeightType,
|
30 |
Precision
|
31 |
)
|
32 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND
|
33 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
34 |
from src.submission.submit import add_new_eval
|
35 |
+
from src.submission.evaluate import calculate_metrics
|
36 |
+
|
37 |
+
def handle_new_eval_submission(model_name, model_zip, model_link=None):
|
38 |
+
extraction_path = EVAL_RESULTS_PATH_BACKEND
|
39 |
+
|
40 |
+
if not os.path.exists(extraction_path):
|
41 |
+
os.makedirs(extraction_path)
|
42 |
+
|
43 |
+
# define path for the zip file to be extracted to
|
44 |
+
extraction_path = os.path.join(extraction_path, model_name)
|
45 |
+
|
46 |
+
if model_zip is not None:
|
47 |
+
with zipfile.ZipFile(model_zip, 'r') as zip_ref:
|
48 |
+
zip_ref.extractall(extraction_path)
|
49 |
+
print("File unzipped successfully to:", extraction_path)
|
50 |
+
|
51 |
+
# Evaluate the model's performance
|
52 |
+
calculate_metrics(extraction_path, model_name)
|
53 |
+
|
54 |
+
# upload to results repo
|
55 |
+
API.upload_file(
|
56 |
+
path_or_fileobj=os.path.join(os.getcwd(), EVAL_RESULTS_PATH, '3d-pope', model_name, 'results.json'),
|
57 |
+
path_in_repo=os.path.join('3d-pope', model_name, 'results.json'),
|
58 |
+
repo_id=RESULTS_REPO,
|
59 |
+
repo_type="dataset",
|
60 |
+
)
|
61 |
+
|
62 |
+
restart_space()
|
63 |
+
|
64 |
+
return "Submission received and results are being processed. Please check the leaderboard for updates."
|
65 |
+
|
66 |
|
|
|
|
|
|
|
67 |
|
68 |
def restart_space():
|
69 |
API.restart_space(repo_id=REPO_ID)
|
|
|
272 |
with gr.Column():
|
273 |
with gr.Row():
|
274 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
with gr.Row():
|
276 |
gr.Markdown("# 📋 Submit your results here!", elem_classes="markdown-text")
|
277 |
|
278 |
with gr.Row():
|
279 |
model_name_textbox = gr.Textbox(label="Model name")
|
280 |
model_zip_file = gr.File(label="Upload model prediction result ZIP file")
|
281 |
+
# model_link_textbox = gr.Textbox(label="Link to model page")
|
282 |
with gr.Row():
|
283 |
gr.Column()
|
284 |
with gr.Column(scale=2):
|
|
|
287 |
|
288 |
submit_button.click(
|
289 |
handle_new_eval_submission,
|
290 |
+
[model_name_textbox, model_zip_file],
|
291 |
submission_result
|
292 |
)
|
293 |
gr.Column()
|
src/submission/evaluate.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
7 |
+
import re
|
8 |
+
|
9 |
+
from src.envs import EVAL_RESULTS_PATH
|
10 |
+
|
11 |
+
def parse_first_word(answer):
|
12 |
+
# Extract the first word and check if it's 'yes' or 'no'
|
13 |
+
first_word = re.split(r'[\s,\.]', answer.lower())[0]
|
14 |
+
if first_word.startswith('yes'):
|
15 |
+
return 'yes'
|
16 |
+
elif first_word.startswith('no'):
|
17 |
+
return 'no'
|
18 |
+
else:
|
19 |
+
return None
|
20 |
+
|
21 |
+
def compute_metrics(true_labels, predicted_labels):
|
22 |
+
# Filtering out invalid answers
|
23 |
+
valid_indices = [i for i, label in enumerate(predicted_labels) if label in ['yes', 'no']]
|
24 |
+
filtered_true_labels = [true_labels[i] for i in valid_indices]
|
25 |
+
filtered_predicted_labels = [predicted_labels[i] for i in valid_indices]
|
26 |
+
|
27 |
+
# Calculating metrics
|
28 |
+
accuracy = accuracy_score(filtered_true_labels, filtered_predicted_labels)
|
29 |
+
precision, recall, f1_score, _ = precision_recall_fscore_support(
|
30 |
+
filtered_true_labels, filtered_predicted_labels, average='binary', pos_label='yes')
|
31 |
+
|
32 |
+
yes_ratio = filtered_predicted_labels.count('yes') / len(filtered_predicted_labels) if filtered_predicted_labels else 0
|
33 |
+
|
34 |
+
return {
|
35 |
+
"Accuracy": accuracy,
|
36 |
+
"Precision": precision,
|
37 |
+
"Recall": recall,
|
38 |
+
"F1 Score": f1_score,
|
39 |
+
"Yes Ratio": yes_ratio
|
40 |
+
}
|
41 |
+
|
42 |
+
def aggregate_metrics(directory_path):
|
43 |
+
metrics_data = {"random": {"true": [], "pred": [], "invalid": []},
|
44 |
+
"popular": {"true": [], "pred": [], "invalid": []},
|
45 |
+
"adversarial": {"true": [], "pred": [], "invalid": []}}
|
46 |
+
|
47 |
+
# Process each file in the directory
|
48 |
+
for filename in os.listdir(directory_path):
|
49 |
+
if filename.endswith(".json"):
|
50 |
+
file_path = os.path.join(directory_path, filename)
|
51 |
+
with open(file_path, 'r') as f:
|
52 |
+
data = json.load(f)
|
53 |
+
|
54 |
+
question_type = filename.split('_')[0]
|
55 |
+
if question_type in metrics_data:
|
56 |
+
for entry in data[next(iter(data))]:
|
57 |
+
first_word = parse_first_word(entry['predicted_answer'])
|
58 |
+
if first_word is None:
|
59 |
+
metrics_data[question_type]["invalid"].append(entry['predicted_answer'])
|
60 |
+
metrics_data[question_type]["true"].append(entry['ground_truth_answer'].lower())
|
61 |
+
metrics_data[question_type]["pred"].append(first_word if first_word else entry['predicted_answer'].lower())
|
62 |
+
|
63 |
+
results = {}
|
64 |
+
for q_type, data in metrics_data.items():
|
65 |
+
result = compute_metrics(data["true"], data["pred"])
|
66 |
+
result["Non-Binary Responses Count"] = len(data["invalid"])
|
67 |
+
result["Non-Binary Responses"] = data["invalid"]
|
68 |
+
results[q_type] = result
|
69 |
+
|
70 |
+
return results
|
71 |
+
|
72 |
+
def transform_format(data, model_name):
|
73 |
+
# Define the new format's base structure
|
74 |
+
transformed_data = {
|
75 |
+
"config": {
|
76 |
+
"model_name": model_name
|
77 |
+
},
|
78 |
+
"results": {}
|
79 |
+
}
|
80 |
+
|
81 |
+
# Mapping of old keys to new keys
|
82 |
+
key_mapping = {
|
83 |
+
"Accuracy": "accuracy",
|
84 |
+
"Precision": "precision",
|
85 |
+
"Recall": "recall",
|
86 |
+
"F1 Score": "f1_score",
|
87 |
+
"Yes Ratio": "yes_percentage"
|
88 |
+
}
|
89 |
+
|
90 |
+
# Iterate over each item in the original data
|
91 |
+
for model_type, metrics in data.items():
|
92 |
+
for old_key, new_suffix in key_mapping.items():
|
93 |
+
# Format the new key according to the required format 2 style
|
94 |
+
new_key = f"{model_type}_{new_suffix}"
|
95 |
+
# Assign the corresponding value to the new key in the results dictionary
|
96 |
+
transformed_data["results"][new_key] = {
|
97 |
+
new_key: round(metrics[old_key], 4) if isinstance(metrics[old_key], float) else metrics[old_key]
|
98 |
+
}
|
99 |
+
|
100 |
+
return transformed_data
|
101 |
+
|
102 |
+
def calculate_metrics(json_output_directory, model_name):
|
103 |
+
final_metrics = aggregate_metrics(json_output_directory)
|
104 |
+
transformed_metrics = transform_format(final_metrics, model_name)
|
105 |
+
# write to a file
|
106 |
+
results_path = os.path.join(EVAL_RESULTS_PATH, '3d-pope', model_name)
|
107 |
+
if not os.path.exists(results_path):
|
108 |
+
os.makedirs(results_path)
|
109 |
+
with open(os.path.join(results_path, 'results.json'), 'w') as f:
|
110 |
+
json.dump(transformed_metrics, f, indent=4)
|
111 |
+
print(json.dumps(final_metrics, indent=4))
|