Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -72,7 +72,7 @@ def has_nan_values(df, columns):
|
|
72 |
return df[columns].isna().any(axis=1)
|
73 |
|
74 |
|
75 |
-
def
|
76 |
if eval_results:
|
77 |
print("Pulling evaluation results for the leaderboard.")
|
78 |
eval_results.git_pull()
|
@@ -99,6 +99,22 @@ def get_leaderboard_df():
|
|
99 |
print(type(df))
|
100 |
return df
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
def get_evaluation_queue_df():
|
104 |
if eval_queue:
|
@@ -299,29 +315,8 @@ with demo:
|
|
299 |
)
|
300 |
|
301 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
302 |
-
|
303 |
-
|
304 |
-
value=leaderboard_df[COLS_LITE],
|
305 |
-
headers=COLS_LITE,
|
306 |
-
datatype=TYPES_LITE,
|
307 |
-
max_rows=None,
|
308 |
-
elem_id="leaderboard-table-lite",
|
309 |
-
)
|
310 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
311 |
-
hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
|
312 |
-
value=original_df[COLS_LITE],
|
313 |
-
headers=COLS_LITE,
|
314 |
-
datatype=TYPES_LITE,
|
315 |
-
max_rows=None,
|
316 |
-
visible=False,
|
317 |
-
)
|
318 |
-
search_bar.submit(
|
319 |
-
search_table,
|
320 |
-
[hidden_leaderboard_table_for_search_lite, search_bar],
|
321 |
-
leaderboard_table_lite,
|
322 |
-
)
|
323 |
-
|
324 |
-
with gr.TabItem("๐ Extended view", elem_id="llm-benchmark-tab-table", id=1):
|
325 |
leaderboard_table = gr.components.Dataframe(
|
326 |
value=leaderboard_df,
|
327 |
headers=COLS,
|
@@ -346,107 +341,7 @@ with demo:
|
|
346 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
347 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
348 |
|
349 |
-
|
350 |
-
with gr.Column():
|
351 |
-
with gr.Row():
|
352 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
353 |
-
|
354 |
-
with gr.Column():
|
355 |
-
with gr.Accordion(f"โ
Finished Evaluations: {len(finished_eval_queue_df)}", open=False):
|
356 |
-
with gr.Row():
|
357 |
-
finished_eval_table = gr.components.Dataframe(
|
358 |
-
value=finished_eval_queue_df,
|
359 |
-
headers=EVAL_COLS,
|
360 |
-
datatype=EVAL_TYPES,
|
361 |
-
max_rows=5,
|
362 |
-
)
|
363 |
-
with gr.Accordion(f"๐ Running Evaluation Queue: {len(running_eval_queue_df)}", open=False):
|
364 |
-
with gr.Row():
|
365 |
-
running_eval_table = gr.components.Dataframe(
|
366 |
-
value=running_eval_queue_df,
|
367 |
-
headers=EVAL_COLS,
|
368 |
-
datatype=EVAL_TYPES,
|
369 |
-
max_rows=5,
|
370 |
-
)
|
371 |
-
|
372 |
-
with gr.Accordion(f"โณ Pending Evaluation Queue: {len(pending_eval_queue_df)}", open=False):
|
373 |
-
with gr.Row():
|
374 |
-
pending_eval_table = gr.components.Dataframe(
|
375 |
-
value=pending_eval_queue_df,
|
376 |
-
headers=EVAL_COLS,
|
377 |
-
datatype=EVAL_TYPES,
|
378 |
-
max_rows=5,
|
379 |
-
)
|
380 |
-
with gr.Row():
|
381 |
-
gr.Markdown("# โ๏ธโจ Submit your model here!", elem_classes="markdown-text")
|
382 |
-
|
383 |
-
with gr.Row():
|
384 |
-
with gr.Column():
|
385 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
386 |
-
revision_name_textbox = gr.Textbox(
|
387 |
-
label="revision", placeholder="main"
|
388 |
-
)
|
389 |
-
private = gr.Checkbox(
|
390 |
-
False, label="Private", visible=not IS_PUBLIC
|
391 |
-
)
|
392 |
-
model_type = gr.Dropdown(
|
393 |
-
choices=["pretrained", "fine-tuned", "with RL"],
|
394 |
-
label="Model type",
|
395 |
-
multiselect=False,
|
396 |
-
value="pretrained",
|
397 |
-
max_choices=1,
|
398 |
-
interactive=True,
|
399 |
-
)
|
400 |
-
|
401 |
-
with gr.Column():
|
402 |
-
precision = gr.Dropdown(
|
403 |
-
choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)"],
|
404 |
-
label="Precision",
|
405 |
-
multiselect=False,
|
406 |
-
value="float16",
|
407 |
-
max_choices=1,
|
408 |
-
interactive=True,
|
409 |
-
)
|
410 |
-
weight_type = gr.Dropdown(
|
411 |
-
choices=["Original", "Delta", "Adapter"],
|
412 |
-
label="Weights type",
|
413 |
-
multiselect=False,
|
414 |
-
value="Original",
|
415 |
-
max_choices=1,
|
416 |
-
interactive=True,
|
417 |
-
)
|
418 |
-
base_model_name_textbox = gr.Textbox(
|
419 |
-
label="Base model (for delta or adapter weights)"
|
420 |
-
)
|
421 |
-
|
422 |
-
submit_button = gr.Button("Submit Eval")
|
423 |
-
submission_result = gr.Markdown()
|
424 |
-
submit_button.click(
|
425 |
-
add_new_eval,
|
426 |
-
[
|
427 |
-
model_name_textbox,
|
428 |
-
base_model_name_textbox,
|
429 |
-
revision_name_textbox,
|
430 |
-
precision,
|
431 |
-
private,
|
432 |
-
weight_type,
|
433 |
-
model_type
|
434 |
-
],
|
435 |
-
submission_result,
|
436 |
-
)
|
437 |
-
|
438 |
-
with gr.Row():
|
439 |
-
refresh_button = gr.Button("Refresh")
|
440 |
-
refresh_button.click(
|
441 |
-
refresh,
|
442 |
-
inputs=[],
|
443 |
-
outputs=[
|
444 |
-
leaderboard_table,
|
445 |
-
finished_eval_table,
|
446 |
-
running_eval_table,
|
447 |
-
pending_eval_table,
|
448 |
-
],
|
449 |
-
)
|
450 |
|
451 |
with gr.Row():
|
452 |
with gr.Accordion("๐ Citation", open=False):
|
|
|
72 |
return df[columns].isna().any(axis=1)
|
73 |
|
74 |
|
75 |
+
def get_leaderboard_df_1():
|
76 |
if eval_results:
|
77 |
print("Pulling evaluation results for the leaderboard.")
|
78 |
eval_results.git_pull()
|
|
|
99 |
print(type(df))
|
100 |
return df
|
101 |
|
102 |
+
def get_leaderboard_df():
|
103 |
+
|
104 |
+
data = {
|
105 |
+
'Datasets': ['SOTA(FT)', 'SOTA(ZS)', 'FLAN-T5', 'GPT-3', 'GPT-3.5v2', 'GPT-3.5v3', 'ChatGPT', 'GPT-4'],
|
106 |
+
'KQApro': [93.85, 94.20, 37.27, 38.28, 38.01, 40.35, 47.93, 57.20],
|
107 |
+
'LC-quad2': [33.10, '-', 30.14, 33.04, 33.77, 39.04, 42.76, 54.95],
|
108 |
+
'WQSP': [73.10, 62.98, 59.87, 67.68, 72.34, 79.60, 83.70, 90.45],
|
109 |
+
'CWQ': [72.20, '-', 46.69, 51.77, 53.96, 57.54, 64.02, 71.00],
|
110 |
+
'GrailQA': [76.31, '-', 29.02, 27.58, 30.50, 35.43, 46.77, 51.40],
|
111 |
+
'GraphQ': [41.30, '-', 32.27, 38.32, 40.85, 47.95, 53.10, 63.20],
|
112 |
+
'QALD-9': [67.82, '-', 30.17, 38.54, 44.96, 46.19, 45.71, 57.20],
|
113 |
+
'MKQA': [46.00, '-', 20.17, 26.97, 30.14, 39.05, 44.30, 59.20]
|
114 |
+
}
|
115 |
+
|
116 |
+
df = pd.DataFrame(data)
|
117 |
+
return df
|
118 |
|
119 |
def get_evaluation_queue_df():
|
120 |
if eval_queue:
|
|
|
315 |
)
|
316 |
|
317 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
318 |
+
|
319 |
+
with gr.TabItem("๐
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
leaderboard_table = gr.components.Dataframe(
|
321 |
value=leaderboard_df,
|
322 |
headers=COLS,
|
|
|
341 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
342 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
343 |
|
344 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
|
346 |
with gr.Row():
|
347 |
with gr.Accordion("๐ Citation", open=False):
|