sync from github
Browse files- README.md +1 -1
- app.py +18 -17
- requirements.txt +1 -1
- src/leaderboard/read_evals.py +16 -9
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🔥
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.36.1
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -75,7 +75,7 @@ def restart_space():
|
|
75 |
|
76 |
|
77 |
def init_space():
|
78 |
-
dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
|
79 |
|
80 |
if socket.gethostname() not in {"neuromancer"}:
|
81 |
# sync model_type with open-llm-leaderboard
|
@@ -90,7 +90,8 @@ def init_space():
|
|
90 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
91 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
92 |
)
|
93 |
-
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
|
|
94 |
|
95 |
|
96 |
def add_benchmark_columns(shown_columns):
|
@@ -353,21 +354,21 @@ with demo:
|
|
353 |
queue=True,
|
354 |
)
|
355 |
|
356 |
-
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
|
372 |
with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
|
373 |
with gr.Column():
|
|
|
75 |
|
76 |
|
77 |
def init_space():
|
78 |
+
# dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
|
79 |
|
80 |
if socket.gethostname() not in {"neuromancer"}:
|
81 |
# sync model_type with open-llm-leaderboard
|
|
|
90 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
91 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
92 |
)
|
93 |
+
# return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
94 |
+
return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
95 |
|
96 |
|
97 |
def add_benchmark_columns(shown_columns):
|
|
|
354 |
queue=True,
|
355 |
)
|
356 |
|
357 |
+
# with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
358 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
359 |
+
|
360 |
+
# dataset_table = gr.components.Dataframe(
|
361 |
+
# value=dataset_df,
|
362 |
+
# headers=list(dataset_df.columns),
|
363 |
+
# datatype=["str", "markdown", "str", "str", "str"],
|
364 |
+
# elem_id="dataset-table",
|
365 |
+
# interactive=False,
|
366 |
+
# visible=True,
|
367 |
+
# column_widths=["15%", "20%"],
|
368 |
+
# )
|
369 |
+
|
370 |
+
# gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
|
371 |
+
# gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
372 |
|
373 |
with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
|
374 |
with gr.Column():
|
requirements.txt
CHANGED
@@ -4,7 +4,7 @@ APScheduler
|
|
4 |
black
|
5 |
click
|
6 |
datasets
|
7 |
-
gradio
|
8 |
gradio_client
|
9 |
huggingface-hub
|
10 |
matplotlib
|
|
|
4 |
black
|
5 |
click
|
6 |
datasets
|
7 |
+
gradio==4.36.1
|
8 |
gradio_client
|
9 |
huggingface-hub
|
10 |
matplotlib
|
src/leaderboard/read_evals.py
CHANGED
@@ -277,15 +277,22 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
|
|
277 |
|
278 |
eval_results = {}
|
279 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
results = []
|
291 |
for v in eval_results.values():
|
|
|
277 |
|
278 |
eval_results = {}
|
279 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
280 |
+
try:
|
281 |
+
# Creation of result
|
282 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
|
283 |
+
eval_result.update_with_request_file(requests_path)
|
284 |
+
|
285 |
+
# Store results of same eval together
|
286 |
+
eval_name = eval_result.eval_name
|
287 |
+
if eval_name in eval_results.keys():
|
288 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
289 |
+
else:
|
290 |
+
eval_results[eval_name] = eval_result
|
291 |
+
|
292 |
+
except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
|
293 |
+
# Log the error and continue with the next file
|
294 |
+
print(f"Error processing file {model_result_filepath}: {e}")
|
295 |
+
continue
|
296 |
|
297 |
results = []
|
298 |
for v in eval_results.values():
|