comparator / app.py
albertvillanova's picture
Add description of Tasks
ca2b34f verified
raw
history blame
7.24 kB
from functools import partial
import gradio as gr
from src.constants import SUBTASKS, TASKS
from src.details import update_subtasks_component, update_load_details_component, load_details_dataframes, \
display_details, update_sample_idx_component, clear_details, update_task_description_component
from src.results import update_load_results_component, \
load_results_dataframes, display_results, update_tasks_component, clear_results, \
sort_result_paths_per_model, fetch_result_paths
# if __name__ == "__main__":
result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
load_results_dataframes = partial(load_results_dataframes, result_paths_per_model=result_paths_per_model)
with gr.Blocks(fill_height=True, fill_width=True) as demo:
gr.HTML("<h1 style='text-align: center;'>Compare Results of the πŸ€— Open LLM Leaderboard</h1>")
gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
gr.HTML("<p style='text-align: center; color:orange;'>&#9888; This demo is a beta version and may contain bugs, performance issues, incomplete features, or unexpected behavior. We appreciate your understanding and welcome any feedback through the Community tab to help improve the final product.</p>")
gr.Markdown("Compare Results of the πŸ€— [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard-old/open_llm_leaderboard). "
"Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) πŸ“„ to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
)
with gr.Row():
with gr.Column():
model_id_1 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
dataframe_1 = gr.Dataframe(visible=False)
with gr.Column():
model_id_2 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
dataframe_2 = gr.Dataframe(visible=False)
with gr.Row():
with gr.Tab("Results"):
load_results_btn = gr.Button("Load", interactive=False)
clear_results_btn = gr.Button("Clear")
results_task = gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
)
results_task_description = gr.Textbox(
label="Task Description",
lines=3,
visible=False,
)
results = gr.HTML()
with gr.Tab("Configs"):
load_configs_btn = gr.Button("Load", interactive=False)
clear_configs_btn = gr.Button("Clear")
configs_task = gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
)
configs_task_description = gr.Textbox(
label="Task Description",
lines=3,
visible=False,
)
configs = gr.HTML()
with gr.Tab("Details"):
details_task = gr.Radio(
list(value for value in TASKS.values() if value[1] != "leaderboard_gpqa"),
label="Tasks",
info="Evaluation tasks to be loaded",
interactive=True,
)
details_task_description = gr.Textbox(
label="Task Description",
lines=3,
)
subtask = gr.Radio(
SUBTASKS.get(details_task.value),
label="Subtasks",
info="Evaluation subtasks to be loaded (choose one of the Tasks above)",
)
load_details_btn = gr.Button("Load Details", interactive=False)
clear_details_btn = gr.Button("Clear Details")
sample_idx = gr.Number(
label="Sample Index",
info="Index of the sample to be displayed",
value=0,
minimum=0,
visible=False
)
details = gr.HTML()
details_dataframe_1 = gr.Dataframe(visible=False)
details_dataframe_2 = gr.Dataframe(visible=False)
details_dataframe = gr.DataFrame(visible=False)
gr.on(
triggers=[model_id_1.input, model_id_2.input],
fn=update_load_results_component,
outputs=[load_results_btn, load_configs_btn],
)
gr.on(
triggers=[load_results_btn.click, load_configs_btn.click],
fn=load_results_dataframes,
inputs=[model_id_1, model_id_2],
outputs=[dataframe_1, dataframe_2],
).then(
fn=update_tasks_component,
outputs=[results_task, configs_task],
)
# Synchronize the results_task and configs_task radio buttons
results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
# Update task descriptions
results_task.change(
fn=update_task_description_component,
inputs=results_task,
outputs=results_task_description,
).then(
fn=update_task_description_component,
inputs=results_task,
outputs=configs_task_description,
)
# Display results
gr.on(
triggers=[dataframe_1.change, dataframe_2.change, results_task.change],
fn=display_results,
inputs=[results_task, dataframe_1, dataframe_2],
outputs=[results, configs],
)
gr.on(
triggers=[clear_results_btn.click, clear_configs_btn.click],
fn=clear_results,
outputs=[model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task],
)
details_task.change(
fn=update_task_description_component,
inputs=details_task,
outputs=details_task_description,
).then(
fn=update_subtasks_component,
inputs=details_task,
outputs=subtask,
)
gr.on(
triggers=[model_id_1.input, model_id_2.input, subtask.input, details_task.input],
fn=update_load_details_component,
inputs=[model_id_1, model_id_2, subtask],
outputs=load_details_btn,
)
load_details_btn.click(
fn=load_details_dataframes,
inputs=[subtask, model_id_1, model_id_2],
outputs=[details_dataframe_1, details_dataframe_2],
).then(
fn=update_sample_idx_component,
inputs=[details_dataframe_1, details_dataframe_2],
outputs=sample_idx,
)
gr.on(
triggers=[details_dataframe_1.change, details_dataframe_2.change, sample_idx.change],
fn=display_details,
inputs=[sample_idx, details_dataframe_1, details_dataframe_2],
outputs=details,
)
clear_details_btn.click(
fn=clear_details,
outputs=[model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx],
)
demo.launch()