Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add metric selector
Browse files- app.py +35 -5
- src/populate.py +8 -9
- tests/src/test_populate.py +3 -1
- utils.py +24 -1
app.py
CHANGED
@@ -17,10 +17,12 @@ from src.display.utils import (
|
|
17 |
)
|
18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
19 |
from src.populate import get_leaderboard_df
|
20 |
-
from utils import update_table
|
21 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
|
22 |
|
23 |
|
|
|
|
|
24 |
def restart_space():
|
25 |
API.restart_space(repo_id=REPO_ID)
|
26 |
|
@@ -41,11 +43,21 @@ def restart_space():
|
|
41 |
# except Exception:
|
42 |
# restart_space()
|
43 |
|
44 |
-
|
45 |
-
|
|
|
46 |
print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
|
47 |
leaderboard_df = original_df_qa.copy()
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# (
|
50 |
# finished_eval_queue_df,
|
51 |
# running_eval_queue_df,
|
@@ -99,7 +111,7 @@ with demo:
|
|
99 |
with gr.Column(min_width=320):
|
100 |
selected_metric = gr.Dropdown(
|
101 |
choices=metric_list,
|
102 |
-
value=metric_list[
|
103 |
label="Select the metric",
|
104 |
interactive=True,
|
105 |
elem_id="metric-select",
|
@@ -117,11 +129,13 @@ with demo:
|
|
117 |
|
118 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
119 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
120 |
-
value=
|
121 |
# headers=COLS,
|
122 |
# datatype=TYPES,
|
123 |
visible=False,
|
124 |
)
|
|
|
|
|
125 |
search_bar.submit(
|
126 |
update_table,
|
127 |
[
|
@@ -133,6 +147,8 @@ with demo:
|
|
133 |
],
|
134 |
leaderboard_table,
|
135 |
)
|
|
|
|
|
136 |
for selector in [
|
137 |
selected_domains, selected_langs, selected_rerankings
|
138 |
]:
|
@@ -149,6 +165,20 @@ with demo:
|
|
149 |
queue=True,
|
150 |
)
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
153 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
154 |
|
|
|
17 |
)
|
18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
19 |
from src.populate import get_leaderboard_df
|
20 |
+
from utils import update_table, update_metric
|
21 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
|
22 |
|
23 |
|
24 |
+
from functools import partial
|
25 |
+
|
26 |
def restart_space():
|
27 |
API.restart_space(repo_id=REPO_ID)
|
28 |
|
|
|
43 |
# except Exception:
|
44 |
# restart_space()
|
45 |
|
46 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
47 |
+
raw_data_qa = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
48 |
+
original_df_qa = get_leaderboard_df(raw_data_qa, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
|
49 |
print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
|
50 |
leaderboard_df = original_df_qa.copy()
|
51 |
|
52 |
+
|
53 |
+
def update_metric_qa(
|
54 |
+
metric: str,
|
55 |
+
domains: list,
|
56 |
+
langs: list,
|
57 |
+
reranking_model: list,
|
58 |
+
query: str,
|
59 |
+
):
|
60 |
+
return update_metric(raw_data_qa, metric, domains, langs, reranking_model, query)
|
61 |
# (
|
62 |
# finished_eval_queue_df,
|
63 |
# running_eval_queue_df,
|
|
|
111 |
with gr.Column(min_width=320):
|
112 |
selected_metric = gr.Dropdown(
|
113 |
choices=metric_list,
|
114 |
+
value=metric_list[1],
|
115 |
label="Select the metric",
|
116 |
interactive=True,
|
117 |
elem_id="metric-select",
|
|
|
129 |
|
130 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
131 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
132 |
+
value=leaderboard_df,
|
133 |
# headers=COLS,
|
134 |
# datatype=TYPES,
|
135 |
visible=False,
|
136 |
)
|
137 |
+
|
138 |
+
# Set search_bar listener
|
139 |
search_bar.submit(
|
140 |
update_table,
|
141 |
[
|
|
|
147 |
],
|
148 |
leaderboard_table,
|
149 |
)
|
150 |
+
|
151 |
+
# Set column-wise listener
|
152 |
for selector in [
|
153 |
selected_domains, selected_langs, selected_rerankings
|
154 |
]:
|
|
|
165 |
queue=True,
|
166 |
)
|
167 |
|
168 |
+
# set metric listener
|
169 |
+
selected_metric.change(
|
170 |
+
update_metric_qa,
|
171 |
+
[
|
172 |
+
selected_metric,
|
173 |
+
selected_domains,
|
174 |
+
selected_langs,
|
175 |
+
selected_rerankings,
|
176 |
+
search_bar,
|
177 |
+
],
|
178 |
+
leaderboard_table,
|
179 |
+
queue=True
|
180 |
+
)
|
181 |
+
|
182 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
183 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
184 |
|
src/populate.py
CHANGED
@@ -5,31 +5,30 @@ import pandas as pd
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
9 |
-
from typing import Tuple
|
10 |
|
11 |
|
12 |
-
def get_leaderboard_df(
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
-
print(f"raw_data loaded: {len(raw_data)}")
|
16 |
all_data_json = []
|
17 |
for v in raw_data:
|
18 |
all_data_json += v.to_dict(task=task, metric=metric)
|
19 |
-
|
20 |
-
print(f'records loaded: {len(all_data_json)}')
|
21 |
df = pd.DataFrame.from_records(all_data_json)
|
22 |
print(f'dataframe created: {df.shape}')
|
|
|
|
|
23 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
24 |
-
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
|
25 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
26 |
df.reset_index(inplace=True)
|
|
|
27 |
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
28 |
df = df[_cols].round(decimals=2)
|
29 |
|
30 |
# filter out if any of the benchmarks have not been produced
|
31 |
df = df[has_no_nan_values(df, _benchmark_cols)]
|
32 |
-
return
|
33 |
|
34 |
|
35 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, FullEvalResult
|
9 |
+
from typing import Tuple, List
|
10 |
|
11 |
|
12 |
+
def get_leaderboard_df(raw_data: List[FullEvalResult], cols: list, benchmark_cols: list, task: str, metric: str) -> pd.DataFrame:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
|
14 |
all_data_json = []
|
15 |
for v in raw_data:
|
16 |
all_data_json += v.to_dict(task=task, metric=metric)
|
|
|
|
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
18 |
print(f'dataframe created: {df.shape}')
|
19 |
+
|
20 |
+
# calculate the average score for selected benchmarks
|
21 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
22 |
+
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
|
23 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
24 |
df.reset_index(inplace=True)
|
25 |
+
|
26 |
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
27 |
df = df[_cols].round(decimals=2)
|
28 |
|
29 |
# filter out if any of the benchmarks have not been produced
|
30 |
df = df[has_no_nan_values(df, _benchmark_cols)]
|
31 |
+
return df
|
32 |
|
33 |
|
34 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
tests/src/test_populate.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from src.populate import get_leaderboard_df
|
|
|
2 |
from pathlib import Path
|
3 |
|
4 |
cur_fp = Path(__file__)
|
@@ -9,7 +10,8 @@ def test_get_leaderboard_df():
|
|
9 |
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
10 |
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
11 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
12 |
-
raw_data
|
|
|
13 |
assert df.shape[0] == 2
|
14 |
# the results contain only one embedding model
|
15 |
for i in range(2):
|
|
|
1 |
from src.populate import get_leaderboard_df
|
2 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
3 |
from pathlib import Path
|
4 |
|
5 |
cur_fp = Path(__file__)
|
|
|
10 |
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
11 |
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
12 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
df = get_leaderboard_df(raw_data, cols, benchmark_cols, 'qa', 'ndcg_at_1')
|
15 |
assert df.shape[0] == 2
|
16 |
# the results contain only one embedding model
|
17 |
for i in range(2):
|
utils.py
CHANGED
@@ -2,6 +2,10 @@ import pandas as pd
|
|
2 |
|
3 |
from src.display.utils import AutoEvalColumnQA, COLS
|
4 |
from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
|
@@ -68,4 +72,23 @@ def update_table(
|
|
68 |
filtered_df = filter_models(hidden_df, reranking_query)
|
69 |
filtered_df = filter_queries(query, filtered_df)
|
70 |
df = select_columns(filtered_df, domains, langs)
|
71 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
from src.display.utils import AutoEvalColumnQA, COLS
|
4 |
from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
|
5 |
+
from src.leaderboard.read_evals import FullEvalResult
|
6 |
+
from typing import List
|
7 |
+
from src.populate import get_leaderboard_df
|
8 |
+
from src.display.utils import COLS, QA_BENCHMARK_COLS
|
9 |
|
10 |
|
11 |
def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
|
|
|
72 |
filtered_df = filter_models(hidden_df, reranking_query)
|
73 |
filtered_df = filter_queries(query, filtered_df)
|
74 |
df = select_columns(filtered_df, domains, langs)
|
75 |
+
return df
|
76 |
+
|
77 |
+
|
78 |
+
def update_metric(
|
79 |
+
raw_data: List[FullEvalResult],
|
80 |
+
metric: str,
|
81 |
+
domains: list,
|
82 |
+
langs: list,
|
83 |
+
reranking_model: list,
|
84 |
+
query: str,
|
85 |
+
) -> pd.DataFrame:
|
86 |
+
leaderboard_df = get_leaderboard_df(raw_data, COLS, QA_BENCHMARK_COLS, task='qa', metric=metric)
|
87 |
+
hidden_df = leaderboard_df
|
88 |
+
return update_table(
|
89 |
+
hidden_df,
|
90 |
+
domains,
|
91 |
+
langs,
|
92 |
+
reranking_model,
|
93 |
+
query
|
94 |
+
)
|