Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: refactor the data loading function
Browse files
app.py
CHANGED
@@ -77,41 +77,54 @@ def restart_space():
|
|
77 |
from dataclasses import dataclass
|
78 |
import pandas as pd
|
79 |
from typing import Optional
|
|
|
|
|
80 |
@dataclass
|
81 |
class LeaderboardDataStore:
|
82 |
raw_data: Optional[list]
|
83 |
-
|
84 |
original_df_long_doc: Optional[pd.DataFrame]
|
85 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
86 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
87 |
reranking_models: Optional[list]
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
def update_metric_qa(
|
117 |
metric: str,
|
@@ -173,9 +186,9 @@ with demo:
|
|
173 |
# select reranking models
|
174 |
with gr.Column():
|
175 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
176 |
-
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, types_qa)
|
177 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
178 |
-
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].
|
179 |
|
180 |
set_listeners(
|
181 |
"qa",
|
@@ -212,11 +225,11 @@ with demo:
|
|
212 |
selected_noreranker = get_noreranking_dropdown()
|
213 |
lb_df_retriever = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
214 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
215 |
-
lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
|
216 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
217 |
-
hidden_lb_df_retriever = data["AIR-Bench_24.04"].
|
218 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
219 |
-
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
|
220 |
|
221 |
set_listeners(
|
222 |
"qa",
|
@@ -254,11 +267,11 @@ with demo:
|
|
254 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
255 |
with gr.Column(scale=1):
|
256 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
257 |
-
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
258 |
-
hidden_lb_df_reranker = data["AIR-Bench_24.04"].
|
259 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
260 |
hidden_lb_table_reranker = get_leaderboard_table(
|
261 |
-
hidden_lb_df_reranker, types_qa, visible=False
|
262 |
)
|
263 |
|
264 |
set_listeners(
|
@@ -316,12 +329,12 @@ with demo:
|
|
316 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
317 |
|
318 |
lb_table = get_leaderboard_table(
|
319 |
-
data["AIR-Bench_24.04"].leaderboard_df_long_doc, types_long_doc
|
320 |
)
|
321 |
|
322 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
323 |
hidden_lb_table_for_search = get_leaderboard_table(
|
324 |
-
data["AIR-Bench_24.04"].original_df_long_doc, types_long_doc, visible=False
|
325 |
)
|
326 |
|
327 |
set_listeners(
|
@@ -366,9 +379,9 @@ with demo:
|
|
366 |
]
|
367 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
368 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
369 |
-
lb_df_retriever_long_doc, types_long_doc)
|
370 |
hidden_lb_table_retriever_long_doc = get_leaderboard_table(
|
371 |
-
hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
|
372 |
)
|
373 |
|
374 |
set_listeners(
|
@@ -408,11 +421,11 @@ with demo:
|
|
408 |
selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
|
409 |
with gr.Column(scale=1):
|
410 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
411 |
-
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
|
412 |
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
413 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
414 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
415 |
-
hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
|
416 |
)
|
417 |
|
418 |
set_listeners(
|
|
|
77 |
from dataclasses import dataclass
|
78 |
import pandas as pd
|
79 |
from typing import Optional
|
80 |
+
|
81 |
+
|
82 |
@dataclass
|
83 |
class LeaderboardDataStore:
|
84 |
raw_data: Optional[list]
|
85 |
+
raw_qa_df: Optional[pd.DataFrame]
|
86 |
original_df_long_doc: Optional[pd.DataFrame]
|
87 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
88 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
89 |
reranking_models: Optional[list]
|
90 |
+
types_qa: Optional[list]
|
91 |
+
types_long_doc: Optional[list]
|
92 |
+
|
93 |
+
|
94 |
+
def load_eval_results(file_path: str):
|
95 |
+
output = {}
|
96 |
+
versions = ("AIR-Bench_24.04",)
|
97 |
+
for version in versions:
|
98 |
+
output[version] = LeaderboardDataStore(None, None, None, None, None, None, None, None)
|
99 |
+
output[version].raw_data = get_raw_eval_results(f"{file_path}/{version}")
|
100 |
+
output[version].raw_qa_df = get_leaderboard_df(
|
101 |
+
output[version].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
102 |
+
output[version].original_df_long_doc = get_leaderboard_df(
|
103 |
+
output[version].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
104 |
+
print(f'raw data: {len(output[version].raw_data)}')
|
105 |
+
print(f'QA data loaded: {output[version].raw_qa_df.shape}')
|
106 |
+
print(f'Long-Doc data loaded: {len(output[version].original_df_long_doc)}')
|
107 |
+
|
108 |
+
output[version].leaderboard_df_qa = output[version].raw_qa_df.copy()
|
109 |
+
# leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
|
110 |
+
shown_columns_qa, types_qa = get_default_cols(
|
111 |
+
'qa', output[version].leaderboard_df_qa.columns, add_fix_cols=True)
|
112 |
+
output[version].types_qa = types_qa
|
113 |
+
output[version].leaderboard_df_qa = output[version].leaderboard_df_qa[~output[version].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
114 |
+
output[version].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
115 |
+
|
116 |
+
output[version].leaderboard_df_long_doc = output[version].original_df_long_doc.copy()
|
117 |
+
shown_columns_long_doc, types_long_doc = get_default_cols(
|
118 |
+
'long-doc', output[version].leaderboard_df_long_doc.columns, add_fix_cols=True)
|
119 |
+
output[version].types_long_doc = types_long_doc
|
120 |
+
output[version].leaderboard_df_long_doc = output[version].leaderboard_df_long_doc[~output[version].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
|
121 |
+
output[version].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
122 |
+
|
123 |
+
output[version].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in output[version].raw_data])))
|
124 |
+
return output
|
125 |
+
|
126 |
+
|
127 |
+
data = load_eval_results(EVAL_RESULTS_PATH)
|
128 |
|
129 |
def update_metric_qa(
|
130 |
metric: str,
|
|
|
186 |
# select reranking models
|
187 |
with gr.Column():
|
188 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
189 |
+
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
|
190 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
191 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_qa_df, data["AIR-Bench_24.04"].types_qa, visible=False)
|
192 |
|
193 |
set_listeners(
|
194 |
"qa",
|
|
|
225 |
selected_noreranker = get_noreranking_dropdown()
|
226 |
lb_df_retriever = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
227 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
228 |
+
lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
|
229 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
230 |
+
hidden_lb_df_retriever = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
231 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
232 |
+
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
|
233 |
|
234 |
set_listeners(
|
235 |
"qa",
|
|
|
267 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
268 |
with gr.Column(scale=1):
|
269 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
270 |
+
lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
|
271 |
+
hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
272 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
273 |
hidden_lb_table_reranker = get_leaderboard_table(
|
274 |
+
hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
|
275 |
)
|
276 |
|
277 |
set_listeners(
|
|
|
329 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
330 |
|
331 |
lb_table = get_leaderboard_table(
|
332 |
+
data["AIR-Bench_24.04"].leaderboard_df_long_doc, data["AIR-Bench_24.04"].types_long_doc
|
333 |
)
|
334 |
|
335 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
336 |
hidden_lb_table_for_search = get_leaderboard_table(
|
337 |
+
data["AIR-Bench_24.04"].original_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
338 |
)
|
339 |
|
340 |
set_listeners(
|
|
|
379 |
]
|
380 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
381 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
382 |
+
lb_df_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc)
|
383 |
hidden_lb_table_retriever_long_doc = get_leaderboard_table(
|
384 |
+
hidden_lb_db_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
385 |
)
|
386 |
|
387 |
set_listeners(
|
|
|
421 |
selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
|
422 |
with gr.Column(scale=1):
|
423 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
424 |
+
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
|
425 |
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
426 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
427 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
428 |
+
hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
429 |
)
|
430 |
|
431 |
set_listeners(
|