Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
hanhainebula
commited on
Commit
•
257f64d
1
Parent(s):
30f9433
update part code for v24.05
Browse files- add dependency air-benchmark>=0.0.4
- update benchmarks.py
- update about.py
- update layout of leaderboard
- app.py +270 -266
- requirements.txt +1 -0
- src/about.py +3 -3
- src/benchmarks.py +2 -62
app.py
CHANGED
@@ -131,303 +131,307 @@ with demo:
|
|
131 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
132 |
|
133 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
134 |
-
with gr.TabItem("
|
135 |
with gr.Row():
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
show_revision_and_timestamp,
|
176 |
-
)
|
177 |
-
|
178 |
-
# set metric listener
|
179 |
-
selected_metric.change(
|
180 |
-
update_metric_qa,
|
181 |
-
[
|
182 |
-
selected_metric,
|
183 |
selected_domains,
|
184 |
selected_langs,
|
185 |
selected_rerankings,
|
186 |
-
search_bar,
|
187 |
show_anonymous,
|
188 |
show_revision_and_timestamp,
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
|
|
224 |
selected_domains,
|
225 |
selected_langs,
|
226 |
selected_noreranker,
|
227 |
-
search_bar_retriever,
|
228 |
show_anonymous,
|
229 |
show_revision_and_timestamp,
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
|
|
|
|
266 |
selected_domains,
|
267 |
selected_langs,
|
268 |
selected_rerankings_reranker,
|
269 |
-
search_bar_reranker,
|
270 |
show_anonymous,
|
271 |
show_revision_and_timestamp,
|
272 |
-
],
|
273 |
-
lb_table_reranker,
|
274 |
-
queue=True
|
275 |
-
)
|
276 |
-
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
277 |
-
with gr.Row():
|
278 |
-
with gr.Column(min_width=320):
|
279 |
-
# select domain
|
280 |
-
with gr.Row():
|
281 |
-
selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
|
282 |
-
# select language
|
283 |
-
with gr.Row():
|
284 |
-
selected_langs = get_language_dropdown(
|
285 |
-
LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
|
286 |
)
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
)
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
selected_domains,
|
333 |
selected_langs,
|
334 |
selected_rerankings,
|
335 |
-
search_bar,
|
336 |
show_anonymous,
|
337 |
-
show_revision_and_timestamp
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
|
|
|
|
378 |
selected_domains,
|
379 |
selected_langs,
|
380 |
selected_noreranker,
|
381 |
-
search_bar_retriever,
|
382 |
show_anonymous,
|
383 |
show_revision_and_timestamp,
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
|
|
|
|
421 |
selected_domains,
|
422 |
selected_langs,
|
423 |
selected_rerankings_reranker_ldoc,
|
424 |
-
search_bar_reranker_ldoc,
|
425 |
show_anonymous,
|
426 |
show_revision_and_timestamp,
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
|
432 |
with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
|
433 |
with gr.Column():
|
|
|
131 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
132 |
|
133 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
134 |
+
with gr.TabItem("Results", elem_id="results-tab-table"):
|
135 |
with gr.Row():
|
136 |
+
selected_version = get_version_dropdown(BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION)
|
137 |
+
|
138 |
+
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
|
139 |
+
with gr.Row():
|
140 |
+
with gr.Column(min_width=320):
|
141 |
+
# select domain
|
142 |
+
with gr.Row():
|
143 |
+
selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
|
144 |
+
# select language
|
145 |
+
with gr.Row():
|
146 |
+
selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
|
147 |
+
|
148 |
+
with gr.Column():
|
149 |
+
with gr.Row():
|
150 |
+
selected_version = get_version_dropdown()
|
151 |
+
# select the metric
|
152 |
+
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
|
153 |
+
with gr.Row():
|
154 |
+
show_anonymous = get_anonymous_checkbox()
|
155 |
+
with gr.Row():
|
156 |
+
show_revision_and_timestamp = get_revision_and_ts_checkbox()
|
157 |
+
with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
|
158 |
+
with gr.TabItem("Retrieval + Reranking", id=10):
|
159 |
+
with gr.Row():
|
160 |
+
# search retrieval models
|
161 |
+
with gr.Column():
|
162 |
+
search_bar = get_search_bar()
|
163 |
+
# select reranking models
|
164 |
+
with gr.Column():
|
165 |
+
selected_rerankings = get_reranking_dropdown(reranking_models)
|
166 |
+
leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
|
167 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
168 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
|
169 |
+
|
170 |
+
set_listeners(
|
171 |
+
"qa",
|
172 |
+
leaderboard_table,
|
173 |
+
hidden_leaderboard_table_for_search,
|
174 |
+
search_bar,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
selected_domains,
|
176 |
selected_langs,
|
177 |
selected_rerankings,
|
|
|
178 |
show_anonymous,
|
179 |
show_revision_and_timestamp,
|
180 |
+
)
|
181 |
+
|
182 |
+
# set metric listener
|
183 |
+
selected_metric.change(
|
184 |
+
update_metric_qa,
|
185 |
+
[
|
186 |
+
selected_metric,
|
187 |
+
selected_domains,
|
188 |
+
selected_langs,
|
189 |
+
selected_rerankings,
|
190 |
+
search_bar,
|
191 |
+
show_anonymous,
|
192 |
+
show_revision_and_timestamp,
|
193 |
+
],
|
194 |
+
leaderboard_table,
|
195 |
+
queue=True
|
196 |
+
)
|
197 |
+
with gr.TabItem("Retrieval Only", id=11):
|
198 |
+
with gr.Row():
|
199 |
+
with gr.Column(scale=1):
|
200 |
+
search_bar_retriever = get_search_bar()
|
201 |
+
with gr.Column(scale=1):
|
202 |
+
selected_noreranker = get_noreranking_dropdown()
|
203 |
+
lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
204 |
+
lb_df_retriever = reset_rank(lb_df_retriever)
|
205 |
+
lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
|
206 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
207 |
+
hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
208 |
+
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
209 |
+
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
|
210 |
+
|
211 |
+
set_listeners(
|
212 |
+
"qa",
|
213 |
+
lb_table_retriever,
|
214 |
+
hidden_lb_table_retriever,
|
215 |
+
search_bar_retriever,
|
216 |
selected_domains,
|
217 |
selected_langs,
|
218 |
selected_noreranker,
|
|
|
219 |
show_anonymous,
|
220 |
show_revision_and_timestamp,
|
221 |
+
)
|
222 |
+
|
223 |
+
# set metric listener
|
224 |
+
selected_metric.change(
|
225 |
+
update_metric_qa,
|
226 |
+
[
|
227 |
+
selected_metric,
|
228 |
+
selected_domains,
|
229 |
+
selected_langs,
|
230 |
+
selected_noreranker,
|
231 |
+
search_bar_retriever,
|
232 |
+
show_anonymous,
|
233 |
+
show_revision_and_timestamp,
|
234 |
+
],
|
235 |
+
lb_table_retriever,
|
236 |
+
queue=True
|
237 |
+
)
|
238 |
+
with gr.TabItem("Reranking Only", id=12):
|
239 |
+
lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
240 |
+
lb_df_reranker = reset_rank(lb_df_reranker)
|
241 |
+
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
242 |
+
with gr.Row():
|
243 |
+
with gr.Column(scale=1):
|
244 |
+
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
245 |
+
with gr.Column(scale=1):
|
246 |
+
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
247 |
+
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
248 |
+
hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
249 |
+
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
250 |
+
hidden_lb_table_reranker = get_leaderboard_table(
|
251 |
+
hidden_lb_df_reranker, types_qa, visible=False
|
252 |
+
)
|
253 |
+
|
254 |
+
set_listeners(
|
255 |
+
"qa",
|
256 |
+
lb_table_reranker,
|
257 |
+
hidden_lb_table_reranker,
|
258 |
+
search_bar_reranker,
|
259 |
selected_domains,
|
260 |
selected_langs,
|
261 |
selected_rerankings_reranker,
|
|
|
262 |
show_anonymous,
|
263 |
show_revision_and_timestamp,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
)
|
265 |
+
# set metric listener
|
266 |
+
selected_metric.change(
|
267 |
+
update_metric_qa,
|
268 |
+
[
|
269 |
+
selected_metric,
|
270 |
+
selected_domains,
|
271 |
+
selected_langs,
|
272 |
+
selected_rerankings_reranker,
|
273 |
+
search_bar_reranker,
|
274 |
+
show_anonymous,
|
275 |
+
show_revision_and_timestamp,
|
276 |
+
],
|
277 |
+
lb_table_reranker,
|
278 |
+
queue=True
|
279 |
+
)
|
280 |
+
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
281 |
+
with gr.Row():
|
282 |
+
with gr.Column(min_width=320):
|
283 |
+
# select domain
|
284 |
+
with gr.Row():
|
285 |
+
selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
|
286 |
+
# select language
|
287 |
+
with gr.Row():
|
288 |
+
selected_langs = get_language_dropdown(
|
289 |
+
LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
|
290 |
+
)
|
291 |
+
with gr.Column():
|
292 |
+
with gr.Row():
|
293 |
+
selected_version = get_version_dropdown()
|
294 |
+
# select the metric
|
295 |
+
with gr.Row():
|
296 |
+
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
|
297 |
+
with gr.Row():
|
298 |
+
show_anonymous = get_anonymous_checkbox()
|
299 |
+
with gr.Row():
|
300 |
+
show_revision_and_timestamp = get_revision_and_ts_checkbox()
|
301 |
+
with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
|
302 |
+
with gr.TabItem("Retrieval + Reranking", id=20):
|
303 |
+
with gr.Row():
|
304 |
+
with gr.Column():
|
305 |
+
search_bar = get_search_bar()
|
306 |
+
# select reranking model
|
307 |
+
with gr.Column():
|
308 |
+
selected_rerankings = get_reranking_dropdown(reranking_models)
|
309 |
+
|
310 |
+
lb_table = get_leaderboard_table(
|
311 |
+
leaderboard_df_long_doc, types_long_doc
|
312 |
+
)
|
313 |
+
|
314 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
315 |
+
hidden_lb_table_for_search = get_leaderboard_table(
|
316 |
+
original_df_long_doc, types_long_doc, visible=False
|
317 |
+
)
|
318 |
+
|
319 |
+
set_listeners(
|
320 |
+
"long-doc",
|
321 |
+
lb_table,
|
322 |
+
hidden_lb_table_for_search,
|
323 |
+
search_bar,
|
324 |
selected_domains,
|
325 |
selected_langs,
|
326 |
selected_rerankings,
|
|
|
327 |
show_anonymous,
|
328 |
+
show_revision_and_timestamp,
|
329 |
+
)
|
330 |
+
|
331 |
+
# set metric listener
|
332 |
+
selected_metric.change(
|
333 |
+
update_metric_long_doc,
|
334 |
+
[
|
335 |
+
selected_metric,
|
336 |
+
selected_domains,
|
337 |
+
selected_langs,
|
338 |
+
selected_rerankings,
|
339 |
+
search_bar,
|
340 |
+
show_anonymous,
|
341 |
+
show_revision_and_timestamp
|
342 |
+
],
|
343 |
+
lb_table,
|
344 |
+
queue=True
|
345 |
+
)
|
346 |
+
with gr.TabItem("Retrieval Only", id=21):
|
347 |
+
with gr.Row():
|
348 |
+
with gr.Column(scale=1):
|
349 |
+
search_bar_retriever = get_search_bar()
|
350 |
+
with gr.Column(scale=1):
|
351 |
+
selected_noreranker = get_noreranking_dropdown()
|
352 |
+
lb_df_retriever_long_doc = leaderboard_df_long_doc[
|
353 |
+
leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
354 |
+
]
|
355 |
+
lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
|
356 |
+
hidden_lb_db_retriever_long_doc = original_df_long_doc[
|
357 |
+
original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
358 |
+
]
|
359 |
+
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
360 |
+
lb_table_retriever_long_doc = get_leaderboard_table(
|
361 |
+
lb_df_retriever_long_doc, types_long_doc)
|
362 |
+
hidden_lb_table_retriever_long_doc = get_leaderboard_table(
|
363 |
+
hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
|
364 |
+
)
|
365 |
+
|
366 |
+
set_listeners(
|
367 |
+
"long-doc",
|
368 |
+
lb_table_retriever_long_doc,
|
369 |
+
hidden_lb_table_retriever_long_doc,
|
370 |
+
search_bar_retriever,
|
371 |
selected_domains,
|
372 |
selected_langs,
|
373 |
selected_noreranker,
|
|
|
374 |
show_anonymous,
|
375 |
show_revision_and_timestamp,
|
376 |
+
)
|
377 |
+
|
378 |
+
selected_metric.change(
|
379 |
+
update_metric_long_doc,
|
380 |
+
[
|
381 |
+
selected_metric,
|
382 |
+
selected_domains,
|
383 |
+
selected_langs,
|
384 |
+
selected_noreranker,
|
385 |
+
search_bar_retriever,
|
386 |
+
show_anonymous,
|
387 |
+
show_revision_and_timestamp,
|
388 |
+
],
|
389 |
+
lb_table_retriever_long_doc,
|
390 |
+
queue=True
|
391 |
+
)
|
392 |
+
with gr.TabItem("Reranking Only", id=22):
|
393 |
+
lb_df_reranker_ldoc = leaderboard_df_long_doc[
|
394 |
+
leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
395 |
+
]
|
396 |
+
lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
|
397 |
+
reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
398 |
+
with gr.Row():
|
399 |
+
with gr.Column(scale=1):
|
400 |
+
selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
|
401 |
+
with gr.Column(scale=1):
|
402 |
+
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
403 |
+
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
|
404 |
+
hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
405 |
+
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
406 |
+
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
407 |
+
hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
|
408 |
+
)
|
409 |
+
|
410 |
+
set_listeners(
|
411 |
+
"long-doc",
|
412 |
+
lb_table_reranker_ldoc,
|
413 |
+
hidden_lb_table_reranker_ldoc,
|
414 |
+
search_bar_reranker_ldoc,
|
415 |
selected_domains,
|
416 |
selected_langs,
|
417 |
selected_rerankings_reranker_ldoc,
|
|
|
418 |
show_anonymous,
|
419 |
show_revision_and_timestamp,
|
420 |
+
)
|
421 |
+
selected_metric.change(
|
422 |
+
update_metric_long_doc,
|
423 |
+
[
|
424 |
+
selected_metric,
|
425 |
+
selected_domains,
|
426 |
+
selected_langs,
|
427 |
+
selected_rerankings_reranker_ldoc,
|
428 |
+
search_bar_reranker_ldoc,
|
429 |
+
show_anonymous,
|
430 |
+
show_revision_and_timestamp,
|
431 |
+
],
|
432 |
+
lb_table_reranker_ldoc,
|
433 |
+
queue=True
|
434 |
+
)
|
435 |
|
436 |
with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
|
437 |
with gr.Column():
|
requirements.txt
CHANGED
@@ -12,3 +12,4 @@ requests>=2.31.0
|
|
12 |
tqdm>=4.65.0
|
13 |
accelerate>=0.24.1
|
14 |
socksio>=1.0.0
|
|
|
|
12 |
tqdm>=4.65.0
|
13 |
accelerate>=0.24.1
|
14 |
socksio>=1.0.0
|
15 |
+
air-benchmark>=0.0.4
|
src/about.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# Your leaderboard name
|
2 |
TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
|
3 |
-
(v0.0.
|
4 |
|
5 |
# What does your leaderboard evaluate?
|
6 |
INTRODUCTION_TEXT = """
|
@@ -17,14 +17,14 @@ BENCHMARKS_TEXT = f"""
|
|
17 |
- A: Yes, we plan to release new datasets on regular basis. However, the update frequency is to be decided.
|
18 |
|
19 |
- Q: As you are using models to do the quality control when generating the data, is it biased to the models that are used?
|
20 |
-
- A: Yes, the results is biased to the chosen models. However, we believe the datasets labeled by human are also biased to the human's preference. The key point to verify is whether the model's bias is consistent with the human's. We use our approach to generate test data using the well established MSMARCO datasets. We benchmark different models' performances using the generated dataset and the human-label DEV dataset. Comparing the ranking of different models on these two datasets, we observe the spearman correlation between them is 0.8211 (p-value=5e-5). This indicates that the models' perference is well aligned with the human. Please refer to [here](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/
|
21 |
|
22 |
"""
|
23 |
|
24 |
EVALUATION_QUEUE_TEXT = """
|
25 |
## Check out the submission steps at [our GitHub repo](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/submit_to_leaderboard.md)
|
26 |
|
27 |
-
## You can find the **STATUS of Your Submission** at the [Backend Space](https://huggingface.co/spaces/AIR-Bench/leaderboard_backend)
|
28 |
|
29 |
- If the status is **✔️ Success**, then you can find your results at the [Leaderboard Space](https://huggingface.co/spaces/AIR-Bench/leaderboard) in no more than one hour.
|
30 |
- If the status is **❌ Failed**, please check your submission steps and try again. If you have any questions, please feel free to open an issue [here](https://github.com/AIR-Bench/AIR-Bench/issues/new).
|
|
|
1 |
# Your leaderboard name
|
2 |
TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
|
3 |
+
(v0.1.0.dev) </h1>"""
|
4 |
|
5 |
# What does your leaderboard evaluate?
|
6 |
INTRODUCTION_TEXT = """
|
|
|
17 |
- A: Yes, we plan to release new datasets on regular basis. However, the update frequency is to be decided.
|
18 |
|
19 |
- Q: As you are using models to do the quality control when generating the data, is it biased to the models that are used?
|
20 |
+
- A: Yes, the results is biased to the chosen models. However, we believe the datasets labeled by human are also biased to the human's preference. The key point to verify is whether the model's bias is consistent with the human's. We use our approach to generate test data using the well established MSMARCO datasets. We benchmark different models' performances using the generated dataset and the human-label DEV dataset. Comparing the ranking of different models on these two datasets, we observe the spearman correlation between them is 0.8211 (p-value=5e-5). This indicates that the models' perference is well aligned with the human. Please refer to [here](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/available_analysis_results.md#consistency-with-human-labeled-data) for details.
|
21 |
|
22 |
"""
|
23 |
|
24 |
EVALUATION_QUEUE_TEXT = """
|
25 |
## Check out the submission steps at [our GitHub repo](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/submit_to_leaderboard.md)
|
26 |
|
27 |
+
## You can find the **STATUS of Your Submission** at the [Backend Space](https://huggingface.co/spaces/AIR-Bench/leaderboard_backend)
|
28 |
|
29 |
- If the status is **✔️ Success**, then you can find your results at the [Leaderboard Space](https://huggingface.co/spaces/AIR-Bench/leaderboard) in no more than one hour.
|
30 |
- If the status is **❌ Failed**, please check your submission steps and try again. If you have any questions, please feel free to open an issue [here](https://github.com/AIR-Bench/AIR-Bench/issues/new).
|
src/benchmarks.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
|
|
3 |
|
4 |
|
5 |
def get_safe_name(name: str):
|
@@ -11,67 +12,6 @@ def get_safe_name(name: str):
|
|
11 |
if (character.isalnum() or character == '_'))
|
12 |
|
13 |
|
14 |
-
dataset_dict = {
|
15 |
-
"qa": {
|
16 |
-
"wiki": {
|
17 |
-
"en": ["wikipedia_20240101", ],
|
18 |
-
"zh": ["wikipedia_20240101", ]
|
19 |
-
},
|
20 |
-
"web": {
|
21 |
-
"en": ["mC4", ],
|
22 |
-
"zh": ["mC4", ]
|
23 |
-
},
|
24 |
-
"news": {
|
25 |
-
"en": ["CC-News", ],
|
26 |
-
"zh": ["CC-News", ]
|
27 |
-
},
|
28 |
-
"healthcare": {
|
29 |
-
"en": ["PubMedQA", ],
|
30 |
-
"zh": ["Huatuo-26M", ]
|
31 |
-
},
|
32 |
-
"law": {
|
33 |
-
"en": ["pile-of-law", ],
|
34 |
-
# "zh": ["flk_npc_gov_cn", ]
|
35 |
-
},
|
36 |
-
"finance": {
|
37 |
-
"en": ["Reuters-Financial", ],
|
38 |
-
"zh": ["FinCorpus", ]
|
39 |
-
},
|
40 |
-
"arxiv": {
|
41 |
-
"en": ["Arxiv", ]},
|
42 |
-
"msmarco": {
|
43 |
-
"en": ["MS MARCO", ]},
|
44 |
-
},
|
45 |
-
"long-doc": {
|
46 |
-
"arxiv": {
|
47 |
-
"en": ["gpt3", "llama2", "llm-survey", "gemini"],
|
48 |
-
},
|
49 |
-
"book": {
|
50 |
-
"en": [
|
51 |
-
"origin-of-species_darwin",
|
52 |
-
"a-brief-history-of-time_stephen-hawking"
|
53 |
-
]
|
54 |
-
},
|
55 |
-
"healthcare": {
|
56 |
-
"en": [
|
57 |
-
"pubmed_100k-200k_1",
|
58 |
-
"pubmed_100k-200k_2",
|
59 |
-
"pubmed_100k-200k_3",
|
60 |
-
"pubmed_40k-50k_5-merged",
|
61 |
-
"pubmed_30k-40k_10-merged"
|
62 |
-
]
|
63 |
-
},
|
64 |
-
"law": {
|
65 |
-
"en": [
|
66 |
-
"lex_files_300k-400k",
|
67 |
-
"lex_files_400k-500k",
|
68 |
-
"lex_files_500k-600k",
|
69 |
-
"lex_files_600k-700k"
|
70 |
-
]
|
71 |
-
}
|
72 |
-
}
|
73 |
-
}
|
74 |
-
|
75 |
METRIC_LIST = [
|
76 |
"ndcg_at_1",
|
77 |
"ndcg_at_3",
|
@@ -118,7 +58,7 @@ class Benchmark:
|
|
118 |
|
119 |
qa_benchmark_dict = {}
|
120 |
long_doc_benchmark_dict = {}
|
121 |
-
for task, domain_dict in
|
122 |
for domain, lang_dict in domain_dict.items():
|
123 |
for lang, dataset_list in lang_dict.items():
|
124 |
if task == "qa":
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
+
from air_benchmark.tasks.tasks import BenchmarkTable
|
4 |
|
5 |
|
6 |
def get_safe_name(name: str):
|
|
|
12 |
if (character.isalnum() or character == '_'))
|
13 |
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
METRIC_LIST = [
|
16 |
"ndcg_at_1",
|
17 |
"ndcg_at_3",
|
|
|
58 |
|
59 |
qa_benchmark_dict = {}
|
60 |
long_doc_benchmark_dict = {}
|
61 |
+
for task, domain_dict in BenchmarkTable['AIR-Bench_24.04'].items():
|
62 |
for domain, lang_dict in domain_dict.items():
|
63 |
for lang, dataset_list in lang_dict.items():
|
64 |
if task == "qa":
|