Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
a3d4c8d
1 Parent(s): 98e75e7

refactor: reformat

Browse files
app.py CHANGED
@@ -6,6 +6,7 @@ from huggingface_hub import snapshot_download
6
 
7
  from src.about import BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
8
  from src.benchmarks import LongDocBenchmarks, QABenchmarks
 
9
  from src.components import (
10
  get_anonymous_checkbox,
11
  get_domain_dropdown,
@@ -31,7 +32,6 @@ from src.envs import (
31
  RESULTS_REPO,
32
  TOKEN,
33
  )
34
- from src.columns import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL
35
  from src.loaders import load_eval_results
36
  from src.models import TaskType, model_hyperlink
37
  from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
@@ -81,7 +81,7 @@ def update_qa_metric(
81
  reranking_model,
82
  query,
83
  show_anonymous,
84
- show_revision_and_timestamp
85
  )
86
 
87
 
@@ -173,7 +173,9 @@ with demo:
173
  # shown_table
174
  qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
175
  # Dummy leaderboard for handling the case when the user uses backspace key
176
- qa_df_elem_ret_rerank_hidden = get_leaderboard_table(datastore.qa_raw_df, datastore.qa_types, visible=False)
 
 
177
 
178
  version.change(
179
  update_qa_version,
@@ -214,14 +216,24 @@ with demo:
214
  qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
215
 
216
  # Dummy leaderboard for handling the case when the user uses backspace key
217
- _qa_df_ret_hidden = datastore.qa_raw_df[datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"]
 
 
218
  _qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
219
- qa_df_elem_ret_hidden = get_leaderboard_table(_qa_df_ret_hidden, datastore.qa_types, visible=False)
 
 
220
 
221
  version.change(
222
  update_qa_version,
223
  version,
224
- [domains, langs, models_ret, qa_df_elem_ret, qa_df_elem_ret_hidden, ],
 
 
 
 
 
 
225
  )
226
 
227
  set_listeners(
@@ -253,13 +265,9 @@ with demo:
253
  )
254
 
255
  with gr.TabItem("Reranking Only", id=12):
256
- _qa_df_rerank = datastore.qa_fmt_df[
257
- datastore.qa_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
258
- ]
259
  _qa_df_rerank = reset_rank(_qa_df_rerank)
260
- qa_rerank_models = (
261
- _qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
262
- )
263
  with gr.Row():
264
  with gr.Column(scale=1):
265
  qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
@@ -269,7 +277,7 @@ with demo:
269
 
270
  _qa_df_rerank_hidden = datastore.qa_raw_df[
271
  datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
272
- ]
273
  _qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
274
  qa_df_elem_rerank_hidden = get_leaderboard_table(
275
  _qa_df_rerank_hidden, datastore.qa_types, visible=False
@@ -333,9 +341,7 @@ with demo:
333
  with gr.Column():
334
  models = get_reranking_dropdown(datastore.reranking_models)
335
 
336
- doc_df_elem_ret_rerank = get_leaderboard_table(
337
- datastore.doc_fmt_df, datastore.doc_types
338
- )
339
 
340
  # Dummy leaderboard for handling the case when the user uses backspace key
341
  doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
@@ -385,15 +391,13 @@ with demo:
385
 
386
  _doc_df_ret = datastore.doc_fmt_df[
387
  datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
388
- ]
389
  _doc_df_ret = reset_rank(_doc_df_ret)
390
- doc_df_elem_ret = get_leaderboard_table(
391
- _doc_df_ret, datastore.doc_types
392
- )
393
 
394
  _doc_df_ret_hidden = datastore.doc_raw_df[
395
  datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
396
- ]
397
  _doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
398
  doc_df_elem_ret_hidden = get_leaderboard_table(
399
  _doc_df_ret_hidden, datastore.doc_types, visible=False
@@ -435,22 +439,20 @@ with demo:
435
  with gr.TabItem("Reranking Only", id=22):
436
  _doc_df_rerank = datastore.doc_fmt_df[
437
  datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
438
- ]
439
  _doc_df_rerank = reset_rank(_doc_df_rerank)
440
  doc_rerank_models = (
441
  _doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
442
  )
443
  with gr.Row():
444
  with gr.Column(scale=1):
445
- doc_models_rerank = get_reranking_dropdown(
446
- doc_rerank_models
447
- )
448
  with gr.Column(scale=1):
449
  doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
450
  doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
451
  _doc_df_rerank_hidden = datastore.doc_raw_df[
452
  datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
453
- ]
454
  _doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
455
  doc_df_elem_rerank_hidden = get_leaderboard_table(
456
  _doc_df_rerank_hidden, datastore.doc_types, visible=False
 
6
 
7
  from src.about import BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
8
  from src.benchmarks import LongDocBenchmarks, QABenchmarks
9
+ from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
10
  from src.components import (
11
  get_anonymous_checkbox,
12
  get_domain_dropdown,
 
32
  RESULTS_REPO,
33
  TOKEN,
34
  )
 
35
  from src.loaders import load_eval_results
36
  from src.models import TaskType, model_hyperlink
37
  from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
 
81
  reranking_model,
82
  query,
83
  show_anonymous,
84
+ show_revision_and_timestamp,
85
  )
86
 
87
 
 
173
  # shown_table
174
  qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
175
  # Dummy leaderboard for handling the case when the user uses backspace key
176
+ qa_df_elem_ret_rerank_hidden = get_leaderboard_table(
177
+ datastore.qa_raw_df, datastore.qa_types, visible=False
178
+ )
179
 
180
  version.change(
181
  update_qa_version,
 
216
  qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
217
 
218
  # Dummy leaderboard for handling the case when the user uses backspace key
219
+ _qa_df_ret_hidden = datastore.qa_raw_df[
220
+ datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
221
+ ]
222
  _qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
223
+ qa_df_elem_ret_hidden = get_leaderboard_table(
224
+ _qa_df_ret_hidden, datastore.qa_types, visible=False
225
+ )
226
 
227
  version.change(
228
  update_qa_version,
229
  version,
230
+ [
231
+ domains,
232
+ langs,
233
+ models_ret,
234
+ qa_df_elem_ret,
235
+ qa_df_elem_ret_hidden,
236
+ ],
237
  )
238
 
239
  set_listeners(
 
265
  )
266
 
267
  with gr.TabItem("Reranking Only", id=12):
268
+ _qa_df_rerank = datastore.qa_fmt_df[datastore.qa_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
 
 
269
  _qa_df_rerank = reset_rank(_qa_df_rerank)
270
+ qa_rerank_models = _qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
 
 
271
  with gr.Row():
272
  with gr.Column(scale=1):
273
  qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
 
277
 
278
  _qa_df_rerank_hidden = datastore.qa_raw_df[
279
  datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
280
+ ]
281
  _qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
282
  qa_df_elem_rerank_hidden = get_leaderboard_table(
283
  _qa_df_rerank_hidden, datastore.qa_types, visible=False
 
341
  with gr.Column():
342
  models = get_reranking_dropdown(datastore.reranking_models)
343
 
344
+ doc_df_elem_ret_rerank = get_leaderboard_table(datastore.doc_fmt_df, datastore.doc_types)
 
 
345
 
346
  # Dummy leaderboard for handling the case when the user uses backspace key
347
  doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
 
391
 
392
  _doc_df_ret = datastore.doc_fmt_df[
393
  datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
394
+ ]
395
  _doc_df_ret = reset_rank(_doc_df_ret)
396
+ doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
 
 
397
 
398
  _doc_df_ret_hidden = datastore.doc_raw_df[
399
  datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
400
+ ]
401
  _doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
402
  doc_df_elem_ret_hidden = get_leaderboard_table(
403
  _doc_df_ret_hidden, datastore.doc_types, visible=False
 
439
  with gr.TabItem("Reranking Only", id=22):
440
  _doc_df_rerank = datastore.doc_fmt_df[
441
  datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
442
+ ]
443
  _doc_df_rerank = reset_rank(_doc_df_rerank)
444
  doc_rerank_models = (
445
  _doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
446
  )
447
  with gr.Row():
448
  with gr.Column(scale=1):
449
+ doc_models_rerank = get_reranking_dropdown(doc_rerank_models)
 
 
450
  with gr.Column(scale=1):
451
  doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
452
  doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
453
  _doc_df_rerank_hidden = datastore.doc_raw_df[
454
  datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
455
+ ]
456
  _doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
457
  doc_df_elem_rerank_hidden = get_leaderboard_table(
458
  _doc_df_rerank_hidden, datastore.doc_types, visible=False
src/loaders.py CHANGED
@@ -1,14 +1,10 @@
1
  import os.path
2
- from typing import List, Dict
3
 
4
  import pandas as pd
5
 
6
- from src.envs import (
7
- BENCHMARK_VERSION_LIST,
8
- DEFAULT_METRIC_LONG_DOC,
9
- DEFAULT_METRIC_QA,
10
- )
11
- from src.columns import COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
12
  from src.models import FullEvalResult, LeaderboardDataStore, TaskType
13
  from src.utils import get_default_cols, get_leaderboard_df
14
 
@@ -80,8 +76,9 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
80
  datastore.doc_fmt_df = datastore.doc_fmt_df[~datastore.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
81
  datastore.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
82
 
83
- datastore.reranking_models = \
84
- sorted(list(frozenset([eval_result.reranking_model for eval_result in datastore.raw_data])))
 
85
  return datastore
86
 
87
 
 
1
  import os.path
2
+ from typing import Dict, List
3
 
4
  import pandas as pd
5
 
6
+ from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
7
+ from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
 
 
 
 
8
  from src.models import FullEvalResult, LeaderboardDataStore, TaskType
9
  from src.utils import get_default_cols, get_leaderboard_df
10
 
 
76
  datastore.doc_fmt_df = datastore.doc_fmt_df[~datastore.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
77
  datastore.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
78
 
79
+ datastore.reranking_models = sorted(
80
+ list(frozenset([eval_result.reranking_model for eval_result in datastore.raw_data]))
81
+ )
82
  return datastore
83
 
84
 
src/models.py CHANGED
@@ -1,14 +1,20 @@
1
  import json
2
- from enum import Enum
3
-
4
  from collections import defaultdict
5
  from dataclasses import dataclass
 
6
  from typing import List, Optional
7
 
8
  import pandas as pd
9
 
10
- from src.columns import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
11
- COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
 
 
 
 
 
 
 
12
 
13
 
14
  def get_safe_name(name: str):
@@ -16,6 +22,7 @@ def get_safe_name(name: str):
16
  name = name.replace("-", "_")
17
  return "".join(character.lower() for character in name if (character.isalnum() or character == "_"))
18
 
 
19
  @dataclass
20
  class EvalResult:
21
  """
 
1
  import json
 
 
2
  from collections import defaultdict
3
  from dataclasses import dataclass
4
+ from enum import Enum
5
  from typing import List, Optional
6
 
7
  import pandas as pd
8
 
9
+ from src.columns import (
10
+ COL_NAME_IS_ANONYMOUS,
11
+ COL_NAME_RERANKING_MODEL,
12
+ COL_NAME_RERANKING_MODEL_LINK,
13
+ COL_NAME_RETRIEVAL_MODEL,
14
+ COL_NAME_RETRIEVAL_MODEL_LINK,
15
+ COL_NAME_REVISION,
16
+ COL_NAME_TIMESTAMP,
17
+ )
18
 
19
 
20
  def get_safe_name(name: str):
 
22
  name = name.replace("-", "_")
23
  return "".join(character.lower() for character in name if (character.isalnum() or character == "_"))
24
 
25
+
26
  @dataclass
27
  class EvalResult:
28
  """
src/utils.py CHANGED
@@ -6,16 +6,20 @@ from pathlib import Path
6
 
7
  import pandas as pd
8
 
9
- from src.models import TaskType
10
  from src.benchmarks import LongDocBenchmarks, QABenchmarks
11
- from src.columns import get_default_col_names_and_types, get_fixed_col_names_and_types, COL_NAME_AVG, \
12
- COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
13
- COL_NAME_IS_ANONYMOUS
14
- from src.envs import (
15
- API,
16
- LATEST_BENCHMARK_VERSION,
17
- SEARCH_RESULTS_REPO,
 
 
 
18
  )
 
 
19
 
20
 
21
  def calculate_mean(row):
@@ -200,7 +204,7 @@ def update_metric(
200
  elif task == TaskType.long_doc:
201
  update_func = update_doc_df_elem
202
  else:
203
- raise NotImplemented
204
  df_elem = get_leaderboard_df(datastore, task=task, metric=metric)
205
  version = datastore.version
206
  return update_func(
 
6
 
7
  import pandas as pd
8
 
 
9
  from src.benchmarks import LongDocBenchmarks, QABenchmarks
10
+ from src.columns import (
11
+ COL_NAME_AVG,
12
+ COL_NAME_IS_ANONYMOUS,
13
+ COL_NAME_RANK,
14
+ COL_NAME_RERANKING_MODEL,
15
+ COL_NAME_RETRIEVAL_MODEL,
16
+ COL_NAME_REVISION,
17
+ COL_NAME_TIMESTAMP,
18
+ get_default_col_names_and_types,
19
+ get_fixed_col_names_and_types,
20
  )
21
+ from src.envs import API, LATEST_BENCHMARK_VERSION, SEARCH_RESULTS_REPO
22
+ from src.models import TaskType
23
 
24
 
25
  def calculate_mean(row):
 
204
  elif task == TaskType.long_doc:
205
  update_func = update_doc_df_elem
206
  else:
207
+ raise NotImplementedError
208
  df_elem = get_leaderboard_df(datastore, task=task, metric=metric)
209
  version = datastore.version
210
  return update_func(
tests/src/display/test_utils.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  from src.display.utils import (
3
  COLS_LONG_DOC,
4
  COLS_QA,
 
 
1
  from src.display.utils import (
2
  COLS_LONG_DOC,
3
  COLS_QA,
tests/test_utils.py CHANGED
@@ -2,8 +2,15 @@ import pandas as pd
2
  import pytest
3
 
4
  from app import update_table
5
- from src.columns import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
6
- COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
 
 
 
 
 
 
 
7
  from src.utils import (
8
  filter_models,
9
  filter_queries,
 
2
  import pytest
3
 
4
  from app import update_table
5
+ from src.columns import (
6
+ COL_NAME_AVG,
7
+ COL_NAME_IS_ANONYMOUS,
8
+ COL_NAME_RANK,
9
+ COL_NAME_RERANKING_MODEL,
10
+ COL_NAME_RETRIEVAL_MODEL,
11
+ COL_NAME_REVISION,
12
+ COL_NAME_TIMESTAMP,
13
+ )
14
  from src.utils import (
15
  filter_models,
16
  filter_queries,