Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
1f17567
1 Parent(s): 34b8881

style: reformat the styles

Browse files
Makefile CHANGED
@@ -3,14 +3,20 @@
3
 
4
  style:
5
  python -m black --line-length 119 .
 
6
  python -m isort .
 
7
  ruff check --fix .
 
8
 
9
 
10
  quality:
11
  python -m black --check --line-length 119 .
 
12
  python -m isort --check-only .
 
13
  ruff check .
 
14
 
15
 
16
  test:
 
3
 
4
  style:
5
  python -m black --line-length 119 .
6
+ python -m black --line-length 119 src
7
  python -m isort .
8
+ python -m isort src
9
  ruff check --fix .
10
+ ruff check --fix src
11
 
12
 
13
  quality:
14
  python -m black --check --line-length 119 .
15
+ python -m black --check --line-length 119 src
16
  python -m isort --check-only .
17
+ python -m isort --check-only src
18
  ruff check .
19
+ ruff check src
20
 
21
 
22
  test:
app.py CHANGED
@@ -63,13 +63,13 @@ datastore = ds_dict[LATEST_BENCHMARK_VERSION]
63
 
64
 
65
  def update_qa_metric(
66
- metric: str,
67
- domains: list,
68
- langs: list,
69
- reranking_model: list,
70
- query: str,
71
- show_anonymous: bool,
72
- show_revision_and_timestamp: bool,
73
  ):
74
  global datastore
75
  return update_metric(
@@ -86,13 +86,13 @@ def update_qa_metric(
86
 
87
 
88
  def update_doc_metric(
89
- metric: str,
90
- domains: list,
91
- langs: list,
92
- reranking_model: list,
93
- query: str,
94
- show_anonymous: bool,
95
- show_revision_and_timestamp,
96
  ):
97
  global datastore
98
  return update_metric(
@@ -218,7 +218,7 @@ with demo:
218
  # Dummy leaderboard for handling the case when the user uses backspace key
219
  _qa_df_ret_hidden = datastore.qa_raw_df[
220
  datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
221
- ]
222
  _qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
223
  qa_df_elem_ret_hidden = get_leaderboard_table(
224
  _qa_df_ret_hidden, datastore.qa_types, visible=False
@@ -277,7 +277,7 @@ with demo:
277
 
278
  _qa_df_rerank_hidden = datastore.qa_raw_df[
279
  datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
280
- ]
281
  _qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
282
  qa_df_elem_rerank_hidden = get_leaderboard_table(
283
  _qa_df_rerank_hidden, datastore.qa_types, visible=False
@@ -391,13 +391,13 @@ with demo:
391
 
392
  _doc_df_ret = datastore.doc_fmt_df[
393
  datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
394
- ]
395
  _doc_df_ret = reset_rank(_doc_df_ret)
396
  doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
397
 
398
  _doc_df_ret_hidden = datastore.doc_raw_df[
399
  datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
400
- ]
401
  _doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
402
  doc_df_elem_ret_hidden = get_leaderboard_table(
403
  _doc_df_ret_hidden, datastore.doc_types, visible=False
@@ -439,7 +439,7 @@ with demo:
439
  with gr.TabItem("Reranking Only", id=22):
440
  _doc_df_rerank = datastore.doc_fmt_df[
441
  datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
442
- ]
443
  _doc_df_rerank = reset_rank(_doc_df_rerank)
444
  doc_rerank_models = (
445
  _doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
@@ -452,7 +452,7 @@ with demo:
452
  doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
453
  _doc_df_rerank_hidden = datastore.doc_raw_df[
454
  datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
455
- ]
456
  _doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
457
  doc_df_elem_rerank_hidden = get_leaderboard_table(
458
  _doc_df_rerank_hidden, datastore.doc_types, visible=False
 
63
 
64
 
65
  def update_qa_metric(
66
+ metric: str,
67
+ domains: list,
68
+ langs: list,
69
+ reranking_model: list,
70
+ query: str,
71
+ show_anonymous: bool,
72
+ show_revision_and_timestamp: bool,
73
  ):
74
  global datastore
75
  return update_metric(
 
86
 
87
 
88
  def update_doc_metric(
89
+ metric: str,
90
+ domains: list,
91
+ langs: list,
92
+ reranking_model: list,
93
+ query: str,
94
+ show_anonymous: bool,
95
+ show_revision_and_timestamp,
96
  ):
97
  global datastore
98
  return update_metric(
 
218
  # Dummy leaderboard for handling the case when the user uses backspace key
219
  _qa_df_ret_hidden = datastore.qa_raw_df[
220
  datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
221
+ ]
222
  _qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
223
  qa_df_elem_ret_hidden = get_leaderboard_table(
224
  _qa_df_ret_hidden, datastore.qa_types, visible=False
 
277
 
278
  _qa_df_rerank_hidden = datastore.qa_raw_df[
279
  datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
280
+ ]
281
  _qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
282
  qa_df_elem_rerank_hidden = get_leaderboard_table(
283
  _qa_df_rerank_hidden, datastore.qa_types, visible=False
 
391
 
392
  _doc_df_ret = datastore.doc_fmt_df[
393
  datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
394
+ ]
395
  _doc_df_ret = reset_rank(_doc_df_ret)
396
  doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
397
 
398
  _doc_df_ret_hidden = datastore.doc_raw_df[
399
  datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
400
+ ]
401
  _doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
402
  doc_df_elem_ret_hidden = get_leaderboard_table(
403
  _doc_df_ret_hidden, datastore.doc_types, visible=False
 
439
  with gr.TabItem("Reranking Only", id=22):
440
  _doc_df_rerank = datastore.doc_fmt_df[
441
  datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
442
+ ]
443
  _doc_df_rerank = reset_rank(_doc_df_rerank)
444
  doc_rerank_models = (
445
  _doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
 
452
  doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
453
  _doc_df_rerank_hidden = datastore.doc_raw_df[
454
  datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
455
+ ]
456
  _doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
457
  doc_df_elem_rerank_hidden = get_leaderboard_table(
458
  _doc_df_rerank_hidden, datastore.doc_types, visible=False
pyproject.toml CHANGED
@@ -1,9 +1,9 @@
1
  [tool.ruff]
2
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- select = ["E", "F"]
4
- ignore = ["E501"] # line too long (black is taking care of this)
5
  line-length = 119
6
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
 
1
  [tool.ruff]
2
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ lint.select = ["E", "F"]
4
+ lint.ignore = ["E501"] # line too long (black is taking care of this)
5
  line-length = 119
6
+ lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
src/benchmarks.py CHANGED
@@ -30,9 +30,7 @@ def get_qa_benchmarks_dict(version: str):
30
  for metric in dataset_list:
31
  if "test" not in dataset_list[metric]["splits"]:
32
  continue
33
- benchmark_dict[benchmark_name] = Benchmark(
34
- benchmark_name, metric, col_name, domain, lang, task
35
- )
36
  return benchmark_dict
37
 
38
 
@@ -59,20 +57,14 @@ def get_doc_benchmarks_dict(version: str):
59
  _qa_benchmark_dict = {}
60
  for version in BENCHMARK_VERSION_LIST:
61
  safe_version_name = get_safe_name(version)
62
- _qa_benchmark_dict[safe_version_name] = \
63
- Enum(
64
- f"QABenchmarks_{safe_version_name}",
65
- get_qa_benchmarks_dict(version)
66
- )
67
 
68
  _doc_benchmark_dict = {}
69
  for version in BENCHMARK_VERSION_LIST:
70
  safe_version_name = get_safe_name(version)
71
- _doc_benchmark_dict[safe_version_name] = \
72
- Enum(
73
- f"LongDocBenchmarks_{safe_version_name}",
74
- get_doc_benchmarks_dict(version)
75
- )
76
 
77
 
78
  QABenchmarks = Enum("QABenchmarks", _qa_benchmark_dict)
 
30
  for metric in dataset_list:
31
  if "test" not in dataset_list[metric]["splits"]:
32
  continue
33
+ benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
 
 
34
  return benchmark_dict
35
 
36
 
 
57
  _qa_benchmark_dict = {}
58
  for version in BENCHMARK_VERSION_LIST:
59
  safe_version_name = get_safe_name(version)
60
+ _qa_benchmark_dict[safe_version_name] = Enum(f"QABenchmarks_{safe_version_name}", get_qa_benchmarks_dict(version))
 
 
 
 
61
 
62
  _doc_benchmark_dict = {}
63
  for version in BENCHMARK_VERSION_LIST:
64
  safe_version_name = get_safe_name(version)
65
+ _doc_benchmark_dict[safe_version_name] = Enum(
66
+ f"LongDocBenchmarks_{safe_version_name}", get_doc_benchmarks_dict(version)
67
+ )
 
 
68
 
69
 
70
  QABenchmarks = Enum("QABenchmarks", _qa_benchmark_dict)
src/columns.py CHANGED
@@ -19,69 +19,28 @@ class ColumnContent:
19
 
20
  def get_default_auto_eval_column_dict():
21
  auto_eval_column_dict = []
22
- auto_eval_column_dict.append(
23
- [
24
- "rank",
25
- ColumnContent,
26
- ColumnContent(
27
- COL_NAME_RANK,
28
- "number",
29
- True
30
- )
31
- ]
32
- )
33
  auto_eval_column_dict.append(
34
  [
35
  "retrieval_model",
36
  ColumnContent,
37
- ColumnContent(
38
- COL_NAME_RETRIEVAL_MODEL,
39
- "markdown",
40
- True,
41
- never_hidden=True
42
- )
43
  ]
44
  )
45
  auto_eval_column_dict.append(
46
  [
47
  "reranking_model",
48
  ColumnContent,
49
- ColumnContent(
50
- COL_NAME_RERANKING_MODEL,
51
- "markdown",
52
- True,
53
- never_hidden=True
54
- )
55
  ]
56
  )
57
  auto_eval_column_dict.append(
58
- [
59
- "revision",
60
- ColumnContent,
61
- ColumnContent(
62
- COL_NAME_REVISION,
63
- "markdown",
64
- True,
65
- never_hidden=True
66
- )
67
- ]
68
  )
69
  auto_eval_column_dict.append(
70
- [
71
- "timestamp",
72
- ColumnContent,
73
- ColumnContent(
74
- COL_NAME_TIMESTAMP, "date", True, never_hidden=True
75
- )
76
- ]
77
- )
78
- auto_eval_column_dict.append(
79
- [
80
- "average",
81
- ColumnContent,
82
- ColumnContent(COL_NAME_AVG, "number", True)
83
- ]
84
  )
 
85
  auto_eval_column_dict.append(
86
  [
87
  "retrieval_model_link",
@@ -91,7 +50,7 @@ def get_default_auto_eval_column_dict():
91
  "markdown",
92
  False,
93
  hidden=True,
94
- )
95
  ]
96
  )
97
  auto_eval_column_dict.append(
@@ -103,20 +62,11 @@ def get_default_auto_eval_column_dict():
103
  "markdown",
104
  False,
105
  hidden=True,
106
- )
107
  ]
108
  )
109
  auto_eval_column_dict.append(
110
- [
111
- "is_anonymous",
112
- ColumnContent,
113
- ColumnContent(
114
- COL_NAME_IS_ANONYMOUS,
115
- "bool",
116
- False,
117
- hidden=True
118
- )
119
- ]
120
  )
121
  return auto_eval_column_dict
122
 
 
19
 
20
  def get_default_auto_eval_column_dict():
21
  auto_eval_column_dict = []
22
+ auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)])
 
 
 
 
 
 
 
 
 
 
23
  auto_eval_column_dict.append(
24
  [
25
  "retrieval_model",
26
  ColumnContent,
27
+ ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, never_hidden=True),
 
 
 
 
 
28
  ]
29
  )
30
  auto_eval_column_dict.append(
31
  [
32
  "reranking_model",
33
  ColumnContent,
34
+ ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True),
 
 
 
 
 
35
  ]
36
  )
37
  auto_eval_column_dict.append(
38
+ ["revision", ColumnContent, ColumnContent(COL_NAME_REVISION, "markdown", True, never_hidden=True)]
 
 
 
 
 
 
 
 
 
39
  )
40
  auto_eval_column_dict.append(
41
+ ["timestamp", ColumnContent, ColumnContent(COL_NAME_TIMESTAMP, "date", True, never_hidden=True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  )
43
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)])
44
  auto_eval_column_dict.append(
45
  [
46
  "retrieval_model_link",
 
50
  "markdown",
51
  False,
52
  hidden=True,
53
+ ),
54
  ]
55
  )
56
  auto_eval_column_dict.append(
 
62
  "markdown",
63
  False,
64
  hidden=True,
65
+ ),
66
  ]
67
  )
68
  auto_eval_column_dict.append(
69
+ ["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
 
 
 
 
 
 
 
 
 
70
  )
71
  return auto_eval_column_dict
72
 
src/envs.py CHANGED
@@ -6,7 +6,9 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN", "") # A read/write token for your org
8
 
9
- OWNER = "AIR-Bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN", "") # A read/write token for your org
8
 
9
+ OWNER = (
10
+ "AIR-Bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
+ )
12
  # ----------------------------------
13
 
14
  REPO_ID = f"{OWNER}/leaderboard"
src/loaders.py CHANGED
@@ -1,7 +1,6 @@
1
  import os.path
2
  from pathlib import Path
3
- from typing import Union
4
- from typing import Dict, List
5
 
6
  import pandas as pd
7
 
 
1
  import os.path
2
  from pathlib import Path
3
+ from typing import Dict, List, Union
 
4
 
5
  import pandas as pd
6
 
src/models.py CHANGED
@@ -2,7 +2,7 @@ import json
2
  from collections import defaultdict
3
  from dataclasses import dataclass
4
  from enum import Enum
5
- from typing import List, Optional
6
 
7
  import pandas as pd
8
 
 
2
  from collections import defaultdict
3
  from dataclasses import dataclass
4
  from enum import Enum
5
+ from typing import List
6
 
7
  import pandas as pd
8
 
src/utils.py CHANGED
@@ -118,39 +118,36 @@ def get_selected_cols(task, version_slug, domains, languages):
118
 
119
 
120
  def select_columns(
121
- df: pd.DataFrame,
122
- domains: list,
123
- languages: list,
124
- task: TaskType = TaskType.qa,
125
- reset_ranking: bool = True,
126
- version_slug: str = None,
127
  ) -> pd.DataFrame:
128
- selected_cols = get_selected_cols(
129
- task, version_slug, domains, languages)
130
  fixed_cols, _ = get_fixed_col_names_and_types()
131
  filtered_df = df[fixed_cols + selected_cols]
132
  filtered_df.replace({"": pd.NA}, inplace=True)
133
  if reset_ranking:
134
- filtered_df[COL_NAME_AVG] = \
135
- filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
136
- filtered_df.sort_values(
137
- by=[COL_NAME_AVG], ascending=False, inplace=True)
138
  filtered_df.reset_index(inplace=True, drop=True)
139
  filtered_df = reset_rank(filtered_df)
140
  return filtered_df
141
 
142
 
143
  def _update_df_elem(
144
- task: TaskType,
145
- version: str,
146
- source_df: pd.DataFrame,
147
- domains: list,
148
- langs: list,
149
- reranking_query: list,
150
- query: str,
151
- show_anonymous: bool,
152
- reset_ranking: bool = True,
153
- show_revision_and_timestamp: bool = False,
154
  ):
155
  filtered_df = source_df.copy()
156
  if not show_anonymous:
@@ -164,15 +161,15 @@ def _update_df_elem(
164
 
165
 
166
  def update_doc_df_elem(
167
- version: str,
168
- hidden_df: pd.DataFrame,
169
- domains: list,
170
- langs: list,
171
- reranking_query: list,
172
- query: str,
173
- show_anonymous: bool,
174
- show_revision_and_timestamp: bool = False,
175
- reset_ranking: bool = True,
176
  ):
177
  return _update_df_elem(
178
  TaskType.long_doc,
@@ -189,15 +186,15 @@ def update_doc_df_elem(
189
 
190
 
191
  def update_metric(
192
- datastore,
193
- task: TaskType,
194
- metric: str,
195
- domains: list,
196
- langs: list,
197
- reranking_model: list,
198
- query: str,
199
- show_anonymous: bool = False,
200
- show_revision_and_timestamp: bool = False,
201
  ) -> pd.DataFrame:
202
  if task == TaskType.qa:
203
  update_func = update_qa_df_elem
@@ -253,13 +250,13 @@ def calculate_file_md5(file_path):
253
 
254
 
255
  def submit_results(
256
- filepath: str,
257
- model: str,
258
- model_url: str,
259
- reranking_model: str = "",
260
- reranking_model_url: str = "",
261
- version: str = LATEST_BENCHMARK_VERSION,
262
- is_anonymous=False,
263
  ):
264
  if not filepath.endswith(".zip"):
265
  return styled_error(f"file uploading aborted. wrong file type: {filepath}")
@@ -355,11 +352,7 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
355
  benchmark_cols.append(t.value.col_name)
356
 
357
  # filter out the columns that are not in the data
358
- df[COL_NAME_AVG] = (
359
- df[list(benchmark_cols)]
360
- .apply(calculate_mean, axis=1)
361
- .round(decimals=2)
362
- )
363
  df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
364
  df.reset_index(inplace=True, drop=True)
365
 
@@ -381,16 +374,16 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
381
 
382
 
383
  def set_listeners(
384
- task: TaskType,
385
- target_df,
386
- source_df,
387
- search_bar,
388
- version,
389
- selected_domains,
390
- selected_langs,
391
- selected_rerankings,
392
- show_anonymous,
393
- show_revision_and_timestamp,
394
  ):
395
  if task == TaskType.qa:
396
  update_table_func = update_qa_df_elem
@@ -400,15 +393,15 @@ def set_listeners(
400
  raise NotImplementedError
401
  selector_list = [selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous]
402
  search_bar_args = [
403
- source_df,
404
- version,
405
- ] + selector_list
406
  selector_args = (
407
- [version, source_df]
408
- + selector_list
409
- + [
410
- show_revision_and_timestamp,
411
- ]
412
  )
413
  # Set search_bar listener
414
  search_bar.submit(update_table_func, search_bar_args, target_df)
@@ -424,15 +417,15 @@ def set_listeners(
424
 
425
 
426
  def update_qa_df_elem(
427
- version: str,
428
- hidden_df: pd.DataFrame,
429
- domains: list,
430
- langs: list,
431
- reranking_query: list,
432
- query: str,
433
- show_anonymous: bool,
434
- show_revision_and_timestamp: bool = False,
435
- reset_ranking: bool = True,
436
  ):
437
  return _update_df_elem(
438
  TaskType.qa,
 
118
 
119
 
120
  def select_columns(
121
+ df: pd.DataFrame,
122
+ domains: list,
123
+ languages: list,
124
+ task: TaskType = TaskType.qa,
125
+ reset_ranking: bool = True,
126
+ version_slug: str = None,
127
  ) -> pd.DataFrame:
128
+ selected_cols = get_selected_cols(task, version_slug, domains, languages)
 
129
  fixed_cols, _ = get_fixed_col_names_and_types()
130
  filtered_df = df[fixed_cols + selected_cols]
131
  filtered_df.replace({"": pd.NA}, inplace=True)
132
  if reset_ranking:
133
+ filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
134
+ filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
 
 
135
  filtered_df.reset_index(inplace=True, drop=True)
136
  filtered_df = reset_rank(filtered_df)
137
  return filtered_df
138
 
139
 
140
  def _update_df_elem(
141
+ task: TaskType,
142
+ version: str,
143
+ source_df: pd.DataFrame,
144
+ domains: list,
145
+ langs: list,
146
+ reranking_query: list,
147
+ query: str,
148
+ show_anonymous: bool,
149
+ reset_ranking: bool = True,
150
+ show_revision_and_timestamp: bool = False,
151
  ):
152
  filtered_df = source_df.copy()
153
  if not show_anonymous:
 
161
 
162
 
163
  def update_doc_df_elem(
164
+ version: str,
165
+ hidden_df: pd.DataFrame,
166
+ domains: list,
167
+ langs: list,
168
+ reranking_query: list,
169
+ query: str,
170
+ show_anonymous: bool,
171
+ show_revision_and_timestamp: bool = False,
172
+ reset_ranking: bool = True,
173
  ):
174
  return _update_df_elem(
175
  TaskType.long_doc,
 
186
 
187
 
188
  def update_metric(
189
+ datastore,
190
+ task: TaskType,
191
+ metric: str,
192
+ domains: list,
193
+ langs: list,
194
+ reranking_model: list,
195
+ query: str,
196
+ show_anonymous: bool = False,
197
+ show_revision_and_timestamp: bool = False,
198
  ) -> pd.DataFrame:
199
  if task == TaskType.qa:
200
  update_func = update_qa_df_elem
 
250
 
251
 
252
  def submit_results(
253
+ filepath: str,
254
+ model: str,
255
+ model_url: str,
256
+ reranking_model: str = "",
257
+ reranking_model_url: str = "",
258
+ version: str = LATEST_BENCHMARK_VERSION,
259
+ is_anonymous=False,
260
  ):
261
  if not filepath.endswith(".zip"):
262
  return styled_error(f"file uploading aborted. wrong file type: {filepath}")
 
352
  benchmark_cols.append(t.value.col_name)
353
 
354
  # filter out the columns that are not in the data
355
+ df[COL_NAME_AVG] = df[list(benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
 
 
 
 
356
  df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
357
  df.reset_index(inplace=True, drop=True)
358
 
 
374
 
375
 
376
  def set_listeners(
377
+ task: TaskType,
378
+ target_df,
379
+ source_df,
380
+ search_bar,
381
+ version,
382
+ selected_domains,
383
+ selected_langs,
384
+ selected_rerankings,
385
+ show_anonymous,
386
+ show_revision_and_timestamp,
387
  ):
388
  if task == TaskType.qa:
389
  update_table_func = update_qa_df_elem
 
393
  raise NotImplementedError
394
  selector_list = [selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous]
395
  search_bar_args = [
396
+ source_df,
397
+ version,
398
+ ] + selector_list
399
  selector_args = (
400
+ [version, source_df]
401
+ + selector_list
402
+ + [
403
+ show_revision_and_timestamp,
404
+ ]
405
  )
406
  # Set search_bar listener
407
  search_bar.submit(update_table_func, search_bar_args, target_df)
 
417
 
418
 
419
  def update_qa_df_elem(
420
+ version: str,
421
+ hidden_df: pd.DataFrame,
422
+ domains: list,
423
+ langs: list,
424
+ reranking_query: list,
425
+ query: str,
426
+ show_anonymous: bool,
427
+ show_revision_and_timestamp: bool = False,
428
+ reset_ranking: bool = True,
429
  ):
430
  return _update_df_elem(
431
  TaskType.qa,
tests/src/test_benchmarks.py CHANGED
@@ -3,7 +3,6 @@ import pytest
3
  from src.benchmarks import LongDocBenchmarks, QABenchmarks
4
  from src.envs import BENCHMARK_VERSION_LIST
5
 
6
-
7
  # Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
8
  # 24.05
9
  # | Task | dev | test |
@@ -17,15 +16,8 @@ from src.envs import BENCHMARK_VERSION_LIST
17
  # | Long-Doc | 15 |
18
  # | QA | 13 |
19
 
20
- @pytest.mark.parametrize(
21
- "num_datasets_dict",
22
- [
23
- {
24
- "air_bench_2404": 13,
25
- "air_bench_2405": 53
26
- }
27
- ]
28
- )
29
  def test_qa_benchmarks(num_datasets_dict):
30
  assert len(QABenchmarks) == len(BENCHMARK_VERSION_LIST)
31
  for benchmark_list in list(QABenchmarks):
@@ -33,15 +25,7 @@ def test_qa_benchmarks(num_datasets_dict):
33
  assert num_datasets_dict[version_slug] == len(benchmark_list.value)
34
 
35
 
36
- @pytest.mark.parametrize(
37
- "num_datasets_dict",
38
- [
39
- {
40
- "air_bench_2404": 15,
41
- "air_bench_2405": 11
42
- }
43
- ]
44
- )
45
  def test_doc_benchmarks(num_datasets_dict):
46
  assert len(LongDocBenchmarks) == len(BENCHMARK_VERSION_LIST)
47
  for benchmark_list in list(LongDocBenchmarks):
 
3
  from src.benchmarks import LongDocBenchmarks, QABenchmarks
4
  from src.envs import BENCHMARK_VERSION_LIST
5
 
 
6
  # Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
7
  # 24.05
8
  # | Task | dev | test |
 
16
  # | Long-Doc | 15 |
17
  # | QA | 13 |
18
 
19
+
20
+ @pytest.mark.parametrize("num_datasets_dict", [{"air_bench_2404": 13, "air_bench_2405": 53}])
 
 
 
 
 
 
 
21
  def test_qa_benchmarks(num_datasets_dict):
22
  assert len(QABenchmarks) == len(BENCHMARK_VERSION_LIST)
23
  for benchmark_list in list(QABenchmarks):
 
25
  assert num_datasets_dict[version_slug] == len(benchmark_list.value)
26
 
27
 
28
+ @pytest.mark.parametrize("num_datasets_dict", [{"air_bench_2404": 15, "air_bench_2405": 11}])
 
 
 
 
 
 
 
 
29
  def test_doc_benchmarks(num_datasets_dict):
30
  assert len(LongDocBenchmarks) == len(BENCHMARK_VERSION_LIST)
31
  for benchmark_list in list(LongDocBenchmarks):
tests/src/test_columns.py CHANGED
@@ -1,12 +1,18 @@
1
  import pytest
2
 
3
- from src.benchmarks import QABenchmarks, LongDocBenchmarks
4
- from src.columns import get_default_auto_eval_column_dict, \
5
- get_fixed_col_names_and_types, get_default_col_names_and_types, make_autoevalcolumn, COL_NAME_RANK, \
6
- COL_NAME_RETRIEVAL_MODEL, \
7
- COL_NAME_RERANKING_MODEL, COL_NAME_REVISION, \
8
- COL_NAME_TIMESTAMP, COL_NAME_AVG
9
-
 
 
 
 
 
 
10
 
11
  # Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
12
  # 24.05
@@ -21,6 +27,7 @@ from src.columns import get_default_auto_eval_column_dict, \
21
  # | Long-Doc | 15 |
22
  # | QA | 13 |
23
 
 
24
  @pytest.fixture()
25
  def expected_col_names():
26
  return [
@@ -45,8 +52,7 @@ def expected_hidden_col_names():
45
  ]
46
 
47
 
48
- def test_get_default_auto_eval_column_dict(
49
- expected_col_names, expected_hidden_col_names):
50
  col_list = get_default_auto_eval_column_dict()
51
  assert len(col_list) == 9
52
  hidden_cols = []
@@ -76,14 +82,13 @@ def test_get_fixed_col_names_and_types():
76
 
77
 
78
  @pytest.mark.parametrize(
79
- 'benchmarks, expected_benchmark_len',
80
  [
81
  (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
82
- (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
83
- ]
84
  )
85
- def test_make_autoevalcolumn(
86
- benchmarks, expected_benchmark_len, expected_col_names):
87
  expected_default_attrs = frozenset(expected_col_names)
88
  for benchmark in benchmarks:
89
  TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
@@ -98,17 +103,15 @@ def test_make_autoevalcolumn(
98
 
99
 
100
  @pytest.mark.parametrize(
101
- 'benchmarks, expected_benchmark_len',
102
  [
103
  (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
104
- (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
105
- ]
106
  )
107
  def test_get_default_col_names_and_types(
108
- benchmarks,
109
- expected_benchmark_len,
110
- expected_col_names,
111
- expected_hidden_col_names):
112
  default_col_len = len(expected_col_names)
113
  hidden_col_len = len(expected_hidden_col_names)
114
  for benchmark in benchmarks:
 
1
  import pytest
2
 
3
+ from src.benchmarks import LongDocBenchmarks, QABenchmarks
4
+ from src.columns import (
5
+ COL_NAME_AVG,
6
+ COL_NAME_RANK,
7
+ COL_NAME_RERANKING_MODEL,
8
+ COL_NAME_RETRIEVAL_MODEL,
9
+ COL_NAME_REVISION,
10
+ COL_NAME_TIMESTAMP,
11
+ get_default_auto_eval_column_dict,
12
+ get_default_col_names_and_types,
13
+ get_fixed_col_names_and_types,
14
+ make_autoevalcolumn,
15
+ )
16
 
17
  # Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
18
  # 24.05
 
27
  # | Long-Doc | 15 |
28
  # | QA | 13 |
29
 
30
+
31
  @pytest.fixture()
32
  def expected_col_names():
33
  return [
 
52
  ]
53
 
54
 
55
+ def test_get_default_auto_eval_column_dict(expected_col_names, expected_hidden_col_names):
 
56
  col_list = get_default_auto_eval_column_dict()
57
  assert len(col_list) == 9
58
  hidden_cols = []
 
82
 
83
 
84
  @pytest.mark.parametrize(
85
+ "benchmarks, expected_benchmark_len",
86
  [
87
  (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
88
+ (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
89
+ ],
90
  )
91
+ def test_make_autoevalcolumn(benchmarks, expected_benchmark_len, expected_col_names):
 
92
  expected_default_attrs = frozenset(expected_col_names)
93
  for benchmark in benchmarks:
94
  TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
 
103
 
104
 
105
  @pytest.mark.parametrize(
106
+ "benchmarks, expected_benchmark_len",
107
  [
108
  (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
109
+ (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
110
+ ],
111
  )
112
  def test_get_default_col_names_and_types(
113
+ benchmarks, expected_benchmark_len, expected_col_names, expected_hidden_col_names
114
+ ):
 
 
115
  default_col_len = len(expected_col_names)
116
  hidden_col_len = len(expected_hidden_col_names)
117
  for benchmark in benchmarks:
tests/src/test_envs.py CHANGED
@@ -1,13 +1,12 @@
1
  from air_benchmark.tasks import BenchmarkTable
2
 
3
- from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST
4
 
5
 
6
  def test_benchmark_version_list():
7
  leaderboard_versions = frozenset(BENCHMARK_VERSION_LIST)
8
  available_versions = frozenset([k for k in BenchmarkTable.keys()])
9
- assert leaderboard_versions.issubset(
10
- available_versions)
11
 
12
 
13
  def test_default_metrics():
 
1
  from air_benchmark.tasks import BenchmarkTable
2
 
3
+ from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA, METRIC_LIST
4
 
5
 
6
  def test_benchmark_version_list():
7
  leaderboard_versions = frozenset(BENCHMARK_VERSION_LIST)
8
  available_versions = frozenset([k for k in BenchmarkTable.keys()])
9
+ assert leaderboard_versions.issubset(available_versions)
 
10
 
11
 
12
  def test_default_metrics():
tests/src/test_loaders.py CHANGED
@@ -1,41 +1,34 @@
 
 
1
  import pandas as pd
2
  import pytest
3
- from pathlib import Path
4
 
5
- from src.loaders import load_raw_eval_results, load_leaderboard_datastore, load_eval_results
6
 
7
  cur_fp = Path(__file__)
8
 
9
 
10
- @pytest.mark.parametrize(
11
- "version",
12
- ["AIR-Bench_24.04", "AIR-Bench_24.05"]
13
- )
14
  def test_load_raw_eval_results(version):
15
- raw_data = load_raw_eval_results(
16
- cur_fp.parents[1] / f"toydata/eval_results/{version}"
17
- )
18
  assert len(raw_data) == 1
19
  full_eval_result = raw_data[0]
20
  expected_attr = [
21
- 'eval_name',
22
- 'retrieval_model',
23
- 'reranking_model',
24
- 'retrieval_model_link',
25
- 'reranking_model_link',
26
- 'results',
27
- 'timestamp',
28
- 'revision',
29
- 'is_anonymous'
30
  ]
31
  result_attr = [k for k in full_eval_result.__dict__.keys() if k[:2] != "__" and k[-2:] != "__"]
32
  assert sorted(expected_attr) == sorted(result_attr)
33
 
34
 
35
- @pytest.mark.parametrize(
36
- "version",
37
- ["AIR-Bench_24.04", "AIR-Bench_24.05"]
38
- )
39
  def test_load_leaderboard_datastore(version):
40
  file_path = cur_fp.parents[1] / f"toydata/eval_results/{version}"
41
  datastore = load_leaderboard_datastore(file_path, version)
@@ -51,4 +44,3 @@ def test_load_eval_results():
51
  file_path = cur_fp.parents[1] / "toydata/eval_results/"
52
  datastore_dict = load_eval_results(file_path)
53
  assert len(datastore_dict) == 2
54
-
 
1
+ from pathlib import Path
2
+
3
  import pandas as pd
4
  import pytest
 
5
 
6
+ from src.loaders import load_eval_results, load_leaderboard_datastore, load_raw_eval_results
7
 
8
  cur_fp = Path(__file__)
9
 
10
 
11
+ @pytest.mark.parametrize("version", ["AIR-Bench_24.04", "AIR-Bench_24.05"])
 
 
 
12
  def test_load_raw_eval_results(version):
13
+ raw_data = load_raw_eval_results(cur_fp.parents[1] / f"toydata/eval_results/{version}")
 
 
14
  assert len(raw_data) == 1
15
  full_eval_result = raw_data[0]
16
  expected_attr = [
17
+ "eval_name",
18
+ "retrieval_model",
19
+ "reranking_model",
20
+ "retrieval_model_link",
21
+ "reranking_model_link",
22
+ "results",
23
+ "timestamp",
24
+ "revision",
25
+ "is_anonymous",
26
  ]
27
  result_attr = [k for k in full_eval_result.__dict__.keys() if k[:2] != "__" and k[-2:] != "__"]
28
  assert sorted(expected_attr) == sorted(result_attr)
29
 
30
 
31
+ @pytest.mark.parametrize("version", ["AIR-Bench_24.04", "AIR-Bench_24.05"])
 
 
 
32
  def test_load_leaderboard_datastore(version):
33
  file_path = cur_fp.parents[1] / f"toydata/eval_results/{version}"
34
  datastore = load_leaderboard_datastore(file_path, version)
 
44
  file_path = cur_fp.parents[1] / "toydata/eval_results/"
45
  datastore_dict = load_eval_results(file_path)
46
  assert len(datastore_dict) == 2
 
tests/src/test_models.py CHANGED
@@ -1,6 +1,7 @@
1
- import pytest
2
  from pathlib import Path
3
 
 
 
4
  from src.models import EvalResult, FullEvalResult
5
 
6
  cur_fp = Path(__file__)
@@ -23,19 +24,13 @@ NUM_DOC_BENCHMARKS_24_05 = 11
23
  NUM_QA_BENCHMARKS_24_04 = 13
24
  NUM_DOC_BENCHMARKS_24_04 = 15
25
 
 
26
  def test_eval_result():
27
- eval_result = EvalResult(
28
  eval_name="eval_name",
29
  retrieval_model="bge-m3",
30
  reranking_model="NoReranking",
31
- results=[
32
- {
33
- "domain": "law",
34
- "lang": "en",
35
- "dataset": "lex_files_500K-600K",
36
- "value": 0.45723
37
- }
38
- ],
39
  task="qa",
40
  metric="ndcg_at_3",
41
  timestamp="2024-05-14T03:09:08Z",
@@ -45,11 +40,12 @@ def test_eval_result():
45
 
46
 
47
  @pytest.mark.parametrize(
48
- 'file_path',
49
  [
50
  "AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
51
- "AIR-Bench_24.05/bge-m3/NoReranker/results.json"
52
- ])
 
53
  def test_full_eval_result_init_from_json_file(file_path):
54
  json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
55
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
@@ -59,20 +55,35 @@ def test_full_eval_result_init_from_json_file(file_path):
59
 
60
 
61
  @pytest.mark.parametrize(
62
- 'file_path, task, expected_num_results',
63
  [
64
  ("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "qa", NUM_QA_BENCHMARKS_24_04),
65
- ("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_04),
 
 
 
 
66
  ("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "qa", NUM_QA_BENCHMARKS_24_05),
67
  ("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_05),
68
- ])
 
69
  def test_full_eval_result_to_dict(file_path, task, expected_num_results):
70
  json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
71
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
72
  result_dict_list = full_eval_result.to_dict(task)
73
  assert len(result_dict_list) == 1
74
  result = result_dict_list[0]
75
- attr_list = frozenset([
76
- 'eval_name', 'Retrieval Method', 'Reranking Model', 'Retrieval Model LINK', 'Reranking Model LINK', 'Revision', 'Submission Date', 'Anonymous Submission'])
 
 
 
 
 
 
 
 
 
 
77
  result_cols = list(result.keys())
78
- assert len(result_cols) == (expected_num_results + len(attr_list))
 
 
1
  from pathlib import Path
2
 
3
+ import pytest
4
+
5
  from src.models import EvalResult, FullEvalResult
6
 
7
  cur_fp = Path(__file__)
 
24
  NUM_QA_BENCHMARKS_24_04 = 13
25
  NUM_DOC_BENCHMARKS_24_04 = 15
26
 
27
+
28
  def test_eval_result():
29
+ EvalResult(
30
  eval_name="eval_name",
31
  retrieval_model="bge-m3",
32
  reranking_model="NoReranking",
33
+ results=[{"domain": "law", "lang": "en", "dataset": "lex_files_500K-600K", "value": 0.45723}],
 
 
 
 
 
 
 
34
  task="qa",
35
  metric="ndcg_at_3",
36
  timestamp="2024-05-14T03:09:08Z",
 
40
 
41
 
42
  @pytest.mark.parametrize(
43
+ "file_path",
44
  [
45
  "AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
46
+ "AIR-Bench_24.05/bge-m3/NoReranker/results.json",
47
+ ],
48
+ )
49
  def test_full_eval_result_init_from_json_file(file_path):
50
  json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
51
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
 
55
 
56
 
57
  @pytest.mark.parametrize(
58
+ "file_path, task, expected_num_results",
59
  [
60
  ("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "qa", NUM_QA_BENCHMARKS_24_04),
61
+ (
62
+ "AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
63
+ "long-doc",
64
+ NUM_DOC_BENCHMARKS_24_04,
65
+ ),
66
  ("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "qa", NUM_QA_BENCHMARKS_24_05),
67
  ("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_05),
68
+ ],
69
+ )
70
  def test_full_eval_result_to_dict(file_path, task, expected_num_results):
71
  json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
72
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
73
  result_dict_list = full_eval_result.to_dict(task)
74
  assert len(result_dict_list) == 1
75
  result = result_dict_list[0]
76
+ attr_list = frozenset(
77
+ [
78
+ "eval_name",
79
+ "Retrieval Method",
80
+ "Reranking Model",
81
+ "Retrieval Model LINK",
82
+ "Reranking Model LINK",
83
+ "Revision",
84
+ "Submission Date",
85
+ "Anonymous Submission",
86
+ ]
87
+ )
88
  result_cols = list(result.keys())
89
+ assert len(result_cols) == (expected_num_results + len(attr_list))
tests/src/test_read_evals.py DELETED
@@ -1,78 +0,0 @@
1
- from pathlib import Path
2
-
3
- from src.models import FullEvalResult
4
- from src.read_evals import load_raw_eval_results
5
- from src.utils import get_leaderboard_df
6
-
7
- cur_fp = Path(__file__)
8
-
9
-
10
- def test_init_from_json_file():
11
- json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
12
- full_eval_result = FullEvalResult.init_from_json_file(json_fp)
13
- num_different_task_domain_lang_metric_dataset_combination = 6
14
- assert len(full_eval_result.results) == num_different_task_domain_lang_metric_dataset_combination
15
- assert full_eval_result.retrieval_model == "bge-m3"
16
- assert full_eval_result.reranking_model == "bge-reranker-v2-m3"
17
-
18
-
19
- def test_to_dict():
20
- json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
21
- full_eval_result = FullEvalResult.init_from_json_file(json_fp)
22
- result_list = full_eval_result.to_dict(task="qa", metric="ndcg_at_1")
23
- assert len(result_list) == 1
24
- result_dict = result_list[0]
25
- assert result_dict["Retrieval Model"] == "bge-m3"
26
- assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
27
- assert result_dict["wiki_en"] is not None
28
- assert result_dict["wiki_zh"] is not None
29
-
30
-
31
- def test_get_raw_eval_results():
32
- results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
33
- results = load_raw_eval_results(results_path)
34
- # only load the latest results
35
- assert len(results) == 4
36
- assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
37
- assert len(results[0].results) == 70
38
- assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
39
- assert len(results[1].results) == 70
40
-
41
-
42
- def test_get_leaderboard_df():
43
- results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
44
- raw_data = load_raw_eval_results(results_path)
45
- df = get_leaderboard_df(raw_data, "qa", "ndcg_at_10")
46
- assert df.shape[0] == 4
47
- # the results contain only one embedding model
48
- # for i in range(4):
49
- # assert df["Retrieval Model"][i] == "bge-m3"
50
- # # the results contain only two reranking model
51
- # assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
52
- # assert df["Reranking Model"][1] == "NoReranker"
53
- # assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
54
- # assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
55
-
56
-
57
- def test_get_leaderboard_df_long_doc():
58
- results_path = cur_fp.parents[2] / "toydata" / "test_results"
59
- raw_data = load_raw_eval_results(results_path)
60
- df = get_leaderboard_df(raw_data, "long-doc", "ndcg_at_1")
61
- assert df.shape[0] == 2
62
- # the results contain only one embedding model
63
- for i in range(2):
64
- assert df["Retrieval Model"][i] == "bge-m3"
65
- # the results contains only two reranking model
66
- assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
67
- assert df["Reranking Model"][1] == "NoReranker"
68
- assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
69
- assert (
70
- not df[
71
- [
72
- "Average ⬆️",
73
- "law_en_lex_files_500k_600k",
74
- ]
75
- ]
76
- .isnull()
77
- .values.any()
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/src/test_utils.py CHANGED
@@ -1,10 +1,21 @@
1
- import pytest
2
- import pandas as pd
3
  from pathlib import Path
4
 
5
- from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem, get_leaderboard_df
6
- from src.models import model_hyperlink, TaskType
 
7
  from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  cur_fp = Path(__file__)
10
 
@@ -18,18 +29,8 @@ NUM_DOC_BENCHMARKS_24_04 = 15
18
  def toy_df():
19
  return pd.DataFrame(
20
  {
21
- "Retrieval Method": [
22
- "bge-m3",
23
- "bge-m3",
24
- "jina-embeddings-v2-base",
25
- "jina-embeddings-v2-base"
26
- ],
27
- "Reranking Model": [
28
- "bge-reranker-v2-m3",
29
- "NoReranker",
30
- "bge-reranker-v2-m3",
31
- "NoReranker"
32
- ],
33
  "Rank 🏆": [1, 2, 3, 4],
34
  "Revision": ["123", "234", "345", "456"],
35
  "Submission Date": ["", "", "", ""],
@@ -45,8 +46,7 @@ def toy_df():
45
 
46
  def test_remove_html():
47
  model_name = "jina-embeddings-v3"
48
- html_str = model_hyperlink(
49
- "https://jina.ai", model_name)
50
  output_str = remove_html(html_str)
51
  assert output_str == model_name
52
 
@@ -60,17 +60,29 @@ def test_calculate_mean():
60
  assert result[1] == -1
61
 
62
 
63
- @pytest.mark.parametrize("models, expected", [
64
- (["model1", "model3"], 2),
65
- (["model1", "model_missing"], 1),
66
- (["model1", "model2", "model3"], 3),
67
- (["model1", ], 1),
68
- ([], 3),
69
- ])
 
 
 
 
 
 
 
 
70
  def test_filter_models(models, expected):
71
  df = pd.DataFrame(
72
  {
73
- COL_NAME_RERANKING_MODEL: ["model1", "model2", "model3", ],
 
 
 
 
74
  "col2": [1, 2, 3],
75
  }
76
  )
@@ -78,18 +90,29 @@ def test_filter_models(models, expected):
78
  assert len(output_df) == expected
79
 
80
 
81
- @pytest.mark.parametrize("query, expected", [
82
- ("model1;model3", 2),
83
- ("model1;model4", 1),
84
- ("model1;model2;model3", 3),
85
- ("model1", 1),
86
- ("", 3),
87
- ])
 
 
 
88
  def test_filter_queries(query, expected):
89
  df = pd.DataFrame(
90
  {
91
- COL_NAME_RETRIEVAL_MODEL: ["model1", "model2", "model3", ],
92
- COL_NAME_RERANKING_MODEL: ["model4", "model5", "model6", ],
 
 
 
 
 
 
 
 
93
  }
94
  )
95
  output_df = filter_queries(query, df)
@@ -103,10 +126,10 @@ def test_filter_queries(query, expected):
103
  (TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
104
  (TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
105
  (TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
106
- ]
107
  )
108
  def test_get_default_cols(task_type, slug, add_fix_cols, expected):
109
- attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️']
110
  cols, types = get_default_cols(task_type, slug)
111
  cols_set = frozenset(cols)
112
  attrs_set = frozenset(attr_cols)
@@ -119,44 +142,54 @@ def test_get_default_cols(task_type, slug, add_fix_cols, expected):
119
  @pytest.mark.parametrize(
120
  "task_type, domains, languages, expected",
121
  [
122
- (TaskType.qa, ["wiki", "news"], ["zh",], ["wiki_zh", "news_zh"]),
123
- (TaskType.qa, ["law",], ["zh", "en"], ["law_en"]),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  (
125
  TaskType.long_doc,
126
  ["healthcare"],
127
  ["zh", "en"],
128
  [
129
- 'healthcare_en_pubmed_100k_200k_1',
130
- 'healthcare_en_pubmed_100k_200k_2',
131
- 'healthcare_en_pubmed_100k_200k_3',
132
- 'healthcare_en_pubmed_40k_50k_5_merged',
133
- 'healthcare_en_pubmed_30k_40k_10_merged'
134
- ]
135
- )
136
- ]
137
  )
138
  def test_get_selected_cols(task_type, domains, languages, expected):
139
  slug = "air_bench_2404"
140
  cols = get_selected_cols(task_type, slug, domains, languages)
141
  assert sorted(cols) == sorted(expected)
142
 
 
143
  @pytest.mark.parametrize("reset_rank", [False])
144
  def test_select_columns(toy_df, reset_rank):
145
  expected = [
146
- 'Rank 🏆',
147
- 'Retrieval Method',
148
- 'Reranking Model',
149
- 'Revision',
150
- 'Submission Date',
151
- 'Average ⬆️',
152
- 'news_zh']
153
- df_result = select_columns(
154
- toy_df,
155
- ["news"],
156
- ["zh"],
157
- version_slug="air_bench_2404",
158
- reset_ranking=reset_rank
159
- )
160
  assert len(df_result.columns) == len(expected)
161
  if reset_rank:
162
  assert df_result["Average ⬆️"].equals(df_result["news_zh"])
@@ -170,20 +203,10 @@ def test_select_columns(toy_df, reset_rank):
170
  (False, True),
171
  (True, True),
172
  (True, False),
173
- ]
174
  )
175
  def test__update_df_elem(toy_df, reset_rank, show_anony):
176
- df = _update_df_elem(
177
- TaskType.qa,
178
- "AIR-Bench_24.04",
179
- toy_df,
180
- ["news"],
181
- ["zh"],
182
- [],
183
- "",
184
- show_anony,
185
- reset_rank
186
- )
187
  if show_anony:
188
  assert df.shape[0] == 4
189
  else:
@@ -201,19 +224,14 @@ def test__update_df_elem(toy_df, reset_rank, show_anony):
201
  ("AIR-Bench_24.04", TaskType.qa),
202
  ("AIR-Bench_24.04", TaskType.long_doc),
203
  ("AIR-Bench_24.05", TaskType.qa),
204
- ("AIR-Bench_24.05", TaskType.long_doc)
205
- ]
206
  )
207
  def test_get_leaderboard_df(version, task_type):
208
  from src.loaders import load_raw_eval_results
209
  from src.models import LeaderboardDataStore, get_safe_name
210
- raw_data = load_raw_eval_results(
211
- cur_fp.parents[1] / f"toydata/eval_results/{version}"
212
- )
213
  ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
214
- df = get_leaderboard_df(
215
- ds,
216
- task_type,
217
- "ndcg_at_10"
218
- )
219
- assert df.shape[0] == 1
 
 
 
1
  from pathlib import Path
2
 
3
+ import pandas as pd
4
+ import pytest
5
+
6
  from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
7
+ from src.models import TaskType, model_hyperlink
8
+ from src.utils import (
9
+ _update_df_elem,
10
+ calculate_mean,
11
+ filter_models,
12
+ filter_queries,
13
+ get_default_cols,
14
+ get_leaderboard_df,
15
+ get_selected_cols,
16
+ remove_html,
17
+ select_columns,
18
+ )
19
 
20
  cur_fp = Path(__file__)
21
 
 
29
  def toy_df():
30
  return pd.DataFrame(
31
  {
32
+ "Retrieval Method": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
33
+ "Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
 
 
 
 
 
 
 
 
 
 
34
  "Rank 🏆": [1, 2, 3, 4],
35
  "Revision": ["123", "234", "345", "456"],
36
  "Submission Date": ["", "", "", ""],
 
46
 
47
  def test_remove_html():
48
  model_name = "jina-embeddings-v3"
49
+ html_str = model_hyperlink("https://jina.ai", model_name)
 
50
  output_str = remove_html(html_str)
51
  assert output_str == model_name
52
 
 
60
  assert result[1] == -1
61
 
62
 
63
+ @pytest.mark.parametrize(
64
+ "models, expected",
65
+ [
66
+ (["model1", "model3"], 2),
67
+ (["model1", "model_missing"], 1),
68
+ (["model1", "model2", "model3"], 3),
69
+ (
70
+ [
71
+ "model1",
72
+ ],
73
+ 1,
74
+ ),
75
+ ([], 3),
76
+ ],
77
+ )
78
  def test_filter_models(models, expected):
79
  df = pd.DataFrame(
80
  {
81
+ COL_NAME_RERANKING_MODEL: [
82
+ "model1",
83
+ "model2",
84
+ "model3",
85
+ ],
86
  "col2": [1, 2, 3],
87
  }
88
  )
 
90
  assert len(output_df) == expected
91
 
92
 
93
+ @pytest.mark.parametrize(
94
+ "query, expected",
95
+ [
96
+ ("model1;model3", 2),
97
+ ("model1;model4", 1),
98
+ ("model1;model2;model3", 3),
99
+ ("model1", 1),
100
+ ("", 3),
101
+ ],
102
+ )
103
  def test_filter_queries(query, expected):
104
  df = pd.DataFrame(
105
  {
106
+ COL_NAME_RETRIEVAL_MODEL: [
107
+ "model1",
108
+ "model2",
109
+ "model3",
110
+ ],
111
+ COL_NAME_RERANKING_MODEL: [
112
+ "model4",
113
+ "model5",
114
+ "model6",
115
+ ],
116
  }
117
  )
118
  output_df = filter_queries(query, df)
 
126
  (TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
127
  (TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
128
  (TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
129
+ ],
130
  )
131
  def test_get_default_cols(task_type, slug, add_fix_cols, expected):
132
+ attr_cols = ["Rank 🏆", "Retrieval Method", "Reranking Model", "Revision", "Submission Date", "Average ⬆️"]
133
  cols, types = get_default_cols(task_type, slug)
134
  cols_set = frozenset(cols)
135
  attrs_set = frozenset(attr_cols)
 
142
  @pytest.mark.parametrize(
143
  "task_type, domains, languages, expected",
144
  [
145
+ (
146
+ TaskType.qa,
147
+ ["wiki", "news"],
148
+ [
149
+ "zh",
150
+ ],
151
+ ["wiki_zh", "news_zh"],
152
+ ),
153
+ (
154
+ TaskType.qa,
155
+ [
156
+ "law",
157
+ ],
158
+ ["zh", "en"],
159
+ ["law_en"],
160
+ ),
161
  (
162
  TaskType.long_doc,
163
  ["healthcare"],
164
  ["zh", "en"],
165
  [
166
+ "healthcare_en_pubmed_100k_200k_1",
167
+ "healthcare_en_pubmed_100k_200k_2",
168
+ "healthcare_en_pubmed_100k_200k_3",
169
+ "healthcare_en_pubmed_40k_50k_5_merged",
170
+ "healthcare_en_pubmed_30k_40k_10_merged",
171
+ ],
172
+ ),
173
+ ],
174
  )
175
  def test_get_selected_cols(task_type, domains, languages, expected):
176
  slug = "air_bench_2404"
177
  cols = get_selected_cols(task_type, slug, domains, languages)
178
  assert sorted(cols) == sorted(expected)
179
 
180
+
181
  @pytest.mark.parametrize("reset_rank", [False])
182
  def test_select_columns(toy_df, reset_rank):
183
  expected = [
184
+ "Rank 🏆",
185
+ "Retrieval Method",
186
+ "Reranking Model",
187
+ "Revision",
188
+ "Submission Date",
189
+ "Average ⬆️",
190
+ "news_zh",
191
+ ]
192
+ df_result = select_columns(toy_df, ["news"], ["zh"], version_slug="air_bench_2404", reset_ranking=reset_rank)
 
 
 
 
 
193
  assert len(df_result.columns) == len(expected)
194
  if reset_rank:
195
  assert df_result["Average ⬆️"].equals(df_result["news_zh"])
 
203
  (False, True),
204
  (True, True),
205
  (True, False),
206
+ ],
207
  )
208
  def test__update_df_elem(toy_df, reset_rank, show_anony):
209
+ df = _update_df_elem(TaskType.qa, "AIR-Bench_24.04", toy_df, ["news"], ["zh"], [], "", show_anony, reset_rank)
 
 
 
 
 
 
 
 
 
 
210
  if show_anony:
211
  assert df.shape[0] == 4
212
  else:
 
224
  ("AIR-Bench_24.04", TaskType.qa),
225
  ("AIR-Bench_24.04", TaskType.long_doc),
226
  ("AIR-Bench_24.05", TaskType.qa),
227
+ ("AIR-Bench_24.05", TaskType.long_doc),
228
+ ],
229
  )
230
  def test_get_leaderboard_df(version, task_type):
231
  from src.loaders import load_raw_eval_results
232
  from src.models import LeaderboardDataStore, get_safe_name
233
+
234
+ raw_data = load_raw_eval_results(cur_fp.parents[1] / f"toydata/eval_results/{version}")
 
235
  ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
236
+ df = get_leaderboard_df(ds, task_type, "ndcg_at_10")
237
+ assert df.shape[0] == 1
 
 
 
 
tests/test_utils.py DELETED
@@ -1,136 +0,0 @@
1
- import pandas as pd
2
- import pytest
3
-
4
- from app import update_table
5
- from src.columns import (
6
- COL_NAME_AVG,
7
- COL_NAME_IS_ANONYMOUS,
8
- COL_NAME_RANK,
9
- COL_NAME_RERANKING_MODEL,
10
- COL_NAME_RETRIEVAL_MODEL,
11
- COL_NAME_REVISION,
12
- COL_NAME_TIMESTAMP,
13
- )
14
- from src.utils import (
15
- filter_models,
16
- filter_queries,
17
- get_default_cols,
18
- get_iso_format_timestamp,
19
- search_table,
20
- select_columns,
21
- update_doc_df_elem,
22
- )
23
-
24
-
25
- @pytest.fixture
26
- def toy_df():
27
- return pd.DataFrame(
28
- {
29
- "Retrieval Model": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
30
- "Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
31
- "Average ⬆️": [0.6, 0.4, 0.3, 0.2],
32
- "wiki_en": [0.8, 0.7, 0.2, 0.1],
33
- "wiki_zh": [0.4, 0.1, 0.4, 0.3],
34
- "news_en": [0.8, 0.7, 0.2, 0.1],
35
- "news_zh": [0.4, 0.1, 0.4, 0.3],
36
- }
37
- )
38
-
39
-
40
- @pytest.fixture
41
- def toy_df_long_doc():
42
- return pd.DataFrame(
43
- {
44
- "Retrieval Model": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
45
- "Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
46
- "Average ⬆️": [0.6, 0.4, 0.3, 0.2],
47
- "law_en_lex_files_300k_400k": [0.4, 0.1, 0.4, 0.3],
48
- "law_en_lex_files_400k_500k": [0.8, 0.7, 0.2, 0.1],
49
- "law_en_lex_files_500k_600k": [0.8, 0.7, 0.2, 0.1],
50
- "law_en_lex_files_600k_700k": [0.4, 0.1, 0.4, 0.3],
51
- }
52
- )
53
-
54
-
55
- def test_filter_models(toy_df):
56
- df_result = filter_models(
57
- toy_df,
58
- [
59
- "bge-reranker-v2-m3",
60
- ],
61
- )
62
- assert len(df_result) == 2
63
- assert df_result.iloc[0]["Reranking Model"] == "bge-reranker-v2-m3"
64
-
65
-
66
- def test_search_table(toy_df):
67
- df_result = search_table(toy_df, "jina")
68
- assert len(df_result) == 2
69
- assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
70
-
71
-
72
- def test_filter_queries(toy_df):
73
- df_result = filter_queries("jina", toy_df)
74
- assert len(df_result) == 2
75
- assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
76
-
77
-
78
-
79
-
80
-
81
- def test_update_table_long_doc(toy_df_long_doc):
82
- df_result = update_doc_df_elem(
83
- toy_df_long_doc,
84
- [
85
- "law",
86
- ],
87
- [
88
- "en",
89
- ],
90
- [
91
- "bge-reranker-v2-m3",
92
- ],
93
- "jina",
94
- )
95
- print(df_result)
96
-
97
-
98
- def test_get_iso_format_timestamp():
99
- timestamp_config, timestamp_fn = get_iso_format_timestamp()
100
- assert len(timestamp_fn) == 14
101
- assert len(timestamp_config) == 20
102
- assert timestamp_config[-1] == "Z"
103
-
104
-
105
- def test_get_default_cols():
106
- cols, types = get_default_cols("qa")
107
- for c, t in zip(cols, types):
108
- print(f"type({c}): {t}")
109
- assert len(frozenset(cols)) == len(cols)
110
-
111
-
112
- def test_update_table():
113
- df = pd.DataFrame(
114
- {
115
- COL_NAME_IS_ANONYMOUS: [False, False, False],
116
- COL_NAME_REVISION: ["a1", "a2", "a3"],
117
- COL_NAME_TIMESTAMP: ["2024-05-12T12:24:02Z"] * 3,
118
- COL_NAME_RERANKING_MODEL: ["NoReranker"] * 3,
119
- COL_NAME_RETRIEVAL_MODEL: ["Foo"] * 3,
120
- COL_NAME_RANK: [1, 2, 3],
121
- COL_NAME_AVG: [0.1, 0.2, 0.3], # unsorted values
122
- "wiki_en": [0.1, 0.2, 0.3],
123
- }
124
- )
125
- results = update_table(
126
- df,
127
- "wiki",
128
- "en",
129
- ["NoReranker"],
130
- "",
131
- show_anonymous=False,
132
- reset_ranking=False,
133
- show_revision_and_timestamp=False,
134
- )
135
- # keep the RANK as the same regardless of the unsorted averages
136
- assert results[COL_NAME_RANK].to_list() == [1, 2, 3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/toydata/test_data.json DELETED
@@ -1,98 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "retrieval_model": "bge-m3",
5
- "reranking_model": "bge-reranker-v2-m3",
6
- "task": "long_doc",
7
- "metric": "ndcg_at_1"
8
- },
9
- "results": [
10
- {
11
- "domain": "law",
12
- "lang": "en",
13
- "dataset": "lex_files_500K-600K",
14
- "value": 0.75723
15
- }
16
- ]
17
- },
18
- {
19
- "config": {
20
- "retrieval_model": "bge-m3",
21
- "reranking_model": "bge-reranker-v2-m3",
22
- "task": "long_doc",
23
- "metric": "ndcg_at_3"
24
- },
25
- "results": [
26
- {
27
- "domain": "law",
28
- "lang": "en",
29
- "dataset": "lex_files_500K-600K",
30
- "value": 0.69909
31
- }
32
- ]
33
- },
34
- {
35
- "config": {
36
- "retrieval_model": "bge-m3",
37
- "reranking_model": "bge-reranker-v2-m3",
38
- "task": "qa",
39
- "metric": "ndcg_at_1"
40
- },
41
- "results": [
42
- {
43
- "domain": "wiki",
44
- "lang": "en",
45
- "dataset": "unknown",
46
- "value": 0.69083
47
- }
48
- ]
49
- },
50
- {
51
- "config": {
52
- "retrieval_model": "bge-m3",
53
- "reranking_model": "bge-reranker-v2-m3",
54
- "task": "qa",
55
- "metric": "ndcg_at_3"
56
- },
57
- "results": [
58
- {
59
- "domain": "wiki",
60
- "lang": "en",
61
- "dataset": "unknown",
62
- "value": 0.73359
63
- }
64
- ]
65
- },
66
- {
67
- "config": {
68
- "retrieval_model": "bge-m3",
69
- "reranking_model": "bge-reranker-v2-m3",
70
- "task": "qa",
71
- "metric": "ndcg_at_1"
72
- },
73
- "results": [
74
- {
75
- "domain": "wiki",
76
- "lang": "zh",
77
- "dataset": "unknown",
78
- "value": 0.78358
79
- }
80
- ]
81
- },
82
- {
83
- "config": {
84
- "retrieval_model": "bge-m3",
85
- "reranking_model": "bge-reranker-v2-m3",
86
- "task": "qa",
87
- "metric": "ndcg_at_3"
88
- },
89
- "results": [
90
- {
91
- "domain": "wiki",
92
- "lang": "zh",
93
- "dataset": "unknown",
94
- "value": 0.78358
95
- }
96
- ]
97
- }
98
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/toydata/test_results/bge-m3/NoReranker/results_2023-11-21T18-10-08.json DELETED
@@ -1,98 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "retrieval_model": "bge-m3",
5
- "reranking_model": "NoReranker",
6
- "task": "long_doc",
7
- "metric": "ndcg_at_1"
8
- },
9
- "results": [
10
- {
11
- "domain": "law",
12
- "lang": "en",
13
- "dataset": "lex_files_500K-600K",
14
- "value": 0.45723
15
- }
16
- ]
17
- },
18
- {
19
- "config": {
20
- "retrieval_model": "bge-m3",
21
- "reranking_model": "NoReranker",
22
- "task": "long_doc",
23
- "metric": "ndcg_at_3"
24
- },
25
- "results": [
26
- {
27
- "domain": "law",
28
- "lang": "en",
29
- "dataset": "lex_files_500K-600K",
30
- "value": 0.49909
31
- }
32
- ]
33
- },
34
- {
35
- "config": {
36
- "retrieval_model": "bge-m3",
37
- "reranking_model": "NoReranker",
38
- "task": "qa",
39
- "metric": "ndcg_at_1"
40
- },
41
- "results": [
42
- {
43
- "domain": "wiki",
44
- "lang": "en",
45
- "dataset": "unknown",
46
- "value": 0.49083
47
- }
48
- ]
49
- },
50
- {
51
- "config": {
52
- "retrieval_model": "bge-m3",
53
- "reranking_model": "NoReranker",
54
- "task": "qa",
55
- "metric": "ndcg_at_3"
56
- },
57
- "results": [
58
- {
59
- "domain": "wiki",
60
- "lang": "en",
61
- "dataset": "unknown",
62
- "value": 0.43359
63
- }
64
- ]
65
- },
66
- {
67
- "config": {
68
- "retrieval_model": "bge-m3",
69
- "reranking_model": "NoReranker",
70
- "task": "qa",
71
- "metric": "ndcg_at_1"
72
- },
73
- "results": [
74
- {
75
- "domain": "wiki",
76
- "lang": "zh",
77
- "dataset": "unknown",
78
- "value": 0.78358
79
- }
80
- ]
81
- },
82
- {
83
- "config": {
84
- "retrieval_model": "bge-m3",
85
- "reranking_model": "NoReranker",
86
- "task": "qa",
87
- "metric": "ndcg_at_3"
88
- },
89
- "results": [
90
- {
91
- "domain": "wiki",
92
- "lang": "zh",
93
- "dataset": "unknown",
94
- "value": 0.78358
95
- }
96
- ]
97
- }
98
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/results_2023-11-21T18-10-08.json DELETED
@@ -1,98 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "retrieval_model": "bge-m3",
5
- "reranking_model": "bge-reranker-v2-m3",
6
- "task": "long_doc",
7
- "metric": "ndcg_at_1"
8
- },
9
- "results": [
10
- {
11
- "domain": "law",
12
- "lang": "en",
13
- "dataset": "lex_files_500K-600K",
14
- "value": 0.75723
15
- }
16
- ]
17
- },
18
- {
19
- "config": {
20
- "retrieval_model": "bge-m3",
21
- "reranking_model": "bge-reranker-v2-m3",
22
- "task": "long_doc",
23
- "metric": "ndcg_at_3"
24
- },
25
- "results": [
26
- {
27
- "domain": "law",
28
- "lang": "en",
29
- "dataset": "lex_files_500K-600K",
30
- "value": 0.69909
31
- }
32
- ]
33
- },
34
- {
35
- "config": {
36
- "retrieval_model": "bge-m3",
37
- "reranking_model": "bge-reranker-v2-m3",
38
- "task": "qa",
39
- "metric": "ndcg_at_1"
40
- },
41
- "results": [
42
- {
43
- "domain": "wiki",
44
- "lang": "en",
45
- "dataset": "unknown",
46
- "value": 0.69083
47
- }
48
- ]
49
- },
50
- {
51
- "config": {
52
- "retrieval_model": "bge-m3",
53
- "reranking_model": "bge-reranker-v2-m3",
54
- "task": "qa",
55
- "metric": "ndcg_at_3"
56
- },
57
- "results": [
58
- {
59
- "domain": "wiki",
60
- "lang": "en",
61
- "dataset": "unknown",
62
- "value": 0.73359
63
- }
64
- ]
65
- },
66
- {
67
- "config": {
68
- "retrieval_model": "bge-m3",
69
- "reranking_model": "bge-reranker-v2-m3",
70
- "task": "qa",
71
- "metric": "ndcg_at_1"
72
- },
73
- "results": [
74
- {
75
- "domain": "wiki",
76
- "lang": "zh",
77
- "dataset": "unknown",
78
- "value": 0.78358
79
- }
80
- ]
81
- },
82
- {
83
- "config": {
84
- "retrieval_model": "bge-m3",
85
- "reranking_model": "bge-reranker-v2-m3",
86
- "task": "qa",
87
- "metric": "ndcg_at_3"
88
- },
89
- "results": [
90
- {
91
- "domain": "wiki",
92
- "lang": "zh",
93
- "dataset": "unknown",
94
- "value": 0.78358
95
- }
96
- ]
97
- }
98
- ]