Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
style: reformat the styles
Browse files- Makefile +6 -0
- app.py +20 -20
- pyproject.toml +3 -3
- src/benchmarks.py +5 -13
- src/columns.py +9 -59
- src/envs.py +3 -1
- src/loaders.py +1 -2
- src/models.py +1 -1
- src/utils.py +72 -79
- tests/src/test_benchmarks.py +3 -19
- tests/src/test_columns.py +24 -21
- tests/src/test_envs.py +2 -3
- tests/src/test_loaders.py +15 -23
- tests/src/test_models.py +30 -19
- tests/src/test_read_evals.py +0 -78
- tests/src/test_utils.py +102 -84
- tests/test_utils.py +0 -136
- tests/toydata/test_data.json +0 -98
- tests/toydata/test_results/bge-m3/NoReranker/results_2023-11-21T18-10-08.json +0 -98
- tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/results_2023-11-21T18-10-08.json +0 -98
Makefile
CHANGED
@@ -3,14 +3,20 @@
|
|
3 |
|
4 |
style:
|
5 |
python -m black --line-length 119 .
|
|
|
6 |
python -m isort .
|
|
|
7 |
ruff check --fix .
|
|
|
8 |
|
9 |
|
10 |
quality:
|
11 |
python -m black --check --line-length 119 .
|
|
|
12 |
python -m isort --check-only .
|
|
|
13 |
ruff check .
|
|
|
14 |
|
15 |
|
16 |
test:
|
|
|
3 |
|
4 |
style:
|
5 |
python -m black --line-length 119 .
|
6 |
+
python -m black --line-length 119 src
|
7 |
python -m isort .
|
8 |
+
python -m isort src
|
9 |
ruff check --fix .
|
10 |
+
ruff check --fix src
|
11 |
|
12 |
|
13 |
quality:
|
14 |
python -m black --check --line-length 119 .
|
15 |
+
python -m black --check --line-length 119 src
|
16 |
python -m isort --check-only .
|
17 |
+
python -m isort --check-only src
|
18 |
ruff check .
|
19 |
+
ruff check src
|
20 |
|
21 |
|
22 |
test:
|
app.py
CHANGED
@@ -63,13 +63,13 @@ datastore = ds_dict[LATEST_BENCHMARK_VERSION]
|
|
63 |
|
64 |
|
65 |
def update_qa_metric(
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
):
|
74 |
global datastore
|
75 |
return update_metric(
|
@@ -86,13 +86,13 @@ def update_qa_metric(
|
|
86 |
|
87 |
|
88 |
def update_doc_metric(
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
):
|
97 |
global datastore
|
98 |
return update_metric(
|
@@ -218,7 +218,7 @@ with demo:
|
|
218 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
219 |
_qa_df_ret_hidden = datastore.qa_raw_df[
|
220 |
datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
221 |
-
|
222 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
223 |
qa_df_elem_ret_hidden = get_leaderboard_table(
|
224 |
_qa_df_ret_hidden, datastore.qa_types, visible=False
|
@@ -277,7 +277,7 @@ with demo:
|
|
277 |
|
278 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
279 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
280 |
-
|
281 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
282 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
283 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
@@ -391,13 +391,13 @@ with demo:
|
|
391 |
|
392 |
_doc_df_ret = datastore.doc_fmt_df[
|
393 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
394 |
-
|
395 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
396 |
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
|
397 |
|
398 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
399 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
400 |
-
|
401 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
402 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
403 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
@@ -439,7 +439,7 @@ with demo:
|
|
439 |
with gr.TabItem("Reranking Only", id=22):
|
440 |
_doc_df_rerank = datastore.doc_fmt_df[
|
441 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
442 |
-
|
443 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
444 |
doc_rerank_models = (
|
445 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
@@ -452,7 +452,7 @@ with demo:
|
|
452 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
453 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
454 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
455 |
-
|
456 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
457 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
458 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
|
|
63 |
|
64 |
|
65 |
def update_qa_metric(
|
66 |
+
metric: str,
|
67 |
+
domains: list,
|
68 |
+
langs: list,
|
69 |
+
reranking_model: list,
|
70 |
+
query: str,
|
71 |
+
show_anonymous: bool,
|
72 |
+
show_revision_and_timestamp: bool,
|
73 |
):
|
74 |
global datastore
|
75 |
return update_metric(
|
|
|
86 |
|
87 |
|
88 |
def update_doc_metric(
|
89 |
+
metric: str,
|
90 |
+
domains: list,
|
91 |
+
langs: list,
|
92 |
+
reranking_model: list,
|
93 |
+
query: str,
|
94 |
+
show_anonymous: bool,
|
95 |
+
show_revision_and_timestamp,
|
96 |
):
|
97 |
global datastore
|
98 |
return update_metric(
|
|
|
218 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
219 |
_qa_df_ret_hidden = datastore.qa_raw_df[
|
220 |
datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
221 |
+
]
|
222 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
223 |
qa_df_elem_ret_hidden = get_leaderboard_table(
|
224 |
_qa_df_ret_hidden, datastore.qa_types, visible=False
|
|
|
277 |
|
278 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
279 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
280 |
+
]
|
281 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
282 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
283 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
|
|
391 |
|
392 |
_doc_df_ret = datastore.doc_fmt_df[
|
393 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
394 |
+
]
|
395 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
396 |
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
|
397 |
|
398 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
399 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
400 |
+
]
|
401 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
402 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
403 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
|
|
439 |
with gr.TabItem("Reranking Only", id=22):
|
440 |
_doc_df_rerank = datastore.doc_fmt_df[
|
441 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
442 |
+
]
|
443 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
444 |
doc_rerank_models = (
|
445 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
|
452 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
453 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
454 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
455 |
+
]
|
456 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
457 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
458 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
pyproject.toml
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
[tool.ruff]
|
2 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
-
select = ["E", "F"]
|
4 |
-
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
line-length = 119
|
6 |
-
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
|
8 |
[tool.isort]
|
9 |
profile = "black"
|
|
|
1 |
[tool.ruff]
|
2 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
+
lint.select = ["E", "F"]
|
4 |
+
lint.ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
line-length = 119
|
6 |
+
lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
|
8 |
[tool.isort]
|
9 |
profile = "black"
|
src/benchmarks.py
CHANGED
@@ -30,9 +30,7 @@ def get_qa_benchmarks_dict(version: str):
|
|
30 |
for metric in dataset_list:
|
31 |
if "test" not in dataset_list[metric]["splits"]:
|
32 |
continue
|
33 |
-
benchmark_dict[benchmark_name] = Benchmark(
|
34 |
-
benchmark_name, metric, col_name, domain, lang, task
|
35 |
-
)
|
36 |
return benchmark_dict
|
37 |
|
38 |
|
@@ -59,20 +57,14 @@ def get_doc_benchmarks_dict(version: str):
|
|
59 |
_qa_benchmark_dict = {}
|
60 |
for version in BENCHMARK_VERSION_LIST:
|
61 |
safe_version_name = get_safe_name(version)
|
62 |
-
_qa_benchmark_dict[safe_version_name] =
|
63 |
-
Enum(
|
64 |
-
f"QABenchmarks_{safe_version_name}",
|
65 |
-
get_qa_benchmarks_dict(version)
|
66 |
-
)
|
67 |
|
68 |
_doc_benchmark_dict = {}
|
69 |
for version in BENCHMARK_VERSION_LIST:
|
70 |
safe_version_name = get_safe_name(version)
|
71 |
-
_doc_benchmark_dict[safe_version_name] =
|
72 |
-
|
73 |
-
|
74 |
-
get_doc_benchmarks_dict(version)
|
75 |
-
)
|
76 |
|
77 |
|
78 |
QABenchmarks = Enum("QABenchmarks", _qa_benchmark_dict)
|
|
|
30 |
for metric in dataset_list:
|
31 |
if "test" not in dataset_list[metric]["splits"]:
|
32 |
continue
|
33 |
+
benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
|
|
|
|
34 |
return benchmark_dict
|
35 |
|
36 |
|
|
|
57 |
_qa_benchmark_dict = {}
|
58 |
for version in BENCHMARK_VERSION_LIST:
|
59 |
safe_version_name = get_safe_name(version)
|
60 |
+
_qa_benchmark_dict[safe_version_name] = Enum(f"QABenchmarks_{safe_version_name}", get_qa_benchmarks_dict(version))
|
|
|
|
|
|
|
|
|
61 |
|
62 |
_doc_benchmark_dict = {}
|
63 |
for version in BENCHMARK_VERSION_LIST:
|
64 |
safe_version_name = get_safe_name(version)
|
65 |
+
_doc_benchmark_dict[safe_version_name] = Enum(
|
66 |
+
f"LongDocBenchmarks_{safe_version_name}", get_doc_benchmarks_dict(version)
|
67 |
+
)
|
|
|
|
|
68 |
|
69 |
|
70 |
QABenchmarks = Enum("QABenchmarks", _qa_benchmark_dict)
|
src/columns.py
CHANGED
@@ -19,69 +19,28 @@ class ColumnContent:
|
|
19 |
|
20 |
def get_default_auto_eval_column_dict():
|
21 |
auto_eval_column_dict = []
|
22 |
-
auto_eval_column_dict.append(
|
23 |
-
[
|
24 |
-
"rank",
|
25 |
-
ColumnContent,
|
26 |
-
ColumnContent(
|
27 |
-
COL_NAME_RANK,
|
28 |
-
"number",
|
29 |
-
True
|
30 |
-
)
|
31 |
-
]
|
32 |
-
)
|
33 |
auto_eval_column_dict.append(
|
34 |
[
|
35 |
"retrieval_model",
|
36 |
ColumnContent,
|
37 |
-
ColumnContent(
|
38 |
-
COL_NAME_RETRIEVAL_MODEL,
|
39 |
-
"markdown",
|
40 |
-
True,
|
41 |
-
never_hidden=True
|
42 |
-
)
|
43 |
]
|
44 |
)
|
45 |
auto_eval_column_dict.append(
|
46 |
[
|
47 |
"reranking_model",
|
48 |
ColumnContent,
|
49 |
-
ColumnContent(
|
50 |
-
COL_NAME_RERANKING_MODEL,
|
51 |
-
"markdown",
|
52 |
-
True,
|
53 |
-
never_hidden=True
|
54 |
-
)
|
55 |
]
|
56 |
)
|
57 |
auto_eval_column_dict.append(
|
58 |
-
[
|
59 |
-
"revision",
|
60 |
-
ColumnContent,
|
61 |
-
ColumnContent(
|
62 |
-
COL_NAME_REVISION,
|
63 |
-
"markdown",
|
64 |
-
True,
|
65 |
-
never_hidden=True
|
66 |
-
)
|
67 |
-
]
|
68 |
)
|
69 |
auto_eval_column_dict.append(
|
70 |
-
[
|
71 |
-
"timestamp",
|
72 |
-
ColumnContent,
|
73 |
-
ColumnContent(
|
74 |
-
COL_NAME_TIMESTAMP, "date", True, never_hidden=True
|
75 |
-
)
|
76 |
-
]
|
77 |
-
)
|
78 |
-
auto_eval_column_dict.append(
|
79 |
-
[
|
80 |
-
"average",
|
81 |
-
ColumnContent,
|
82 |
-
ColumnContent(COL_NAME_AVG, "number", True)
|
83 |
-
]
|
84 |
)
|
|
|
85 |
auto_eval_column_dict.append(
|
86 |
[
|
87 |
"retrieval_model_link",
|
@@ -91,7 +50,7 @@ def get_default_auto_eval_column_dict():
|
|
91 |
"markdown",
|
92 |
False,
|
93 |
hidden=True,
|
94 |
-
)
|
95 |
]
|
96 |
)
|
97 |
auto_eval_column_dict.append(
|
@@ -103,20 +62,11 @@ def get_default_auto_eval_column_dict():
|
|
103 |
"markdown",
|
104 |
False,
|
105 |
hidden=True,
|
106 |
-
)
|
107 |
]
|
108 |
)
|
109 |
auto_eval_column_dict.append(
|
110 |
-
[
|
111 |
-
"is_anonymous",
|
112 |
-
ColumnContent,
|
113 |
-
ColumnContent(
|
114 |
-
COL_NAME_IS_ANONYMOUS,
|
115 |
-
"bool",
|
116 |
-
False,
|
117 |
-
hidden=True
|
118 |
-
)
|
119 |
-
]
|
120 |
)
|
121 |
return auto_eval_column_dict
|
122 |
|
|
|
19 |
|
20 |
def get_default_auto_eval_column_dict():
|
21 |
auto_eval_column_dict = []
|
22 |
+
auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
auto_eval_column_dict.append(
|
24 |
[
|
25 |
"retrieval_model",
|
26 |
ColumnContent,
|
27 |
+
ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, never_hidden=True),
|
|
|
|
|
|
|
|
|
|
|
28 |
]
|
29 |
)
|
30 |
auto_eval_column_dict.append(
|
31 |
[
|
32 |
"reranking_model",
|
33 |
ColumnContent,
|
34 |
+
ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True),
|
|
|
|
|
|
|
|
|
|
|
35 |
]
|
36 |
)
|
37 |
auto_eval_column_dict.append(
|
38 |
+
["revision", ColumnContent, ColumnContent(COL_NAME_REVISION, "markdown", True, never_hidden=True)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
)
|
40 |
auto_eval_column_dict.append(
|
41 |
+
["timestamp", ColumnContent, ColumnContent(COL_NAME_TIMESTAMP, "date", True, never_hidden=True)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
)
|
43 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)])
|
44 |
auto_eval_column_dict.append(
|
45 |
[
|
46 |
"retrieval_model_link",
|
|
|
50 |
"markdown",
|
51 |
False,
|
52 |
hidden=True,
|
53 |
+
),
|
54 |
]
|
55 |
)
|
56 |
auto_eval_column_dict.append(
|
|
|
62 |
"markdown",
|
63 |
False,
|
64 |
hidden=True,
|
65 |
+
),
|
66 |
]
|
67 |
)
|
68 |
auto_eval_column_dict.append(
|
69 |
+
["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
)
|
71 |
return auto_eval_column_dict
|
72 |
|
src/envs.py
CHANGED
@@ -6,7 +6,9 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN", "") # A read/write token for your org
|
8 |
|
9 |
-
OWNER =
|
|
|
|
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN", "") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = (
|
10 |
+
"AIR-Bench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
11 |
+
)
|
12 |
# ----------------------------------
|
13 |
|
14 |
REPO_ID = f"{OWNER}/leaderboard"
|
src/loaders.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import os.path
|
2 |
from pathlib import Path
|
3 |
-
from typing import Union
|
4 |
-
from typing import Dict, List
|
5 |
|
6 |
import pandas as pd
|
7 |
|
|
|
1 |
import os.path
|
2 |
from pathlib import Path
|
3 |
+
from typing import Dict, List, Union
|
|
|
4 |
|
5 |
import pandas as pd
|
6 |
|
src/models.py
CHANGED
@@ -2,7 +2,7 @@ import json
|
|
2 |
from collections import defaultdict
|
3 |
from dataclasses import dataclass
|
4 |
from enum import Enum
|
5 |
-
from typing import List
|
6 |
|
7 |
import pandas as pd
|
8 |
|
|
|
2 |
from collections import defaultdict
|
3 |
from dataclasses import dataclass
|
4 |
from enum import Enum
|
5 |
+
from typing import List
|
6 |
|
7 |
import pandas as pd
|
8 |
|
src/utils.py
CHANGED
@@ -118,39 +118,36 @@ def get_selected_cols(task, version_slug, domains, languages):
|
|
118 |
|
119 |
|
120 |
def select_columns(
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
) -> pd.DataFrame:
|
128 |
-
selected_cols = get_selected_cols(
|
129 |
-
task, version_slug, domains, languages)
|
130 |
fixed_cols, _ = get_fixed_col_names_and_types()
|
131 |
filtered_df = df[fixed_cols + selected_cols]
|
132 |
filtered_df.replace({"": pd.NA}, inplace=True)
|
133 |
if reset_ranking:
|
134 |
-
filtered_df[COL_NAME_AVG] =
|
135 |
-
|
136 |
-
filtered_df.sort_values(
|
137 |
-
by=[COL_NAME_AVG], ascending=False, inplace=True)
|
138 |
filtered_df.reset_index(inplace=True, drop=True)
|
139 |
filtered_df = reset_rank(filtered_df)
|
140 |
return filtered_df
|
141 |
|
142 |
|
143 |
def _update_df_elem(
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
):
|
155 |
filtered_df = source_df.copy()
|
156 |
if not show_anonymous:
|
@@ -164,15 +161,15 @@ def _update_df_elem(
|
|
164 |
|
165 |
|
166 |
def update_doc_df_elem(
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
):
|
177 |
return _update_df_elem(
|
178 |
TaskType.long_doc,
|
@@ -189,15 +186,15 @@ def update_doc_df_elem(
|
|
189 |
|
190 |
|
191 |
def update_metric(
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
) -> pd.DataFrame:
|
202 |
if task == TaskType.qa:
|
203 |
update_func = update_qa_df_elem
|
@@ -253,13 +250,13 @@ def calculate_file_md5(file_path):
|
|
253 |
|
254 |
|
255 |
def submit_results(
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
):
|
264 |
if not filepath.endswith(".zip"):
|
265 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
@@ -355,11 +352,7 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
|
|
355 |
benchmark_cols.append(t.value.col_name)
|
356 |
|
357 |
# filter out the columns that are not in the data
|
358 |
-
df[COL_NAME_AVG] = (
|
359 |
-
df[list(benchmark_cols)]
|
360 |
-
.apply(calculate_mean, axis=1)
|
361 |
-
.round(decimals=2)
|
362 |
-
)
|
363 |
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
364 |
df.reset_index(inplace=True, drop=True)
|
365 |
|
@@ -381,16 +374,16 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
|
|
381 |
|
382 |
|
383 |
def set_listeners(
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
):
|
395 |
if task == TaskType.qa:
|
396 |
update_table_func = update_qa_df_elem
|
@@ -400,15 +393,15 @@ def set_listeners(
|
|
400 |
raise NotImplementedError
|
401 |
selector_list = [selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous]
|
402 |
search_bar_args = [
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
selector_args = (
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
)
|
413 |
# Set search_bar listener
|
414 |
search_bar.submit(update_table_func, search_bar_args, target_df)
|
@@ -424,15 +417,15 @@ def set_listeners(
|
|
424 |
|
425 |
|
426 |
def update_qa_df_elem(
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
):
|
437 |
return _update_df_elem(
|
438 |
TaskType.qa,
|
|
|
118 |
|
119 |
|
120 |
def select_columns(
|
121 |
+
df: pd.DataFrame,
|
122 |
+
domains: list,
|
123 |
+
languages: list,
|
124 |
+
task: TaskType = TaskType.qa,
|
125 |
+
reset_ranking: bool = True,
|
126 |
+
version_slug: str = None,
|
127 |
) -> pd.DataFrame:
|
128 |
+
selected_cols = get_selected_cols(task, version_slug, domains, languages)
|
|
|
129 |
fixed_cols, _ = get_fixed_col_names_and_types()
|
130 |
filtered_df = df[fixed_cols + selected_cols]
|
131 |
filtered_df.replace({"": pd.NA}, inplace=True)
|
132 |
if reset_ranking:
|
133 |
+
filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
|
134 |
+
filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
|
|
|
|
135 |
filtered_df.reset_index(inplace=True, drop=True)
|
136 |
filtered_df = reset_rank(filtered_df)
|
137 |
return filtered_df
|
138 |
|
139 |
|
140 |
def _update_df_elem(
|
141 |
+
task: TaskType,
|
142 |
+
version: str,
|
143 |
+
source_df: pd.DataFrame,
|
144 |
+
domains: list,
|
145 |
+
langs: list,
|
146 |
+
reranking_query: list,
|
147 |
+
query: str,
|
148 |
+
show_anonymous: bool,
|
149 |
+
reset_ranking: bool = True,
|
150 |
+
show_revision_and_timestamp: bool = False,
|
151 |
):
|
152 |
filtered_df = source_df.copy()
|
153 |
if not show_anonymous:
|
|
|
161 |
|
162 |
|
163 |
def update_doc_df_elem(
|
164 |
+
version: str,
|
165 |
+
hidden_df: pd.DataFrame,
|
166 |
+
domains: list,
|
167 |
+
langs: list,
|
168 |
+
reranking_query: list,
|
169 |
+
query: str,
|
170 |
+
show_anonymous: bool,
|
171 |
+
show_revision_and_timestamp: bool = False,
|
172 |
+
reset_ranking: bool = True,
|
173 |
):
|
174 |
return _update_df_elem(
|
175 |
TaskType.long_doc,
|
|
|
186 |
|
187 |
|
188 |
def update_metric(
|
189 |
+
datastore,
|
190 |
+
task: TaskType,
|
191 |
+
metric: str,
|
192 |
+
domains: list,
|
193 |
+
langs: list,
|
194 |
+
reranking_model: list,
|
195 |
+
query: str,
|
196 |
+
show_anonymous: bool = False,
|
197 |
+
show_revision_and_timestamp: bool = False,
|
198 |
) -> pd.DataFrame:
|
199 |
if task == TaskType.qa:
|
200 |
update_func = update_qa_df_elem
|
|
|
250 |
|
251 |
|
252 |
def submit_results(
|
253 |
+
filepath: str,
|
254 |
+
model: str,
|
255 |
+
model_url: str,
|
256 |
+
reranking_model: str = "",
|
257 |
+
reranking_model_url: str = "",
|
258 |
+
version: str = LATEST_BENCHMARK_VERSION,
|
259 |
+
is_anonymous=False,
|
260 |
):
|
261 |
if not filepath.endswith(".zip"):
|
262 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
|
|
352 |
benchmark_cols.append(t.value.col_name)
|
353 |
|
354 |
# filter out the columns that are not in the data
|
355 |
+
df[COL_NAME_AVG] = df[list(benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
|
|
|
|
|
|
|
|
|
356 |
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
357 |
df.reset_index(inplace=True, drop=True)
|
358 |
|
|
|
374 |
|
375 |
|
376 |
def set_listeners(
|
377 |
+
task: TaskType,
|
378 |
+
target_df,
|
379 |
+
source_df,
|
380 |
+
search_bar,
|
381 |
+
version,
|
382 |
+
selected_domains,
|
383 |
+
selected_langs,
|
384 |
+
selected_rerankings,
|
385 |
+
show_anonymous,
|
386 |
+
show_revision_and_timestamp,
|
387 |
):
|
388 |
if task == TaskType.qa:
|
389 |
update_table_func = update_qa_df_elem
|
|
|
393 |
raise NotImplementedError
|
394 |
selector_list = [selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous]
|
395 |
search_bar_args = [
|
396 |
+
source_df,
|
397 |
+
version,
|
398 |
+
] + selector_list
|
399 |
selector_args = (
|
400 |
+
[version, source_df]
|
401 |
+
+ selector_list
|
402 |
+
+ [
|
403 |
+
show_revision_and_timestamp,
|
404 |
+
]
|
405 |
)
|
406 |
# Set search_bar listener
|
407 |
search_bar.submit(update_table_func, search_bar_args, target_df)
|
|
|
417 |
|
418 |
|
419 |
def update_qa_df_elem(
|
420 |
+
version: str,
|
421 |
+
hidden_df: pd.DataFrame,
|
422 |
+
domains: list,
|
423 |
+
langs: list,
|
424 |
+
reranking_query: list,
|
425 |
+
query: str,
|
426 |
+
show_anonymous: bool,
|
427 |
+
show_revision_and_timestamp: bool = False,
|
428 |
+
reset_ranking: bool = True,
|
429 |
):
|
430 |
return _update_df_elem(
|
431 |
TaskType.qa,
|
tests/src/test_benchmarks.py
CHANGED
@@ -3,7 +3,6 @@ import pytest
|
|
3 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
4 |
from src.envs import BENCHMARK_VERSION_LIST
|
5 |
|
6 |
-
|
7 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
8 |
# 24.05
|
9 |
# | Task | dev | test |
|
@@ -17,15 +16,8 @@ from src.envs import BENCHMARK_VERSION_LIST
|
|
17 |
# | Long-Doc | 15 |
|
18 |
# | QA | 13 |
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
[
|
23 |
-
{
|
24 |
-
"air_bench_2404": 13,
|
25 |
-
"air_bench_2405": 53
|
26 |
-
}
|
27 |
-
]
|
28 |
-
)
|
29 |
def test_qa_benchmarks(num_datasets_dict):
|
30 |
assert len(QABenchmarks) == len(BENCHMARK_VERSION_LIST)
|
31 |
for benchmark_list in list(QABenchmarks):
|
@@ -33,15 +25,7 @@ def test_qa_benchmarks(num_datasets_dict):
|
|
33 |
assert num_datasets_dict[version_slug] == len(benchmark_list.value)
|
34 |
|
35 |
|
36 |
-
@pytest.mark.parametrize(
|
37 |
-
"num_datasets_dict",
|
38 |
-
[
|
39 |
-
{
|
40 |
-
"air_bench_2404": 15,
|
41 |
-
"air_bench_2405": 11
|
42 |
-
}
|
43 |
-
]
|
44 |
-
)
|
45 |
def test_doc_benchmarks(num_datasets_dict):
|
46 |
assert len(LongDocBenchmarks) == len(BENCHMARK_VERSION_LIST)
|
47 |
for benchmark_list in list(LongDocBenchmarks):
|
|
|
3 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
4 |
from src.envs import BENCHMARK_VERSION_LIST
|
5 |
|
|
|
6 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
7 |
# 24.05
|
8 |
# | Task | dev | test |
|
|
|
16 |
# | Long-Doc | 15 |
|
17 |
# | QA | 13 |
|
18 |
|
19 |
+
|
20 |
+
@pytest.mark.parametrize("num_datasets_dict", [{"air_bench_2404": 13, "air_bench_2405": 53}])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def test_qa_benchmarks(num_datasets_dict):
|
22 |
assert len(QABenchmarks) == len(BENCHMARK_VERSION_LIST)
|
23 |
for benchmark_list in list(QABenchmarks):
|
|
|
25 |
assert num_datasets_dict[version_slug] == len(benchmark_list.value)
|
26 |
|
27 |
|
28 |
+
@pytest.mark.parametrize("num_datasets_dict", [{"air_bench_2404": 15, "air_bench_2405": 11}])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def test_doc_benchmarks(num_datasets_dict):
|
30 |
assert len(LongDocBenchmarks) == len(BENCHMARK_VERSION_LIST)
|
31 |
for benchmark_list in list(LongDocBenchmarks):
|
tests/src/test_columns.py
CHANGED
@@ -1,12 +1,18 @@
|
|
1 |
import pytest
|
2 |
|
3 |
-
from src.benchmarks import
|
4 |
-
from src.columns import
|
5 |
-
|
6 |
-
|
7 |
-
COL_NAME_RERANKING_MODEL,
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
12 |
# 24.05
|
@@ -21,6 +27,7 @@ from src.columns import get_default_auto_eval_column_dict, \
|
|
21 |
# | Long-Doc | 15 |
|
22 |
# | QA | 13 |
|
23 |
|
|
|
24 |
@pytest.fixture()
|
25 |
def expected_col_names():
|
26 |
return [
|
@@ -45,8 +52,7 @@ def expected_hidden_col_names():
|
|
45 |
]
|
46 |
|
47 |
|
48 |
-
def test_get_default_auto_eval_column_dict(
|
49 |
-
expected_col_names, expected_hidden_col_names):
|
50 |
col_list = get_default_auto_eval_column_dict()
|
51 |
assert len(col_list) == 9
|
52 |
hidden_cols = []
|
@@ -76,14 +82,13 @@ def test_get_fixed_col_names_and_types():
|
|
76 |
|
77 |
|
78 |
@pytest.mark.parametrize(
|
79 |
-
|
80 |
[
|
81 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
82 |
-
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
|
83 |
-
]
|
84 |
)
|
85 |
-
def test_make_autoevalcolumn(
|
86 |
-
benchmarks, expected_benchmark_len, expected_col_names):
|
87 |
expected_default_attrs = frozenset(expected_col_names)
|
88 |
for benchmark in benchmarks:
|
89 |
TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
|
@@ -98,17 +103,15 @@ def test_make_autoevalcolumn(
|
|
98 |
|
99 |
|
100 |
@pytest.mark.parametrize(
|
101 |
-
|
102 |
[
|
103 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
104 |
-
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
|
105 |
-
]
|
106 |
)
|
107 |
def test_get_default_col_names_and_types(
|
108 |
-
|
109 |
-
|
110 |
-
expected_col_names,
|
111 |
-
expected_hidden_col_names):
|
112 |
default_col_len = len(expected_col_names)
|
113 |
hidden_col_len = len(expected_hidden_col_names)
|
114 |
for benchmark in benchmarks:
|
|
|
1 |
import pytest
|
2 |
|
3 |
+
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
4 |
+
from src.columns import (
|
5 |
+
COL_NAME_AVG,
|
6 |
+
COL_NAME_RANK,
|
7 |
+
COL_NAME_RERANKING_MODEL,
|
8 |
+
COL_NAME_RETRIEVAL_MODEL,
|
9 |
+
COL_NAME_REVISION,
|
10 |
+
COL_NAME_TIMESTAMP,
|
11 |
+
get_default_auto_eval_column_dict,
|
12 |
+
get_default_col_names_and_types,
|
13 |
+
get_fixed_col_names_and_types,
|
14 |
+
make_autoevalcolumn,
|
15 |
+
)
|
16 |
|
17 |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
|
18 |
# 24.05
|
|
|
27 |
# | Long-Doc | 15 |
|
28 |
# | QA | 13 |
|
29 |
|
30 |
+
|
31 |
@pytest.fixture()
|
32 |
def expected_col_names():
|
33 |
return [
|
|
|
52 |
]
|
53 |
|
54 |
|
55 |
+
def test_get_default_auto_eval_column_dict(expected_col_names, expected_hidden_col_names):
|
|
|
56 |
col_list = get_default_auto_eval_column_dict()
|
57 |
assert len(col_list) == 9
|
58 |
hidden_cols = []
|
|
|
82 |
|
83 |
|
84 |
@pytest.mark.parametrize(
|
85 |
+
"benchmarks, expected_benchmark_len",
|
86 |
[
|
87 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
88 |
+
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
|
89 |
+
],
|
90 |
)
|
91 |
+
def test_make_autoevalcolumn(benchmarks, expected_benchmark_len, expected_col_names):
|
|
|
92 |
expected_default_attrs = frozenset(expected_col_names)
|
93 |
for benchmark in benchmarks:
|
94 |
TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
|
|
|
103 |
|
104 |
|
105 |
@pytest.mark.parametrize(
|
106 |
+
"benchmarks, expected_benchmark_len",
|
107 |
[
|
108 |
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
|
109 |
+
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
|
110 |
+
],
|
111 |
)
|
112 |
def test_get_default_col_names_and_types(
|
113 |
+
benchmarks, expected_benchmark_len, expected_col_names, expected_hidden_col_names
|
114 |
+
):
|
|
|
|
|
115 |
default_col_len = len(expected_col_names)
|
116 |
hidden_col_len = len(expected_hidden_col_names)
|
117 |
for benchmark in benchmarks:
|
tests/src/test_envs.py
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
from air_benchmark.tasks import BenchmarkTable
|
2 |
|
3 |
-
from src.envs import BENCHMARK_VERSION_LIST,
|
4 |
|
5 |
|
6 |
def test_benchmark_version_list():
|
7 |
leaderboard_versions = frozenset(BENCHMARK_VERSION_LIST)
|
8 |
available_versions = frozenset([k for k in BenchmarkTable.keys()])
|
9 |
-
assert leaderboard_versions.issubset(
|
10 |
-
available_versions)
|
11 |
|
12 |
|
13 |
def test_default_metrics():
|
|
|
1 |
from air_benchmark.tasks import BenchmarkTable
|
2 |
|
3 |
+
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA, METRIC_LIST
|
4 |
|
5 |
|
6 |
def test_benchmark_version_list():
|
7 |
leaderboard_versions = frozenset(BENCHMARK_VERSION_LIST)
|
8 |
available_versions = frozenset([k for k in BenchmarkTable.keys()])
|
9 |
+
assert leaderboard_versions.issubset(available_versions)
|
|
|
10 |
|
11 |
|
12 |
def test_default_metrics():
|
tests/src/test_loaders.py
CHANGED
@@ -1,41 +1,34 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import pytest
|
3 |
-
from pathlib import Path
|
4 |
|
5 |
-
from src.loaders import
|
6 |
|
7 |
cur_fp = Path(__file__)
|
8 |
|
9 |
|
10 |
-
@pytest.mark.parametrize(
|
11 |
-
"version",
|
12 |
-
["AIR-Bench_24.04", "AIR-Bench_24.05"]
|
13 |
-
)
|
14 |
def test_load_raw_eval_results(version):
|
15 |
-
raw_data = load_raw_eval_results(
|
16 |
-
cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
17 |
-
)
|
18 |
assert len(raw_data) == 1
|
19 |
full_eval_result = raw_data[0]
|
20 |
expected_attr = [
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
]
|
31 |
result_attr = [k for k in full_eval_result.__dict__.keys() if k[:2] != "__" and k[-2:] != "__"]
|
32 |
assert sorted(expected_attr) == sorted(result_attr)
|
33 |
|
34 |
|
35 |
-
@pytest.mark.parametrize(
|
36 |
-
"version",
|
37 |
-
["AIR-Bench_24.04", "AIR-Bench_24.05"]
|
38 |
-
)
|
39 |
def test_load_leaderboard_datastore(version):
|
40 |
file_path = cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
41 |
datastore = load_leaderboard_datastore(file_path, version)
|
@@ -51,4 +44,3 @@ def test_load_eval_results():
|
|
51 |
file_path = cur_fp.parents[1] / "toydata/eval_results/"
|
52 |
datastore_dict = load_eval_results(file_path)
|
53 |
assert len(datastore_dict) == 2
|
54 |
-
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
import pandas as pd
|
4 |
import pytest
|
|
|
5 |
|
6 |
+
from src.loaders import load_eval_results, load_leaderboard_datastore, load_raw_eval_results
|
7 |
|
8 |
cur_fp = Path(__file__)
|
9 |
|
10 |
|
11 |
+
@pytest.mark.parametrize("version", ["AIR-Bench_24.04", "AIR-Bench_24.05"])
|
|
|
|
|
|
|
12 |
def test_load_raw_eval_results(version):
|
13 |
+
raw_data = load_raw_eval_results(cur_fp.parents[1] / f"toydata/eval_results/{version}")
|
|
|
|
|
14 |
assert len(raw_data) == 1
|
15 |
full_eval_result = raw_data[0]
|
16 |
expected_attr = [
|
17 |
+
"eval_name",
|
18 |
+
"retrieval_model",
|
19 |
+
"reranking_model",
|
20 |
+
"retrieval_model_link",
|
21 |
+
"reranking_model_link",
|
22 |
+
"results",
|
23 |
+
"timestamp",
|
24 |
+
"revision",
|
25 |
+
"is_anonymous",
|
26 |
]
|
27 |
result_attr = [k for k in full_eval_result.__dict__.keys() if k[:2] != "__" and k[-2:] != "__"]
|
28 |
assert sorted(expected_attr) == sorted(result_attr)
|
29 |
|
30 |
|
31 |
+
@pytest.mark.parametrize("version", ["AIR-Bench_24.04", "AIR-Bench_24.05"])
|
|
|
|
|
|
|
32 |
def test_load_leaderboard_datastore(version):
|
33 |
file_path = cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
34 |
datastore = load_leaderboard_datastore(file_path, version)
|
|
|
44 |
file_path = cur_fp.parents[1] / "toydata/eval_results/"
|
45 |
datastore_dict = load_eval_results(file_path)
|
46 |
assert len(datastore_dict) == 2
|
|
tests/src/test_models.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
-
import pytest
|
2 |
from pathlib import Path
|
3 |
|
|
|
|
|
4 |
from src.models import EvalResult, FullEvalResult
|
5 |
|
6 |
cur_fp = Path(__file__)
|
@@ -23,19 +24,13 @@ NUM_DOC_BENCHMARKS_24_05 = 11
|
|
23 |
NUM_QA_BENCHMARKS_24_04 = 13
|
24 |
NUM_DOC_BENCHMARKS_24_04 = 15
|
25 |
|
|
|
26 |
def test_eval_result():
|
27 |
-
|
28 |
eval_name="eval_name",
|
29 |
retrieval_model="bge-m3",
|
30 |
reranking_model="NoReranking",
|
31 |
-
results=[
|
32 |
-
{
|
33 |
-
"domain": "law",
|
34 |
-
"lang": "en",
|
35 |
-
"dataset": "lex_files_500K-600K",
|
36 |
-
"value": 0.45723
|
37 |
-
}
|
38 |
-
],
|
39 |
task="qa",
|
40 |
metric="ndcg_at_3",
|
41 |
timestamp="2024-05-14T03:09:08Z",
|
@@ -45,11 +40,12 @@ def test_eval_result():
|
|
45 |
|
46 |
|
47 |
@pytest.mark.parametrize(
|
48 |
-
|
49 |
[
|
50 |
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
|
51 |
-
"AIR-Bench_24.05/bge-m3/NoReranker/results.json"
|
52 |
-
]
|
|
|
53 |
def test_full_eval_result_init_from_json_file(file_path):
|
54 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
55 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
@@ -59,20 +55,35 @@ def test_full_eval_result_init_from_json_file(file_path):
|
|
59 |
|
60 |
|
61 |
@pytest.mark.parametrize(
|
62 |
-
|
63 |
[
|
64 |
("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "qa", NUM_QA_BENCHMARKS_24_04),
|
65 |
-
(
|
|
|
|
|
|
|
|
|
66 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "qa", NUM_QA_BENCHMARKS_24_05),
|
67 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_05),
|
68 |
-
]
|
|
|
69 |
def test_full_eval_result_to_dict(file_path, task, expected_num_results):
|
70 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
71 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
72 |
result_dict_list = full_eval_result.to_dict(task)
|
73 |
assert len(result_dict_list) == 1
|
74 |
result = result_dict_list[0]
|
75 |
-
attr_list = frozenset(
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
result_cols = list(result.keys())
|
78 |
-
assert len(result_cols) == (expected_num_results + len(attr_list))
|
|
|
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
+
import pytest
|
4 |
+
|
5 |
from src.models import EvalResult, FullEvalResult
|
6 |
|
7 |
cur_fp = Path(__file__)
|
|
|
24 |
NUM_QA_BENCHMARKS_24_04 = 13
|
25 |
NUM_DOC_BENCHMARKS_24_04 = 15
|
26 |
|
27 |
+
|
28 |
def test_eval_result():
|
29 |
+
EvalResult(
|
30 |
eval_name="eval_name",
|
31 |
retrieval_model="bge-m3",
|
32 |
reranking_model="NoReranking",
|
33 |
+
results=[{"domain": "law", "lang": "en", "dataset": "lex_files_500K-600K", "value": 0.45723}],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
task="qa",
|
35 |
metric="ndcg_at_3",
|
36 |
timestamp="2024-05-14T03:09:08Z",
|
|
|
40 |
|
41 |
|
42 |
@pytest.mark.parametrize(
|
43 |
+
"file_path",
|
44 |
[
|
45 |
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
|
46 |
+
"AIR-Bench_24.05/bge-m3/NoReranker/results.json",
|
47 |
+
],
|
48 |
+
)
|
49 |
def test_full_eval_result_init_from_json_file(file_path):
|
50 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
51 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
|
|
55 |
|
56 |
|
57 |
@pytest.mark.parametrize(
|
58 |
+
"file_path, task, expected_num_results",
|
59 |
[
|
60 |
("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "qa", NUM_QA_BENCHMARKS_24_04),
|
61 |
+
(
|
62 |
+
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
|
63 |
+
"long-doc",
|
64 |
+
NUM_DOC_BENCHMARKS_24_04,
|
65 |
+
),
|
66 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "qa", NUM_QA_BENCHMARKS_24_05),
|
67 |
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_05),
|
68 |
+
],
|
69 |
+
)
|
70 |
def test_full_eval_result_to_dict(file_path, task, expected_num_results):
|
71 |
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
72 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
73 |
result_dict_list = full_eval_result.to_dict(task)
|
74 |
assert len(result_dict_list) == 1
|
75 |
result = result_dict_list[0]
|
76 |
+
attr_list = frozenset(
|
77 |
+
[
|
78 |
+
"eval_name",
|
79 |
+
"Retrieval Method",
|
80 |
+
"Reranking Model",
|
81 |
+
"Retrieval Model LINK",
|
82 |
+
"Reranking Model LINK",
|
83 |
+
"Revision",
|
84 |
+
"Submission Date",
|
85 |
+
"Anonymous Submission",
|
86 |
+
]
|
87 |
+
)
|
88 |
result_cols = list(result.keys())
|
89 |
+
assert len(result_cols) == (expected_num_results + len(attr_list))
|
tests/src/test_read_evals.py
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
|
3 |
-
from src.models import FullEvalResult
|
4 |
-
from src.read_evals import load_raw_eval_results
|
5 |
-
from src.utils import get_leaderboard_df
|
6 |
-
|
7 |
-
cur_fp = Path(__file__)
|
8 |
-
|
9 |
-
|
10 |
-
def test_init_from_json_file():
|
11 |
-
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
12 |
-
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
13 |
-
num_different_task_domain_lang_metric_dataset_combination = 6
|
14 |
-
assert len(full_eval_result.results) == num_different_task_domain_lang_metric_dataset_combination
|
15 |
-
assert full_eval_result.retrieval_model == "bge-m3"
|
16 |
-
assert full_eval_result.reranking_model == "bge-reranker-v2-m3"
|
17 |
-
|
18 |
-
|
19 |
-
def test_to_dict():
|
20 |
-
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
21 |
-
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
22 |
-
result_list = full_eval_result.to_dict(task="qa", metric="ndcg_at_1")
|
23 |
-
assert len(result_list) == 1
|
24 |
-
result_dict = result_list[0]
|
25 |
-
assert result_dict["Retrieval Model"] == "bge-m3"
|
26 |
-
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
27 |
-
assert result_dict["wiki_en"] is not None
|
28 |
-
assert result_dict["wiki_zh"] is not None
|
29 |
-
|
30 |
-
|
31 |
-
def test_get_raw_eval_results():
|
32 |
-
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
33 |
-
results = load_raw_eval_results(results_path)
|
34 |
-
# only load the latest results
|
35 |
-
assert len(results) == 4
|
36 |
-
assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
|
37 |
-
assert len(results[0].results) == 70
|
38 |
-
assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
|
39 |
-
assert len(results[1].results) == 70
|
40 |
-
|
41 |
-
|
42 |
-
def test_get_leaderboard_df():
|
43 |
-
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
44 |
-
raw_data = load_raw_eval_results(results_path)
|
45 |
-
df = get_leaderboard_df(raw_data, "qa", "ndcg_at_10")
|
46 |
-
assert df.shape[0] == 4
|
47 |
-
# the results contain only one embedding model
|
48 |
-
# for i in range(4):
|
49 |
-
# assert df["Retrieval Model"][i] == "bge-m3"
|
50 |
-
# # the results contain only two reranking model
|
51 |
-
# assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
52 |
-
# assert df["Reranking Model"][1] == "NoReranker"
|
53 |
-
# assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
54 |
-
# assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
|
55 |
-
|
56 |
-
|
57 |
-
def test_get_leaderboard_df_long_doc():
|
58 |
-
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
59 |
-
raw_data = load_raw_eval_results(results_path)
|
60 |
-
df = get_leaderboard_df(raw_data, "long-doc", "ndcg_at_1")
|
61 |
-
assert df.shape[0] == 2
|
62 |
-
# the results contain only one embedding model
|
63 |
-
for i in range(2):
|
64 |
-
assert df["Retrieval Model"][i] == "bge-m3"
|
65 |
-
# the results contains only two reranking model
|
66 |
-
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
67 |
-
assert df["Reranking Model"][1] == "NoReranker"
|
68 |
-
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
69 |
-
assert (
|
70 |
-
not df[
|
71 |
-
[
|
72 |
-
"Average ⬆️",
|
73 |
-
"law_en_lex_files_500k_600k",
|
74 |
-
]
|
75 |
-
]
|
76 |
-
.isnull()
|
77 |
-
.values.any()
|
78 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/src/test_utils.py
CHANGED
@@ -1,10 +1,21 @@
|
|
1 |
-
import pytest
|
2 |
-
import pandas as pd
|
3 |
from pathlib import Path
|
4 |
|
5 |
-
|
6 |
-
|
|
|
7 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
cur_fp = Path(__file__)
|
10 |
|
@@ -18,18 +29,8 @@ NUM_DOC_BENCHMARKS_24_04 = 15
|
|
18 |
def toy_df():
|
19 |
return pd.DataFrame(
|
20 |
{
|
21 |
-
"Retrieval Method": [
|
22 |
-
|
23 |
-
"bge-m3",
|
24 |
-
"jina-embeddings-v2-base",
|
25 |
-
"jina-embeddings-v2-base"
|
26 |
-
],
|
27 |
-
"Reranking Model": [
|
28 |
-
"bge-reranker-v2-m3",
|
29 |
-
"NoReranker",
|
30 |
-
"bge-reranker-v2-m3",
|
31 |
-
"NoReranker"
|
32 |
-
],
|
33 |
"Rank 🏆": [1, 2, 3, 4],
|
34 |
"Revision": ["123", "234", "345", "456"],
|
35 |
"Submission Date": ["", "", "", ""],
|
@@ -45,8 +46,7 @@ def toy_df():
|
|
45 |
|
46 |
def test_remove_html():
|
47 |
model_name = "jina-embeddings-v3"
|
48 |
-
html_str = model_hyperlink(
|
49 |
-
"https://jina.ai", model_name)
|
50 |
output_str = remove_html(html_str)
|
51 |
assert output_str == model_name
|
52 |
|
@@ -60,17 +60,29 @@ def test_calculate_mean():
|
|
60 |
assert result[1] == -1
|
61 |
|
62 |
|
63 |
-
@pytest.mark.parametrize(
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
def test_filter_models(models, expected):
|
71 |
df = pd.DataFrame(
|
72 |
{
|
73 |
-
COL_NAME_RERANKING_MODEL: [
|
|
|
|
|
|
|
|
|
74 |
"col2": [1, 2, 3],
|
75 |
}
|
76 |
)
|
@@ -78,18 +90,29 @@ def test_filter_models(models, expected):
|
|
78 |
assert len(output_df) == expected
|
79 |
|
80 |
|
81 |
-
@pytest.mark.parametrize(
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
88 |
def test_filter_queries(query, expected):
|
89 |
df = pd.DataFrame(
|
90 |
{
|
91 |
-
COL_NAME_RETRIEVAL_MODEL: [
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
}
|
94 |
)
|
95 |
output_df = filter_queries(query, df)
|
@@ -103,10 +126,10 @@ def test_filter_queries(query, expected):
|
|
103 |
(TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
|
104 |
(TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
|
105 |
(TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
|
106 |
-
]
|
107 |
)
|
108 |
def test_get_default_cols(task_type, slug, add_fix_cols, expected):
|
109 |
-
attr_cols = [
|
110 |
cols, types = get_default_cols(task_type, slug)
|
111 |
cols_set = frozenset(cols)
|
112 |
attrs_set = frozenset(attr_cols)
|
@@ -119,44 +142,54 @@ def test_get_default_cols(task_type, slug, add_fix_cols, expected):
|
|
119 |
@pytest.mark.parametrize(
|
120 |
"task_type, domains, languages, expected",
|
121 |
[
|
122 |
-
(
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
(
|
125 |
TaskType.long_doc,
|
126 |
["healthcare"],
|
127 |
["zh", "en"],
|
128 |
[
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
]
|
135 |
-
)
|
136 |
-
]
|
137 |
)
|
138 |
def test_get_selected_cols(task_type, domains, languages, expected):
|
139 |
slug = "air_bench_2404"
|
140 |
cols = get_selected_cols(task_type, slug, domains, languages)
|
141 |
assert sorted(cols) == sorted(expected)
|
142 |
|
|
|
143 |
@pytest.mark.parametrize("reset_rank", [False])
|
144 |
def test_select_columns(toy_df, reset_rank):
|
145 |
expected = [
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
["news"],
|
156 |
-
["zh"],
|
157 |
-
version_slug="air_bench_2404",
|
158 |
-
reset_ranking=reset_rank
|
159 |
-
)
|
160 |
assert len(df_result.columns) == len(expected)
|
161 |
if reset_rank:
|
162 |
assert df_result["Average ⬆️"].equals(df_result["news_zh"])
|
@@ -170,20 +203,10 @@ def test_select_columns(toy_df, reset_rank):
|
|
170 |
(False, True),
|
171 |
(True, True),
|
172 |
(True, False),
|
173 |
-
]
|
174 |
)
|
175 |
def test__update_df_elem(toy_df, reset_rank, show_anony):
|
176 |
-
df = _update_df_elem(
|
177 |
-
TaskType.qa,
|
178 |
-
"AIR-Bench_24.04",
|
179 |
-
toy_df,
|
180 |
-
["news"],
|
181 |
-
["zh"],
|
182 |
-
[],
|
183 |
-
"",
|
184 |
-
show_anony,
|
185 |
-
reset_rank
|
186 |
-
)
|
187 |
if show_anony:
|
188 |
assert df.shape[0] == 4
|
189 |
else:
|
@@ -201,19 +224,14 @@ def test__update_df_elem(toy_df, reset_rank, show_anony):
|
|
201 |
("AIR-Bench_24.04", TaskType.qa),
|
202 |
("AIR-Bench_24.04", TaskType.long_doc),
|
203 |
("AIR-Bench_24.05", TaskType.qa),
|
204 |
-
("AIR-Bench_24.05", TaskType.long_doc)
|
205 |
-
]
|
206 |
)
|
207 |
def test_get_leaderboard_df(version, task_type):
|
208 |
from src.loaders import load_raw_eval_results
|
209 |
from src.models import LeaderboardDataStore, get_safe_name
|
210 |
-
|
211 |
-
|
212 |
-
)
|
213 |
ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
|
214 |
-
df = get_leaderboard_df(
|
215 |
-
|
216 |
-
task_type,
|
217 |
-
"ndcg_at_10"
|
218 |
-
)
|
219 |
-
assert df.shape[0] == 1
|
|
|
|
|
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
+
import pandas as pd
|
4 |
+
import pytest
|
5 |
+
|
6 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
7 |
+
from src.models import TaskType, model_hyperlink
|
8 |
+
from src.utils import (
|
9 |
+
_update_df_elem,
|
10 |
+
calculate_mean,
|
11 |
+
filter_models,
|
12 |
+
filter_queries,
|
13 |
+
get_default_cols,
|
14 |
+
get_leaderboard_df,
|
15 |
+
get_selected_cols,
|
16 |
+
remove_html,
|
17 |
+
select_columns,
|
18 |
+
)
|
19 |
|
20 |
cur_fp = Path(__file__)
|
21 |
|
|
|
29 |
def toy_df():
|
30 |
return pd.DataFrame(
|
31 |
{
|
32 |
+
"Retrieval Method": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
|
33 |
+
"Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
"Rank 🏆": [1, 2, 3, 4],
|
35 |
"Revision": ["123", "234", "345", "456"],
|
36 |
"Submission Date": ["", "", "", ""],
|
|
|
46 |
|
47 |
def test_remove_html():
|
48 |
model_name = "jina-embeddings-v3"
|
49 |
+
html_str = model_hyperlink("https://jina.ai", model_name)
|
|
|
50 |
output_str = remove_html(html_str)
|
51 |
assert output_str == model_name
|
52 |
|
|
|
60 |
assert result[1] == -1
|
61 |
|
62 |
|
63 |
+
@pytest.mark.parametrize(
|
64 |
+
"models, expected",
|
65 |
+
[
|
66 |
+
(["model1", "model3"], 2),
|
67 |
+
(["model1", "model_missing"], 1),
|
68 |
+
(["model1", "model2", "model3"], 3),
|
69 |
+
(
|
70 |
+
[
|
71 |
+
"model1",
|
72 |
+
],
|
73 |
+
1,
|
74 |
+
),
|
75 |
+
([], 3),
|
76 |
+
],
|
77 |
+
)
|
78 |
def test_filter_models(models, expected):
|
79 |
df = pd.DataFrame(
|
80 |
{
|
81 |
+
COL_NAME_RERANKING_MODEL: [
|
82 |
+
"model1",
|
83 |
+
"model2",
|
84 |
+
"model3",
|
85 |
+
],
|
86 |
"col2": [1, 2, 3],
|
87 |
}
|
88 |
)
|
|
|
90 |
assert len(output_df) == expected
|
91 |
|
92 |
|
93 |
+
@pytest.mark.parametrize(
|
94 |
+
"query, expected",
|
95 |
+
[
|
96 |
+
("model1;model3", 2),
|
97 |
+
("model1;model4", 1),
|
98 |
+
("model1;model2;model3", 3),
|
99 |
+
("model1", 1),
|
100 |
+
("", 3),
|
101 |
+
],
|
102 |
+
)
|
103 |
def test_filter_queries(query, expected):
|
104 |
df = pd.DataFrame(
|
105 |
{
|
106 |
+
COL_NAME_RETRIEVAL_MODEL: [
|
107 |
+
"model1",
|
108 |
+
"model2",
|
109 |
+
"model3",
|
110 |
+
],
|
111 |
+
COL_NAME_RERANKING_MODEL: [
|
112 |
+
"model4",
|
113 |
+
"model5",
|
114 |
+
"model6",
|
115 |
+
],
|
116 |
}
|
117 |
)
|
118 |
output_df = filter_queries(query, df)
|
|
|
126 |
(TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
|
127 |
(TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
|
128 |
(TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
|
129 |
+
],
|
130 |
)
|
131 |
def test_get_default_cols(task_type, slug, add_fix_cols, expected):
|
132 |
+
attr_cols = ["Rank 🏆", "Retrieval Method", "Reranking Model", "Revision", "Submission Date", "Average ⬆️"]
|
133 |
cols, types = get_default_cols(task_type, slug)
|
134 |
cols_set = frozenset(cols)
|
135 |
attrs_set = frozenset(attr_cols)
|
|
|
142 |
@pytest.mark.parametrize(
|
143 |
"task_type, domains, languages, expected",
|
144 |
[
|
145 |
+
(
|
146 |
+
TaskType.qa,
|
147 |
+
["wiki", "news"],
|
148 |
+
[
|
149 |
+
"zh",
|
150 |
+
],
|
151 |
+
["wiki_zh", "news_zh"],
|
152 |
+
),
|
153 |
+
(
|
154 |
+
TaskType.qa,
|
155 |
+
[
|
156 |
+
"law",
|
157 |
+
],
|
158 |
+
["zh", "en"],
|
159 |
+
["law_en"],
|
160 |
+
),
|
161 |
(
|
162 |
TaskType.long_doc,
|
163 |
["healthcare"],
|
164 |
["zh", "en"],
|
165 |
[
|
166 |
+
"healthcare_en_pubmed_100k_200k_1",
|
167 |
+
"healthcare_en_pubmed_100k_200k_2",
|
168 |
+
"healthcare_en_pubmed_100k_200k_3",
|
169 |
+
"healthcare_en_pubmed_40k_50k_5_merged",
|
170 |
+
"healthcare_en_pubmed_30k_40k_10_merged",
|
171 |
+
],
|
172 |
+
),
|
173 |
+
],
|
174 |
)
|
175 |
def test_get_selected_cols(task_type, domains, languages, expected):
|
176 |
slug = "air_bench_2404"
|
177 |
cols = get_selected_cols(task_type, slug, domains, languages)
|
178 |
assert sorted(cols) == sorted(expected)
|
179 |
|
180 |
+
|
181 |
@pytest.mark.parametrize("reset_rank", [False])
|
182 |
def test_select_columns(toy_df, reset_rank):
|
183 |
expected = [
|
184 |
+
"Rank 🏆",
|
185 |
+
"Retrieval Method",
|
186 |
+
"Reranking Model",
|
187 |
+
"Revision",
|
188 |
+
"Submission Date",
|
189 |
+
"Average ⬆️",
|
190 |
+
"news_zh",
|
191 |
+
]
|
192 |
+
df_result = select_columns(toy_df, ["news"], ["zh"], version_slug="air_bench_2404", reset_ranking=reset_rank)
|
|
|
|
|
|
|
|
|
|
|
193 |
assert len(df_result.columns) == len(expected)
|
194 |
if reset_rank:
|
195 |
assert df_result["Average ⬆️"].equals(df_result["news_zh"])
|
|
|
203 |
(False, True),
|
204 |
(True, True),
|
205 |
(True, False),
|
206 |
+
],
|
207 |
)
|
208 |
def test__update_df_elem(toy_df, reset_rank, show_anony):
|
209 |
+
df = _update_df_elem(TaskType.qa, "AIR-Bench_24.04", toy_df, ["news"], ["zh"], [], "", show_anony, reset_rank)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
if show_anony:
|
211 |
assert df.shape[0] == 4
|
212 |
else:
|
|
|
224 |
("AIR-Bench_24.04", TaskType.qa),
|
225 |
("AIR-Bench_24.04", TaskType.long_doc),
|
226 |
("AIR-Bench_24.05", TaskType.qa),
|
227 |
+
("AIR-Bench_24.05", TaskType.long_doc),
|
228 |
+
],
|
229 |
)
|
230 |
def test_get_leaderboard_df(version, task_type):
|
231 |
from src.loaders import load_raw_eval_results
|
232 |
from src.models import LeaderboardDataStore, get_safe_name
|
233 |
+
|
234 |
+
raw_data = load_raw_eval_results(cur_fp.parents[1] / f"toydata/eval_results/{version}")
|
|
|
235 |
ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
|
236 |
+
df = get_leaderboard_df(ds, task_type, "ndcg_at_10")
|
237 |
+
assert df.shape[0] == 1
|
|
|
|
|
|
|
|
tests/test_utils.py
DELETED
@@ -1,136 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import pytest
|
3 |
-
|
4 |
-
from app import update_table
|
5 |
-
from src.columns import (
|
6 |
-
COL_NAME_AVG,
|
7 |
-
COL_NAME_IS_ANONYMOUS,
|
8 |
-
COL_NAME_RANK,
|
9 |
-
COL_NAME_RERANKING_MODEL,
|
10 |
-
COL_NAME_RETRIEVAL_MODEL,
|
11 |
-
COL_NAME_REVISION,
|
12 |
-
COL_NAME_TIMESTAMP,
|
13 |
-
)
|
14 |
-
from src.utils import (
|
15 |
-
filter_models,
|
16 |
-
filter_queries,
|
17 |
-
get_default_cols,
|
18 |
-
get_iso_format_timestamp,
|
19 |
-
search_table,
|
20 |
-
select_columns,
|
21 |
-
update_doc_df_elem,
|
22 |
-
)
|
23 |
-
|
24 |
-
|
25 |
-
@pytest.fixture
|
26 |
-
def toy_df():
|
27 |
-
return pd.DataFrame(
|
28 |
-
{
|
29 |
-
"Retrieval Model": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
|
30 |
-
"Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
|
31 |
-
"Average ⬆️": [0.6, 0.4, 0.3, 0.2],
|
32 |
-
"wiki_en": [0.8, 0.7, 0.2, 0.1],
|
33 |
-
"wiki_zh": [0.4, 0.1, 0.4, 0.3],
|
34 |
-
"news_en": [0.8, 0.7, 0.2, 0.1],
|
35 |
-
"news_zh": [0.4, 0.1, 0.4, 0.3],
|
36 |
-
}
|
37 |
-
)
|
38 |
-
|
39 |
-
|
40 |
-
@pytest.fixture
|
41 |
-
def toy_df_long_doc():
|
42 |
-
return pd.DataFrame(
|
43 |
-
{
|
44 |
-
"Retrieval Model": ["bge-m3", "bge-m3", "jina-embeddings-v2-base", "jina-embeddings-v2-base"],
|
45 |
-
"Reranking Model": ["bge-reranker-v2-m3", "NoReranker", "bge-reranker-v2-m3", "NoReranker"],
|
46 |
-
"Average ⬆️": [0.6, 0.4, 0.3, 0.2],
|
47 |
-
"law_en_lex_files_300k_400k": [0.4, 0.1, 0.4, 0.3],
|
48 |
-
"law_en_lex_files_400k_500k": [0.8, 0.7, 0.2, 0.1],
|
49 |
-
"law_en_lex_files_500k_600k": [0.8, 0.7, 0.2, 0.1],
|
50 |
-
"law_en_lex_files_600k_700k": [0.4, 0.1, 0.4, 0.3],
|
51 |
-
}
|
52 |
-
)
|
53 |
-
|
54 |
-
|
55 |
-
def test_filter_models(toy_df):
|
56 |
-
df_result = filter_models(
|
57 |
-
toy_df,
|
58 |
-
[
|
59 |
-
"bge-reranker-v2-m3",
|
60 |
-
],
|
61 |
-
)
|
62 |
-
assert len(df_result) == 2
|
63 |
-
assert df_result.iloc[0]["Reranking Model"] == "bge-reranker-v2-m3"
|
64 |
-
|
65 |
-
|
66 |
-
def test_search_table(toy_df):
|
67 |
-
df_result = search_table(toy_df, "jina")
|
68 |
-
assert len(df_result) == 2
|
69 |
-
assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
|
70 |
-
|
71 |
-
|
72 |
-
def test_filter_queries(toy_df):
|
73 |
-
df_result = filter_queries("jina", toy_df)
|
74 |
-
assert len(df_result) == 2
|
75 |
-
assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
def test_update_table_long_doc(toy_df_long_doc):
|
82 |
-
df_result = update_doc_df_elem(
|
83 |
-
toy_df_long_doc,
|
84 |
-
[
|
85 |
-
"law",
|
86 |
-
],
|
87 |
-
[
|
88 |
-
"en",
|
89 |
-
],
|
90 |
-
[
|
91 |
-
"bge-reranker-v2-m3",
|
92 |
-
],
|
93 |
-
"jina",
|
94 |
-
)
|
95 |
-
print(df_result)
|
96 |
-
|
97 |
-
|
98 |
-
def test_get_iso_format_timestamp():
|
99 |
-
timestamp_config, timestamp_fn = get_iso_format_timestamp()
|
100 |
-
assert len(timestamp_fn) == 14
|
101 |
-
assert len(timestamp_config) == 20
|
102 |
-
assert timestamp_config[-1] == "Z"
|
103 |
-
|
104 |
-
|
105 |
-
def test_get_default_cols():
|
106 |
-
cols, types = get_default_cols("qa")
|
107 |
-
for c, t in zip(cols, types):
|
108 |
-
print(f"type({c}): {t}")
|
109 |
-
assert len(frozenset(cols)) == len(cols)
|
110 |
-
|
111 |
-
|
112 |
-
def test_update_table():
|
113 |
-
df = pd.DataFrame(
|
114 |
-
{
|
115 |
-
COL_NAME_IS_ANONYMOUS: [False, False, False],
|
116 |
-
COL_NAME_REVISION: ["a1", "a2", "a3"],
|
117 |
-
COL_NAME_TIMESTAMP: ["2024-05-12T12:24:02Z"] * 3,
|
118 |
-
COL_NAME_RERANKING_MODEL: ["NoReranker"] * 3,
|
119 |
-
COL_NAME_RETRIEVAL_MODEL: ["Foo"] * 3,
|
120 |
-
COL_NAME_RANK: [1, 2, 3],
|
121 |
-
COL_NAME_AVG: [0.1, 0.2, 0.3], # unsorted values
|
122 |
-
"wiki_en": [0.1, 0.2, 0.3],
|
123 |
-
}
|
124 |
-
)
|
125 |
-
results = update_table(
|
126 |
-
df,
|
127 |
-
"wiki",
|
128 |
-
"en",
|
129 |
-
["NoReranker"],
|
130 |
-
"",
|
131 |
-
show_anonymous=False,
|
132 |
-
reset_ranking=False,
|
133 |
-
show_revision_and_timestamp=False,
|
134 |
-
)
|
135 |
-
# keep the RANK as the same regardless of the unsorted averages
|
136 |
-
assert results[COL_NAME_RANK].to_list() == [1, 2, 3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_data.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"config": {
|
4 |
-
"retrieval_model": "bge-m3",
|
5 |
-
"reranking_model": "bge-reranker-v2-m3",
|
6 |
-
"task": "long_doc",
|
7 |
-
"metric": "ndcg_at_1"
|
8 |
-
},
|
9 |
-
"results": [
|
10 |
-
{
|
11 |
-
"domain": "law",
|
12 |
-
"lang": "en",
|
13 |
-
"dataset": "lex_files_500K-600K",
|
14 |
-
"value": 0.75723
|
15 |
-
}
|
16 |
-
]
|
17 |
-
},
|
18 |
-
{
|
19 |
-
"config": {
|
20 |
-
"retrieval_model": "bge-m3",
|
21 |
-
"reranking_model": "bge-reranker-v2-m3",
|
22 |
-
"task": "long_doc",
|
23 |
-
"metric": "ndcg_at_3"
|
24 |
-
},
|
25 |
-
"results": [
|
26 |
-
{
|
27 |
-
"domain": "law",
|
28 |
-
"lang": "en",
|
29 |
-
"dataset": "lex_files_500K-600K",
|
30 |
-
"value": 0.69909
|
31 |
-
}
|
32 |
-
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"config": {
|
36 |
-
"retrieval_model": "bge-m3",
|
37 |
-
"reranking_model": "bge-reranker-v2-m3",
|
38 |
-
"task": "qa",
|
39 |
-
"metric": "ndcg_at_1"
|
40 |
-
},
|
41 |
-
"results": [
|
42 |
-
{
|
43 |
-
"domain": "wiki",
|
44 |
-
"lang": "en",
|
45 |
-
"dataset": "unknown",
|
46 |
-
"value": 0.69083
|
47 |
-
}
|
48 |
-
]
|
49 |
-
},
|
50 |
-
{
|
51 |
-
"config": {
|
52 |
-
"retrieval_model": "bge-m3",
|
53 |
-
"reranking_model": "bge-reranker-v2-m3",
|
54 |
-
"task": "qa",
|
55 |
-
"metric": "ndcg_at_3"
|
56 |
-
},
|
57 |
-
"results": [
|
58 |
-
{
|
59 |
-
"domain": "wiki",
|
60 |
-
"lang": "en",
|
61 |
-
"dataset": "unknown",
|
62 |
-
"value": 0.73359
|
63 |
-
}
|
64 |
-
]
|
65 |
-
},
|
66 |
-
{
|
67 |
-
"config": {
|
68 |
-
"retrieval_model": "bge-m3",
|
69 |
-
"reranking_model": "bge-reranker-v2-m3",
|
70 |
-
"task": "qa",
|
71 |
-
"metric": "ndcg_at_1"
|
72 |
-
},
|
73 |
-
"results": [
|
74 |
-
{
|
75 |
-
"domain": "wiki",
|
76 |
-
"lang": "zh",
|
77 |
-
"dataset": "unknown",
|
78 |
-
"value": 0.78358
|
79 |
-
}
|
80 |
-
]
|
81 |
-
},
|
82 |
-
{
|
83 |
-
"config": {
|
84 |
-
"retrieval_model": "bge-m3",
|
85 |
-
"reranking_model": "bge-reranker-v2-m3",
|
86 |
-
"task": "qa",
|
87 |
-
"metric": "ndcg_at_3"
|
88 |
-
},
|
89 |
-
"results": [
|
90 |
-
{
|
91 |
-
"domain": "wiki",
|
92 |
-
"lang": "zh",
|
93 |
-
"dataset": "unknown",
|
94 |
-
"value": 0.78358
|
95 |
-
}
|
96 |
-
]
|
97 |
-
}
|
98 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_results/bge-m3/NoReranker/results_2023-11-21T18-10-08.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"config": {
|
4 |
-
"retrieval_model": "bge-m3",
|
5 |
-
"reranking_model": "NoReranker",
|
6 |
-
"task": "long_doc",
|
7 |
-
"metric": "ndcg_at_1"
|
8 |
-
},
|
9 |
-
"results": [
|
10 |
-
{
|
11 |
-
"domain": "law",
|
12 |
-
"lang": "en",
|
13 |
-
"dataset": "lex_files_500K-600K",
|
14 |
-
"value": 0.45723
|
15 |
-
}
|
16 |
-
]
|
17 |
-
},
|
18 |
-
{
|
19 |
-
"config": {
|
20 |
-
"retrieval_model": "bge-m3",
|
21 |
-
"reranking_model": "NoReranker",
|
22 |
-
"task": "long_doc",
|
23 |
-
"metric": "ndcg_at_3"
|
24 |
-
},
|
25 |
-
"results": [
|
26 |
-
{
|
27 |
-
"domain": "law",
|
28 |
-
"lang": "en",
|
29 |
-
"dataset": "lex_files_500K-600K",
|
30 |
-
"value": 0.49909
|
31 |
-
}
|
32 |
-
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"config": {
|
36 |
-
"retrieval_model": "bge-m3",
|
37 |
-
"reranking_model": "NoReranker",
|
38 |
-
"task": "qa",
|
39 |
-
"metric": "ndcg_at_1"
|
40 |
-
},
|
41 |
-
"results": [
|
42 |
-
{
|
43 |
-
"domain": "wiki",
|
44 |
-
"lang": "en",
|
45 |
-
"dataset": "unknown",
|
46 |
-
"value": 0.49083
|
47 |
-
}
|
48 |
-
]
|
49 |
-
},
|
50 |
-
{
|
51 |
-
"config": {
|
52 |
-
"retrieval_model": "bge-m3",
|
53 |
-
"reranking_model": "NoReranker",
|
54 |
-
"task": "qa",
|
55 |
-
"metric": "ndcg_at_3"
|
56 |
-
},
|
57 |
-
"results": [
|
58 |
-
{
|
59 |
-
"domain": "wiki",
|
60 |
-
"lang": "en",
|
61 |
-
"dataset": "unknown",
|
62 |
-
"value": 0.43359
|
63 |
-
}
|
64 |
-
]
|
65 |
-
},
|
66 |
-
{
|
67 |
-
"config": {
|
68 |
-
"retrieval_model": "bge-m3",
|
69 |
-
"reranking_model": "NoReranker",
|
70 |
-
"task": "qa",
|
71 |
-
"metric": "ndcg_at_1"
|
72 |
-
},
|
73 |
-
"results": [
|
74 |
-
{
|
75 |
-
"domain": "wiki",
|
76 |
-
"lang": "zh",
|
77 |
-
"dataset": "unknown",
|
78 |
-
"value": 0.78358
|
79 |
-
}
|
80 |
-
]
|
81 |
-
},
|
82 |
-
{
|
83 |
-
"config": {
|
84 |
-
"retrieval_model": "bge-m3",
|
85 |
-
"reranking_model": "NoReranker",
|
86 |
-
"task": "qa",
|
87 |
-
"metric": "ndcg_at_3"
|
88 |
-
},
|
89 |
-
"results": [
|
90 |
-
{
|
91 |
-
"domain": "wiki",
|
92 |
-
"lang": "zh",
|
93 |
-
"dataset": "unknown",
|
94 |
-
"value": 0.78358
|
95 |
-
}
|
96 |
-
]
|
97 |
-
}
|
98 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/results_2023-11-21T18-10-08.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"config": {
|
4 |
-
"retrieval_model": "bge-m3",
|
5 |
-
"reranking_model": "bge-reranker-v2-m3",
|
6 |
-
"task": "long_doc",
|
7 |
-
"metric": "ndcg_at_1"
|
8 |
-
},
|
9 |
-
"results": [
|
10 |
-
{
|
11 |
-
"domain": "law",
|
12 |
-
"lang": "en",
|
13 |
-
"dataset": "lex_files_500K-600K",
|
14 |
-
"value": 0.75723
|
15 |
-
}
|
16 |
-
]
|
17 |
-
},
|
18 |
-
{
|
19 |
-
"config": {
|
20 |
-
"retrieval_model": "bge-m3",
|
21 |
-
"reranking_model": "bge-reranker-v2-m3",
|
22 |
-
"task": "long_doc",
|
23 |
-
"metric": "ndcg_at_3"
|
24 |
-
},
|
25 |
-
"results": [
|
26 |
-
{
|
27 |
-
"domain": "law",
|
28 |
-
"lang": "en",
|
29 |
-
"dataset": "lex_files_500K-600K",
|
30 |
-
"value": 0.69909
|
31 |
-
}
|
32 |
-
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"config": {
|
36 |
-
"retrieval_model": "bge-m3",
|
37 |
-
"reranking_model": "bge-reranker-v2-m3",
|
38 |
-
"task": "qa",
|
39 |
-
"metric": "ndcg_at_1"
|
40 |
-
},
|
41 |
-
"results": [
|
42 |
-
{
|
43 |
-
"domain": "wiki",
|
44 |
-
"lang": "en",
|
45 |
-
"dataset": "unknown",
|
46 |
-
"value": 0.69083
|
47 |
-
}
|
48 |
-
]
|
49 |
-
},
|
50 |
-
{
|
51 |
-
"config": {
|
52 |
-
"retrieval_model": "bge-m3",
|
53 |
-
"reranking_model": "bge-reranker-v2-m3",
|
54 |
-
"task": "qa",
|
55 |
-
"metric": "ndcg_at_3"
|
56 |
-
},
|
57 |
-
"results": [
|
58 |
-
{
|
59 |
-
"domain": "wiki",
|
60 |
-
"lang": "en",
|
61 |
-
"dataset": "unknown",
|
62 |
-
"value": 0.73359
|
63 |
-
}
|
64 |
-
]
|
65 |
-
},
|
66 |
-
{
|
67 |
-
"config": {
|
68 |
-
"retrieval_model": "bge-m3",
|
69 |
-
"reranking_model": "bge-reranker-v2-m3",
|
70 |
-
"task": "qa",
|
71 |
-
"metric": "ndcg_at_1"
|
72 |
-
},
|
73 |
-
"results": [
|
74 |
-
{
|
75 |
-
"domain": "wiki",
|
76 |
-
"lang": "zh",
|
77 |
-
"dataset": "unknown",
|
78 |
-
"value": 0.78358
|
79 |
-
}
|
80 |
-
]
|
81 |
-
},
|
82 |
-
{
|
83 |
-
"config": {
|
84 |
-
"retrieval_model": "bge-m3",
|
85 |
-
"reranking_model": "bge-reranker-v2-m3",
|
86 |
-
"task": "qa",
|
87 |
-
"metric": "ndcg_at_3"
|
88 |
-
},
|
89 |
-
"results": [
|
90 |
-
{
|
91 |
-
"domain": "wiki",
|
92 |
-
"lang": "zh",
|
93 |
-
"dataset": "unknown",
|
94 |
-
"value": 0.78358
|
95 |
-
}
|
96 |
-
]
|
97 |
-
}
|
98 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|