Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
test: add unit tests for utils
Browse files- src/loaders.py +3 -1
- src/models.py +8 -8
- src/utils.py +1 -1
- tests/src/test_utils.py +25 -1
src/loaders.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import os.path
|
|
|
|
|
2 |
from typing import Dict, List
|
3 |
|
4 |
import pandas as pd
|
@@ -11,7 +13,7 @@ from src.utils import get_default_cols, get_leaderboard_df, reset_rank
|
|
11 |
pd.options.mode.copy_on_write = True
|
12 |
|
13 |
|
14 |
-
def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
15 |
"""
|
16 |
Load the evaluation results from a json file
|
17 |
"""
|
|
|
1 |
import os.path
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Union
|
4 |
from typing import Dict, List
|
5 |
|
6 |
import pandas as pd
|
|
|
13 |
pd.options.mode.copy_on_write = True
|
14 |
|
15 |
|
16 |
+
def load_raw_eval_results(results_path: Union[Path, str]) -> List[FullEvalResult]:
|
17 |
"""
|
18 |
Load the evaluation results from a json file
|
19 |
"""
|
src/models.py
CHANGED
@@ -141,14 +141,14 @@ class FullEvalResult:
|
|
141 |
class LeaderboardDataStore:
|
142 |
version: str
|
143 |
slug: str
|
144 |
-
raw_data:
|
145 |
-
qa_raw_df:
|
146 |
-
doc_raw_df:
|
147 |
-
qa_fmt_df:
|
148 |
-
doc_fmt_df:
|
149 |
-
reranking_models:
|
150 |
-
qa_types:
|
151 |
-
doc_types:
|
152 |
|
153 |
|
154 |
# Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.
|
|
|
141 |
class LeaderboardDataStore:
|
142 |
version: str
|
143 |
slug: str
|
144 |
+
raw_data: list = None
|
145 |
+
qa_raw_df: pd.DataFrame = pd.DataFrame()
|
146 |
+
doc_raw_df: pd.DataFrame = pd.DataFrame()
|
147 |
+
qa_fmt_df: pd.DataFrame = pd.DataFrame()
|
148 |
+
doc_fmt_df: pd.DataFrame = pd.DataFrame()
|
149 |
+
reranking_models: list = None
|
150 |
+
qa_types: list = None
|
151 |
+
doc_types: list = None
|
152 |
|
153 |
|
154 |
# Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.
|
src/utils.py
CHANGED
@@ -354,7 +354,7 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
|
|
354 |
continue
|
355 |
benchmark_cols.append(t.value.col_name)
|
356 |
|
357 |
-
|
358 |
df[COL_NAME_AVG] = (
|
359 |
df[list(benchmark_cols)]
|
360 |
.apply(calculate_mean, axis=1)
|
|
|
354 |
continue
|
355 |
benchmark_cols.append(t.value.col_name)
|
356 |
|
357 |
+
# filter out the columns that are not in the data
|
358 |
df[COL_NAME_AVG] = (
|
359 |
df[list(benchmark_cols)]
|
360 |
.apply(calculate_mean, axis=1)
|
tests/src/test_utils.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import pytest
|
2 |
import pandas as pd
|
|
|
3 |
|
4 |
-
from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem
|
5 |
from src.models import model_hyperlink, TaskType
|
6 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
7 |
|
|
|
8 |
|
9 |
NUM_QA_BENCHMARKS_24_05 = 53
|
10 |
NUM_DOC_BENCHMARKS_24_05 = 11
|
@@ -193,3 +195,25 @@ def test__update_df_elem(toy_df, reset_rank, show_anony):
|
|
193 |
assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
|
194 |
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pytest
|
2 |
import pandas as pd
|
3 |
+
from pathlib import Path
|
4 |
|
5 |
+
from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem, get_leaderboard_df
|
6 |
from src.models import model_hyperlink, TaskType
|
7 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
8 |
|
9 |
+
cur_fp = Path(__file__)
|
10 |
|
11 |
NUM_QA_BENCHMARKS_24_05 = 53
|
12 |
NUM_DOC_BENCHMARKS_24_05 = 11
|
|
|
195 |
assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
|
196 |
|
197 |
|
198 |
+
@pytest.mark.parametrize(
|
199 |
+
"version, task_type",
|
200 |
+
[
|
201 |
+
("AIR-Bench_24.04", TaskType.qa),
|
202 |
+
("AIR-Bench_24.04", TaskType.long_doc),
|
203 |
+
("AIR-Bench_24.05", TaskType.qa),
|
204 |
+
("AIR-Bench_24.05", TaskType.long_doc)
|
205 |
+
]
|
206 |
+
)
|
207 |
+
def test_get_leaderboard_df(version, task_type):
|
208 |
+
from src.loaders import load_raw_eval_results
|
209 |
+
from src.models import LeaderboardDataStore, get_safe_name
|
210 |
+
raw_data = load_raw_eval_results(
|
211 |
+
cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
212 |
+
)
|
213 |
+
ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
|
214 |
+
df = get_leaderboard_df(
|
215 |
+
ds,
|
216 |
+
task_type,
|
217 |
+
"ndcg_at_10"
|
218 |
+
)
|
219 |
+
assert df.shape[0] == 1
|