Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: fix the to_dict function
Browse files- src/leaderboard/read_evals.py +12 -14
- src/populate.py +6 -3
- tests/src/display/test_utils.py +0 -1
- tests/src/leaderboard/test_read_evals.py +7 -2
- tests/src/test_populate.py +14 -6
src/leaderboard/read_evals.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import glob
|
|
|
2 |
import json
|
3 |
import os.path
|
4 |
from dataclasses import dataclass
|
@@ -6,7 +7,7 @@ from typing import List
|
|
6 |
|
7 |
import dateutil.parser._parser
|
8 |
|
9 |
-
from src.display.utils import
|
10 |
from src.benchmarks import get_safe_name
|
11 |
|
12 |
|
@@ -61,20 +62,19 @@ class FullEvalResult:
|
|
61 |
results=result_list
|
62 |
)
|
63 |
|
64 |
-
def to_dict(self, task='qa', metric='ndcg_at_1'):
|
65 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
66 |
"""
|
67 |
-
results =
|
68 |
for eval_result in self.results:
|
69 |
if eval_result.metric != metric:
|
70 |
continue
|
71 |
if eval_result.task != task:
|
72 |
continue
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
}
|
78 |
for result in eval_result.results:
|
79 |
# add result for each domain, language, and dataset
|
80 |
domain = result["domain"]
|
@@ -82,12 +82,11 @@ class FullEvalResult:
|
|
82 |
dataset = result["dataset"]
|
83 |
value = result["value"]
|
84 |
if task == 'qa':
|
85 |
-
benchmark_name = f"{
|
86 |
elif task == 'long_doc':
|
87 |
-
benchmark_name = f"{
|
88 |
-
|
89 |
-
|
90 |
-
return results
|
91 |
|
92 |
def update_with_request_file(self, request_path):
|
93 |
"""
|
@@ -148,7 +147,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
148 |
eval_results = {}
|
149 |
for model_result_filepath in model_result_filepaths:
|
150 |
# create evaluation results
|
151 |
-
# TODO: fix the bug here, the running results should not be loaded
|
152 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
153 |
# get the latest result that is finished
|
154 |
eval_result.update_with_request_file(requests_path)
|
|
|
1 |
import glob
|
2 |
+
from collections import defaultdict
|
3 |
import json
|
4 |
import os.path
|
5 |
from dataclasses import dataclass
|
|
|
7 |
|
8 |
import dateutil.parser._parser
|
9 |
|
10 |
+
from src.display.utils import AutoEvalColumnQA
|
11 |
from src.benchmarks import get_safe_name
|
12 |
|
13 |
|
|
|
62 |
results=result_list
|
63 |
)
|
64 |
|
65 |
+
def to_dict(self, task='qa', metric='ndcg_at_1') -> List:
|
66 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
67 |
"""
|
68 |
+
results = defaultdict(dict)
|
69 |
for eval_result in self.results:
|
70 |
if eval_result.metric != metric:
|
71 |
continue
|
72 |
if eval_result.task != task:
|
73 |
continue
|
74 |
+
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
75 |
+
results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
|
76 |
+
results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
|
77 |
+
|
|
|
78 |
for result in eval_result.results:
|
79 |
# add result for each domain, language, and dataset
|
80 |
domain = result["domain"]
|
|
|
82 |
dataset = result["dataset"]
|
83 |
value = result["value"]
|
84 |
if task == 'qa':
|
85 |
+
benchmark_name = f"{domain}_{lang}"
|
86 |
elif task == 'long_doc':
|
87 |
+
benchmark_name = f"{domain}_{lang}_{dataset}_{metric}"
|
88 |
+
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
89 |
+
return [v for v in results.values()]
|
|
|
90 |
|
91 |
def update_with_request_file(self, request_path):
|
92 |
"""
|
|
|
147 |
eval_results = {}
|
148 |
for model_result_filepath in model_result_filepaths:
|
149 |
# create evaluation results
|
|
|
150 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
151 |
# get the latest result that is finished
|
152 |
eval_result.update_with_request_file(requests_path)
|
src/populate.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
9 |
from typing import Tuple
|
10 |
|
@@ -12,10 +12,13 @@ from typing import Tuple
|
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
-
all_data_json = [
|
|
|
|
|
16 |
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
18 |
-
df = df.
|
|
|
19 |
df = df[cols].round(decimals=2)
|
20 |
|
21 |
# filter out if any of the benchmarks have not been produced
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
9 |
from typing import Tuple
|
10 |
|
|
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
+
all_data_json = []
|
16 |
+
for v in raw_data:
|
17 |
+
all_data_json += v.to_dict()
|
18 |
|
19 |
df = pd.DataFrame.from_records(all_data_json)
|
20 |
+
df["Average ⬆️"] = df[benchmark_cols].mean(axis=1)
|
21 |
+
# df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
22 |
df = df[cols].round(decimals=2)
|
23 |
|
24 |
# filter out if any of the benchmarks have not been produced
|
tests/src/display/test_utils.py
CHANGED
@@ -2,7 +2,6 @@ import pytest
|
|
2 |
from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
3 |
|
4 |
|
5 |
-
@pytest.mark.parametrize('auto_eval_column')
|
6 |
def test_fields():
|
7 |
for c in fields(AutoEvalColumnQA):
|
8 |
print(c)
|
|
|
2 |
from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
3 |
|
4 |
|
|
|
5 |
def test_fields():
|
6 |
for c in fields(AutoEvalColumnQA):
|
7 |
print(c)
|
tests/src/leaderboard/test_read_evals.py
CHANGED
@@ -14,8 +14,13 @@ def test_init_from_json_file():
|
|
14 |
def test_to_dict():
|
15 |
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
16 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
17 |
-
|
18 |
-
assert len(
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
def test_get_request_file_for_model():
|
|
|
14 |
def test_to_dict():
|
15 |
json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
|
16 |
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
17 |
+
result_list = full_eval_result.to_dict(task='qa', metric='ndcg_at_1')
|
18 |
+
assert len(result_list) == 1
|
19 |
+
result_dict = result_list[0]
|
20 |
+
assert result_dict["Retrieval Model"] == "bge-m3"
|
21 |
+
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
22 |
+
assert result_dict["qa_wiki_en"] is not None
|
23 |
+
assert result_dict["qa_wiki_zh"] is not None
|
24 |
|
25 |
|
26 |
def test_get_request_file_for_model():
|
tests/src/test_populate.py
CHANGED
@@ -3,10 +3,18 @@ from pathlib import Path
|
|
3 |
|
4 |
cur_fp = Path(__file__)
|
5 |
|
|
|
6 |
def test_get_leaderboard_df():
|
7 |
-
requests_path = cur_fp.parents[
|
8 |
-
results_path = cur_fp.parents[
|
9 |
-
cols = []
|
10 |
-
benchmark_cols = []
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
cur_fp = Path(__file__)
|
5 |
|
6 |
+
|
7 |
def test_get_leaderboard_df():
|
8 |
+
requests_path = cur_fp.parents[1] / "toydata" / "test_requests"
|
9 |
+
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
10 |
+
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
11 |
+
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
12 |
+
raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
|
13 |
+
assert df.shape[0] == 2
|
14 |
+
assert df["Retrieval Model"][0] == "bge-m3"
|
15 |
+
assert df["Retrieval Model"][1] == "bge-m3"
|
16 |
+
assert df["Reranking Model"][0] == "NoReranker"
|
17 |
+
assert df["Reranking Model"][1] == "bge-reranker-v2-m3"
|
18 |
+
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
|
19 |
+
|
20 |
+
|