Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
chore: clean up requests-related codes
Browse files- src/leaderboard/read_evals.py +2 -24
- tests/src/leaderboard/test_read_evals.py +5 -15
- tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-11-21T18-10-08.json +0 -6
- tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-12-21T18-10-08.json +0 -6
- tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-11-21T18-10-08.json +0 -6
- tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-12-21T18-10-08.json +0 -6
- tests/toydata/test_results/bge-m3/NoReranker/results_2023-12-21T18-10-08.json +0 -50
src/leaderboard/read_evals.py
CHANGED
@@ -91,21 +91,6 @@ class FullEvalResult:
|
|
91 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
92 |
return [v for v in results.values()]
|
93 |
|
94 |
-
def update_with_request_file(self, request_path):
|
95 |
-
"""
|
96 |
-
Update the request file
|
97 |
-
"""
|
98 |
-
request_file = get_request_file_for_model(
|
99 |
-
request_path, self.retrieval_model, self.reranking_model
|
100 |
-
)
|
101 |
-
|
102 |
-
try:
|
103 |
-
with open(request_file, "r") as f:
|
104 |
-
request = json.load(f)
|
105 |
-
self.date = request.get("submitted_time", "")
|
106 |
-
except Exception:
|
107 |
-
print(f"Failed to find request file for {self.retrieval_model}, {self.reranking_model}: {request_path}")
|
108 |
-
|
109 |
|
110 |
def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
|
111 |
"""
|
@@ -130,7 +115,7 @@ def get_request_file_for_model(requests_path, retrieval_model_name, reranking_mo
|
|
130 |
return request_file
|
131 |
|
132 |
|
133 |
-
def get_raw_eval_results(results_path: str
|
134 |
"""
|
135 |
Load the evaluation results from a json file
|
136 |
"""
|
@@ -151,14 +136,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
151 |
for model_result_filepath in model_result_filepaths:
|
152 |
# create evaluation results
|
153 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
154 |
-
|
155 |
-
eval_result.update_with_request_file(requests_path)
|
156 |
-
latest_date_str = eval_result.date.replace(":", "-")
|
157 |
-
model_result_date_str = model_result_filepath.split('/')[-1
|
158 |
-
].removeprefix("results_").removesuffix(".json")
|
159 |
-
if latest_date_str != model_result_date_str:
|
160 |
-
print(f'file skipped: {model_result_filepath}')
|
161 |
-
continue
|
162 |
print(f'file loaded: {model_result_filepath}')
|
163 |
eval_name = eval_result.eval_name
|
164 |
eval_results[eval_name] = eval_result
|
|
|
91 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
92 |
return [v for v in results.values()]
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
|
96 |
"""
|
|
|
115 |
return request_file
|
116 |
|
117 |
|
118 |
+
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
119 |
"""
|
120 |
Load the evaluation results from a json file
|
121 |
"""
|
|
|
136 |
for model_result_filepath in model_result_filepaths:
|
137 |
# create evaluation results
|
138 |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
|
139 |
+
model_result_date_str = model_result_filepath.split('/')[-1].removeprefix("results_").removesuffix(".json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
print(f'file loaded: {model_result_filepath}')
|
141 |
eval_name = eval_result.eval_name
|
142 |
eval_results[eval_name] = eval_result
|
tests/src/leaderboard/test_read_evals.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
-
from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results
|
4 |
|
5 |
cur_fp = Path(__file__)
|
6 |
|
@@ -19,26 +19,16 @@ def test_to_dict():
|
|
19 |
result_dict = result_list[0]
|
20 |
assert result_dict["Retrieval Model"] == "bge-m3"
|
21 |
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
22 |
-
assert result_dict["
|
23 |
-
assert result_dict["
|
24 |
-
|
25 |
-
|
26 |
-
def test_get_request_file_for_model():
|
27 |
-
requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
|
28 |
-
request_file = get_request_file_for_model(requests_path, "bge-m3", "bge-reranker-v2-m3")
|
29 |
-
# only load the latest finished results
|
30 |
-
assert Path(request_file).name.removeprefix("eval_request_").removesuffix(".json") == "2023-11-21T18-10-08"
|
31 |
|
32 |
|
33 |
def test_get_raw_eval_results():
|
34 |
-
requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
|
35 |
results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
|
36 |
-
results = get_raw_eval_results(results_path
|
37 |
# only load the latest results
|
38 |
assert len(results) == 2
|
39 |
-
assert results[0].date == "2023-12-21T18:10:08"
|
40 |
assert results[0].eval_name == "bge-m3_NoReranker"
|
41 |
-
assert len(results[0].results) ==
|
42 |
assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
|
43 |
-
assert results[1].date == "2023-11-21T18:10:08"
|
44 |
assert len(results[1].results) == 6
|
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
+
from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results
|
4 |
|
5 |
cur_fp = Path(__file__)
|
6 |
|
|
|
19 |
result_dict = result_list[0]
|
20 |
assert result_dict["Retrieval Model"] == "bge-m3"
|
21 |
assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
|
22 |
+
assert result_dict["wiki_en"] is not None
|
23 |
+
assert result_dict["wiki_zh"] is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
def test_get_raw_eval_results():
|
|
|
27 |
results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
|
28 |
+
results = get_raw_eval_results(results_path)
|
29 |
# only load the latest results
|
30 |
assert len(results) == 2
|
|
|
31 |
assert results[0].eval_name == "bge-m3_NoReranker"
|
32 |
+
assert len(results[0].results) == 6
|
33 |
assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
|
|
|
34 |
assert len(results[1].results) == 6
|
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-11-21T18-10-08.json
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"retrieval_model": "BAAI/bge-m3",
|
3 |
-
"reranking_model": "NoReranker",
|
4 |
-
"status": "FINISHED",
|
5 |
-
"submitted_time": "2023-11-21T18:10:08"
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-12-21T18-10-08.json
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"retrieval_model": "BAAI/bge-m3",
|
3 |
-
"reranking_model": "NoReranker",
|
4 |
-
"status": "FINISHED",
|
5 |
-
"submitted_time": "2023-12-21T18:10:08"
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-11-21T18-10-08.json
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"retrieval_model": "BAAI/bge-m3",
|
3 |
-
"reranking_model": "BAAI/bge-reranker-v2-m3",
|
4 |
-
"status": "FINISHED",
|
5 |
-
"submitted_time": "2023-11-21T18:10:08"
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-12-21T18-10-08.json
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"retrieval_model": "BAAI/bge-m3",
|
3 |
-
"reranking_model": "BAAI/bge-reranker-v2-m3",
|
4 |
-
"status": "RUNNING",
|
5 |
-
"submitted_time": "2023-12-21T18:10:08"
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/toydata/test_results/bge-m3/NoReranker/results_2023-12-21T18-10-08.json
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"config": {
|
4 |
-
"retrieval_model": "bge-m3",
|
5 |
-
"reranking_model": "NoReranker",
|
6 |
-
"task": "long_doc",
|
7 |
-
"metric": "ndcg_at_1"
|
8 |
-
},
|
9 |
-
"results": [
|
10 |
-
{
|
11 |
-
"domain": "law",
|
12 |
-
"lang": "en",
|
13 |
-
"dataset": "lex_files_500K-600K",
|
14 |
-
"value": 0.45723
|
15 |
-
}
|
16 |
-
]
|
17 |
-
},
|
18 |
-
{
|
19 |
-
"config": {
|
20 |
-
"retrieval_model": "bge-m3",
|
21 |
-
"reranking_model": "NoReranker",
|
22 |
-
"task": "qa",
|
23 |
-
"metric": "ndcg_at_1"
|
24 |
-
},
|
25 |
-
"results": [
|
26 |
-
{
|
27 |
-
"domain": "wiki",
|
28 |
-
"lang": "en",
|
29 |
-
"dataset": "unknown",
|
30 |
-
"value": 0.39083
|
31 |
-
}
|
32 |
-
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"config": {
|
36 |
-
"retrieval_model": "bge-m3",
|
37 |
-
"reranking_model": "NoReranker",
|
38 |
-
"task": "qa",
|
39 |
-
"metric": "ndcg_at_1"
|
40 |
-
},
|
41 |
-
"results": [
|
42 |
-
{
|
43 |
-
"domain": "wiki",
|
44 |
-
"lang": "zh",
|
45 |
-
"dataset": "unknown",
|
46 |
-
"value": 0.78358
|
47 |
-
}
|
48 |
-
]
|
49 |
-
}
|
50 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|