Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
e5c7cad
1 Parent(s): 2c777fc

chore: clean up requests-related codes

Browse files
src/leaderboard/read_evals.py CHANGED
@@ -91,21 +91,6 @@ class FullEvalResult:
91
  results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
92
  return [v for v in results.values()]
93
 
94
- def update_with_request_file(self, request_path):
95
- """
96
- Update the request file
97
- """
98
- request_file = get_request_file_for_model(
99
- request_path, self.retrieval_model, self.reranking_model
100
- )
101
-
102
- try:
103
- with open(request_file, "r") as f:
104
- request = json.load(f)
105
- self.date = request.get("submitted_time", "")
106
- except Exception:
107
- print(f"Failed to find request file for {self.retrieval_model}, {self.reranking_model}: {request_path}")
108
-
109
 
110
  def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
111
  """
@@ -130,7 +115,7 @@ def get_request_file_for_model(requests_path, retrieval_model_name, reranking_mo
130
  return request_file
131
 
132
 
133
- def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEvalResult]:
134
  """
135
  Load the evaluation results from a json file
136
  """
@@ -151,14 +136,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
151
  for model_result_filepath in model_result_filepaths:
152
  # create evaluation results
153
  eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
154
- # get the latest result that is finished
155
- eval_result.update_with_request_file(requests_path)
156
- latest_date_str = eval_result.date.replace(":", "-")
157
- model_result_date_str = model_result_filepath.split('/')[-1
158
- ].removeprefix("results_").removesuffix(".json")
159
- if latest_date_str != model_result_date_str:
160
- print(f'file skipped: {model_result_filepath}')
161
- continue
162
  print(f'file loaded: {model_result_filepath}')
163
  eval_name = eval_result.eval_name
164
  eval_results[eval_name] = eval_result
 
91
  results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
92
  return [v for v in results.values()]
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
96
  """
 
115
  return request_file
116
 
117
 
118
+ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
119
  """
120
  Load the evaluation results from a json file
121
  """
 
136
  for model_result_filepath in model_result_filepaths:
137
  # create evaluation results
138
  eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
139
+ model_result_date_str = model_result_filepath.split('/')[-1].removeprefix("results_").removesuffix(".json")
 
 
 
 
 
 
 
140
  print(f'file loaded: {model_result_filepath}')
141
  eval_name = eval_result.eval_name
142
  eval_results[eval_name] = eval_result
tests/src/leaderboard/test_read_evals.py CHANGED
@@ -1,6 +1,6 @@
1
  from pathlib import Path
2
 
3
- from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results, get_request_file_for_model
4
 
5
  cur_fp = Path(__file__)
6
 
@@ -19,26 +19,16 @@ def test_to_dict():
19
  result_dict = result_list[0]
20
  assert result_dict["Retrieval Model"] == "bge-m3"
21
  assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
22
- assert result_dict["qa_wiki_en"] is not None
23
- assert result_dict["qa_wiki_zh"] is not None
24
-
25
-
26
- def test_get_request_file_for_model():
27
- requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
28
- request_file = get_request_file_for_model(requests_path, "bge-m3", "bge-reranker-v2-m3")
29
- # only load the latest finished results
30
- assert Path(request_file).name.removeprefix("eval_request_").removesuffix(".json") == "2023-11-21T18-10-08"
31
 
32
 
33
  def test_get_raw_eval_results():
34
- requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
35
  results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
36
- results = get_raw_eval_results(results_path, requests_path)
37
  # only load the latest results
38
  assert len(results) == 2
39
- assert results[0].date == "2023-12-21T18:10:08"
40
  assert results[0].eval_name == "bge-m3_NoReranker"
41
- assert len(results[0].results) == 3
42
  assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
43
- assert results[1].date == "2023-11-21T18:10:08"
44
  assert len(results[1].results) == 6
 
1
  from pathlib import Path
2
 
3
+ from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results
4
 
5
  cur_fp = Path(__file__)
6
 
 
19
  result_dict = result_list[0]
20
  assert result_dict["Retrieval Model"] == "bge-m3"
21
  assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
22
+ assert result_dict["wiki_en"] is not None
23
+ assert result_dict["wiki_zh"] is not None
 
 
 
 
 
 
 
24
 
25
 
26
  def test_get_raw_eval_results():
 
27
  results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
28
+ results = get_raw_eval_results(results_path)
29
  # only load the latest results
30
  assert len(results) == 2
 
31
  assert results[0].eval_name == "bge-m3_NoReranker"
32
+ assert len(results[0].results) == 6
33
  assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
 
34
  assert len(results[1].results) == 6
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-11-21T18-10-08.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "retrieval_model": "BAAI/bge-m3",
3
- "reranking_model": "NoReranker",
4
- "status": "FINISHED",
5
- "submitted_time": "2023-11-21T18:10:08"
6
- }
 
 
 
 
 
 
 
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-12-21T18-10-08.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "retrieval_model": "BAAI/bge-m3",
3
- "reranking_model": "NoReranker",
4
- "status": "FINISHED",
5
- "submitted_time": "2023-12-21T18:10:08"
6
- }
 
 
 
 
 
 
 
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-11-21T18-10-08.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "retrieval_model": "BAAI/bge-m3",
3
- "reranking_model": "BAAI/bge-reranker-v2-m3",
4
- "status": "FINISHED",
5
- "submitted_time": "2023-11-21T18:10:08"
6
- }
 
 
 
 
 
 
 
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-12-21T18-10-08.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "retrieval_model": "BAAI/bge-m3",
3
- "reranking_model": "BAAI/bge-reranker-v2-m3",
4
- "status": "RUNNING",
5
- "submitted_time": "2023-12-21T18:10:08"
6
- }
 
 
 
 
 
 
 
tests/toydata/test_results/bge-m3/NoReranker/results_2023-12-21T18-10-08.json DELETED
@@ -1,50 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "retrieval_model": "bge-m3",
5
- "reranking_model": "NoReranker",
6
- "task": "long_doc",
7
- "metric": "ndcg_at_1"
8
- },
9
- "results": [
10
- {
11
- "domain": "law",
12
- "lang": "en",
13
- "dataset": "lex_files_500K-600K",
14
- "value": 0.45723
15
- }
16
- ]
17
- },
18
- {
19
- "config": {
20
- "retrieval_model": "bge-m3",
21
- "reranking_model": "NoReranker",
22
- "task": "qa",
23
- "metric": "ndcg_at_1"
24
- },
25
- "results": [
26
- {
27
- "domain": "wiki",
28
- "lang": "en",
29
- "dataset": "unknown",
30
- "value": 0.39083
31
- }
32
- ]
33
- },
34
- {
35
- "config": {
36
- "retrieval_model": "bge-m3",
37
- "reranking_model": "NoReranker",
38
- "task": "qa",
39
- "metric": "ndcg_at_1"
40
- },
41
- "results": [
42
- {
43
- "domain": "wiki",
44
- "lang": "zh",
45
- "dataset": "unknown",
46
- "value": 0.78358
47
- }
48
- ]
49
- }
50
- ]