Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
2d272e2
1 Parent(s): 649e0fb

fix: add the missing files

Browse files
Files changed (1) hide show
  1. src/models.py +138 -0
src/models.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional
5
+
6
+ import pandas as pd
7
+
8
+ from src.benchmarks import get_safe_name
9
+ from src.display.column_names import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
10
+ COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
11
+ from src.display.formatting import make_clickable_model
12
+
13
+
14
+ @dataclass
15
+ class EvalResult:
16
+ """
17
+ Evaluation result of a single embedding model with a specific reranking model on benchmarks over different
18
+ domains, languages, and datasets
19
+ """
20
+ eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
21
+ retrieval_model: str
22
+ reranking_model: str
23
+ results: list # results on all the benchmarks stored as dict
24
+ task: str
25
+ metric: str
26
+ timestamp: str = "" # submission timestamp
27
+ revision: str = ""
28
+ is_anonymous: bool = False
29
+
30
+
31
+ @dataclass
32
+ class FullEvalResult:
33
+ """
34
+ Evaluation result of a single embedding model with a specific reranking model on benchmarks over different tasks
35
+ """
36
+ eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
37
+ retrieval_model: str
38
+ reranking_model: str
39
+ retrieval_model_link: str
40
+ reranking_model_link: str
41
+ results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
42
+ timestamp: str = ""
43
+ revision: str = ""
44
+ is_anonymous: bool = False
45
+
46
+ @classmethod
47
+ def init_from_json_file(cls, json_filepath):
48
+ """
49
+ Initiate from the result json file for a single model.
50
+ The json file will be written only when the status is FINISHED.
51
+ """
52
+ with open(json_filepath) as fp:
53
+ model_data = json.load(fp)
54
+
55
+ # store all the results for different metrics and tasks
56
+ result_list = []
57
+ retrieval_model_link = ""
58
+ reranking_model_link = ""
59
+ revision = ""
60
+ for item in model_data:
61
+ config = item.get("config", {})
62
+ # eval results for different metrics
63
+ results = item.get("results", [])
64
+ retrieval_model_link = config["retrieval_model_link"]
65
+ if config["reranking_model_link"] is None:
66
+ reranking_model_link = ""
67
+ else:
68
+ reranking_model_link = config["reranking_model_link"]
69
+ eval_result = EvalResult(
70
+ eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
71
+ retrieval_model=config["retrieval_model"],
72
+ reranking_model=config["reranking_model"],
73
+ results=results,
74
+ task=config["task"],
75
+ metric=config["metric"],
76
+ timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
77
+ revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
78
+ is_anonymous=config.get("is_anonymous", False)
79
+ )
80
+ result_list.append(eval_result)
81
+ return cls(
82
+ eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
83
+ retrieval_model=result_list[0].retrieval_model,
84
+ reranking_model=result_list[0].reranking_model,
85
+ retrieval_model_link=retrieval_model_link,
86
+ reranking_model_link=reranking_model_link,
87
+ results=result_list,
88
+ timestamp=result_list[0].timestamp,
89
+ revision=result_list[0].revision,
90
+ is_anonymous=result_list[0].is_anonymous
91
+ )
92
+
93
+ def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
94
+ """
95
+ Convert the results in all the EvalResults over different tasks and metrics. The output is a list of dict compatible with the dataframe UI
96
+ """
97
+ results = defaultdict(dict)
98
+ for eval_result in self.results:
99
+ if eval_result.metric != metric:
100
+ continue
101
+ if eval_result.task != task:
102
+ continue
103
+ results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
104
+ results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = (
105
+ make_clickable_model(self.retrieval_model, self.retrieval_model_link))
106
+ results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = (
107
+ make_clickable_model(self.reranking_model, self.reranking_model_link))
108
+ results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
109
+ results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
110
+ results[eval_result.eval_name][COL_NAME_REVISION] = self.revision
111
+ results[eval_result.eval_name][COL_NAME_TIMESTAMP] = self.timestamp
112
+ results[eval_result.eval_name][COL_NAME_IS_ANONYMOUS] = self.is_anonymous
113
+
114
+ # print(f'result loaded: {eval_result.eval_name}')
115
+ for result in eval_result.results:
116
+ # add result for each domain, language, and dataset
117
+ domain = result["domain"]
118
+ lang = result["lang"]
119
+ dataset = result["dataset"]
120
+ value = result["value"] * 100
121
+ if dataset == 'default':
122
+ benchmark_name = f"{domain}_{lang}"
123
+ else:
124
+ benchmark_name = f"{domain}_{lang}_{dataset}"
125
+ results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
126
+ return [v for v in results.values()]
127
+
128
+
129
+ @dataclass
130
+ class LeaderboardDataStore:
131
+ raw_data: Optional[list]
132
+ raw_df_qa: Optional[pd.DataFrame]
133
+ raw_df_long_doc: Optional[pd.DataFrame]
134
+ leaderboard_df_qa: Optional[pd.DataFrame]
135
+ leaderboard_df_long_doc: Optional[pd.DataFrame]
136
+ reranking_models: Optional[list]
137
+ types_qa: Optional[list]
138
+ types_long_doc: Optional[list]