Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
fix: add the missing files
Browse files- src/models.py +138 -0
src/models.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from collections import defaultdict
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import List, Optional
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from src.benchmarks import get_safe_name
|
9 |
+
from src.display.column_names import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
|
10 |
+
COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
11 |
+
from src.display.formatting import make_clickable_model
|
12 |
+
|
13 |
+
|
14 |
+
@dataclass
|
15 |
+
class EvalResult:
|
16 |
+
"""
|
17 |
+
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different
|
18 |
+
domains, languages, and datasets
|
19 |
+
"""
|
20 |
+
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
|
21 |
+
retrieval_model: str
|
22 |
+
reranking_model: str
|
23 |
+
results: list # results on all the benchmarks stored as dict
|
24 |
+
task: str
|
25 |
+
metric: str
|
26 |
+
timestamp: str = "" # submission timestamp
|
27 |
+
revision: str = ""
|
28 |
+
is_anonymous: bool = False
|
29 |
+
|
30 |
+
|
31 |
+
@dataclass
|
32 |
+
class FullEvalResult:
|
33 |
+
"""
|
34 |
+
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different tasks
|
35 |
+
"""
|
36 |
+
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
37 |
+
retrieval_model: str
|
38 |
+
reranking_model: str
|
39 |
+
retrieval_model_link: str
|
40 |
+
reranking_model_link: str
|
41 |
+
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
42 |
+
timestamp: str = ""
|
43 |
+
revision: str = ""
|
44 |
+
is_anonymous: bool = False
|
45 |
+
|
46 |
+
@classmethod
|
47 |
+
def init_from_json_file(cls, json_filepath):
|
48 |
+
"""
|
49 |
+
Initiate from the result json file for a single model.
|
50 |
+
The json file will be written only when the status is FINISHED.
|
51 |
+
"""
|
52 |
+
with open(json_filepath) as fp:
|
53 |
+
model_data = json.load(fp)
|
54 |
+
|
55 |
+
# store all the results for different metrics and tasks
|
56 |
+
result_list = []
|
57 |
+
retrieval_model_link = ""
|
58 |
+
reranking_model_link = ""
|
59 |
+
revision = ""
|
60 |
+
for item in model_data:
|
61 |
+
config = item.get("config", {})
|
62 |
+
# eval results for different metrics
|
63 |
+
results = item.get("results", [])
|
64 |
+
retrieval_model_link = config["retrieval_model_link"]
|
65 |
+
if config["reranking_model_link"] is None:
|
66 |
+
reranking_model_link = ""
|
67 |
+
else:
|
68 |
+
reranking_model_link = config["reranking_model_link"]
|
69 |
+
eval_result = EvalResult(
|
70 |
+
eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
|
71 |
+
retrieval_model=config["retrieval_model"],
|
72 |
+
reranking_model=config["reranking_model"],
|
73 |
+
results=results,
|
74 |
+
task=config["task"],
|
75 |
+
metric=config["metric"],
|
76 |
+
timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
|
77 |
+
revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
|
78 |
+
is_anonymous=config.get("is_anonymous", False)
|
79 |
+
)
|
80 |
+
result_list.append(eval_result)
|
81 |
+
return cls(
|
82 |
+
eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
|
83 |
+
retrieval_model=result_list[0].retrieval_model,
|
84 |
+
reranking_model=result_list[0].reranking_model,
|
85 |
+
retrieval_model_link=retrieval_model_link,
|
86 |
+
reranking_model_link=reranking_model_link,
|
87 |
+
results=result_list,
|
88 |
+
timestamp=result_list[0].timestamp,
|
89 |
+
revision=result_list[0].revision,
|
90 |
+
is_anonymous=result_list[0].is_anonymous
|
91 |
+
)
|
92 |
+
|
93 |
+
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
94 |
+
"""
|
95 |
+
Convert the results in all the EvalResults over different tasks and metrics. The output is a list of dict compatible with the dataframe UI
|
96 |
+
"""
|
97 |
+
results = defaultdict(dict)
|
98 |
+
for eval_result in self.results:
|
99 |
+
if eval_result.metric != metric:
|
100 |
+
continue
|
101 |
+
if eval_result.task != task:
|
102 |
+
continue
|
103 |
+
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
104 |
+
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = (
|
105 |
+
make_clickable_model(self.retrieval_model, self.retrieval_model_link))
|
106 |
+
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = (
|
107 |
+
make_clickable_model(self.reranking_model, self.reranking_model_link))
|
108 |
+
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
|
109 |
+
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
|
110 |
+
results[eval_result.eval_name][COL_NAME_REVISION] = self.revision
|
111 |
+
results[eval_result.eval_name][COL_NAME_TIMESTAMP] = self.timestamp
|
112 |
+
results[eval_result.eval_name][COL_NAME_IS_ANONYMOUS] = self.is_anonymous
|
113 |
+
|
114 |
+
# print(f'result loaded: {eval_result.eval_name}')
|
115 |
+
for result in eval_result.results:
|
116 |
+
# add result for each domain, language, and dataset
|
117 |
+
domain = result["domain"]
|
118 |
+
lang = result["lang"]
|
119 |
+
dataset = result["dataset"]
|
120 |
+
value = result["value"] * 100
|
121 |
+
if dataset == 'default':
|
122 |
+
benchmark_name = f"{domain}_{lang}"
|
123 |
+
else:
|
124 |
+
benchmark_name = f"{domain}_{lang}_{dataset}"
|
125 |
+
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
126 |
+
return [v for v in results.values()]
|
127 |
+
|
128 |
+
|
129 |
+
@dataclass
|
130 |
+
class LeaderboardDataStore:
|
131 |
+
raw_data: Optional[list]
|
132 |
+
raw_df_qa: Optional[pd.DataFrame]
|
133 |
+
raw_df_long_doc: Optional[pd.DataFrame]
|
134 |
+
leaderboard_df_qa: Optional[pd.DataFrame]
|
135 |
+
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
136 |
+
reranking_models: Optional[list]
|
137 |
+
types_qa: Optional[list]
|
138 |
+
types_long_doc: Optional[list]
|