Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: adapt to the latest data format
Browse files- app.py +4 -4
- src/benchmarks.py +2 -2
- src/display/utils.py +8 -0
- src/leaderboard/read_evals.py +19 -7
- tests/src/leaderboard/test_read_evals.py +17 -17
- utils.py +3 -3
app.py
CHANGED
@@ -27,12 +27,12 @@ try:
|
|
27 |
except Exception:
|
28 |
restart_space()
|
29 |
|
30 |
-
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH)
|
31 |
|
32 |
original_df_qa = get_leaderboard_df(
|
33 |
raw_data, task='qa', metric='ndcg_at_3')
|
34 |
original_df_long_doc = get_leaderboard_df(
|
35 |
-
raw_data, task='
|
36 |
print(f'raw data: {len(raw_data)}')
|
37 |
print(f'QA data loaded: {original_df_qa.shape}')
|
38 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
@@ -42,7 +42,7 @@ shown_columns_qa = get_default_cols('qa', leaderboard_df_qa.columns, add_fix_col
|
|
42 |
leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
|
43 |
|
44 |
leaderboard_df_long_doc = original_df_long_doc.copy()
|
45 |
-
shown_columns_long_doc = get_default_cols('
|
46 |
leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
|
47 |
|
48 |
|
@@ -62,7 +62,7 @@ def update_metric_long_doc(
|
|
62 |
reranking_model: list,
|
63 |
query: str,
|
64 |
):
|
65 |
-
return update_metric(raw_data,
|
66 |
|
67 |
|
68 |
demo = gr.Blocks(css=custom_css)
|
|
|
27 |
except Exception:
|
28 |
restart_space()
|
29 |
|
30 |
+
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
31 |
|
32 |
original_df_qa = get_leaderboard_df(
|
33 |
raw_data, task='qa', metric='ndcg_at_3')
|
34 |
original_df_long_doc = get_leaderboard_df(
|
35 |
+
raw_data, task='long-doc', metric='ndcg_at_3')
|
36 |
print(f'raw data: {len(raw_data)}')
|
37 |
print(f'QA data loaded: {original_df_qa.shape}')
|
38 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
|
|
42 |
leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
|
43 |
|
44 |
leaderboard_df_long_doc = original_df_long_doc.copy()
|
45 |
+
shown_columns_long_doc = get_default_cols('long-doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
|
46 |
leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
|
47 |
|
48 |
|
|
|
62 |
reranking_model: list,
|
63 |
query: str,
|
64 |
):
|
65 |
+
return update_metric(raw_data, "long-doc", metric, domains, langs, reranking_model, query)
|
66 |
|
67 |
|
68 |
demo = gr.Blocks(css=custom_css)
|
src/benchmarks.py
CHANGED
@@ -40,7 +40,7 @@ dataset_dict = {
|
|
40 |
"arxiv": {
|
41 |
"en": ["Arxiv", ]},
|
42 |
},
|
43 |
-
"
|
44 |
"arxiv": {
|
45 |
"en": ["gpt-3", "llama2", "llm-survey", "gemini"],
|
46 |
},
|
@@ -125,7 +125,7 @@ for task, domain_dict in dataset_dict.items():
|
|
125 |
col_name = benchmark_name
|
126 |
for metric in dataset_list:
|
127 |
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
128 |
-
elif task == "
|
129 |
for dataset in dataset_list:
|
130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
131 |
benchmark_name = get_safe_name(benchmark_name)
|
|
|
40 |
"arxiv": {
|
41 |
"en": ["Arxiv", ]},
|
42 |
},
|
43 |
+
"long-doc": {
|
44 |
"arxiv": {
|
45 |
"en": ["gpt-3", "llama2", "llm-survey", "gemini"],
|
46 |
},
|
|
|
125 |
col_name = benchmark_name
|
126 |
for metric in dataset_list:
|
127 |
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
128 |
+
elif task == "long-doc":
|
129 |
for dataset in dataset_list:
|
130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
131 |
benchmark_name = get_safe_name(benchmark_name)
|
src/display/utils.py
CHANGED
@@ -22,6 +22,8 @@ class ColumnContent:
|
|
22 |
COL_NAME_AVG = "Average ⬆️"
|
23 |
COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
|
24 |
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
|
|
|
|
25 |
COL_NAME_RANK = "Rank 🏆"
|
26 |
|
27 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
@@ -34,6 +36,12 @@ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
|
34 |
auto_eval_column_dict.append(
|
35 |
["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
|
36 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
auto_eval_column_dict.append(
|
38 |
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
|
39 |
)
|
|
|
22 |
COL_NAME_AVG = "Average ⬆️"
|
23 |
COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
|
24 |
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
25 |
+
COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
|
26 |
+
COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
|
27 |
COL_NAME_RANK = "Rank 🏆"
|
28 |
|
29 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
|
|
36 |
auto_eval_column_dict.append(
|
37 |
["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
|
38 |
)
|
39 |
+
auto_eval_column_dict.append(
|
40 |
+
["retrieval_model_link", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", False, hidden=True, never_hidden=False)]
|
41 |
+
)
|
42 |
+
auto_eval_column_dict.append(
|
43 |
+
["reranking_model_link", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", False, hidden=True, never_hidden=False)]
|
44 |
+
)
|
45 |
auto_eval_column_dict.append(
|
46 |
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
|
47 |
)
|
src/leaderboard/read_evals.py
CHANGED
@@ -12,6 +12,8 @@ from src.display.formatting import has_no_nan_values
|
|
12 |
from src.display.utils import (
|
13 |
COL_NAME_RERANKING_MODEL,
|
14 |
COL_NAME_RETRIEVAL_MODEL,
|
|
|
|
|
15 |
COLS_QA,
|
16 |
QA_BENCHMARK_COLS,
|
17 |
COLS_LONG_DOC,
|
@@ -44,6 +46,8 @@ class FullEvalResult:
|
|
44 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
45 |
retrieval_model: str
|
46 |
reranking_model: str
|
|
|
|
|
47 |
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
48 |
date: str = ""
|
49 |
|
@@ -58,10 +62,15 @@ class FullEvalResult:
|
|
58 |
|
59 |
# store all the results for different metrics and tasks
|
60 |
result_list = []
|
|
|
|
|
61 |
for item in model_data:
|
62 |
config = item.get("config", {})
|
63 |
# eval results for different metrics
|
64 |
results = item.get("results", [])
|
|
|
|
|
|
|
65 |
eval_result = EvalResult(
|
66 |
eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
|
67 |
retrieval_model=config["retrieval_model"],
|
@@ -75,6 +84,8 @@ class FullEvalResult:
|
|
75 |
eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
|
76 |
retrieval_model=result_list[0].retrieval_model,
|
77 |
reranking_model=result_list[0].reranking_model,
|
|
|
|
|
78 |
results=result_list
|
79 |
)
|
80 |
|
@@ -91,6 +102,8 @@ class FullEvalResult:
|
|
91 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
92 |
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
|
93 |
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
|
|
|
|
|
94 |
|
95 |
# print(f'result loaded: {eval_result.eval_name}')
|
96 |
for result in eval_result.results:
|
@@ -99,9 +112,9 @@ class FullEvalResult:
|
|
99 |
lang = result["lang"]
|
100 |
dataset = result["dataset"]
|
101 |
value = result["value"]
|
102 |
-
if
|
103 |
benchmark_name = f"{domain}_{lang}"
|
104 |
-
|
105 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
106 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
107 |
return [v for v in results.values()]
|
@@ -115,13 +128,12 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
|
115 |
for root, dirs, files in os.walk(results_path):
|
116 |
if len(files) == 0:
|
117 |
continue
|
118 |
-
try:
|
119 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
|
120 |
-
except dateutil.parser._parser.ParserError:
|
121 |
-
files = [files[-1]]
|
122 |
|
123 |
# select the latest results
|
124 |
for file in files:
|
|
|
|
|
|
|
125 |
model_result_filepaths.append(os.path.join(root, file))
|
126 |
|
127 |
eval_results = {}
|
@@ -154,7 +166,7 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
|
|
154 |
if task == "qa":
|
155 |
cols = COLS_QA
|
156 |
benchmark_cols = QA_BENCHMARK_COLS
|
157 |
-
elif task == "
|
158 |
cols = COLS_LONG_DOC
|
159 |
benchmark_cols = LONG_DOC_BENCHMARK_COLS
|
160 |
else:
|
|
|
12 |
from src.display.utils import (
|
13 |
COL_NAME_RERANKING_MODEL,
|
14 |
COL_NAME_RETRIEVAL_MODEL,
|
15 |
+
COL_NAME_RERANKING_MODEL_LINK,
|
16 |
+
COL_NAME_RETRIEVAL_MODEL_LINK,
|
17 |
COLS_QA,
|
18 |
QA_BENCHMARK_COLS,
|
19 |
COLS_LONG_DOC,
|
|
|
46 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
47 |
retrieval_model: str
|
48 |
reranking_model: str
|
49 |
+
retrieval_model_link: str
|
50 |
+
reranking_model_link: str
|
51 |
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
52 |
date: str = ""
|
53 |
|
|
|
62 |
|
63 |
# store all the results for different metrics and tasks
|
64 |
result_list = []
|
65 |
+
retrieval_model_link = ""
|
66 |
+
reranking_model_link = ""
|
67 |
for item in model_data:
|
68 |
config = item.get("config", {})
|
69 |
# eval results for different metrics
|
70 |
results = item.get("results", [])
|
71 |
+
retrieval_model_link=config["retreival_model_link"]
|
72 |
+
if config["reranking_model_link"] is not None:
|
73 |
+
reranking_model_link=""
|
74 |
eval_result = EvalResult(
|
75 |
eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
|
76 |
retrieval_model=config["retrieval_model"],
|
|
|
84 |
eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
|
85 |
retrieval_model=result_list[0].retrieval_model,
|
86 |
reranking_model=result_list[0].reranking_model,
|
87 |
+
retrieval_model_link=retrieval_model_link,
|
88 |
+
reranking_model_link=reranking_model_link,
|
89 |
results=result_list
|
90 |
)
|
91 |
|
|
|
102 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
103 |
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
|
104 |
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
|
105 |
+
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
|
106 |
+
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
|
107 |
|
108 |
# print(f'result loaded: {eval_result.eval_name}')
|
109 |
for result in eval_result.results:
|
|
|
112 |
lang = result["lang"]
|
113 |
dataset = result["dataset"]
|
114 |
value = result["value"]
|
115 |
+
if dataset == 'default':
|
116 |
benchmark_name = f"{domain}_{lang}"
|
117 |
+
else:
|
118 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
119 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
120 |
return [v for v in results.values()]
|
|
|
128 |
for root, dirs, files in os.walk(results_path):
|
129 |
if len(files) == 0:
|
130 |
continue
|
|
|
|
|
|
|
|
|
131 |
|
132 |
# select the latest results
|
133 |
for file in files:
|
134 |
+
if file != "results.json":
|
135 |
+
print(f'skip {file}')
|
136 |
+
continue
|
137 |
model_result_filepaths.append(os.path.join(root, file))
|
138 |
|
139 |
eval_results = {}
|
|
|
166 |
if task == "qa":
|
167 |
cols = COLS_QA
|
168 |
benchmark_cols = QA_BENCHMARK_COLS
|
169 |
+
elif task == "long-doc":
|
170 |
cols = COLS_LONG_DOC
|
171 |
benchmark_cols = LONG_DOC_BENCHMARK_COLS
|
172 |
else:
|
tests/src/leaderboard/test_read_evals.py
CHANGED
@@ -28,35 +28,35 @@ def test_to_dict():
|
|
28 |
|
29 |
|
30 |
def test_get_raw_eval_results():
|
31 |
-
results_path = cur_fp.parents[2] / "toydata" / "
|
32 |
results = get_raw_eval_results(results_path)
|
33 |
# only load the latest results
|
34 |
-
assert len(results) ==
|
35 |
-
assert results[0].eval_name == "bge-
|
36 |
-
assert len(results[0].results) ==
|
37 |
-
assert results[
|
38 |
-
assert len(results[1].results) ==
|
39 |
|
40 |
|
41 |
def test_get_leaderboard_df():
|
42 |
-
results_path = cur_fp.parents[2] / "toydata" / "
|
43 |
raw_data = get_raw_eval_results(results_path)
|
44 |
-
df = get_leaderboard_df(raw_data, 'qa', '
|
45 |
-
assert df.shape[0] ==
|
46 |
# the results contain only one embedding model
|
47 |
-
for i in range(
|
48 |
-
|
49 |
-
# the results contain only two reranking model
|
50 |
-
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
51 |
-
assert df["Reranking Model"][1] == "NoReranker"
|
52 |
-
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
53 |
-
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
|
54 |
|
55 |
|
56 |
def test_get_leaderboard_df_long_doc():
|
57 |
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
58 |
raw_data = get_raw_eval_results(results_path)
|
59 |
-
df = get_leaderboard_df(raw_data, '
|
60 |
assert df.shape[0] == 2
|
61 |
# the results contain only one embedding model
|
62 |
for i in range(2):
|
|
|
28 |
|
29 |
|
30 |
def test_get_raw_eval_results():
|
31 |
+
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
32 |
results = get_raw_eval_results(results_path)
|
33 |
# only load the latest results
|
34 |
+
assert len(results) == 4
|
35 |
+
assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
|
36 |
+
assert len(results[0].results) == 70
|
37 |
+
assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
|
38 |
+
assert len(results[1].results) == 70
|
39 |
|
40 |
|
41 |
def test_get_leaderboard_df():
|
42 |
+
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
43 |
raw_data = get_raw_eval_results(results_path)
|
44 |
+
df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_3')
|
45 |
+
assert df.shape[0] == 4
|
46 |
# the results contain only one embedding model
|
47 |
+
# for i in range(4):
|
48 |
+
# assert df["Retrieval Model"][i] == "bge-m3"
|
49 |
+
# # the results contain only two reranking model
|
50 |
+
# assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
51 |
+
# assert df["Reranking Model"][1] == "NoReranker"
|
52 |
+
# assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
53 |
+
# assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
|
54 |
|
55 |
|
56 |
def test_get_leaderboard_df_long_doc():
|
57 |
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
58 |
raw_data = get_raw_eval_results(results_path)
|
59 |
+
df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
|
60 |
assert df.shape[0] == 2
|
61 |
# the results contain only one embedding model
|
62 |
for i in range(2):
|
utils.py
CHANGED
@@ -47,7 +47,7 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
47 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
48 |
if task == "qa":
|
49 |
cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
|
50 |
-
elif task == "
|
51 |
cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
|
52 |
else:
|
53 |
raise NotImplemented
|
@@ -68,7 +68,7 @@ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, t
|
|
68 |
for c in cols:
|
69 |
if task == "qa":
|
70 |
eval_col = BenchmarksQA[c].value
|
71 |
-
elif task == "
|
72 |
eval_col = BenchmarksLongDoc[c].value
|
73 |
if eval_col.domain not in domain_query:
|
74 |
continue
|
@@ -127,7 +127,7 @@ def update_metric(
|
|
127 |
reranking_model,
|
128 |
query
|
129 |
)
|
130 |
-
elif task ==
|
131 |
leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
|
132 |
return update_table_long_doc(
|
133 |
leaderboard_df,
|
|
|
47 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
48 |
if task == "qa":
|
49 |
cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
|
50 |
+
elif task == "long-doc":
|
51 |
cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
|
52 |
else:
|
53 |
raise NotImplemented
|
|
|
68 |
for c in cols:
|
69 |
if task == "qa":
|
70 |
eval_col = BenchmarksQA[c].value
|
71 |
+
elif task == "long-doc":
|
72 |
eval_col = BenchmarksLongDoc[c].value
|
73 |
if eval_col.domain not in domain_query:
|
74 |
continue
|
|
|
127 |
reranking_model,
|
128 |
query
|
129 |
)
|
130 |
+
elif task == "long-doc":
|
131 |
leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
|
132 |
return update_table_long_doc(
|
133 |
leaderboard_df,
|