Spaces:
Restarting
on
CPU Upgrade
Restarting
on
CPU Upgrade
feat: improve the layout
Browse files- app.py +39 -37
- src/benchmarks.py +4 -2
- utils.py +3 -3
app.py
CHANGED
@@ -13,7 +13,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
|
|
13 |
|
14 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
15 |
from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
|
16 |
-
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC,
|
17 |
from src.display.utils import TYPES_QA, TYPES_LONG_DOC
|
18 |
|
19 |
|
@@ -31,9 +31,9 @@ except Exception:
|
|
31 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
32 |
|
33 |
original_df_qa = get_leaderboard_df(
|
34 |
-
raw_data, task='qa', metric=
|
35 |
original_df_long_doc = get_leaderboard_df(
|
36 |
-
raw_data, task='long-doc', metric=
|
37 |
print(f'raw data: {len(raw_data)}')
|
38 |
print(f'QA data loaded: {original_df_qa.shape}')
|
39 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
@@ -75,22 +75,33 @@ with demo:
|
|
75 |
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
|
76 |
with gr.Row():
|
77 |
with gr.Column():
|
78 |
-
# search
|
79 |
with gr.Row():
|
80 |
search_bar = gr.Textbox(
|
81 |
-
placeholder=" ๐ Search for
|
82 |
show_label=False,
|
83 |
elem_id="search-bar",
|
|
|
84 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
# select the metric
|
86 |
selected_metric = gr.Dropdown(
|
87 |
-
choices=
|
88 |
-
value=
|
89 |
label="Select the metric",
|
90 |
interactive=True,
|
91 |
elem_id="metric-select",
|
92 |
)
|
93 |
-
with gr.Column(min_width=320):
|
94 |
# select domain
|
95 |
with gr.Row():
|
96 |
selected_domains = gr.CheckboxGroup(
|
@@ -110,16 +121,6 @@ with demo:
|
|
110 |
multiselect=True,
|
111 |
interactive=True
|
112 |
)
|
113 |
-
# select reranking model
|
114 |
-
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
115 |
-
with gr.Row():
|
116 |
-
selected_rerankings = gr.CheckboxGroup(
|
117 |
-
choices=reranking_models,
|
118 |
-
value=reranking_models,
|
119 |
-
label="Select the reranking models",
|
120 |
-
elem_id="reranking-select",
|
121 |
-
interactive=True
|
122 |
-
)
|
123 |
|
124 |
leaderboard_table = gr.components.Dataframe(
|
125 |
value=leaderboard_df_qa,
|
@@ -187,19 +188,30 @@ with demo:
|
|
187 |
with gr.Column():
|
188 |
with gr.Row():
|
189 |
search_bar = gr.Textbox(
|
190 |
-
placeholder=" ๐ Search for
|
191 |
show_label=False,
|
192 |
elem_id="search-bar-long-doc",
|
193 |
)
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
202 |
with gr.Column(min_width=320):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
# select domain
|
204 |
with gr.Row():
|
205 |
selected_domains = gr.CheckboxGroup(
|
@@ -219,16 +231,6 @@ with demo:
|
|
219 |
multiselect=True,
|
220 |
interactive=True
|
221 |
)
|
222 |
-
# select reranking model
|
223 |
-
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
224 |
-
with gr.Row():
|
225 |
-
selected_rerankings = gr.CheckboxGroup(
|
226 |
-
choices=reranking_models,
|
227 |
-
value=reranking_models,
|
228 |
-
label="Select the reranking models",
|
229 |
-
elem_id="reranking-select-long-doc",
|
230 |
-
interactive=True
|
231 |
-
)
|
232 |
|
233 |
leaderboard_table_long_doc = gr.components.Dataframe(
|
234 |
value=leaderboard_df_long_doc,
|
|
|
13 |
|
14 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
15 |
from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
|
16 |
+
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, DEFAULT_METRIC
|
17 |
from src.display.utils import TYPES_QA, TYPES_LONG_DOC
|
18 |
|
19 |
|
|
|
31 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
32 |
|
33 |
original_df_qa = get_leaderboard_df(
|
34 |
+
raw_data, task='qa', metric=DEFAULT_METRIC)
|
35 |
original_df_long_doc = get_leaderboard_df(
|
36 |
+
raw_data, task='long-doc', metric=DEFAULT_METRIC)
|
37 |
print(f'raw data: {len(raw_data)}')
|
38 |
print(f'QA data loaded: {original_df_qa.shape}')
|
39 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
|
|
75 |
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
|
76 |
with gr.Row():
|
77 |
with gr.Column():
|
78 |
+
# search retrieval models
|
79 |
with gr.Row():
|
80 |
search_bar = gr.Textbox(
|
81 |
+
placeholder=" ๐ Search for retrieval models (separate multiple queries with `;`) and press ENTER...",
|
82 |
show_label=False,
|
83 |
elem_id="search-bar",
|
84 |
+
info="Search the retrieval models"
|
85 |
)
|
86 |
+
# select reranking model
|
87 |
+
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
88 |
+
with gr.Row():
|
89 |
+
selected_rerankings = gr.CheckboxGroup(
|
90 |
+
choices=reranking_models,
|
91 |
+
value=reranking_models,
|
92 |
+
label="Select the reranking models",
|
93 |
+
elem_id="reranking-select",
|
94 |
+
interactive=True
|
95 |
+
)
|
96 |
+
with gr.Column(min_width=320):
|
97 |
# select the metric
|
98 |
selected_metric = gr.Dropdown(
|
99 |
+
choices=METRIC_LIST,
|
100 |
+
value=DEFAULT_METRIC,
|
101 |
label="Select the metric",
|
102 |
interactive=True,
|
103 |
elem_id="metric-select",
|
104 |
)
|
|
|
105 |
# select domain
|
106 |
with gr.Row():
|
107 |
selected_domains = gr.CheckboxGroup(
|
|
|
121 |
multiselect=True,
|
122 |
interactive=True
|
123 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
leaderboard_table = gr.components.Dataframe(
|
126 |
value=leaderboard_df_qa,
|
|
|
188 |
with gr.Column():
|
189 |
with gr.Row():
|
190 |
search_bar = gr.Textbox(
|
191 |
+
placeholder=" ๐ Search for retrieval models (separate multiple queries with `;`) and press ENTER...",
|
192 |
show_label=False,
|
193 |
elem_id="search-bar-long-doc",
|
194 |
)
|
195 |
+
# select reranking model
|
196 |
+
reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
|
197 |
+
with gr.Row():
|
198 |
+
selected_rerankings = gr.CheckboxGroup(
|
199 |
+
choices=reranking_models,
|
200 |
+
value=reranking_models,
|
201 |
+
label="Select the reranking models",
|
202 |
+
elem_id="reranking-select-long-doc",
|
203 |
+
interactive=True
|
204 |
+
)
|
205 |
with gr.Column(min_width=320):
|
206 |
+
# select the metric
|
207 |
+
with gr.Row():
|
208 |
+
selected_metric = gr.Dropdown(
|
209 |
+
choices=METRIC_LIST,
|
210 |
+
value=DEFAULT_METRIC,
|
211 |
+
label="Select the metric",
|
212 |
+
interactive=True,
|
213 |
+
elem_id="metric-select-long-doc",
|
214 |
+
)
|
215 |
# select domain
|
216 |
with gr.Row():
|
217 |
selected_domains = gr.CheckboxGroup(
|
|
|
231 |
multiselect=True,
|
232 |
interactive=True
|
233 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
leaderboard_table_long_doc = gr.components.Dataframe(
|
236 |
value=leaderboard_df_long_doc,
|
src/benchmarks.py
CHANGED
@@ -70,7 +70,7 @@ dataset_dict = {
|
|
70 |
}
|
71 |
}
|
72 |
|
73 |
-
|
74 |
"ndcg_at_1",
|
75 |
"ndcg_at_3",
|
76 |
"ndcg_at_5",
|
@@ -130,7 +130,7 @@ for task, domain_dict in dataset_dict.items():
|
|
130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
131 |
benchmark_name = get_safe_name(benchmark_name)
|
132 |
col_name = benchmark_name
|
133 |
-
for metric in
|
134 |
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain,
|
135 |
lang, task)
|
136 |
|
@@ -145,3 +145,5 @@ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
|
|
145 |
|
146 |
DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
|
147 |
LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
|
|
|
|
|
|
70 |
}
|
71 |
}
|
72 |
|
73 |
+
METRIC_LIST = [
|
74 |
"ndcg_at_1",
|
75 |
"ndcg_at_3",
|
76 |
"ndcg_at_5",
|
|
|
130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
131 |
benchmark_name = get_safe_name(benchmark_name)
|
132 |
col_name = benchmark_name
|
133 |
+
for metric in METRIC_LIST:
|
134 |
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain,
|
135 |
lang, task)
|
136 |
|
|
|
145 |
|
146 |
DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
|
147 |
LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
|
148 |
+
|
149 |
+
DEFAULT_METRIC = "ndcg_at_10"
|
utils.py
CHANGED
@@ -33,8 +33,8 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
33 |
filtered_df = pd.concat(final_df)
|
34 |
filtered_df = filtered_df.drop_duplicates(
|
35 |
subset=[
|
36 |
-
|
37 |
-
|
38 |
]
|
39 |
)
|
40 |
|
@@ -42,7 +42,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
42 |
|
43 |
|
44 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
45 |
-
return df[(df[
|
46 |
|
47 |
|
48 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
|
|
33 |
filtered_df = pd.concat(final_df)
|
34 |
filtered_df = filtered_df.drop_duplicates(
|
35 |
subset=[
|
36 |
+
COL_NAME_RETRIEVAL_MODEL,
|
37 |
+
COL_NAME_RERANKING_MODEL,
|
38 |
]
|
39 |
)
|
40 |
|
|
|
42 |
|
43 |
|
44 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
45 |
+
return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
|
46 |
|
47 |
|
48 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|