Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat-switch-to-ndcg-for-qa-0607 (#19)
Browse files- feat: update the default metric (4aa2126070c952b9ae87eb77e3f6e03f652729a1)
- fix: fix the typo (f3888bb6aba9a4aeca13aae68aa10f48c58d2e29)
- app.py +5 -5
- src/benchmarks.py +2 -1
- src/display/gradio_formatting.py +0 -1
app.py
CHANGED
@@ -9,7 +9,7 @@ from src.about import (
|
|
9 |
EVALUATION_QUEUE_TEXT
|
10 |
)
|
11 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
|
12 |
-
|
13 |
from src.display.css_html_js import custom_css
|
14 |
from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL
|
15 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
@@ -35,9 +35,9 @@ except Exception as e:
|
|
35 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
36 |
|
37 |
original_df_qa = get_leaderboard_df(
|
38 |
-
raw_data, task='qa', metric=
|
39 |
original_df_long_doc = get_leaderboard_df(
|
40 |
-
raw_data, task='long-doc', metric=
|
41 |
print(f'raw data: {len(raw_data)}')
|
42 |
print(f'QA data loaded: {original_df_qa.shape}')
|
43 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
@@ -103,7 +103,7 @@ with demo:
|
|
103 |
with gr.Row():
|
104 |
selected_version = get_version_dropdown()
|
105 |
# select the metric
|
106 |
-
selected_metric = get_metric_dropdown(METRIC_LIST,
|
107 |
with gr.Row():
|
108 |
show_anonymous = get_anonymous_checkbox()
|
109 |
with gr.Row():
|
@@ -205,7 +205,7 @@ with demo:
|
|
205 |
selected_version = get_version_dropdown()
|
206 |
# select the metric
|
207 |
with gr.Row():
|
208 |
-
selected_metric = get_metric_dropdown(METRIC_LIST,
|
209 |
with gr.Row():
|
210 |
show_anonymous = get_anonymous_checkbox()
|
211 |
with gr.Row():
|
|
|
9 |
EVALUATION_QUEUE_TEXT
|
10 |
)
|
11 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
|
12 |
+
DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
|
13 |
from src.display.css_html_js import custom_css
|
14 |
from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL
|
15 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
|
|
35 |
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
36 |
|
37 |
original_df_qa = get_leaderboard_df(
|
38 |
+
raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
39 |
original_df_long_doc = get_leaderboard_df(
|
40 |
+
raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
41 |
print(f'raw data: {len(raw_data)}')
|
42 |
print(f'QA data loaded: {original_df_qa.shape}')
|
43 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
|
|
103 |
with gr.Row():
|
104 |
selected_version = get_version_dropdown()
|
105 |
# select the metric
|
106 |
+
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
|
107 |
with gr.Row():
|
108 |
show_anonymous = get_anonymous_checkbox()
|
109 |
with gr.Row():
|
|
|
205 |
selected_version = get_version_dropdown()
|
206 |
# select the metric
|
207 |
with gr.Row():
|
208 |
+
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
|
209 |
with gr.Row():
|
210 |
show_anonymous = get_anonymous_checkbox()
|
211 |
with gr.Row():
|
src/benchmarks.py
CHANGED
@@ -148,4 +148,5 @@ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
|
|
148 |
DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
|
149 |
LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
|
150 |
|
151 |
-
|
|
|
|
148 |
DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
|
149 |
LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
|
150 |
|
151 |
+
DEFAULT_METRIC_QA = "ndcg_at_10"
|
152 |
+
DEFAULT_METRIC_LONG_DOC = "recall_at_10"
|
src/display/gradio_formatting.py
CHANGED
@@ -49,7 +49,6 @@ def get_metric_dropdown(metric_list, default_metrics):
|
|
49 |
value=default_metrics,
|
50 |
label="Select the metric",
|
51 |
interactive=True,
|
52 |
-
info="Assuming that LLMs could generate correct answers when the correct context is retrieved, we recommend to use recall_at_k."
|
53 |
)
|
54 |
|
55 |
|
|
|
49 |
value=default_metrics,
|
50 |
label="Select the metric",
|
51 |
interactive=True,
|
|
|
52 |
)
|
53 |
|
54 |
|