Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
973bd2a
1 Parent(s): bbfe4c1

feat-switch-to-ndcg-for-qa-0607 (#19)

Browse files

- feat: update the default metric (4aa2126070c952b9ae87eb77e3f6e03f652729a1)
- fix: fix the typo (f3888bb6aba9a4aeca13aae68aa10f48c58d2e29)

Files changed (3) hide show
  1. app.py +5 -5
  2. src/benchmarks.py +2 -1
  3. src/display/gradio_formatting.py +0 -1
app.py CHANGED
@@ -9,7 +9,7 @@ from src.about import (
9
  EVALUATION_QUEUE_TEXT
10
  )
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
- DEFAULT_METRIC
13
  from src.display.css_html_js import custom_css
14
  from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
@@ -35,9 +35,9 @@ except Exception as e:
35
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
36
 
37
  original_df_qa = get_leaderboard_df(
38
- raw_data, task='qa', metric=DEFAULT_METRIC)
39
  original_df_long_doc = get_leaderboard_df(
40
- raw_data, task='long-doc', metric=DEFAULT_METRIC)
41
  print(f'raw data: {len(raw_data)}')
42
  print(f'QA data loaded: {original_df_qa.shape}')
43
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
@@ -103,7 +103,7 @@ with demo:
103
  with gr.Row():
104
  selected_version = get_version_dropdown()
105
  # select the metric
106
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
107
  with gr.Row():
108
  show_anonymous = get_anonymous_checkbox()
109
  with gr.Row():
@@ -205,7 +205,7 @@ with demo:
205
  selected_version = get_version_dropdown()
206
  # select the metric
207
  with gr.Row():
208
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
209
  with gr.Row():
210
  show_anonymous = get_anonymous_checkbox()
211
  with gr.Row():
 
9
  EVALUATION_QUEUE_TEXT
10
  )
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
+ DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
13
  from src.display.css_html_js import custom_css
14
  from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
 
35
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
36
 
37
  original_df_qa = get_leaderboard_df(
38
+ raw_data, task='qa', metric=DEFAULT_METRIC_QA)
39
  original_df_long_doc = get_leaderboard_df(
40
+ raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
41
  print(f'raw data: {len(raw_data)}')
42
  print(f'QA data loaded: {original_df_qa.shape}')
43
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
 
103
  with gr.Row():
104
  selected_version = get_version_dropdown()
105
  # select the metric
106
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
107
  with gr.Row():
108
  show_anonymous = get_anonymous_checkbox()
109
  with gr.Row():
 
205
  selected_version = get_version_dropdown()
206
  # select the metric
207
  with gr.Row():
208
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
209
  with gr.Row():
210
  show_anonymous = get_anonymous_checkbox()
211
  with gr.Row():
src/benchmarks.py CHANGED
@@ -148,4 +148,5 @@ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
148
  DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
149
  LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
150
 
151
- DEFAULT_METRIC = "recall_at_10"
 
 
148
  DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
149
  LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
150
 
151
+ DEFAULT_METRIC_QA = "ndcg_at_10"
152
+ DEFAULT_METRIC_LONG_DOC = "recall_at_10"
src/display/gradio_formatting.py CHANGED
@@ -49,7 +49,6 @@ def get_metric_dropdown(metric_list, default_metrics):
49
  value=default_metrics,
50
  label="Select the metric",
51
  interactive=True,
52
- info="Assuming that LLMs could generate correct answers when the correct context is retrieved, we recommend to use recall_at_k."
53
  )
54
 
55
 
 
49
  value=default_metrics,
50
  label="Select the metric",
51
  interactive=True,
 
52
  )
53
 
54