Spaces:
AIR-Bench
/
Restarting on CPU Upgrade

nan commited on
Commit
32ebf18
โ€ข
1 Parent(s): b33239d

feat: improve the layout

Browse files
Files changed (3) hide show
  1. app.py +39 -37
  2. src/benchmarks.py +4 -2
  3. utils.py +3 -3
app.py CHANGED
@@ -13,7 +13,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
13
 
14
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
  from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
16
- from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
17
  from src.display.utils import TYPES_QA, TYPES_LONG_DOC
18
 
19
 
@@ -31,9 +31,9 @@ except Exception:
31
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
32
 
33
  original_df_qa = get_leaderboard_df(
34
- raw_data, task='qa', metric='ndcg_at_10')
35
  original_df_long_doc = get_leaderboard_df(
36
- raw_data, task='long-doc', metric='ndcg_at_10')
37
  print(f'raw data: {len(raw_data)}')
38
  print(f'QA data loaded: {original_df_qa.shape}')
39
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
@@ -75,22 +75,33 @@ with demo:
75
  with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
76
  with gr.Row():
77
  with gr.Column():
78
- # search bar for model name
79
  with gr.Row():
80
  search_bar = gr.Textbox(
81
- placeholder=" ๐Ÿ” Search for your model (separate multiple queries with `;`) and press ENTER...",
82
  show_label=False,
83
  elem_id="search-bar",
 
84
  )
 
 
 
 
 
 
 
 
 
 
 
85
  # select the metric
86
  selected_metric = gr.Dropdown(
87
- choices=metric_list,
88
- value=metric_list[1],
89
  label="Select the metric",
90
  interactive=True,
91
  elem_id="metric-select",
92
  )
93
- with gr.Column(min_width=320):
94
  # select domain
95
  with gr.Row():
96
  selected_domains = gr.CheckboxGroup(
@@ -110,16 +121,6 @@ with demo:
110
  multiselect=True,
111
  interactive=True
112
  )
113
- # select reranking model
114
- reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
115
- with gr.Row():
116
- selected_rerankings = gr.CheckboxGroup(
117
- choices=reranking_models,
118
- value=reranking_models,
119
- label="Select the reranking models",
120
- elem_id="reranking-select",
121
- interactive=True
122
- )
123
 
124
  leaderboard_table = gr.components.Dataframe(
125
  value=leaderboard_df_qa,
@@ -187,19 +188,30 @@ with demo:
187
  with gr.Column():
188
  with gr.Row():
189
  search_bar = gr.Textbox(
190
- placeholder=" ๐Ÿ” Search for your model (separate multiple queries with `;`) and press ENTER...",
191
  show_label=False,
192
  elem_id="search-bar-long-doc",
193
  )
194
- # select the metric
195
- selected_metric = gr.Dropdown(
196
- choices=metric_list,
197
- value=metric_list[1],
198
- label="Select the metric",
199
- interactive=True,
200
- elem_id="metric-select-long-doc",
201
- )
 
 
202
  with gr.Column(min_width=320):
 
 
 
 
 
 
 
 
 
203
  # select domain
204
  with gr.Row():
205
  selected_domains = gr.CheckboxGroup(
@@ -219,16 +231,6 @@ with demo:
219
  multiselect=True,
220
  interactive=True
221
  )
222
- # select reranking model
223
- reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
224
- with gr.Row():
225
- selected_rerankings = gr.CheckboxGroup(
226
- choices=reranking_models,
227
- value=reranking_models,
228
- label="Select the reranking models",
229
- elem_id="reranking-select-long-doc",
230
- interactive=True
231
- )
232
 
233
  leaderboard_table_long_doc = gr.components.Dataframe(
234
  value=leaderboard_df_long_doc,
 
13
 
14
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
  from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
16
+ from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, DEFAULT_METRIC
17
  from src.display.utils import TYPES_QA, TYPES_LONG_DOC
18
 
19
 
 
31
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
32
 
33
  original_df_qa = get_leaderboard_df(
34
+ raw_data, task='qa', metric=DEFAULT_METRIC)
35
  original_df_long_doc = get_leaderboard_df(
36
+ raw_data, task='long-doc', metric=DEFAULT_METRIC)
37
  print(f'raw data: {len(raw_data)}')
38
  print(f'QA data loaded: {original_df_qa.shape}')
39
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
 
75
  with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
76
  with gr.Row():
77
  with gr.Column():
78
+ # search retrieval models
79
  with gr.Row():
80
  search_bar = gr.Textbox(
81
+ placeholder=" ๐Ÿ” Search for retrieval models (separate multiple queries with `;`) and press ENTER...",
82
  show_label=False,
83
  elem_id="search-bar",
84
+ info="Search the retrieval models"
85
  )
86
+ # select reranking model
87
+ reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
88
+ with gr.Row():
89
+ selected_rerankings = gr.CheckboxGroup(
90
+ choices=reranking_models,
91
+ value=reranking_models,
92
+ label="Select the reranking models",
93
+ elem_id="reranking-select",
94
+ interactive=True
95
+ )
96
+ with gr.Column(min_width=320):
97
  # select the metric
98
  selected_metric = gr.Dropdown(
99
+ choices=METRIC_LIST,
100
+ value=DEFAULT_METRIC,
101
  label="Select the metric",
102
  interactive=True,
103
  elem_id="metric-select",
104
  )
 
105
  # select domain
106
  with gr.Row():
107
  selected_domains = gr.CheckboxGroup(
 
121
  multiselect=True,
122
  interactive=True
123
  )
 
 
 
 
 
 
 
 
 
 
124
 
125
  leaderboard_table = gr.components.Dataframe(
126
  value=leaderboard_df_qa,
 
188
  with gr.Column():
189
  with gr.Row():
190
  search_bar = gr.Textbox(
191
+ placeholder=" ๐Ÿ” Search for retrieval models (separate multiple queries with `;`) and press ENTER...",
192
  show_label=False,
193
  elem_id="search-bar-long-doc",
194
  )
195
+ # select reranking model
196
+ reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
197
+ with gr.Row():
198
+ selected_rerankings = gr.CheckboxGroup(
199
+ choices=reranking_models,
200
+ value=reranking_models,
201
+ label="Select the reranking models",
202
+ elem_id="reranking-select-long-doc",
203
+ interactive=True
204
+ )
205
  with gr.Column(min_width=320):
206
+ # select the metric
207
+ with gr.Row():
208
+ selected_metric = gr.Dropdown(
209
+ choices=METRIC_LIST,
210
+ value=DEFAULT_METRIC,
211
+ label="Select the metric",
212
+ interactive=True,
213
+ elem_id="metric-select-long-doc",
214
+ )
215
  # select domain
216
  with gr.Row():
217
  selected_domains = gr.CheckboxGroup(
 
231
  multiselect=True,
232
  interactive=True
233
  )
 
 
 
 
 
 
 
 
 
 
234
 
235
  leaderboard_table_long_doc = gr.components.Dataframe(
236
  value=leaderboard_df_long_doc,
src/benchmarks.py CHANGED
@@ -70,7 +70,7 @@ dataset_dict = {
70
  }
71
  }
72
 
73
- metric_list = [
74
  "ndcg_at_1",
75
  "ndcg_at_3",
76
  "ndcg_at_5",
@@ -130,7 +130,7 @@ for task, domain_dict in dataset_dict.items():
130
  benchmark_name = f"{domain}_{lang}_{dataset}"
131
  benchmark_name = get_safe_name(benchmark_name)
132
  col_name = benchmark_name
133
- for metric in metric_list:
134
  long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain,
135
  lang, task)
136
 
@@ -145,3 +145,5 @@ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
145
 
146
  DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
147
  LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
 
 
 
70
  }
71
  }
72
 
73
+ METRIC_LIST = [
74
  "ndcg_at_1",
75
  "ndcg_at_3",
76
  "ndcg_at_5",
 
130
  benchmark_name = f"{domain}_{lang}_{dataset}"
131
  benchmark_name = get_safe_name(benchmark_name)
132
  col_name = benchmark_name
133
+ for metric in METRIC_LIST:
134
  long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain,
135
  lang, task)
136
 
 
145
 
146
  DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
147
  LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
148
+
149
+ DEFAULT_METRIC = "ndcg_at_10"
utils.py CHANGED
@@ -33,8 +33,8 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
33
  filtered_df = pd.concat(final_df)
34
  filtered_df = filtered_df.drop_duplicates(
35
  subset=[
36
- AutoEvalColumnQA.retrieval_model.name,
37
- AutoEvalColumnQA.reranking_model.name,
38
  ]
39
  )
40
 
@@ -42,7 +42,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
42
 
43
 
44
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
45
- return df[(df[AutoEvalColumnQA.retrieval_model.name].str.contains(query, case=False))]
46
 
47
 
48
  def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
 
33
  filtered_df = pd.concat(final_df)
34
  filtered_df = filtered_df.drop_duplicates(
35
  subset=[
36
+ COL_NAME_RETRIEVAL_MODEL,
37
+ COL_NAME_RERANKING_MODEL,
38
  ]
39
  )
40
 
 
42
 
43
 
44
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
45
+ return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
46
 
47
 
48
  def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list: