Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
refactor: reformat
Browse files- app.py +28 -26
- src/loaders.py +6 -9
- src/models.py +11 -4
- src/utils.py +13 -9
- tests/src/display/test_utils.py +0 -1
- tests/test_utils.py +9 -2
app.py
CHANGED
@@ -6,6 +6,7 @@ from huggingface_hub import snapshot_download
|
|
6 |
|
7 |
from src.about import BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
|
8 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
|
|
9 |
from src.components import (
|
10 |
get_anonymous_checkbox,
|
11 |
get_domain_dropdown,
|
@@ -31,7 +32,6 @@ from src.envs import (
|
|
31 |
RESULTS_REPO,
|
32 |
TOKEN,
|
33 |
)
|
34 |
-
from src.columns import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL
|
35 |
from src.loaders import load_eval_results
|
36 |
from src.models import TaskType, model_hyperlink
|
37 |
from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
|
@@ -81,7 +81,7 @@ def update_qa_metric(
|
|
81 |
reranking_model,
|
82 |
query,
|
83 |
show_anonymous,
|
84 |
-
show_revision_and_timestamp
|
85 |
)
|
86 |
|
87 |
|
@@ -173,7 +173,9 @@ with demo:
|
|
173 |
# shown_table
|
174 |
qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
|
175 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
176 |
-
qa_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
|
|
|
|
177 |
|
178 |
version.change(
|
179 |
update_qa_version,
|
@@ -214,14 +216,24 @@ with demo:
|
|
214 |
qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
|
215 |
|
216 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
217 |
-
_qa_df_ret_hidden = datastore.qa_raw_df[
|
|
|
|
|
218 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
219 |
-
qa_df_elem_ret_hidden = get_leaderboard_table(
|
|
|
|
|
220 |
|
221 |
version.change(
|
222 |
update_qa_version,
|
223 |
version,
|
224 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
)
|
226 |
|
227 |
set_listeners(
|
@@ -253,13 +265,9 @@ with demo:
|
|
253 |
)
|
254 |
|
255 |
with gr.TabItem("Reranking Only", id=12):
|
256 |
-
_qa_df_rerank = datastore.qa_fmt_df[
|
257 |
-
datastore.qa_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
258 |
-
]
|
259 |
_qa_df_rerank = reset_rank(_qa_df_rerank)
|
260 |
-
qa_rerank_models = (
|
261 |
-
_qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
262 |
-
)
|
263 |
with gr.Row():
|
264 |
with gr.Column(scale=1):
|
265 |
qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
|
@@ -269,7 +277,7 @@ with demo:
|
|
269 |
|
270 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
271 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
272 |
-
|
273 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
274 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
275 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
@@ -333,9 +341,7 @@ with demo:
|
|
333 |
with gr.Column():
|
334 |
models = get_reranking_dropdown(datastore.reranking_models)
|
335 |
|
336 |
-
doc_df_elem_ret_rerank = get_leaderboard_table(
|
337 |
-
datastore.doc_fmt_df, datastore.doc_types
|
338 |
-
)
|
339 |
|
340 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
341 |
doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
@@ -385,15 +391,13 @@ with demo:
|
|
385 |
|
386 |
_doc_df_ret = datastore.doc_fmt_df[
|
387 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
388 |
-
|
389 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
390 |
-
doc_df_elem_ret = get_leaderboard_table(
|
391 |
-
_doc_df_ret, datastore.doc_types
|
392 |
-
)
|
393 |
|
394 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
395 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
396 |
-
|
397 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
398 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
399 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
@@ -435,22 +439,20 @@ with demo:
|
|
435 |
with gr.TabItem("Reranking Only", id=22):
|
436 |
_doc_df_rerank = datastore.doc_fmt_df[
|
437 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
438 |
-
|
439 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
440 |
doc_rerank_models = (
|
441 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
442 |
)
|
443 |
with gr.Row():
|
444 |
with gr.Column(scale=1):
|
445 |
-
doc_models_rerank = get_reranking_dropdown(
|
446 |
-
doc_rerank_models
|
447 |
-
)
|
448 |
with gr.Column(scale=1):
|
449 |
doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
|
450 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
451 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
452 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
453 |
-
|
454 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
455 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
456 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
|
|
6 |
|
7 |
from src.about import BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
|
8 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
9 |
+
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
10 |
from src.components import (
|
11 |
get_anonymous_checkbox,
|
12 |
get_domain_dropdown,
|
|
|
32 |
RESULTS_REPO,
|
33 |
TOKEN,
|
34 |
)
|
|
|
35 |
from src.loaders import load_eval_results
|
36 |
from src.models import TaskType, model_hyperlink
|
37 |
from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
|
|
|
81 |
reranking_model,
|
82 |
query,
|
83 |
show_anonymous,
|
84 |
+
show_revision_and_timestamp,
|
85 |
)
|
86 |
|
87 |
|
|
|
173 |
# shown_table
|
174 |
qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
|
175 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
176 |
+
qa_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
177 |
+
datastore.qa_raw_df, datastore.qa_types, visible=False
|
178 |
+
)
|
179 |
|
180 |
version.change(
|
181 |
update_qa_version,
|
|
|
216 |
qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
|
217 |
|
218 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
219 |
+
_qa_df_ret_hidden = datastore.qa_raw_df[
|
220 |
+
datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
221 |
+
]
|
222 |
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
|
223 |
+
qa_df_elem_ret_hidden = get_leaderboard_table(
|
224 |
+
_qa_df_ret_hidden, datastore.qa_types, visible=False
|
225 |
+
)
|
226 |
|
227 |
version.change(
|
228 |
update_qa_version,
|
229 |
version,
|
230 |
+
[
|
231 |
+
domains,
|
232 |
+
langs,
|
233 |
+
models_ret,
|
234 |
+
qa_df_elem_ret,
|
235 |
+
qa_df_elem_ret_hidden,
|
236 |
+
],
|
237 |
)
|
238 |
|
239 |
set_listeners(
|
|
|
265 |
)
|
266 |
|
267 |
with gr.TabItem("Reranking Only", id=12):
|
268 |
+
_qa_df_rerank = datastore.qa_fmt_df[datastore.qa_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
|
|
|
|
269 |
_qa_df_rerank = reset_rank(_qa_df_rerank)
|
270 |
+
qa_rerank_models = _qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
|
|
|
271 |
with gr.Row():
|
272 |
with gr.Column(scale=1):
|
273 |
qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
|
|
|
277 |
|
278 |
_qa_df_rerank_hidden = datastore.qa_raw_df[
|
279 |
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
280 |
+
]
|
281 |
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
|
282 |
qa_df_elem_rerank_hidden = get_leaderboard_table(
|
283 |
_qa_df_rerank_hidden, datastore.qa_types, visible=False
|
|
|
341 |
with gr.Column():
|
342 |
models = get_reranking_dropdown(datastore.reranking_models)
|
343 |
|
344 |
+
doc_df_elem_ret_rerank = get_leaderboard_table(datastore.doc_fmt_df, datastore.doc_types)
|
|
|
|
|
345 |
|
346 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
347 |
doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
|
|
|
391 |
|
392 |
_doc_df_ret = datastore.doc_fmt_df[
|
393 |
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
394 |
+
]
|
395 |
_doc_df_ret = reset_rank(_doc_df_ret)
|
396 |
+
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
|
|
|
|
|
397 |
|
398 |
_doc_df_ret_hidden = datastore.doc_raw_df[
|
399 |
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
400 |
+
]
|
401 |
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
|
402 |
doc_df_elem_ret_hidden = get_leaderboard_table(
|
403 |
_doc_df_ret_hidden, datastore.doc_types, visible=False
|
|
|
439 |
with gr.TabItem("Reranking Only", id=22):
|
440 |
_doc_df_rerank = datastore.doc_fmt_df[
|
441 |
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
442 |
+
]
|
443 |
_doc_df_rerank = reset_rank(_doc_df_rerank)
|
444 |
doc_rerank_models = (
|
445 |
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
446 |
)
|
447 |
with gr.Row():
|
448 |
with gr.Column(scale=1):
|
449 |
+
doc_models_rerank = get_reranking_dropdown(doc_rerank_models)
|
|
|
|
|
450 |
with gr.Column(scale=1):
|
451 |
doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
|
452 |
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
|
453 |
_doc_df_rerank_hidden = datastore.doc_raw_df[
|
454 |
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
455 |
+
]
|
456 |
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
|
457 |
doc_df_elem_rerank_hidden = get_leaderboard_table(
|
458 |
_doc_df_rerank_hidden, datastore.doc_types, visible=False
|
src/loaders.py
CHANGED
@@ -1,14 +1,10 @@
|
|
1 |
import os.path
|
2 |
-
from typing import
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.
|
7 |
-
|
8 |
-
DEFAULT_METRIC_LONG_DOC,
|
9 |
-
DEFAULT_METRIC_QA,
|
10 |
-
)
|
11 |
-
from src.columns import COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
12 |
from src.models import FullEvalResult, LeaderboardDataStore, TaskType
|
13 |
from src.utils import get_default_cols, get_leaderboard_df
|
14 |
|
@@ -80,8 +76,9 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
|
|
80 |
datastore.doc_fmt_df = datastore.doc_fmt_df[~datastore.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
|
81 |
datastore.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
82 |
|
83 |
-
datastore.reranking_models =
|
84 |
-
|
|
|
85 |
return datastore
|
86 |
|
87 |
|
|
|
1 |
import os.path
|
2 |
+
from typing import Dict, List
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
|
7 |
+
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
|
|
|
|
|
|
|
|
|
8 |
from src.models import FullEvalResult, LeaderboardDataStore, TaskType
|
9 |
from src.utils import get_default_cols, get_leaderboard_df
|
10 |
|
|
|
76 |
datastore.doc_fmt_df = datastore.doc_fmt_df[~datastore.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
|
77 |
datastore.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
78 |
|
79 |
+
datastore.reranking_models = sorted(
|
80 |
+
list(frozenset([eval_result.reranking_model for eval_result in datastore.raw_data]))
|
81 |
+
)
|
82 |
return datastore
|
83 |
|
84 |
|
src/models.py
CHANGED
@@ -1,14 +1,20 @@
|
|
1 |
import json
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
from collections import defaultdict
|
5 |
from dataclasses import dataclass
|
|
|
6 |
from typing import List, Optional
|
7 |
|
8 |
import pandas as pd
|
9 |
|
10 |
-
from src.columns import
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def get_safe_name(name: str):
|
@@ -16,6 +22,7 @@ def get_safe_name(name: str):
|
|
16 |
name = name.replace("-", "_")
|
17 |
return "".join(character.lower() for character in name if (character.isalnum() or character == "_"))
|
18 |
|
|
|
19 |
@dataclass
|
20 |
class EvalResult:
|
21 |
"""
|
|
|
1 |
import json
|
|
|
|
|
2 |
from collections import defaultdict
|
3 |
from dataclasses import dataclass
|
4 |
+
from enum import Enum
|
5 |
from typing import List, Optional
|
6 |
|
7 |
import pandas as pd
|
8 |
|
9 |
+
from src.columns import (
|
10 |
+
COL_NAME_IS_ANONYMOUS,
|
11 |
+
COL_NAME_RERANKING_MODEL,
|
12 |
+
COL_NAME_RERANKING_MODEL_LINK,
|
13 |
+
COL_NAME_RETRIEVAL_MODEL,
|
14 |
+
COL_NAME_RETRIEVAL_MODEL_LINK,
|
15 |
+
COL_NAME_REVISION,
|
16 |
+
COL_NAME_TIMESTAMP,
|
17 |
+
)
|
18 |
|
19 |
|
20 |
def get_safe_name(name: str):
|
|
|
22 |
name = name.replace("-", "_")
|
23 |
return "".join(character.lower() for character in name if (character.isalnum() or character == "_"))
|
24 |
|
25 |
+
|
26 |
@dataclass
|
27 |
class EvalResult:
|
28 |
"""
|
src/utils.py
CHANGED
@@ -6,16 +6,20 @@ from pathlib import Path
|
|
6 |
|
7 |
import pandas as pd
|
8 |
|
9 |
-
from src.models import TaskType
|
10 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
11 |
-
from src.columns import
|
12 |
-
|
13 |
-
COL_NAME_IS_ANONYMOUS
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
)
|
|
|
|
|
19 |
|
20 |
|
21 |
def calculate_mean(row):
|
@@ -200,7 +204,7 @@ def update_metric(
|
|
200 |
elif task == TaskType.long_doc:
|
201 |
update_func = update_doc_df_elem
|
202 |
else:
|
203 |
-
raise
|
204 |
df_elem = get_leaderboard_df(datastore, task=task, metric=metric)
|
205 |
version = datastore.version
|
206 |
return update_func(
|
|
|
6 |
|
7 |
import pandas as pd
|
8 |
|
|
|
9 |
from src.benchmarks import LongDocBenchmarks, QABenchmarks
|
10 |
+
from src.columns import (
|
11 |
+
COL_NAME_AVG,
|
12 |
+
COL_NAME_IS_ANONYMOUS,
|
13 |
+
COL_NAME_RANK,
|
14 |
+
COL_NAME_RERANKING_MODEL,
|
15 |
+
COL_NAME_RETRIEVAL_MODEL,
|
16 |
+
COL_NAME_REVISION,
|
17 |
+
COL_NAME_TIMESTAMP,
|
18 |
+
get_default_col_names_and_types,
|
19 |
+
get_fixed_col_names_and_types,
|
20 |
)
|
21 |
+
from src.envs import API, LATEST_BENCHMARK_VERSION, SEARCH_RESULTS_REPO
|
22 |
+
from src.models import TaskType
|
23 |
|
24 |
|
25 |
def calculate_mean(row):
|
|
|
204 |
elif task == TaskType.long_doc:
|
205 |
update_func = update_doc_df_elem
|
206 |
else:
|
207 |
+
raise NotImplementedError
|
208 |
df_elem = get_leaderboard_df(datastore, task=task, metric=metric)
|
209 |
version = datastore.version
|
210 |
return update_func(
|
tests/src/display/test_utils.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
from src.display.utils import (
|
3 |
COLS_LONG_DOC,
|
4 |
COLS_QA,
|
|
|
|
|
1 |
from src.display.utils import (
|
2 |
COLS_LONG_DOC,
|
3 |
COLS_QA,
|
tests/test_utils.py
CHANGED
@@ -2,8 +2,15 @@ import pandas as pd
|
|
2 |
import pytest
|
3 |
|
4 |
from app import update_table
|
5 |
-
from src.columns import
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from src.utils import (
|
8 |
filter_models,
|
9 |
filter_queries,
|
|
|
2 |
import pytest
|
3 |
|
4 |
from app import update_table
|
5 |
+
from src.columns import (
|
6 |
+
COL_NAME_AVG,
|
7 |
+
COL_NAME_IS_ANONYMOUS,
|
8 |
+
COL_NAME_RANK,
|
9 |
+
COL_NAME_RERANKING_MODEL,
|
10 |
+
COL_NAME_RETRIEVAL_MODEL,
|
11 |
+
COL_NAME_REVISION,
|
12 |
+
COL_NAME_TIMESTAMP,
|
13 |
+
)
|
14 |
from src.utils import (
|
15 |
filter_models,
|
16 |
filter_queries,
|