Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
fix: fix the data loader
Browse files- app.py +2 -1
- src/envs.py +2 -2
- src/leaderboard/read_evals.py +8 -3
- src/populate.py +3 -0
- tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json} +0 -0
- tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-12-21T18-10-08.json → results_2023-12-21T18-10-08.json} +0 -0
- tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json} +0 -0
app.py
CHANGED
@@ -42,7 +42,8 @@ def restart_space():
|
|
42 |
# restart_space()
|
43 |
|
44 |
raw_data_qa, original_df_qa = get_leaderboard_df(
|
45 |
-
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='
|
|
|
46 |
leaderboard_df = original_df_qa.copy()
|
47 |
|
48 |
# (
|
|
|
42 |
# restart_space()
|
43 |
|
44 |
raw_data_qa, original_df_qa = get_leaderboard_df(
|
45 |
+
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
|
46 |
+
print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
|
47 |
leaderboard_df = original_df_qa.copy()
|
48 |
|
49 |
# (
|
src/envs.py
CHANGED
@@ -17,8 +17,8 @@ RESULTS_REPO = f"{OWNER}/results"
|
|
17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
-
EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/
|
21 |
-
EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/
|
22 |
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
|
|
|
17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
+
EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/toys/toydata/requests" # os.path.join(CACHE_PATH, "eval-queue")
|
21 |
+
EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/toys/toydata/results" #os.path.join(CACHE_PATH, "eval-results")
|
22 |
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -62,19 +62,22 @@ class FullEvalResult:
|
|
62 |
results=result_list
|
63 |
)
|
64 |
|
65 |
-
def to_dict(self, task='qa', metric='
|
66 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
67 |
"""
|
68 |
results = defaultdict(dict)
|
69 |
for eval_result in self.results:
|
70 |
if eval_result.metric != metric:
|
|
|
71 |
continue
|
72 |
if eval_result.task != task:
|
|
|
73 |
continue
|
74 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
75 |
results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
|
76 |
results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
|
77 |
|
|
|
78 |
for result in eval_result.results:
|
79 |
# add result for each domain, language, and dataset
|
80 |
domain = result["domain"]
|
@@ -136,7 +139,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
136 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
137 |
continue
|
138 |
try:
|
139 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("
|
140 |
except dateutil.parser._parser.ParserError:
|
141 |
files = [files[-1]]
|
142 |
|
@@ -152,9 +155,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
|
|
152 |
eval_result.update_with_request_file(requests_path)
|
153 |
latest_date_str = eval_result.date.replace(":", "-")
|
154 |
model_result_date_str = model_result_filepath.split('/')[-1
|
155 |
-
].removeprefix("
|
156 |
if latest_date_str != model_result_date_str:
|
|
|
157 |
continue
|
|
|
158 |
eval_name = eval_result.eval_name
|
159 |
eval_results[eval_name] = eval_result
|
160 |
|
|
|
62 |
results=result_list
|
63 |
)
|
64 |
|
65 |
+
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
66 |
"""Convert FullEvalResult to a list of dict compatible with our dataframe UI
|
67 |
"""
|
68 |
results = defaultdict(dict)
|
69 |
for eval_result in self.results:
|
70 |
if eval_result.metric != metric:
|
71 |
+
# print(f'result skipped: {metric} != {eval_result.metric}')
|
72 |
continue
|
73 |
if eval_result.task != task:
|
74 |
+
# print(f'result skipped: {task} != {eval_result.task}')
|
75 |
continue
|
76 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
77 |
results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
|
78 |
results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
|
79 |
|
80 |
+
print(f'result loaded: {eval_result.eval_name}')
|
81 |
for result in eval_result.results:
|
82 |
# add result for each domain, language, and dataset
|
83 |
domain = result["domain"]
|
|
|
139 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
140 |
continue
|
141 |
try:
|
142 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
|
143 |
except dateutil.parser._parser.ParserError:
|
144 |
files = [files[-1]]
|
145 |
|
|
|
155 |
eval_result.update_with_request_file(requests_path)
|
156 |
latest_date_str = eval_result.date.replace(":", "-")
|
157 |
model_result_date_str = model_result_filepath.split('/')[-1
|
158 |
+
].removeprefix("results_").removesuffix(".json")
|
159 |
if latest_date_str != model_result_date_str:
|
160 |
+
print(f'file skipped: {model_result_filepath}')
|
161 |
continue
|
162 |
+
print(f'file loaded: {model_result_filepath}')
|
163 |
eval_name = eval_result.eval_name
|
164 |
eval_results[eval_name] = eval_result
|
165 |
|
src/populate.py
CHANGED
@@ -12,11 +12,14 @@ from typing import Tuple
|
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
|
|
15 |
all_data_json = []
|
16 |
for v in raw_data:
|
17 |
all_data_json += v.to_dict(task=task, metric=metric)
|
18 |
|
|
|
19 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
20 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
21 |
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
|
22 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
|
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
+
print(f"raw_data loaded: {len(raw_data)}")
|
16 |
all_data_json = []
|
17 |
for v in raw_data:
|
18 |
all_data_json += v.to_dict(task=task, metric=metric)
|
19 |
|
20 |
+
print(f'records loaded: {len(all_data_json)}')
|
21 |
df = pd.DataFrame.from_records(all_data_json)
|
22 |
+
print(f'dataframe created: {df.shape}')
|
23 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
24 |
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
|
25 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json}
RENAMED
File without changes
|
tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-12-21T18-10-08.json → results_2023-12-21T18-10-08.json}
RENAMED
File without changes
|
tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json}
RENAMED
File without changes
|