XufengDuan commited on
Commit
39125ad
1 Parent(s): 0e2fd0d

updated scripts

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -9,10 +9,11 @@ import src.display.utils as utils
9
  import src.envs as envs
10
  import src.populate as populate
11
  import src.submission.submit as submit
12
-
13
-
 
14
  def restart_space():
15
- envs.API.restart_space(repo_id=envs.REPO_ID, token=envs.TOKEN)
16
 
17
  try:
18
  print(envs.EVAL_REQUESTS_PATH)
 
9
  import src.envs as envs
10
  import src.populate as populate
11
  import src.submission.submit as submit
12
+ import os
13
+ TOKEN = os.environ.get("HF_TOKEN", None)
14
+ print("TOKEN", TOKEN)
15
  def restart_space():
16
+ envs.API.restart_space(repo_id=envs.REPO_ID, token=TOKEN)
17
 
18
  try:
19
  print(envs.EVAL_REQUESTS_PATH)
main_backend.py CHANGED
@@ -111,7 +111,7 @@ def main():
111
  parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
112
 
113
  # Optional arguments
114
- parser.add_argument("--reproduce", type=bool, default=True, help="Reproduce the evaluation results")
115
  parser.add_argument("--model", type=str, default=None, help="Your Model ID")
116
  parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
117
  parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
 
111
  parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
112
 
113
  # Optional arguments
114
+ parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
115
  parser.add_argument("--model", type=str, default=None, help="Your Model ID")
116
  parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
117
  parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
src/.DS_Store CHANGED
Binary files a/src/.DS_Store and b/src/.DS_Store differ
 
src/Makefile DELETED
@@ -1,13 +0,0 @@
1
- .PHONY: style format
2
-
3
-
4
- style:
5
- python -m black --line-length 119 .
6
- python -m isort .
7
- ruff check --fix .
8
-
9
-
10
- quality:
11
- python -m black --check --line-length 119 .
12
- python -m isort --check-only .
13
- ruff check .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/README.md DELETED
@@ -1,47 +0,0 @@
1
- ---
2
- title: Humanlike Evaluation Leaderboard
3
- emoji: 🥇
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.37.1
8
- app_file: app.py
9
- pinned: true
10
- license: apache-2.0
11
- tags:
12
- - leaderboard
13
- models:
14
- - google/gemma-2-9b
15
- ---
16
-
17
-
18
- python>3.10
19
- pip spacy
20
- python -m spacy download en_core_web_sm
21
- pip install google.generativeai
22
- python -m spacy download en_core_web_trf
23
-
24
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
25
-
26
- Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
27
-
28
- Results files should have the following format:
29
- ```
30
- {
31
- "config": {
32
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
33
- "model_name": "path of the model on the hub: org/model",
34
- "model_sha": "revision on the hub",
35
- },
36
- "results": {
37
- "task_name": {
38
- "metric_name": score,
39
- },
40
- "task_name2": {
41
- "metric_name": score,
42
- }
43
- }
44
- }
45
- ```
46
-
47
- Request files are created automatically by this tool.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/app.py DELETED
@@ -1,329 +0,0 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from apscheduler.schedulers.background import BackgroundScheduler
4
- from huggingface_hub import snapshot_download
5
-
6
- import src.display.about as about
7
- from src.display.css_html_js import custom_css
8
- import src.display.utils as utils
9
- import src.envs as envs
10
- import src.populate as populate
11
- import src.submission.submit as submit
12
- import os
13
- TOKEN = os.environ.get("HF_TOKEN", None)
14
- print("TOKEN", TOKEN)
15
- def restart_space():
16
- envs.API.restart_space(repo_id=envs.REPO_ID, token=TOKEN)
17
-
18
- try:
19
- print(envs.EVAL_REQUESTS_PATH)
20
- snapshot_download(
21
- repo_id=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
22
- )
23
- except Exception:
24
- restart_space()
25
- try:
26
- print(envs.EVAL_RESULTS_PATH)
27
- snapshot_download(
28
- repo_id=envs.RESULTS_REPO, local_dir=envs.EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
29
- )
30
- except Exception:
31
- restart_space()
32
-
33
- raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
34
- leaderboard_df = original_df.copy()
35
-
36
- (
37
- finished_eval_queue_df,
38
- running_eval_queue_df,
39
- pending_eval_queue_df,
40
- ) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
41
-
42
-
43
- # Searching and filtering
44
- def update_table(
45
- hidden_df: pd.DataFrame,
46
- columns: list,
47
- type_query: list,
48
- precision_query: str,
49
- size_query: list,
50
- show_deleted: bool,
51
- query: str,
52
- ):
53
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
54
- filtered_df = filter_queries(query, filtered_df)
55
- df = select_columns(filtered_df, columns)
56
- return df
57
-
58
-
59
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
60
- return df[(df[utils.AutoEvalColumn.dummy.name].str.contains(query, case=False))]
61
-
62
-
63
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
64
- always_here_cols = [
65
- utils.AutoEvalColumn.model_type_symbol.name,
66
- utils.AutoEvalColumn.model.name,
67
- ]
68
- # We use COLS to maintain sorting
69
- filtered_df = df[
70
- always_here_cols + [c for c in utils.COLS if c in df.columns and c in columns] + [utils.AutoEvalColumn.dummy.name]
71
- ]
72
- return filtered_df
73
-
74
-
75
- def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
76
- final_df = []
77
- if query != "":
78
- queries = [q.strip() for q in query.split(";")]
79
- for _q in queries:
80
- _q = _q.strip()
81
- if _q != "":
82
- temp_filtered_df = search_table(filtered_df, _q)
83
- if len(temp_filtered_df) > 0:
84
- final_df.append(temp_filtered_df)
85
- if len(final_df) > 0:
86
- filtered_df = pd.concat(final_df)
87
- filtered_df = filtered_df.drop_duplicates(
88
- subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name]
89
- )
90
-
91
- return filtered_df
92
-
93
-
94
- def filter_models(
95
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
96
- ) -> pd.DataFrame:
97
- # Show all models
98
- # if show_deleted:
99
- # filtered_df = df
100
- # else: # Show only still on the hub models
101
- # filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
102
-
103
- filtered_df = df
104
-
105
- type_emoji = [t[0] for t in type_query]
106
- filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
107
- filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
108
-
109
- numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
110
- params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
111
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
112
- filtered_df = filtered_df.loc[mask]
113
-
114
- return filtered_df
115
-
116
-
117
- demo = gr.Blocks(css=custom_css)
118
- with demo:
119
- gr.HTML(about.TITLE)
120
- gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
121
-
122
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
123
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
124
- with gr.Row():
125
- with gr.Column():
126
- with gr.Row():
127
- search_bar = gr.Textbox(
128
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
129
- show_label=False,
130
- elem_id="search-bar",
131
- )
132
- with gr.Row():
133
- shown_columns = gr.CheckboxGroup(
134
- choices=[
135
- c.name
136
- for c in utils.fields(utils.AutoEvalColumn)
137
- if not c.hidden and not c.never_hidden and not c.dummy
138
- ],
139
- value=[
140
- c.name
141
- for c in utils.fields(utils.AutoEvalColumn)
142
- if c.displayed_by_default and not c.hidden and not c.never_hidden
143
- ],
144
- label="Select columns to show",
145
- elem_id="column-select",
146
- interactive=True,
147
- )
148
- with gr.Row():
149
- deleted_models_visibility = gr.Checkbox(
150
- value=False, label="Show gated/private/deleted models", interactive=True
151
- )
152
- with gr.Column(min_width=320):
153
- #with gr.Box(elem_id="box-filter"):
154
- filter_columns_type = gr.CheckboxGroup(
155
- label="Model types",
156
- choices=[t.to_str() for t in utils.ModelType],
157
- value=[t.to_str() for t in utils.ModelType],
158
- interactive=True,
159
- elem_id="filter-columns-type",
160
- )
161
- filter_columns_precision = gr.CheckboxGroup(
162
- label="Precision",
163
- choices=[i.value.name for i in utils.Precision],
164
- value=[i.value.name for i in utils.Precision],
165
- interactive=True,
166
- elem_id="filter-columns-precision",
167
- )
168
- filter_columns_size = gr.CheckboxGroup(
169
- label="Model sizes (in billions of parameters)",
170
- choices=list(utils.NUMERIC_INTERVALS.keys()),
171
- value=list(utils.NUMERIC_INTERVALS.keys()),
172
- interactive=True,
173
- elem_id="filter-columns-size",
174
- )
175
-
176
- leaderboard_table = gr.components.Dataframe(
177
- value=leaderboard_df[
178
- [c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
179
- + shown_columns.value
180
- + [utils.AutoEvalColumn.dummy.name]
181
- ],
182
- headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
183
- datatype=utils.TYPES,
184
- elem_id="leaderboard-table",
185
- interactive=False,
186
- visible=True,
187
- column_widths=["2%", "33%"]
188
- )
189
-
190
- # Dummy leaderboard for handling the case when the user uses backspace key
191
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
192
- value=original_df[utils.COLS],
193
- headers=utils.COLS,
194
- datatype=utils.TYPES,
195
- visible=False,
196
- )
197
- search_bar.submit(
198
- update_table,
199
- [
200
- hidden_leaderboard_table_for_search,
201
- shown_columns,
202
- filter_columns_type,
203
- filter_columns_precision,
204
- filter_columns_size,
205
- deleted_models_visibility,
206
- search_bar,
207
- ],
208
- leaderboard_table,
209
- )
210
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
211
- selector.change(
212
- update_table,
213
- [
214
- hidden_leaderboard_table_for_search,
215
- shown_columns,
216
- filter_columns_type,
217
- filter_columns_precision,
218
- filter_columns_size,
219
- deleted_models_visibility,
220
- search_bar,
221
- ],
222
- leaderboard_table,
223
- queue=True,
224
- )
225
-
226
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
227
- gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
228
-
229
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
230
- with gr.Column():
231
- with gr.Row():
232
- gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
233
-
234
- with gr.Column():
235
- with gr.Accordion(
236
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
237
- open=False,
238
- ):
239
- with gr.Row():
240
- finished_eval_table = gr.components.Dataframe(
241
- value=finished_eval_queue_df,
242
- headers=utils.EVAL_COLS,
243
- datatype=utils.EVAL_TYPES,
244
- row_count=5,
245
- )
246
- with gr.Accordion(
247
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
248
- open=False,
249
- ):
250
- with gr.Row():
251
- running_eval_table = gr.components.Dataframe(
252
- value=running_eval_queue_df,
253
- headers=utils.EVAL_COLS,
254
- datatype=utils.EVAL_TYPES,
255
- row_count=5,
256
- )
257
-
258
- with gr.Accordion(
259
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
260
- open=False,
261
- ):
262
- with gr.Row():
263
- pending_eval_table = gr.components.Dataframe(
264
- value=pending_eval_queue_df,
265
- headers=utils.EVAL_COLS,
266
- datatype=utils.EVAL_TYPES,
267
- row_count=5,
268
- )
269
- with gr.Row():
270
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
271
-
272
- with gr.Row():
273
- with gr.Column():
274
- model_name_textbox = gr.Textbox(label="Model name")
275
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
276
- model_type = gr.Dropdown(
277
- choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
278
- label="Model type",
279
- multiselect=False,
280
- value=None,
281
- interactive=True,
282
- )
283
-
284
- with gr.Column():
285
- precision = gr.Dropdown(
286
- choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
287
- label="Precision",
288
- multiselect=False,
289
- value="float16",
290
- interactive=True,
291
- )
292
- weight_type = gr.Dropdown(
293
- choices=[i.value.name for i in utils.WeightType],
294
- label="Weights type",
295
- multiselect=False,
296
- value="Original",
297
- interactive=True,
298
- )
299
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
300
-
301
- submit_button = gr.Button("Submit Eval")
302
- submission_result = gr.Markdown()
303
- submit_button.click(
304
- submit.add_new_eval,
305
- [
306
- model_name_textbox,
307
- base_model_name_textbox,
308
- revision_name_textbox,
309
- precision,
310
- weight_type,
311
- model_type,
312
- ],
313
- submission_result,
314
- )
315
-
316
- with gr.Row():
317
- with gr.Accordion("📙 Citation", open=False):
318
- citation_button = gr.Textbox(
319
- value=about.CITATION_BUTTON_TEXT,
320
- label=about.CITATION_BUTTON_LABEL,
321
- lines=20,
322
- elem_id="citation-button",
323
- show_copy_button=True,
324
- )
325
-
326
- scheduler = BackgroundScheduler()
327
- scheduler.add_job(restart_space, "interval", seconds=1800)
328
- scheduler.start()
329
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/backend/model_operations.py CHANGED
@@ -166,14 +166,14 @@ class SummaryGenerator:
166
  Stimuli_2_column = df_sheet["Stimuli-2"]
167
 
168
  # 遍历Prompt0列的值
169
- for j, prompt_value in enumerate(tqdm(prompt_column[0:2], desc=f"Processing {sheet_name}"), start=0):
170
  ID = 'E' + str(i)
171
  # q_ID = ID + '_' + str(j)
172
 
173
  # print(ID, q_ID, prompt_value)
174
  system_prompt = envs.SYSTEM_PROMPT
175
  _user_prompt = prompt_value
176
- for ii in range(2):
177
  # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
178
  while True:
179
  try:
 
166
  Stimuli_2_column = df_sheet["Stimuli-2"]
167
 
168
  # 遍历Prompt0列的值
169
+ for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=0):
170
  ID = 'E' + str(i)
171
  # q_ID = ID + '_' + str(j)
172
 
173
  # print(ID, q_ID, prompt_value)
174
  system_prompt = envs.SYSTEM_PROMPT
175
  _user_prompt = prompt_value
176
+ for ii in range(10):
177
  # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
178
  while True:
179
  try:
src/main_backend.py DELETED
@@ -1,126 +0,0 @@
1
- import argparse
2
- import logging
3
- import pprint
4
- import os
5
-
6
- from huggingface_hub import snapshot_download
7
-
8
- import src.backend.run_eval_suite as run_eval_suite
9
- import src.backend.manage_requests as manage_requests
10
- import src.backend.sort_queue as sort_queue
11
- import src.envs as envs
12
-
13
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
14
-
15
- logging.basicConfig(level=logging.ERROR)
16
- pp = pprint.PrettyPrinter(width=80)
17
-
18
- PENDING_STATUS = "PENDING"
19
- RUNNING_STATUS = "RUNNING"
20
- FINISHED_STATUS = "FINISHED"
21
- FAILED_STATUS = "FAILED"
22
- # import os
23
- snapshot_download(repo_id=envs.RESULTS_REPO, revision="main",
24
- local_dir=envs.EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
25
-
26
- snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
27
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
28
- # exit()
29
-
30
- def run_auto_eval(args):
31
- if not args.reproduce:
32
- current_pending_status = [PENDING_STATUS]
33
- print('_________________')
34
- manage_requests.check_completed_evals(
35
- api=envs.API,
36
- checked_status=RUNNING_STATUS,
37
- completed_status=FINISHED_STATUS,
38
- failed_status=FAILED_STATUS,
39
- hf_repo=envs.QUEUE_REPO,
40
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
41
- hf_repo_results=envs.RESULTS_REPO,
42
- local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
43
- )
44
- logging.info("Checked completed evals")
45
- eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
46
- hf_repo=envs.QUEUE_REPO,
47
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
48
- logging.info("Got eval requests")
49
- eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
50
- logging.info("Sorted eval requests")
51
-
52
- print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
53
- print(eval_requests)
54
- if len(eval_requests) == 0:
55
- print("No eval requests found. Exiting.")
56
- return
57
-
58
- if args.model is not None:
59
- eval_request = manage_requests.EvalRequest(
60
- model=args.model,
61
- status=PENDING_STATUS,
62
- precision=args.precision
63
- )
64
- pp.pprint(eval_request)
65
- else:
66
- eval_request = eval_requests[0]
67
- pp.pprint(eval_request)
68
-
69
- # manage_requests.set_eval_request(
70
- # api=envs.API,
71
- # eval_request=eval_request,
72
- # new_status=RUNNING_STATUS,
73
- # hf_repo=envs.QUEUE_REPO,
74
- # local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
75
- # )
76
- # logging.info("Set eval request to running, now running eval")
77
-
78
- run_eval_suite.run_evaluation(
79
- eval_request=eval_request,
80
- local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
81
- results_repo=envs.RESULTS_REPO,
82
- batch_size=1,
83
- device=envs.DEVICE,
84
- no_cache=True,
85
- need_check=not args.publish,
86
- write_results=args.update
87
- )
88
- logging.info("Eval finished, now setting status to finished")
89
- else:
90
- eval_request = manage_requests.EvalRequest(
91
- model=args.model,
92
- status=PENDING_STATUS,
93
- precision=args.precision
94
- )
95
- pp.pprint(eval_request)
96
- logging.info("Running reproducibility eval")
97
-
98
- run_eval_suite.run_evaluation(
99
- eval_request=eval_request,
100
- local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
101
- results_repo=envs.RESULTS_REPO,
102
- batch_size=1,
103
- device=envs.DEVICE,
104
- need_check=not args.publish,
105
- write_results=args.update
106
- )
107
- logging.info("Reproducibility eval finished")
108
-
109
-
110
- def main():
111
- parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
112
-
113
- # Optional arguments
114
- parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
115
- parser.add_argument("--model", type=str, default=None, help="Your Model ID")
116
- parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
117
- parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
118
- parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
119
-
120
- args = parser.parse_args()
121
-
122
- run_auto_eval(args)
123
-
124
-
125
- if __name__ == "__main__":
126
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pyproject.toml DELETED
@@ -1,13 +0,0 @@
1
- [tool.ruff]
2
- # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- select = ["E", "F"]
4
- ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
-
8
- [tool.isort]
9
- profile = "black"
10
- line_length = 119
11
-
12
- [tool.black]
13
- line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/requirements.txt DELETED
@@ -1,17 +0,0 @@
1
- APScheduler==3.10.1
2
- black==23.11.0
3
- click==8.1.3
4
- datasets==2.14.5
5
- gradio==4.4.0
6
- gradio_client==0.7.0
7
- huggingface-hub>=0.18.0
8
- litellm==1.15.1
9
- matplotlib==3.7.1
10
- numpy==1.24.2
11
- pandas==2.0.0
12
- python-dateutil==2.8.2
13
- requests==2.28.2
14
- tqdm==4.65.0
15
- transformers==4.35.2
16
- tokenizers>=0.15.0
17
- sentence-transformers==2.2.2