terryyz commited on
Commit
aca9a0c
1 Parent(s): e42e086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +329 -630
app.py CHANGED
@@ -1,647 +1,346 @@
 
 
 
1
  import os
2
- import logging
 
3
  import time
4
- import datetime
5
- import gradio as gr
6
- from threading import Thread
7
- import datasets
8
- from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
9
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
10
- from apscheduler.schedulers.background import BackgroundScheduler
11
-
12
- # Start ephemeral Spaces on PRs (see config in README.md)
13
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
14
-
15
- from src.display.about import (
16
- CITATION_BUTTON_LABEL,
17
- CITATION_BUTTON_TEXT,
18
- # INTRODUCTION_TEXT,
19
- TITLE,
20
- ABOUT_TEXT,
21
- SUBMISSION_TEXT_3,
22
- )
23
- from src.display.css_html_js import custom_css
24
- from src.display.utils import (
25
- COLS,
26
- EVAL_COLS,
27
- EVAL_TYPES,
28
- AutoEvalColumn,
29
- fields,
30
- EvalQueueColumn
31
- )
32
- from src.envs import (
33
- API,
34
- EVAL_REQUESTS_PATH,
35
- RESULT_REPO,
36
- DATA_VERSION,
37
- DATA_REPO,
38
- HARD_RESULT_REPO,
39
- ELO_REPO,
40
- HARD_ELO_REPO,
41
- SOLVE_REPO,
42
- HARD_SOLVE_REPO,
43
- HF_TOKEN,
44
- QUEUE_REPO,
45
- REPO_ID,
46
- VOTES_REPO,
47
- VOTES_PATH,
48
- HF_HOME,
49
- )
50
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
51
- from src.execute import generate_command, is_running, default_command, stream_logs, find_result_file
52
- from src.tools.plots import plot_elo_mle, plot_solve_rate
53
- # from src.voting.vote_system import VoteManager, run_scheduler
54
-
55
- # Configure logging
56
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
57
-
58
- # Start ephemeral Spaces on PRs (see config in README.md)
59
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
60
-
61
- # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
62
- # This controls whether a full initialization should be performed.
63
- DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
64
- NEW_DATA_ON_LEADERBOARD = True
65
- LEADERBOARD_DF = None
66
- HARD_LEADERBOARD_DF = None
67
- ELO_TASK_DF = None
68
- ELO_BENCH_DF = None
69
- HARD_ELO_TASK_DF = None
70
- HARD_ELO_BENCH_DF = None
71
- COMPLETE_SOLVE_DF = None
72
- INSTRUCT_SOLVE_DF = None
73
- HARD_COMPLETE_SOLVE_DF = None
74
- HARD_INSTRUCT_SOLVE_DF = None
75
-
76
- DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
77
- verification_mode="no_checks")
78
-
79
-
80
- def filter_data(data, keyword):
81
- if not keyword:
82
- return data
83
- filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
84
- return filtered_data
85
-
86
-
87
- def update_display(search_keyword, index, show_test):
88
- filtered_data = filter_data(DATA, search_keyword)
89
-
90
- if not filtered_data:
91
- return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]
92
-
93
- max_index = len(filtered_data) - 1
94
- index = min(max(0, index), max_index)
95
 
96
- task_id = filtered_data[index]['task_id']
97
- snippet1 = filtered_data[index]['complete_prompt']
98
- snippet2 = filtered_data[index]['instruct_prompt']
99
- # snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
100
- snippet4 = filtered_data[index]['test'] if show_test else ""
101
-
102
- return [
103
- task_id,
104
- snippet1,
105
- snippet2,
106
- # snippet3,
107
- snippet4,
108
- len(filtered_data),
109
- gr.update(maximum=max_index, value=index)
110
- ]
111
-
112
- def restart_space():
113
- API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
114
-
115
-
116
- def time_diff_wrapper(func):
117
- def wrapper(*args, **kwargs):
118
- start_time = time.time()
119
- result = func(*args, **kwargs)
120
- end_time = time.time()
121
- diff = end_time - start_time
122
- logging.info(f"Time taken for {func.__name__}: {diff} seconds")
123
- return result
124
-
125
- return wrapper
126
-
127
-
128
- @time_diff_wrapper
129
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
130
- """Download dataset with exponential backoff retries."""
131
- attempt = 0
132
- while attempt < max_attempts:
133
- try:
134
- logging.info(f"Downloading {repo_id} to {local_dir}")
135
- snapshot_download(
136
- repo_id=repo_id,
137
- local_dir=local_dir,
138
- repo_type=repo_type,
139
- tqdm_class=None,
140
- etag_timeout=30,
141
- max_workers=8,
142
- )
143
- logging.info("Download successful")
144
- return
145
- except Exception as e:
146
- wait_time = backoff_factor**attempt
147
- logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
148
- time.sleep(wait_time)
149
- attempt += 1
150
- raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
151
-
152
- def get_latest_data_leaderboard(
153
- leaderboard_initial_df = None,
154
- hard_leaderboard_initial_df = None,
155
- elo_task_df = None,
156
- elo_bench_df = None,
157
- hard_elo_task_df = None,
158
- hard_elo_bench_df = None,
159
- complete_solve_df = None,
160
- instruct_solve_df = None,
161
- hard_complete_solve_df = None,
162
- hard_instruct_solve_df = None
163
- ):
164
- global NEW_DATA_ON_LEADERBOARD
165
- global LEADERBOARD_DF
166
- global HARD_LEADERBOARD_DF
167
- global ELO_TASK_DF
168
- global ELO_BENCH_DF
169
- global HARD_ELO_TASK_DF
170
- global HARD_ELO_BENCH_DF
171
- global COMPLETE_SOLVE_DF
172
- global INSTRUCT_SOLVE_DF
173
- global HARD_COMPLETE_SOLVE_DF
174
- global HARD_INSTRUCT_SOLVE_DF
175
-
176
- if NEW_DATA_ON_LEADERBOARD:
177
- print("Leaderboard updated at reload!")
178
- leaderboard_dataset = datasets.load_dataset(
179
- RESULT_REPO,
180
- "default",
181
- split="train",
182
- cache_dir=HF_HOME,
183
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
184
- verification_mode="no_checks"
185
- )
186
- LEADERBOARD_DF = get_leaderboard_df(
187
- leaderboard_dataset=leaderboard_dataset,
188
- cols=COLS,
189
- )
190
- hard_leaderboard_dataset = datasets.load_dataset(
191
- HARD_RESULT_REPO,
192
- "default",
193
- split="train",
194
- cache_dir=HF_HOME,
195
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
196
- verification_mode="no_checks"
197
- )
198
- hard_leaderboard_df = get_leaderboard_df(
199
- leaderboard_dataset=hard_leaderboard_dataset,
200
- cols=COLS,
201
- )
202
- HARD_LEADERBOARD_DF = hard_leaderboard_df
203
-
204
- elo_task_df = datasets.load_dataset(
205
- ELO_REPO,
206
- "default",
207
- split="task_no_tie",
208
- cache_dir=HF_HOME,
209
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
- verification_mode="no_checks"
211
- ).to_pandas()
212
- elo_bench_df = datasets.load_dataset(
213
- ELO_REPO,
214
- "default",
215
- split="benchmark_tie",
216
- cache_dir=HF_HOME,
217
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
- verification_mode="no_checks"
219
- ).to_pandas()
220
- ELO_TASK_DF = elo_task_df
221
- ELO_BENCH_DF = elo_bench_df
222
 
223
- hard_elo_task_df = datasets.load_dataset(
224
- HARD_ELO_REPO,
225
- "default",
226
- split="task_no_tie",
227
- cache_dir=HF_HOME,
228
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
229
- verification_mode="no_checks"
230
- ).to_pandas()
231
- hard_elo_bench_df = datasets.load_dataset(
232
- HARD_ELO_REPO,
233
- "default",
234
- split="benchmark_tie",
235
- cache_dir=HF_HOME,
236
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
237
- verification_mode="no_checks"
238
- ).to_pandas()
239
- HARD_ELO_TASK_DF = hard_elo_task_df
240
- HARD_ELO_BENCH_DF = hard_elo_bench_df
241
-
242
- complete_solve_df = datasets.load_dataset(
243
- SOLVE_REPO,
244
- "default",
245
- split="complete",
246
- cache_dir=HF_HOME,
247
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
248
- verification_mode="no_checks"
249
- ).to_pandas()
250
- instruct_solve_df = datasets.load_dataset(
251
- SOLVE_REPO,
252
- "default",
253
- split="instruct",
254
- cache_dir=HF_HOME,
255
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
256
- verification_mode="no_checks"
257
- ).to_pandas()
258
- COMPLETE_SOLVE_DF = complete_solve_df
259
- INSTRUCT_SOLVE_DF = instruct_solve_df
260
-
261
- hard_complete_solve_df = datasets.load_dataset(
262
- HARD_SOLVE_REPO,
263
- "default",
264
- split="complete",
265
- cache_dir=HF_HOME,
266
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
267
- verification_mode="no_checks"
268
- ).to_pandas()
269
- hard_instruct_solve_df = datasets.load_dataset(
270
- HARD_SOLVE_REPO,
271
- "default",
272
- split="instruct",
273
- cache_dir=HF_HOME,
274
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
275
- verification_mode="no_checks"
276
- ).to_pandas()
277
- HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
278
- HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
279
-
280
- NEW_DATA_ON_LEADERBOARD = False
281
 
282
- else:
283
- LEADERBOARD_DF = leaderboard_initial_df
284
- # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
285
- ELO_TASK_DF = elo_task_df
286
- # ELO_BENCH_DF = elo_bench_df
287
- # HARD_ELO_TASK_DF = hard_elo_task_df
288
- HARD_ELO_BENCH_DF = hard_elo_bench_df
289
- COMPLETE_SOLVE_DF = complete_solve_df
290
- # INSTRUCT_SOLVE_DF = instruct_solve_df
291
- # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
292
- HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
293
-
294
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
295
- # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
296
-
297
-
298
- def init_space():
299
- """Initializes the application space, loading only necessary data."""
300
-
301
- # Always redownload the leaderboard DataFrame
302
- global LEADERBOARD_DF
303
- global HARD_LEADERBOARD_DF
304
- global ELO_TASK_DF
305
- global ELO_BENCH_DF
306
- global HARD_ELO_TASK_DF
307
- global HARD_ELO_BENCH_DF
308
- global COMPLETE_SOLVE_DF
309
- global INSTRUCT_SOLVE_DF
310
- global HARD_COMPLETE_SOLVE_DF
311
- global HARD_INSTRUCT_SOLVE_DF
312
 
313
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
314
- # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
315
-
316
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
317
- # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
318
-
319
- # Initialize VoteManager
320
- # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
321
-
322
-
323
- # Schedule the upload_votes method to run every 15 minutes
324
- # schedule.every(15).minutes.do(vote_manager.upload_votes)
325
-
326
- # Start the scheduler in a separate thread
327
- # scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
328
- # scheduler_thread.start()
329
-
330
- # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
331
- # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
332
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
333
- ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
334
- COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
335
- HARD_INSTRUCT_SOLVE_DF = init_space()
336
- # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
337
-
338
- # Data processing for plots now only on demand in the respective Gradio tab
339
- # def load_and_create_plots():
340
- # plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
341
- # return plot_df
342
-
343
- # Function to check if a user is logged in
344
- def check_login(profile: gr.OAuthProfile | None) -> bool:
345
- if profile is None:
346
- return False
347
- return True
348
-
349
- def init_leaderboard(dataframe):
350
- if dataframe is None or dataframe.empty:
351
- raise ValueError("Leaderboard DataFrame is empty or None.")
352
- return Leaderboard(
353
- value=dataframe,
354
- datatype=[c.type for c in fields(AutoEvalColumn)],
355
- select_columns=SelectColumns(
356
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
357
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
358
- label="Select Columns to Display:",
359
- ),
360
- search_columns=[AutoEvalColumn.model.name],
361
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
362
- filter_columns=[
363
- ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
364
- ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
365
- ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
366
- ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
367
- ],
368
- bool_checkboxgroup_label="Hide models",
369
- interactive=False,
370
- )
371
 
 
 
372
 
373
- def init_others(dataframe):
374
- if dataframe is None or dataframe.empty:
375
- raise ValueError("Gradio DataFrame is empty or None.")
376
- return gr.Dataframe(dataframe, visible=False)
 
 
377
 
378
- main_block = gr.Blocks(css=custom_css)
379
- with main_block as demo:
380
- with gr.Row(elem_id="header-row"):
381
- gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
 
 
 
382
 
383
- # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
384
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
385
- with gr.Tab("💎 Hard Set") as hard_tabs:
386
- with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
387
- hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
388
- gr.Markdown(
389
- """
390
- **Notes:**
391
- - For the efficiency reasons, we only display the Hard Set leaderboard.
392
- - _Hard Set_ vs _Full Set_:
393
- - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
394
- - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
395
- - _Complete_ vs _Instruct_:
396
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
397
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
398
- - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
399
- - `Average` is the average of `Complete` and `Instruct` when both are available.
400
- - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
401
- - `#Act Params (B)` is the number of activated model parameters during inference.
402
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
403
- - For more details check the 📝 About section.
404
- """,
405
- elem_classes="markdown-text",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  )
407
-
408
- with gr.TabItem("📊 Elo Rating", id="hard_elo"):
409
- with gr.Column():
410
- with gr.Group():
411
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
412
- hard_task_elo_map = gr.Plot()
413
- hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
414
- demo.load(plot_elo_mle, [hard_elo_task_gr],
415
- hard_task_elo_map)
416
- with gr.Group():
417
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
418
- hard_bench_elo_map = gr.Plot()
419
- hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
420
- demo.load(plot_elo_mle, [hard_elo_bench_gr],
421
- hard_bench_elo_map)
422
-
423
- with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
424
- with gr.Column():
425
- hard_complete_map = gr.Plot()
426
- hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
427
- demo.load(plot_solve_rate, [hard_complete_solve_gr,
428
- gr.Textbox("Complete", visible=False),
429
- gr.Number(10, visible=False),
430
- gr.Number(16, visible=False),
431
- ], hard_complete_map)
432
- hard_instruct_map = gr.Plot()
433
- hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
434
- demo.load(plot_solve_rate, [hard_instruct_solve_gr,
435
- gr.Textbox("Instruct", visible=False),
436
- gr.Number(10, visible=False),
437
- gr.Number(16, visible=False),
438
- ], hard_instruct_map)
439
- with gr.Tab("🎯 Full Set") as full_tabs:
440
- with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
441
- leaderboard = init_leaderboard(LEADERBOARD_DF)
442
- gr.Markdown(
443
- """
444
- **Notes:**
445
- - _Complete_ vs _Instruct_:
446
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
447
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
448
- - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
449
- - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
450
- - `size` is the amount of activated model weight during inference.
451
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
452
- - For more details check the 📝 About section.
453
- """,
454
- elem_classes="markdown-text",
 
 
 
455
  )
456
-
457
- with gr.TabItem("📊 Elo Rating", id="full_elo"):
458
- with gr.Column():
459
- with gr.Group():
460
-
461
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
462
- task_elo_map = gr.Plot()
463
- elo_task_gr = init_others(ELO_TASK_DF)
464
- demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
465
- with gr.Group():
466
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
467
- bench_elo_map = gr.Plot()
468
- elo_bench_gr = init_others(ELO_BENCH_DF)
469
- demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
470
-
471
- with gr.TabItem("🧩 Solve Rate", id="full_solve"):
472
- with gr.Column():
473
- complete_map = gr.Plot()
474
- complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
475
- demo.load(plot_solve_rate, [complete_solve_gr,
476
- gr.Textbox("Complete", visible=False),
477
- ], complete_map)
478
- instruct_map = gr.Plot()
479
- instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
480
- demo.load(plot_solve_rate, [instruct_solve_gr,
481
- gr.Textbox("Instruct", visible=False),
482
- ], instruct_map)
483
- with gr.TabItem("📝 About", id=3):
484
- gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
485
- with gr.TabItem("🔎 Data Viewer", id="viewer"):
486
- search_input = gr.Textbox(label="Search by keyword")
487
- count_output = gr.Number(label="Number of filtered items")
488
- index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
489
- # show_solution = gr.Checkbox(label="Show Solution")
490
- show_test = gr.Checkbox(label="Show Test Cases")
491
- update_button = gr.Button("Update")
492
-
493
- task_id_output = gr.Textbox(label="Task ID")
494
- code_completion = gr.Code(language="python", label="Code Completion")
495
- nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction")
496
- # solution = gr.Code(language="python", label="Solution")
497
- test_cases = gr.Code(language="python", label="Test Cases")
498
-
499
- update_button.click(
500
- update_display,
501
- inputs=[search_input, index_slider, show_test],
502
- outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
503
- )
504
-
505
- # Initial load
506
- demo.load(
507
- update_display,
508
- inputs=[search_input, index_slider, show_test],
509
- outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
510
- )
511
 
512
- with gr.TabItem("🚀 Request", id=4):
513
- gr.Markdown(SUBMISSION_TEXT_3)
514
-
515
- with gr.TabItem("🛠️ Execute", id=5):
516
- gr.Markdown("# BigCodeBench Evaluator")
517
-
518
- with gr.Row():
519
- jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
520
- split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
521
- subset = gr.Dropdown(choices=["hard", "full"], label="Subset", value="hard")
522
-
523
- with gr.Row():
524
- parallel = gr.Number(label="Parallel (optional)", precision=0)
525
- min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
526
- max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
527
-
528
- with gr.Row():
529
- max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
530
- max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
531
- check_gt_only = gr.Checkbox(label="Check GT Only")
532
- no_gt = gr.Checkbox(label="No GT")
533
-
534
- command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
535
- with gr.Row():
536
- submit_btn = gr.Button("Run Evaluation")
537
- download_btn = gr.DownloadButton(label="Download Result")
538
- log_output = gr.Textbox(label="Execution Logs", lines=20)
539
-
540
- input_components = [
541
- jsonl_file, split, subset, parallel,
542
- min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
543
- check_gt_only, no_gt
544
- ]
545
-
546
- for component in input_components:
547
- component.change(generate_command, inputs=input_components, outputs=command_output)
548
-
549
-
550
- def start_evaluation(command, jsonl_file, subset, split):
551
- extra = subset + "_" if subset != "full" else ""
552
- if jsonl_file is not None:
553
- result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
554
- else:
555
- result_path = None
556
-
557
- for log in stream_logs(command, jsonl_file):
558
- if jsonl_file is not None:
559
- yield log, gr.update(value=result_path, label=result_path), gr.update()
560
- else:
561
- yield log, gr.update(), gr.update()
562
- result_file = find_result_file()
563
- if result_file:
564
- return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
565
- # gr.Button(visible=False)#,
566
- # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
567
- else:
568
- return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
569
- # gr.Button("Run Evaluation", visible=True),
570
- # gr.DownloadButton(visible=False))
571
- submit_btn.click(start_evaluation,
572
- inputs=[command_output, jsonl_file, subset, split],
573
- outputs=[log_output, download_btn])
574
 
575
- with gr.Row():
576
- with gr.Accordion("📙 Citation", open=False):
577
- citation_button = gr.Textbox(
578
- value=CITATION_BUTTON_TEXT,
579
- label=CITATION_BUTTON_LABEL,
580
- lines=20,
581
- elem_id="citation-button",
582
- show_copy_button=True,
583
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
 
585
- main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
586
- # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
587
- # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
588
- # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
589
-
590
- main_block.queue(default_concurrency_limit=100)
591
-
592
-
593
- def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
594
- # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
595
- # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
596
- # ht to Lucain!
597
- if SPACE_ID is None:
598
- print("Not in a Space: Space CI disabled.")
599
- return WebhooksServer(ui=main_block)
600
-
601
- if IS_EPHEMERAL_SPACE:
602
- print("In an ephemeral Space: Space CI disabled.")
603
- return WebhooksServer(ui=main_block)
604
-
605
- card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
606
- config = card.data.get("space_ci", {})
607
- print(f"Enabling Space CI with config from README: {config}")
608
-
609
- return configure_space_ci(
610
- blocks=ui,
611
- trusted_authors=config.get("trusted_authors"),
612
- private=config.get("private", "auto"),
613
- variables=config.get("variables", "auto"),
614
- secrets=config.get("secrets"),
615
- hardware=config.get("hardware"),
616
- storage=config.get("storage"),
617
  )
 
 
618
 
619
- # Create webhooks server (with CI url if in Space and not ephemeral)
620
- webhooks_server = enable_space_ci_and_return_server(ui=main_block)
621
-
622
- # Add webhooks
623
- @webhooks_server.add_webhook
624
- def update_leaderboard(payload: WebhookPayload) -> None:
625
- """Redownloads the leaderboard dataset each time it updates"""
626
- if payload.repo.type == "dataset" and payload.event.action == "update":
627
- global NEW_DATA_ON_LEADERBOARD
628
- if NEW_DATA_ON_LEADERBOARD:
629
- return
630
- NEW_DATA_ON_LEADERBOARD = True
631
-
632
- for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
633
- datasets.load_dataset(
634
- repo,
635
- "default",
636
- cache_dir=HF_HOME,
637
- download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
638
- verification_mode="no_checks"
639
- )
640
-
641
-
642
-
643
- webhooks_server.launch()
644
-
645
- scheduler = BackgroundScheduler()
646
- scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
647
- scheduler.start()
 
1
+ import gradio as gr
2
+ import json
3
+ import multiprocessing
4
  import os
5
+ import pickle
6
+ import threading
7
  import time
8
+ from collections import Counter, defaultdict
9
+ from concurrent.futures import ProcessPoolExecutor, as_completed, wait, FIRST_COMPLETED
10
+ from datetime import datetime
11
+ from typing import Any, Dict, List, Tuple
12
+ from warnings import warn
13
+
14
+ import numpy as np
15
+ from termcolor import cprint
16
+ from tqdm import tqdm
17
+
18
+ from bigcodebench.data import get_bigcodebench, get_bigcodebench_hash, load_solutions
19
+ from bigcodebench.data.utils import CACHE_DIR
20
+ from bigcodebench.eval import PASS, compatible_eval_result, estimate_pass_at_k, untrusted_check
21
+ from bigcodebench.gen.util import trusted_check
22
+
23
+ Result = Tuple[str, List[bool]]
24
+
25
+
26
+ def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit):
27
+ cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
28
+ if os.path.exists(cache_file):
29
+ if check_gt_only:
30
+ os.remove(cache_file)
31
+ else:
32
+ print(f"Load from ground-truth from {cache_file}")
33
+ with open(cache_file, "rb") as f:
34
+ return pickle.load(f)
35
+
36
+ os.makedirs(CACHE_DIR, exist_ok=True)
37
+ print("\nAsserting the groundtruth...")
38
+ tbegin = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
41
+ futures = []
42
+ n_samples = 0
43
+ expected_time = dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ for problem in problems.values():
46
+ args = (
47
+ problem["complete_prompt"] + "\n" + problem["canonical_solution"],
48
+ problem["test"],
49
+ problem["task_id"],
50
+ max_as_limit,
51
+ max_data_limit,
52
+ max_stack_limit,
53
+ min_time_limit,
54
+ )
55
+
56
+ futures.append(executor.submit(trusted_check, *args))
57
+ n_samples += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ for future in tqdm(as_completed(futures), total=n_samples):
60
+ result = future.result()
61
+ expected_time[result["task_id"]] = result["time"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
64
+
65
+ if any(expected_time.values()):
66
+ with open(cache_file, "wb") as f:
67
+ pickle.dump(expected_time, f)
68
+
69
+ return expected_time
70
+
71
+
72
+ def check_correctness(
73
+ completion_id: int,
74
+ problem: Dict[str, Any],
75
+ solution: str,
76
+ max_as_limit: float,
77
+ max_data_limit: float,
78
+ max_stack_limit: float,
79
+ identifier=None,
80
+ min_time_limit: float = 0.1,
81
+ gt_time_limit: float = 2.0,
82
+ ) -> Dict[str, Result]:
83
+ ret = {
84
+ "completion_id": completion_id,
85
+ "task_id": problem["task_id"],
86
+ "_identifier": identifier,
87
+ "solution": solution,
88
+ }
89
+ ret["base"] = untrusted_check(
90
+ solution,
91
+ problem["test"],
92
+ problem["entry_point"],
93
+ max_as_limit,
94
+ max_data_limit,
95
+ max_stack_limit,
96
+ min_time_limit,
97
+ gt_time_limit,
98
+ )
99
+ return ret
100
+
101
+
102
+ def evaluate(
103
+ split: str,
104
+ subset: str,
105
+ samples: str,
106
+ pass_k: str="1,5,10",
107
+ parallel: int = None,
108
+ min_time_limit: float = 1,
109
+ max_as_limit: int = 30 * 1024,
110
+ max_data_limit: int = 30 * 1024,
111
+ max_stack_limit: int = 10,
112
+ check_gt_only: bool = False,
113
+ no_gt: bool = False,
114
+ ):
115
+ pass_k = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
116
+ if parallel is None:
117
+ n_workers = max(1, multiprocessing.cpu_count() // 2)
118
+ else:
119
+ n_workers = parallel
 
120
 
121
+ if check_gt_only:
122
+ samples = "__dummy__.jsonl"
123
 
124
+ extra = subset + "_" if subset != "full" else ""
125
+ if os.path.isdir(samples):
126
+ result_path = os.path.join(samples, f"{extra}eval_results.json")
127
+ else:
128
+ assert samples.endswith(".jsonl")
129
+ result_path = samples.replace(".jsonl", f"_{extra}eval_results.json")
130
 
131
+ problems = get_bigcodebench(subset=subset)
132
+ dataset_hash = get_bigcodebench_hash(subset=subset)
133
+
134
+ if not no_gt:
135
+ expected_time = get_groundtruth(n_workers, problems, dataset_hash, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit)
136
+ else:
137
+ expected_time = {task_id: None for task_id in problems}
138
 
139
+ gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
140
+ failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
141
+
142
+ if os.path.isfile(result_path):
143
+ with open(result_path, "r") as f:
144
+ results = json.load(f)
145
+ results = compatible_eval_result(results)
146
+ else:
147
+ if check_gt_only:
148
+ if gt_pass_rate > 0.99:
149
+ cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
150
+ else:
151
+ cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
152
+ if len(failed_tasks) > 0:
153
+ cprint(f"Failed tasks: {failed_tasks}", "red")
154
+ return {"gt_pass_rate":float(gt_pass_rate), "failed_tasks": failed_tasks}
155
+
156
+ results = {
157
+ "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
158
+ "eval": {},
159
+ }
160
+
161
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
162
+ futures = []
163
+ completion_id = Counter()
164
+ n_samples = 0
165
+ eval_results = defaultdict(list) # task_id ->
166
+ remainings = set()
167
+
168
+ print("Reading samples...")
169
+ for sample in tqdm(load_solutions(samples)):
170
+ task_id = sample["task_id"]
171
+
172
+ if task_id not in problems:
173
+ warn(
174
+ f"Task {task_id} is found in the samples but not found in the dataset"
175
+ )
176
+ continue
177
+ solution = (
178
+ sample["solution"]
179
+ if "solution" in sample
180
+ else problems[task_id]["complete_prompt"] + sample["completion"]
181
  )
182
+ if "sanitized-calibrated" in samples:
183
+ solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
184
+ remainings.add(sample["_identifier"])
185
+ args = (
186
+ completion_id[task_id],
187
+ problems[task_id],
188
+ solution,
189
+ max_as_limit,
190
+ max_data_limit,
191
+ max_stack_limit,
192
+ sample["_identifier"],
193
+ min_time_limit,
194
+ expected_time[task_id] if expected_time[task_id] else 20
195
+ )
196
+ futures.append(executor.submit(check_correctness, *args))
197
+ completion_id[task_id] += 1
198
+ n_samples += 1
199
+
200
+ assert n_samples == len(remainings), "Missing problems in unfinished"
201
+ assert len(completion_id) == len(problems), "Missing problems in samples"
202
+
203
+ def stucking_checker():
204
+ not_done = futures
205
+ while len(not_done) > 0:
206
+ done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
207
+
208
+ if len(done) == 0:
209
+ warn("No samples have finished testing in the last 240s")
210
+ warn(f"{len(remainings)} samples to be tested: {remainings}")
211
+
212
+ threading.Thread(target=stucking_checker).start()
213
+
214
+ for future in tqdm(as_completed(futures), total=n_samples):
215
+ result = future.result()
216
+ remainings.remove(result["_identifier"])
217
+ eval_results[result["task_id"]].append(result)
218
+
219
+
220
+ # sort the results for each problem by completion_id
221
+ for task_id, task_results in eval_results.items():
222
+ task_results.sort(key=lambda x: x["completion_id"])
223
+ results["eval"][task_id] = []
224
+ for res in task_results:
225
+ stat, details = res["base"]
226
+ results["eval"][task_id].append(
227
+ {
228
+ "task_id": task_id,
229
+ "solution": res["solution"],
230
+ "status": stat,
231
+ "details": details,
232
+ }
233
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ # Calculate pass@k.
236
+ total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
237
+ base_correct = []
238
+
239
+ for key, res in results["eval"].items():
240
+ if key not in problems:
241
+ continue
242
+ bc = sum([r["status"] == PASS for r in res])
243
+ base_correct.append(bc)
244
+
245
+ base_correct = np.array(base_correct)
246
+
247
+ pass_at_k = {
248
+ f"pass@{k}": float(estimate_pass_at_k(total, base_correct, k).mean())
249
+ for k in pass_k
250
+ if total.min() >= k
251
+ }
252
+ pass_at_k["gt_pass_rate"] = float(gt_pass_rate)
253
+ pass_at_k["failed_tasks"] = failed_tasks
254
+ return pass_at_k
255
+
256
+ # mode = "-calibrated" if "sanitized-calibrated" in samples else ""
257
+ # extra = subset.capitalize()
258
+ # split = split.capitalize()
259
+ # cprint(f"BigCodeBench-{split}{mode} ({extra})", "green")
260
+
261
+ # if no_gt:
262
+ # cprint(f"Groundtruth is not checked", "yellow")
263
+ # else:
264
+ # if gt_pass_rate > 0.99:
265
+ # cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
266
+ # else:
267
+ # cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
268
+
269
+ # if len(failed_tasks) > 0:
270
+ # cprint(f"Failed tasks: {failed_tasks}", "red")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
+ # for k, v in pass_at_k.items():
273
+ # cprint(f"{k}:\t{v:.3f}", "green")
274
+
275
+ # # save results
276
+ # if os.path.isfile(result_path):
277
+ # decision = ""
278
+ # while decision.lower() not in ["y", "n"]:
279
+ # print(f"{result_path} already exists. Press [Y/N] to overwrite or exit...")
280
+ # decision = input()
281
+
282
+ # if decision.lower() == "y":
283
+ # # mv the file to a backup
284
+ # new_path = result_path + ".bak"
285
+ # while os.path.isfile(new_path):
286
+ # new_path += ".bak"
287
+ # os.rename(result_path, new_path)
288
+ # print(f"Backup {result_path} to {new_path}")
289
+
290
+ # if not os.path.isfile(result_path):
291
+ # with open(result_path, "w") as f:
292
+ # json.dump(results, f, indent=2)
293
+
294
+ # if save_pass_rate:
295
+ # pass_at_k_path = result_path.replace("_eval_results.json", "_pass_at_k.json")
296
+ # pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
297
+ # pass_at_k["calibrated"] = "sanitized-calibrated" in samples
298
+ # pass_at_k["subset"] = subset
299
+
300
+ # def save_pass_at_k():
301
+ # with open(pass_at_k_path, "w") as f:
302
+ # json.dump(pass_at_k, f, indent=2)
303
+
304
+ # if os.path.isfile(pass_at_k_path):
305
+ # saved_pass_at_k = json.load(open(pass_at_k_path, "r"))
306
+ # # compare saved_pass_at_k with pass_at_k
307
+ # for k in saved_pass_at_k.keys():
308
+ # if pass_at_k[k] != saved_pass_at_k[k]:
309
+ # cprint(f"Warning: {k} is different from the saved one", "yellow")
310
 
311
+ # # ask user whether to save the pass@k
312
+ # decision = ""
313
+ # while decision.lower() not in ["y", "n"]:
314
+ # print(f"Save pass@k to {pass_at_k_path}? [Y/N]")
315
+ # decision = input()
316
+ # if decision.lower() == "y":
317
+ # save_pass_at_k()
318
+
319
+ # else:
320
+ # save_pass_at_k()
321
+
322
+ def run_gradio():
323
+ interface = gr.Interface(
324
+ fn=evaluate,
325
+ inputs=[
326
+ gr.Dropdown(["complete", "instruct"], label="Split"),
327
+ gr.Dropdown(["full", "hard"], label="Subset"),
328
+ gr.File(label="Samples Path (.jsonl)"),
329
+ gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
330
+ gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),
331
+ gr.Slider(0.1, 10, step=0.1, label="Min Time Limit", value=1),
332
+ gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024),
333
+ gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024),
334
+ gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10),
335
+ gr.Checkbox(label="Check GT Only"),
336
+ gr.Checkbox(label="No GT"),
337
+ ],
338
+ outputs="text",
339
+ # concurrency_limit=None
 
 
 
340
  )
341
+ interface.queue(default_concurrency_limit=None)
342
+ interface.launch(show_error=True)
343
 
344
+ if __name__ == "__main__":
345
+ run_gradio()
346
+ # evaluate("complete", "hard", "meta-llama--Llama-3.2-3B-Instruct--bigcodebench-instruct--vllm-0-1.jsonl")