Terry Zhuo commited on
Commit
de45929
1 Parent(s): 5489a0a

revert back to eval only

Browse files
Files changed (3) hide show
  1. _app.py +648 -0
  2. app.py +152 -622
  3. demo.py +0 -178
_app.py ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import time
4
+ import datetime
5
+ import gradio as gr
6
+ from threading import Thread
7
+ import datasets
8
+ from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
9
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
10
+ from apscheduler.schedulers.background import BackgroundScheduler
11
+
12
+ # Start ephemeral Spaces on PRs (see config in README.md)
13
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
14
+
15
+ from src.display.about import (
16
+ CITATION_BUTTON_LABEL,
17
+ CITATION_BUTTON_TEXT,
18
+ # INTRODUCTION_TEXT,
19
+ TITLE,
20
+ ABOUT_TEXT,
21
+ SUBMISSION_TEXT_3,
22
+ )
23
+ from src.display.css_html_js import custom_css
24
+ from src.display.utils import (
25
+ COLS,
26
+ EVAL_COLS,
27
+ EVAL_TYPES,
28
+ AutoEvalColumn,
29
+ fields,
30
+ EvalQueueColumn
31
+ )
32
+ from src.envs import (
33
+ API,
34
+ EVAL_REQUESTS_PATH,
35
+ RESULT_REPO,
36
+ DATA_VERSION,
37
+ DATA_REPO,
38
+ HARD_RESULT_REPO,
39
+ ELO_REPO,
40
+ HARD_ELO_REPO,
41
+ SOLVE_REPO,
42
+ HARD_SOLVE_REPO,
43
+ HF_TOKEN,
44
+ QUEUE_REPO,
45
+ REPO_ID,
46
+ VOTES_REPO,
47
+ VOTES_PATH,
48
+ HF_HOME,
49
+ )
50
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
51
+ from src.execute import generate_command, is_running, lock, stream_logs, find_result_file
52
+ from src.tools.plots import plot_elo_mle, plot_solve_rate
53
+ # from src.voting.vote_system import VoteManager, run_scheduler
54
+
55
+ # Configure logging
56
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
57
+
58
+ # Start ephemeral Spaces on PRs (see config in README.md)
59
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
60
+
61
+ # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
62
+ # This controls whether a full initialization should be performed.
63
+ DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
64
+ NEW_DATA_ON_LEADERBOARD = True
65
+ LEADERBOARD_DF = None
66
+ HARD_LEADERBOARD_DF = None
67
+ ELO_TASK_DF = None
68
+ ELO_BENCH_DF = None
69
+ HARD_ELO_TASK_DF = None
70
+ HARD_ELO_BENCH_DF = None
71
+ COMPLETE_SOLVE_DF = None
72
+ INSTRUCT_SOLVE_DF = None
73
+ HARD_COMPLETE_SOLVE_DF = None
74
+ HARD_INSTRUCT_SOLVE_DF = None
75
+
76
+ DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
77
+ verification_mode="no_checks")
78
+
79
+
80
+ def filter_data(data, keyword):
81
+ if not keyword:
82
+ return data
83
+ filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
84
+ return filtered_data
85
+
86
+
87
+ def update_display(search_keyword, index, show_test):
88
+ filtered_data = filter_data(DATA, search_keyword)
89
+
90
+ if not filtered_data:
91
+ return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]
92
+
93
+ max_index = len(filtered_data) - 1
94
+ index = min(max(0, index), max_index)
95
+
96
+ task_id = filtered_data[index]['task_id']
97
+ snippet1 = filtered_data[index]['complete_prompt']
98
+ snippet2 = filtered_data[index]['instruct_prompt']
99
+ # snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
100
+ snippet4 = filtered_data[index]['test'] if show_test else ""
101
+
102
+ return [
103
+ task_id,
104
+ snippet1,
105
+ snippet2,
106
+ # snippet3,
107
+ snippet4,
108
+ len(filtered_data),
109
+ gr.update(maximum=max_index, value=index)
110
+ ]
111
+
112
+ def restart_space():
113
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
114
+
115
+
116
+ def time_diff_wrapper(func):
117
+ def wrapper(*args, **kwargs):
118
+ start_time = time.time()
119
+ result = func(*args, **kwargs)
120
+ end_time = time.time()
121
+ diff = end_time - start_time
122
+ logging.info(f"Time taken for {func.__name__}: {diff} seconds")
123
+ return result
124
+
125
+ return wrapper
126
+
127
+
128
+ @time_diff_wrapper
129
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
130
+ """Download dataset with exponential backoff retries."""
131
+ attempt = 0
132
+ while attempt < max_attempts:
133
+ try:
134
+ logging.info(f"Downloading {repo_id} to {local_dir}")
135
+ snapshot_download(
136
+ repo_id=repo_id,
137
+ local_dir=local_dir,
138
+ repo_type=repo_type,
139
+ tqdm_class=None,
140
+ etag_timeout=30,
141
+ max_workers=8,
142
+ )
143
+ logging.info("Download successful")
144
+ return
145
+ except Exception as e:
146
+ wait_time = backoff_factor**attempt
147
+ logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
148
+ time.sleep(wait_time)
149
+ attempt += 1
150
+ raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
151
+
152
+ def get_latest_data_leaderboard(
153
+ leaderboard_initial_df = None,
154
+ hard_leaderboard_initial_df = None,
155
+ elo_task_df = None,
156
+ elo_bench_df = None,
157
+ hard_elo_task_df = None,
158
+ hard_elo_bench_df = None,
159
+ complete_solve_df = None,
160
+ instruct_solve_df = None,
161
+ hard_complete_solve_df = None,
162
+ hard_instruct_solve_df = None
163
+ ):
164
+ global NEW_DATA_ON_LEADERBOARD
165
+ global LEADERBOARD_DF
166
+ global HARD_LEADERBOARD_DF
167
+ global ELO_TASK_DF
168
+ global ELO_BENCH_DF
169
+ global HARD_ELO_TASK_DF
170
+ global HARD_ELO_BENCH_DF
171
+ global COMPLETE_SOLVE_DF
172
+ global INSTRUCT_SOLVE_DF
173
+ global HARD_COMPLETE_SOLVE_DF
174
+ global HARD_INSTRUCT_SOLVE_DF
175
+
176
+ if NEW_DATA_ON_LEADERBOARD:
177
+ print("Leaderboard updated at reload!")
178
+ leaderboard_dataset = datasets.load_dataset(
179
+ RESULT_REPO,
180
+ "default",
181
+ split="train",
182
+ cache_dir=HF_HOME,
183
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
184
+ verification_mode="no_checks"
185
+ )
186
+ LEADERBOARD_DF = get_leaderboard_df(
187
+ leaderboard_dataset=leaderboard_dataset,
188
+ cols=COLS,
189
+ )
190
+ hard_leaderboard_dataset = datasets.load_dataset(
191
+ HARD_RESULT_REPO,
192
+ "default",
193
+ split="train",
194
+ cache_dir=HF_HOME,
195
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
196
+ verification_mode="no_checks"
197
+ )
198
+ hard_leaderboard_df = get_leaderboard_df(
199
+ leaderboard_dataset=hard_leaderboard_dataset,
200
+ cols=COLS,
201
+ )
202
+ HARD_LEADERBOARD_DF = hard_leaderboard_df
203
+
204
+ elo_task_df = datasets.load_dataset(
205
+ ELO_REPO,
206
+ "default",
207
+ split="task_no_tie",
208
+ cache_dir=HF_HOME,
209
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
+ verification_mode="no_checks"
211
+ ).to_pandas()
212
+ elo_bench_df = datasets.load_dataset(
213
+ ELO_REPO,
214
+ "default",
215
+ split="benchmark_tie",
216
+ cache_dir=HF_HOME,
217
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
+ verification_mode="no_checks"
219
+ ).to_pandas()
220
+ ELO_TASK_DF = elo_task_df
221
+ ELO_BENCH_DF = elo_bench_df
222
+
223
+ hard_elo_task_df = datasets.load_dataset(
224
+ HARD_ELO_REPO,
225
+ "default",
226
+ split="task_no_tie",
227
+ cache_dir=HF_HOME,
228
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
229
+ verification_mode="no_checks"
230
+ ).to_pandas()
231
+ hard_elo_bench_df = datasets.load_dataset(
232
+ HARD_ELO_REPO,
233
+ "default",
234
+ split="benchmark_tie",
235
+ cache_dir=HF_HOME,
236
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
237
+ verification_mode="no_checks"
238
+ ).to_pandas()
239
+ HARD_ELO_TASK_DF = hard_elo_task_df
240
+ HARD_ELO_BENCH_DF = hard_elo_bench_df
241
+
242
+ complete_solve_df = datasets.load_dataset(
243
+ SOLVE_REPO,
244
+ "default",
245
+ split="complete",
246
+ cache_dir=HF_HOME,
247
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
248
+ verification_mode="no_checks"
249
+ ).to_pandas()
250
+ instruct_solve_df = datasets.load_dataset(
251
+ SOLVE_REPO,
252
+ "default",
253
+ split="instruct",
254
+ cache_dir=HF_HOME,
255
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
256
+ verification_mode="no_checks"
257
+ ).to_pandas()
258
+ COMPLETE_SOLVE_DF = complete_solve_df
259
+ INSTRUCT_SOLVE_DF = instruct_solve_df
260
+
261
+ hard_complete_solve_df = datasets.load_dataset(
262
+ HARD_SOLVE_REPO,
263
+ "default",
264
+ split="complete",
265
+ cache_dir=HF_HOME,
266
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
267
+ verification_mode="no_checks"
268
+ ).to_pandas()
269
+ hard_instruct_solve_df = datasets.load_dataset(
270
+ HARD_SOLVE_REPO,
271
+ "default",
272
+ split="instruct",
273
+ cache_dir=HF_HOME,
274
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
275
+ verification_mode="no_checks"
276
+ ).to_pandas()
277
+ HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
278
+ HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
279
+
280
+ NEW_DATA_ON_LEADERBOARD = False
281
+
282
+ else:
283
+ LEADERBOARD_DF = leaderboard_initial_df
284
+ # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
285
+ ELO_TASK_DF = elo_task_df
286
+ # ELO_BENCH_DF = elo_bench_df
287
+ # HARD_ELO_TASK_DF = hard_elo_task_df
288
+ HARD_ELO_BENCH_DF = hard_elo_bench_df
289
+ COMPLETE_SOLVE_DF = complete_solve_df
290
+ # INSTRUCT_SOLVE_DF = instruct_solve_df
291
+ # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
292
+ HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
293
+
294
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
295
+ # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
296
+
297
+
298
+ def init_space():
299
+ """Initializes the application space, loading only necessary data."""
300
+
301
+ # Always redownload the leaderboard DataFrame
302
+ global LEADERBOARD_DF
303
+ global HARD_LEADERBOARD_DF
304
+ global ELO_TASK_DF
305
+ global ELO_BENCH_DF
306
+ global HARD_ELO_TASK_DF
307
+ global HARD_ELO_BENCH_DF
308
+ global COMPLETE_SOLVE_DF
309
+ global INSTRUCT_SOLVE_DF
310
+ global HARD_COMPLETE_SOLVE_DF
311
+ global HARD_INSTRUCT_SOLVE_DF
312
+
313
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
314
+ # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
315
+
316
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
317
+ # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
318
+
319
+ # Initialize VoteManager
320
+ # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
321
+
322
+
323
+ # Schedule the upload_votes method to run every 15 minutes
324
+ # schedule.every(15).minutes.do(vote_manager.upload_votes)
325
+
326
+ # Start the scheduler in a separate thread
327
+ # scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
328
+ # scheduler_thread.start()
329
+
330
+ # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
331
+ # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
332
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
333
+ ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
334
+ COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
335
+ HARD_INSTRUCT_SOLVE_DF = init_space()
336
+ # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
337
+
338
+ # Data processing for plots now only on demand in the respective Gradio tab
339
+ # def load_and_create_plots():
340
+ # plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
341
+ # return plot_df
342
+
343
+ # Function to check if a user is logged in
344
+ def check_login(profile: gr.OAuthProfile | None) -> bool:
345
+ if profile is None:
346
+ return False
347
+ return True
348
+
349
+ def init_leaderboard(dataframe):
350
+ if dataframe is None or dataframe.empty:
351
+ raise ValueError("Leaderboard DataFrame is empty or None.")
352
+ return Leaderboard(
353
+ value=dataframe,
354
+ datatype=[c.type for c in fields(AutoEvalColumn)],
355
+ select_columns=SelectColumns(
356
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
357
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
358
+ label="Select Columns to Display:",
359
+ ),
360
+ search_columns=[AutoEvalColumn.model.name],
361
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
362
+ filter_columns=[
363
+ ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
364
+ ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
365
+ ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
366
+ ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
367
+ ],
368
+ bool_checkboxgroup_label="Hide models",
369
+ interactive=False,
370
+ )
371
+
372
+
373
+ def init_others(dataframe):
374
+ if dataframe is None or dataframe.empty:
375
+ raise ValueError("Gradio DataFrame is empty or None.")
376
+ return gr.Dataframe(dataframe, visible=False)
377
+
378
+ main_block = gr.Blocks(css=custom_css)
379
+ with main_block as demo:
380
+ with gr.Row(elem_id="header-row"):
381
+ gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
382
+
383
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
384
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
385
+ with gr.Tab("💎 Hard Set") as hard_tabs:
386
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
387
+ hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
388
+ gr.Markdown(
389
+ """
390
+ **Notes:**
391
+ - For the efficiency reasons, we only display the Hard Set leaderboard.
392
+ - _Hard Set_ vs _Full Set_:
393
+ - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
394
+ - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
395
+ - _Complete_ vs _Instruct_:
396
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
397
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
398
+ - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
399
+ - `Average` is the average of `Complete` and `Instruct` when both are available.
400
+ - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
401
+ - `#Act Params (B)` is the number of activated model parameters during inference.
402
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
403
+ - For more details check the 📝 About section.
404
+ """,
405
+ elem_classes="markdown-text",
406
+ )
407
+
408
+ with gr.TabItem("📊 Elo Rating", id="hard_elo"):
409
+ with gr.Column():
410
+ with gr.Group():
411
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
412
+ hard_task_elo_map = gr.Plot()
413
+ hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
414
+ demo.load(plot_elo_mle, [hard_elo_task_gr],
415
+ hard_task_elo_map)
416
+ with gr.Group():
417
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
418
+ hard_bench_elo_map = gr.Plot()
419
+ hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
420
+ demo.load(plot_elo_mle, [hard_elo_bench_gr],
421
+ hard_bench_elo_map)
422
+
423
+ with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
424
+ with gr.Column():
425
+ hard_complete_map = gr.Plot()
426
+ hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
427
+ demo.load(plot_solve_rate, [hard_complete_solve_gr,
428
+ gr.Textbox("Complete", visible=False),
429
+ gr.Number(10, visible=False),
430
+ gr.Number(16, visible=False),
431
+ ], hard_complete_map)
432
+ hard_instruct_map = gr.Plot()
433
+ hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
434
+ demo.load(plot_solve_rate, [hard_instruct_solve_gr,
435
+ gr.Textbox("Instruct", visible=False),
436
+ gr.Number(10, visible=False),
437
+ gr.Number(16, visible=False),
438
+ ], hard_instruct_map)
439
+ with gr.Tab("🎯 Full Set") as full_tabs:
440
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
441
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
442
+ gr.Markdown(
443
+ """
444
+ **Notes:**
445
+ - _Complete_ vs _Instruct_:
446
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
447
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
448
+ - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
449
+ - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
450
+ - `size` is the amount of activated model weight during inference.
451
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
452
+ - For more details check the 📝 About section.
453
+ """,
454
+ elem_classes="markdown-text",
455
+ )
456
+
457
+ with gr.TabItem("📊 Elo Rating", id="full_elo"):
458
+ with gr.Column():
459
+ with gr.Group():
460
+
461
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
462
+ task_elo_map = gr.Plot()
463
+ elo_task_gr = init_others(ELO_TASK_DF)
464
+ demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
465
+ with gr.Group():
466
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
467
+ bench_elo_map = gr.Plot()
468
+ elo_bench_gr = init_others(ELO_BENCH_DF)
469
+ demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
470
+
471
+ with gr.TabItem("🧩 Solve Rate", id="full_solve"):
472
+ with gr.Column():
473
+ complete_map = gr.Plot()
474
+ complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
475
+ demo.load(plot_solve_rate, [complete_solve_gr,
476
+ gr.Textbox("Complete", visible=False),
477
+ ], complete_map)
478
+ instruct_map = gr.Plot()
479
+ instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
480
+ demo.load(plot_solve_rate, [instruct_solve_gr,
481
+ gr.Textbox("Instruct", visible=False),
482
+ ], instruct_map)
483
+ with gr.TabItem("📝 About", id=3):
484
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
485
+ with gr.TabItem("🔎 Data Viewer", id="viewer"):
486
+ search_input = gr.Textbox(label="Search by keyword")
487
+ count_output = gr.Number(label="Number of filtered items")
488
+ index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
489
+ # show_solution = gr.Checkbox(label="Show Solution")
490
+ show_test = gr.Checkbox(label="Show Test Cases")
491
+ update_button = gr.Button("Update")
492
+
493
+ task_id_output = gr.Textbox(label="Task ID")
494
+ code_completion = gr.Code(language="python", label="Code Completion")
495
+ nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction")
496
+ # solution = gr.Code(language="python", label="Solution")
497
+ test_cases = gr.Code(language="python", label="Test Cases")
498
+
499
+ update_button.click(
500
+ update_display,
501
+ inputs=[search_input, index_slider, show_test],
502
+ outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
503
+ )
504
+
505
+ # Initial load
506
+ demo.load(
507
+ update_display,
508
+ inputs=[search_input, index_slider, show_test],
509
+ outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
510
+ )
511
+
512
+ with gr.TabItem("🚀 Request", id=4):
513
+ gr.Markdown(SUBMISSION_TEXT_3)
514
+
515
+ with gr.TabItem("🛠️ Execute", id=5):
516
+ gr.Markdown("# BigCodeBench Evaluator")
517
+
518
+ with gr.Row():
519
+ jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
520
+ split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
521
+ subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
522
+
523
+ with gr.Row():
524
+ parallel = gr.Number(label="Parallel (optional)", precision=0)
525
+ min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
526
+ max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
527
+
528
+ with gr.Row():
529
+ max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
530
+ max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
531
+ check_gt_only = gr.Checkbox(label="Check GT Only")
532
+ no_gt = gr.Checkbox(label="No GT")
533
+
534
+ command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
535
+ with gr.Row():
536
+ submit_btn = gr.Button("Run Evaluation")
537
+ download_btn = gr.DownloadButton(label="Download Result")
538
+ log_output = gr.Textbox(label="Execution Logs", lines=20)
539
+
540
+ input_components = [
541
+ jsonl_file, split, subset, parallel,
542
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
543
+ check_gt_only, no_gt
544
+ ]
545
+
546
+ for component in input_components:
547
+ component.change(generate_command, inputs=input_components, outputs=command_output)
548
+
549
+
550
+ def start_evaluation(command, jsonl_file, subset, split):
551
+ extra = subset + "_" if subset != "full" else ""
552
+ if jsonl_file is not None:
553
+ result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
554
+ else:
555
+ result_path = None
556
+
557
+ for log in stream_logs(command, jsonl_file):
558
+ if jsonl_file is not None:
559
+ yield log, gr.update(value=result_path, label=result_path), gr.update()
560
+ else:
561
+ yield log, gr.update(), gr.update()
562
+ is_running = False
563
+ result_file = find_result_file()
564
+ if result_file:
565
+ return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
566
+ # gr.Button(visible=False)#,
567
+ # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
568
+ else:
569
+ return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
570
+ # gr.Button("Run Evaluation", visible=True),
571
+ # gr.DownloadButton(visible=False))
572
+ submit_btn.click(start_evaluation,
573
+ inputs=[command_output, jsonl_file, subset, split],
574
+ outputs=[log_output, download_btn])
575
+
576
+ with gr.Row():
577
+ with gr.Accordion("📙 Citation", open=False):
578
+ citation_button = gr.Textbox(
579
+ value=CITATION_BUTTON_TEXT,
580
+ label=CITATION_BUTTON_LABEL,
581
+ lines=20,
582
+ elem_id="citation-button",
583
+ show_copy_button=True,
584
+ )
585
+
586
+ main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
587
+ # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
588
+ # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
589
+ # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
590
+
591
+ main_block.queue(default_concurrency_limit=100)
592
+
593
+
594
+ def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
595
+ # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
596
+ # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
597
+ # ht to Lucain!
598
+ if SPACE_ID is None:
599
+ print("Not in a Space: Space CI disabled.")
600
+ return WebhooksServer(ui=main_block)
601
+
602
+ if IS_EPHEMERAL_SPACE:
603
+ print("In an ephemeral Space: Space CI disabled.")
604
+ return WebhooksServer(ui=main_block)
605
+
606
+ card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
607
+ config = card.data.get("space_ci", {})
608
+ print(f"Enabling Space CI with config from README: {config}")
609
+
610
+ return configure_space_ci(
611
+ blocks=ui,
612
+ trusted_authors=config.get("trusted_authors"),
613
+ private=config.get("private", "auto"),
614
+ variables=config.get("variables", "auto"),
615
+ secrets=config.get("secrets"),
616
+ hardware=config.get("hardware"),
617
+ storage=config.get("storage"),
618
+ )
619
+
620
+ # Create webhooks server (with CI url if in Space and not ephemeral)
621
+ webhooks_server = enable_space_ci_and_return_server(ui=main_block)
622
+
623
+ # Add webhooks
624
+ @webhooks_server.add_webhook
625
+ def update_leaderboard(payload: WebhookPayload) -> None:
626
+ """Redownloads the leaderboard dataset each time it updates"""
627
+ if payload.repo.type == "dataset" and payload.event.action == "update":
628
+ global NEW_DATA_ON_LEADERBOARD
629
+ if NEW_DATA_ON_LEADERBOARD:
630
+ return
631
+ NEW_DATA_ON_LEADERBOARD = True
632
+
633
+ for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
634
+ datasets.load_dataset(
635
+ repo,
636
+ "default",
637
+ cache_dir=HF_HOME,
638
+ download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
639
+ verification_mode="no_checks"
640
+ )
641
+
642
+
643
+
644
+ webhooks_server.launch()
645
+
646
+ scheduler = BackgroundScheduler()
647
+ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
648
+ scheduler.start()
app.py CHANGED
@@ -1,648 +1,178 @@
 
 
 
1
  import os
2
- import logging
3
  import time
4
- import datetime
5
- import gradio as gr
6
- from threading import Thread
7
- import datasets
8
- from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
9
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
10
  from apscheduler.schedulers.background import BackgroundScheduler
11
 
12
- # Start ephemeral Spaces on PRs (see config in README.md)
13
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
14
-
15
- from src.display.about import (
16
- CITATION_BUTTON_LABEL,
17
- CITATION_BUTTON_TEXT,
18
- # INTRODUCTION_TEXT,
19
- TITLE,
20
- ABOUT_TEXT,
21
- SUBMISSION_TEXT_3,
22
- )
23
- from src.display.css_html_js import custom_css
24
- from src.display.utils import (
25
- COLS,
26
- EVAL_COLS,
27
- EVAL_TYPES,
28
- AutoEvalColumn,
29
- fields,
30
- EvalQueueColumn
31
- )
32
- from src.envs import (
33
- API,
34
- EVAL_REQUESTS_PATH,
35
- RESULT_REPO,
36
- DATA_VERSION,
37
- DATA_REPO,
38
- HARD_RESULT_REPO,
39
- ELO_REPO,
40
- HARD_ELO_REPO,
41
- SOLVE_REPO,
42
- HARD_SOLVE_REPO,
43
- HF_TOKEN,
44
- QUEUE_REPO,
45
- REPO_ID,
46
- VOTES_REPO,
47
- VOTES_PATH,
48
- HF_HOME,
49
- )
50
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
51
- from src.execute import generate_command, default_command, is_running, lock, stream_logs, find_result_file
52
- from src.tools.plots import plot_elo_mle, plot_solve_rate
53
- # from src.voting.vote_system import VoteManager, run_scheduler
54
-
55
- # Configure logging
56
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
57
-
58
- # Start ephemeral Spaces on PRs (see config in README.md)
59
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
60
-
61
- # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
62
- # This controls whether a full initialization should be performed.
63
- DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
64
- NEW_DATA_ON_LEADERBOARD = True
65
- LEADERBOARD_DF = None
66
- HARD_LEADERBOARD_DF = None
67
- ELO_TASK_DF = None
68
- ELO_BENCH_DF = None
69
- HARD_ELO_TASK_DF = None
70
- HARD_ELO_BENCH_DF = None
71
- COMPLETE_SOLVE_DF = None
72
- INSTRUCT_SOLVE_DF = None
73
- HARD_COMPLETE_SOLVE_DF = None
74
- HARD_INSTRUCT_SOLVE_DF = None
75
-
76
- DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
77
- verification_mode="no_checks")
78
-
79
-
80
- def filter_data(data, keyword):
81
- if not keyword:
82
- return data
83
- filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
84
- return filtered_data
85
 
86
-
87
- def update_display(search_keyword, index, show_test):
88
- filtered_data = filter_data(DATA, search_keyword)
 
 
 
89
 
90
- if not filtered_data:
91
- return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]
 
 
 
92
 
93
- max_index = len(filtered_data) - 1
94
- index = min(max(0, index), max_index)
95
 
96
- task_id = filtered_data[index]['task_id']
97
- snippet1 = filtered_data[index]['complete_prompt']
98
- snippet2 = filtered_data[index]['instruct_prompt']
99
- # snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
100
- snippet4 = filtered_data[index]['test'] if show_test else ""
101
 
102
- return [
103
- task_id,
104
- snippet1,
105
- snippet2,
106
- # snippet3,
107
- snippet4,
108
- len(filtered_data),
109
- gr.update(maximum=max_index, value=index)
110
- ]
111
-
112
- def restart_space():
113
- API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
114
-
115
-
116
- def time_diff_wrapper(func):
117
- def wrapper(*args, **kwargs):
118
- start_time = time.time()
119
- result = func(*args, **kwargs)
120
- end_time = time.time()
121
- diff = end_time - start_time
122
- logging.info(f"Time taken for {func.__name__}: {diff} seconds")
123
- return result
124
-
125
- return wrapper
126
 
127
 
128
- @time_diff_wrapper
129
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
130
- """Download dataset with exponential backoff retries."""
131
- attempt = 0
132
- while attempt < max_attempts:
 
133
  try:
134
- logging.info(f"Downloading {repo_id} to {local_dir}")
135
- snapshot_download(
136
- repo_id=repo_id,
137
- local_dir=local_dir,
138
- repo_type=repo_type,
139
- tqdm_class=None,
140
- etag_timeout=30,
141
- max_workers=8,
142
- )
143
- logging.info("Download successful")
144
- return
145
  except Exception as e:
146
- wait_time = backoff_factor**attempt
147
- logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
148
- time.sleep(wait_time)
149
- attempt += 1
150
- raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
151
-
152
- def get_latest_data_leaderboard(
153
- leaderboard_initial_df = None,
154
- hard_leaderboard_initial_df = None,
155
- elo_task_df = None,
156
- elo_bench_df = None,
157
- hard_elo_task_df = None,
158
- hard_elo_bench_df = None,
159
- complete_solve_df = None,
160
- instruct_solve_df = None,
161
- hard_complete_solve_df = None,
162
- hard_instruct_solve_df = None
163
- ):
164
- global NEW_DATA_ON_LEADERBOARD
165
- global LEADERBOARD_DF
166
- global HARD_LEADERBOARD_DF
167
- global ELO_TASK_DF
168
- global ELO_BENCH_DF
169
- global HARD_ELO_TASK_DF
170
- global HARD_ELO_BENCH_DF
171
- global COMPLETE_SOLVE_DF
172
- global INSTRUCT_SOLVE_DF
173
- global HARD_COMPLETE_SOLVE_DF
174
- global HARD_INSTRUCT_SOLVE_DF
175
 
176
- if NEW_DATA_ON_LEADERBOARD:
177
- print("Leaderboard updated at reload!")
178
- leaderboard_dataset = datasets.load_dataset(
179
- RESULT_REPO,
180
- "default",
181
- split="train",
182
- cache_dir=HF_HOME,
183
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
184
- verification_mode="no_checks"
185
- )
186
- LEADERBOARD_DF = get_leaderboard_df(
187
- leaderboard_dataset=leaderboard_dataset,
188
- cols=COLS,
189
- )
190
- hard_leaderboard_dataset = datasets.load_dataset(
191
- HARD_RESULT_REPO,
192
- "default",
193
- split="train",
194
- cache_dir=HF_HOME,
195
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
196
- verification_mode="no_checks"
197
- )
198
- hard_leaderboard_df = get_leaderboard_df(
199
- leaderboard_dataset=hard_leaderboard_dataset,
200
- cols=COLS,
201
- )
202
- HARD_LEADERBOARD_DF = hard_leaderboard_df
203
 
204
- elo_task_df = datasets.load_dataset(
205
- ELO_REPO,
206
- "default",
207
- split="task_no_tie",
208
- cache_dir=HF_HOME,
209
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
- verification_mode="no_checks"
211
- ).to_pandas()
212
- elo_bench_df = datasets.load_dataset(
213
- ELO_REPO,
214
- "default",
215
- split="benchmark_tie",
216
- cache_dir=HF_HOME,
217
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
- verification_mode="no_checks"
219
- ).to_pandas()
220
- ELO_TASK_DF = elo_task_df
221
- ELO_BENCH_DF = elo_bench_df
222
 
223
- hard_elo_task_df = datasets.load_dataset(
224
- HARD_ELO_REPO,
225
- "default",
226
- split="task_no_tie",
227
- cache_dir=HF_HOME,
228
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
229
- verification_mode="no_checks"
230
- ).to_pandas()
231
- hard_elo_bench_df = datasets.load_dataset(
232
- HARD_ELO_REPO,
233
- "default",
234
- split="benchmark_tie",
235
- cache_dir=HF_HOME,
236
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
237
- verification_mode="no_checks"
238
- ).to_pandas()
239
- HARD_ELO_TASK_DF = hard_elo_task_df
240
- HARD_ELO_BENCH_DF = hard_elo_bench_df
241
 
242
- complete_solve_df = datasets.load_dataset(
243
- SOLVE_REPO,
244
- "default",
245
- split="complete",
246
- cache_dir=HF_HOME,
247
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
248
- verification_mode="no_checks"
249
- ).to_pandas()
250
- instruct_solve_df = datasets.load_dataset(
251
- SOLVE_REPO,
252
- "default",
253
- split="instruct",
254
- cache_dir=HF_HOME,
255
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
256
- verification_mode="no_checks"
257
- ).to_pandas()
258
- COMPLETE_SOLVE_DF = complete_solve_df
259
- INSTRUCT_SOLVE_DF = instruct_solve_df
260
 
261
- hard_complete_solve_df = datasets.load_dataset(
262
- HARD_SOLVE_REPO,
263
- "default",
264
- split="complete",
265
- cache_dir=HF_HOME,
266
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
267
- verification_mode="no_checks"
268
- ).to_pandas()
269
- hard_instruct_solve_df = datasets.load_dataset(
270
- HARD_SOLVE_REPO,
271
- "default",
272
- split="instruct",
273
- cache_dir=HF_HOME,
274
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
275
- verification_mode="no_checks"
276
- ).to_pandas()
277
- HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
278
- HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
279
 
280
- NEW_DATA_ON_LEADERBOARD = False
281
-
282
- else:
283
- LEADERBOARD_DF = leaderboard_initial_df
284
- # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
285
- ELO_TASK_DF = elo_task_df
286
- # ELO_BENCH_DF = elo_bench_df
287
- # HARD_ELO_TASK_DF = hard_elo_task_df
288
- HARD_ELO_BENCH_DF = hard_elo_bench_df
289
- COMPLETE_SOLVE_DF = complete_solve_df
290
- # INSTRUCT_SOLVE_DF = instruct_solve_df
291
- # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
292
- HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
293
 
294
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
295
- # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
296
-
297
-
298
- def init_space():
299
- """Initializes the application space, loading only necessary data."""
300
-
301
- # Always redownload the leaderboard DataFrame
302
- global LEADERBOARD_DF
303
- global HARD_LEADERBOARD_DF
304
- global ELO_TASK_DF
305
- global ELO_BENCH_DF
306
- global HARD_ELO_TASK_DF
307
- global HARD_ELO_BENCH_DF
308
- global COMPLETE_SOLVE_DF
309
- global INSTRUCT_SOLVE_DF
310
- global HARD_COMPLETE_SOLVE_DF
311
- global HARD_INSTRUCT_SOLVE_DF
312
 
313
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
314
- # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
315
-
316
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
317
- # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
318
-
319
- # Initialize VoteManager
320
- # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
321
-
322
-
323
- # Schedule the upload_votes method to run every 15 minutes
324
- # schedule.every(15).minutes.do(vote_manager.upload_votes)
325
-
326
- # Start the scheduler in a separate thread
327
- # scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
328
- # scheduler_thread.start()
329
 
330
- # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
331
- # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
332
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
333
- ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
334
- COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
335
- HARD_INSTRUCT_SOLVE_DF = init_space()
336
- # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
337
-
338
- # Data processing for plots now only on demand in the respective Gradio tab
339
- # def load_and_create_plots():
340
- # plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
341
- # return plot_df
342
-
343
- # Function to check if a user is logged in
344
- def check_login(profile: gr.OAuthProfile | None) -> bool:
345
- if profile is None:
346
- return False
347
- return True
348
-
349
- def init_leaderboard(dataframe):
350
- if dataframe is None or dataframe.empty:
351
- raise ValueError("Leaderboard DataFrame is empty or None.")
352
- return Leaderboard(
353
- value=dataframe,
354
- datatype=[c.type for c in fields(AutoEvalColumn)],
355
- select_columns=SelectColumns(
356
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
357
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
358
- label="Select Columns to Display:",
359
- ),
360
- search_columns=[AutoEvalColumn.model.name],
361
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
362
- filter_columns=[
363
- ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
364
- ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
365
- ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
366
- ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
367
- ],
368
- bool_checkboxgroup_label="Hide models",
369
- interactive=False,
370
- )
371
-
372
-
373
- def init_others(dataframe):
374
- if dataframe is None or dataframe.empty:
375
- raise ValueError("Gradio DataFrame is empty or None.")
376
- return gr.Dataframe(dataframe, visible=False)
377
-
378
- main_block = gr.Blocks(css=custom_css)
379
- with main_block as demo:
380
- with gr.Row(elem_id="header-row"):
381
- gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
382
 
383
- # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
384
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
385
- with gr.Tab("💎 Hard Set") as hard_tabs:
386
- with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
387
- hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
388
- gr.Markdown(
389
- """
390
- **Notes:**
391
- - For the efficiency reasons, we only display the Hard Set leaderboard.
392
- - _Hard Set_ vs _Full Set_:
393
- - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
394
- - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
395
- - _Complete_ vs _Instruct_:
396
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
397
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
398
- - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
399
- - `Average` is the average of `Complete` and `Instruct` when both are available.
400
- - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
401
- - `#Act Params (B)` is the number of activated model parameters during inference.
402
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
403
- - For more details check the 📝 About section.
404
- """,
405
- elem_classes="markdown-text",
406
- )
407
-
408
- with gr.TabItem("📊 Elo Rating", id="hard_elo"):
409
- with gr.Column():
410
- with gr.Group():
411
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
412
- hard_task_elo_map = gr.Plot()
413
- hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
414
- demo.load(plot_elo_mle, [hard_elo_task_gr],
415
- hard_task_elo_map)
416
- with gr.Group():
417
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
418
- hard_bench_elo_map = gr.Plot()
419
- hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
420
- demo.load(plot_elo_mle, [hard_elo_bench_gr],
421
- hard_bench_elo_map)
422
-
423
- with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
424
- with gr.Column():
425
- hard_complete_map = gr.Plot()
426
- hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
427
- demo.load(plot_solve_rate, [hard_complete_solve_gr,
428
- gr.Textbox("Complete", visible=False),
429
- gr.Number(10, visible=False),
430
- gr.Number(16, visible=False),
431
- ], hard_complete_map)
432
- hard_instruct_map = gr.Plot()
433
- hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
434
- demo.load(plot_solve_rate, [hard_instruct_solve_gr,
435
- gr.Textbox("Instruct", visible=False),
436
- gr.Number(10, visible=False),
437
- gr.Number(16, visible=False),
438
- ], hard_instruct_map)
439
- with gr.Tab("🎯 Full Set") as full_tabs:
440
- with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
441
- leaderboard = init_leaderboard(LEADERBOARD_DF)
442
- gr.Markdown(
443
- """
444
- **Notes:**
445
- - _Complete_ vs _Instruct_:
446
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
447
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
448
- - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
449
- - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
450
- - `size` is the amount of activated model weight during inference.
451
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
452
- - For more details check the 📝 About section.
453
- """,
454
- elem_classes="markdown-text",
455
- )
456
-
457
- with gr.TabItem("📊 Elo Rating", id="full_elo"):
458
- with gr.Column():
459
- with gr.Group():
460
-
461
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
462
- task_elo_map = gr.Plot()
463
- elo_task_gr = init_others(ELO_TASK_DF)
464
- demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
465
- with gr.Group():
466
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
467
- bench_elo_map = gr.Plot()
468
- elo_bench_gr = init_others(ELO_BENCH_DF)
469
- demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
470
-
471
- with gr.TabItem("🧩 Solve Rate", id="full_solve"):
472
- with gr.Column():
473
- complete_map = gr.Plot()
474
- complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
475
- demo.load(plot_solve_rate, [complete_solve_gr,
476
- gr.Textbox("Complete", visible=False),
477
- ], complete_map)
478
- instruct_map = gr.Plot()
479
- instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
480
- demo.load(plot_solve_rate, [instruct_solve_gr,
481
- gr.Textbox("Instruct", visible=False),
482
- ], instruct_map)
483
- with gr.TabItem("📝 About", id=3):
484
- gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
485
- with gr.TabItem("🔎 Data Viewer", id="viewer"):
486
- search_input = gr.Textbox(label="Search by keyword")
487
- count_output = gr.Number(label="Number of filtered items")
488
- index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
489
- # show_solution = gr.Checkbox(label="Show Solution")
490
- show_test = gr.Checkbox(label="Show Test Cases")
491
- update_button = gr.Button("Update")
492
-
493
- task_id_output = gr.Textbox(label="Task ID")
494
- code_completion = gr.Code(language="python", label="Code Completion")
495
- nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction")
496
- # solution = gr.Code(language="python", label="Solution")
497
- test_cases = gr.Code(language="python", label="Test Cases")
498
-
499
- update_button.click(
500
- update_display,
501
- inputs=[search_input, index_slider, show_test],
502
- outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
503
- )
504
-
505
- # Initial load
506
- demo.load(
507
- update_display,
508
- inputs=[search_input, index_slider, show_test],
509
- outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
510
- )
511
-
512
- with gr.TabItem("🚀 Request", id=4):
513
- gr.Markdown(SUBMISSION_TEXT_3)
514
-
515
- with gr.TabItem("🛠️ Execute", id=5):
516
- gr.Markdown("# BigCodeBench Evaluator")
517
-
518
- with gr.Row():
519
- jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
520
- split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
521
- subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
522
-
523
- with gr.Row():
524
- parallel = gr.Number(label="Parallel (optional)", precision=0)
525
- min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
526
- max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
527
-
528
- with gr.Row():
529
- max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
530
- max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
531
- check_gt_only = gr.Checkbox(label="Check GT Only")
532
- no_gt = gr.Checkbox(label="No GT")
533
-
534
- command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
535
- with gr.Row():
536
- submit_btn = gr.Button("Run Evaluation")
537
- download_btn = gr.DownloadButton(label="Download Result")
538
- log_output = gr.Textbox(label="Execution Logs", lines=20)
539
-
540
- input_components = [
541
- jsonl_file, split, subset, parallel,
542
- min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
543
- check_gt_only, no_gt
544
- ]
545
-
546
- for component in input_components:
547
- component.change(generate_command, inputs=input_components, outputs=command_output)
548
-
549
-
550
- def start_evaluation(command, jsonl_file, subset, split):
551
- extra = subset + "_" if subset != "full" else ""
552
- if jsonl_file is not None:
553
- result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
554
- else:
555
- result_path = None
556
-
557
- for log in stream_logs(command, jsonl_file):
558
- if jsonl_file is not None:
559
- yield log, gr.update(value=result_path, label=result_path), gr.update()
560
- else:
561
- yield log, gr.update(), gr.update()
562
- is_running = False
563
- result_file = find_result_file()
564
- if result_file:
565
- return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
566
- # gr.Button(visible=False)#,
567
- # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
568
- else:
569
- return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
570
- # gr.Button("Run Evaluation", visible=True),
571
- # gr.DownloadButton(visible=False))
572
- submit_btn.click(start_evaluation,
573
- inputs=[command_output, jsonl_file, subset, split],
574
- outputs=[log_output, download_btn])
575
 
576
  with gr.Row():
577
- with gr.Accordion("📙 Citation", open=False):
578
- citation_button = gr.Textbox(
579
- value=CITATION_BUTTON_TEXT,
580
- label=CITATION_BUTTON_LABEL,
581
- lines=20,
582
- elem_id="citation-button",
583
- show_copy_button=True,
584
- )
585
-
586
- main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
587
- # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
588
- # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
589
- # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
590
-
591
- main_block.queue(default_concurrency_limit=100)
592
-
593
-
594
- def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
595
- # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
596
- # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
597
- # ht to Lucain!
598
- if SPACE_ID is None:
599
- print("Not in a Space: Space CI disabled.")
600
- return WebhooksServer(ui=main_block)
601
-
602
- if IS_EPHEMERAL_SPACE:
603
- print("In an ephemeral Space: Space CI disabled.")
604
- return WebhooksServer(ui=main_block)
605
-
606
- card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
607
- config = card.data.get("space_ci", {})
608
- print(f"Enabling Space CI with config from README: {config}")
609
-
610
- return configure_space_ci(
611
- blocks=ui,
612
- trusted_authors=config.get("trusted_authors"),
613
- private=config.get("private", "auto"),
614
- variables=config.get("variables", "auto"),
615
- secrets=config.get("secrets"),
616
- hardware=config.get("hardware"),
617
- storage=config.get("storage"),
618
- )
619
-
620
- # Create webhooks server (with CI url if in Space and not ephemeral)
621
- webhooks_server = enable_space_ci_and_return_server(ui=main_block)
622
-
623
- # Add webhooks
624
- @webhooks_server.add_webhook
625
- def update_leaderboard(payload: WebhookPayload) -> None:
626
- """Redownloads the leaderboard dataset each time it updates"""
627
- if payload.repo.type == "dataset" and payload.event.action == "update":
628
- global NEW_DATA_ON_LEADERBOARD
629
- if NEW_DATA_ON_LEADERBOARD:
630
- return
631
- NEW_DATA_ON_LEADERBOARD = True
632
-
633
- for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
634
- datasets.load_dataset(
635
- repo,
636
- "default",
637
- cache_dir=HF_HOME,
638
- download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
639
- verification_mode="no_checks"
640
- )
641
-
642
 
643
-
644
- webhooks_server.launch()
645
-
646
- scheduler = BackgroundScheduler()
647
- scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
648
- scheduler.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import sys
4
  import os
5
+ import threading
6
  import time
7
+ import uuid
8
+ import glob
9
+ import shutil
10
+ from pathlib import Path
 
 
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
 
13
+ default_command = "bigcodebench.evaluate"
14
+ is_running = False
15
+ lock = threading.Lock()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def generate_command(
18
+ jsonl_file, split, subset, parallel,
19
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
20
+ check_gt_only, no_gt
21
+ ):
22
+ command = [default_command]
23
 
24
+ if jsonl_file is not None:
25
+ # Copy the uploaded file to the current directory
26
+ local_filename = os.path.basename(jsonl_file.name)
27
+ shutil.copy(jsonl_file.name, local_filename)
28
+ command.extend(["--samples", local_filename])
29
 
30
+ command.extend(["--split", split, "--subset", subset])
 
31
 
32
+ if parallel is not None and parallel != 0:
33
+ command.extend(["--parallel", str(int(parallel))])
 
 
 
34
 
35
+ command.extend([
36
+ "--min-time-limit", str(min_time_limit),
37
+ "--max-as-limit", str(int(max_as_limit)),
38
+ "--max-data-limit", str(int(max_data_limit)),
39
+ "--max-stack-limit", str(int(max_stack_limit))
40
+ ])
41
+
42
+ if check_gt_only:
43
+ command.append("--check-gt-only")
44
+
45
+ if no_gt:
46
+ command.append("--no-gt")
47
+
48
+ return " ".join(command)
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
+ def cleanup_previous_files(jsonl_file):
52
+ if jsonl_file is not None:
53
+ file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
54
+ else:
55
+ file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
56
+ for file in glob.glob("*"):
57
  try:
58
+ if file not in file_list:
59
+ os.remove(file)
 
 
 
 
 
 
 
 
 
60
  except Exception as e:
61
+ print(f"Error during cleanup of {file}: {e}")
62
+
63
+ def find_result_file():
64
+ json_files = glob.glob("*.json")
65
+ if json_files:
66
+ return max(json_files, key=os.path.getmtime)
67
+ return None
68
+
69
+ def run_bigcodebench(command):
70
+ global is_running
71
+ with lock:
72
+ if is_running:
73
+ yield "A command is already running. Please wait for it to finish.\n"
74
+ return
75
+ is_running = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ try:
78
+ yield f"Executing command: {command}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ for line in process.stdout:
83
+ yield line
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ # process.wait()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ if process.returncode != 0:
88
+ yield f"Error: Command exited with status {process.returncode}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ yield "Evaluation completed.\n"
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ result_file = find_result_file()
93
+ if result_file:
94
+ yield f"Result file found: {result_file}\n"
95
+ else:
96
+ yield "No result file found.\n"
97
+ finally:
98
+ with lock:
99
+ is_running = False
100
+
101
+ def stream_logs(command, jsonl_file=None):
102
+ global is_running
103
+
104
+ if is_running:
105
+ yield "A command is already running. Please wait for it to finish.\n"
106
+ return
 
 
 
107
 
108
+ cleanup_previous_files(jsonl_file)
109
+ yield "Cleaned up previous files.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ log_content = []
112
+ for log_line in run_bigcodebench(command):
113
+ log_content.append(log_line)
114
+ yield "".join(log_content)
115
+
116
+ with gr.Blocks() as demo:
117
+ gr.Markdown("# BigCodeBench Evaluator")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ with gr.Row():
120
+ jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
121
+ split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
122
+ subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  with gr.Row():
125
+ parallel = gr.Number(label="Parallel (optional)", precision=0)
126
+ min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
127
+ max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
128
+
129
+ with gr.Row():
130
+ max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
131
+ max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
132
+ check_gt_only = gr.Checkbox(label="Check GT Only")
133
+ no_gt = gr.Checkbox(label="No GT")
134
+
135
+ command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
136
+ with gr.Row():
137
+ submit_btn = gr.Button("Run Evaluation")
138
+ download_btn = gr.DownloadButton(label="Download Result")
139
+ log_output = gr.Textbox(label="Execution Logs", lines=20)
140
+
141
+ input_components = [
142
+ jsonl_file, split, subset, parallel,
143
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
144
+ check_gt_only, no_gt
145
+ ]
146
+
147
+ for component in input_components:
148
+ component.change(generate_command, inputs=input_components, outputs=command_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+
151
+ def start_evaluation(command, jsonl_file, subset, split):
152
+ extra = subset + "_" if subset != "full" else ""
153
+ if jsonl_file is not None:
154
+ result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
155
+ else:
156
+ result_path = None
157
+
158
+ for log in stream_logs(command, jsonl_file):
159
+ if jsonl_file is not None:
160
+ yield log, gr.update(value=result_path, label=result_path), gr.update()
161
+ else:
162
+ yield log, gr.update(), gr.update()
163
+ is_running = False
164
+ result_file = find_result_file()
165
+ if result_file:
166
+ return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
167
+ # gr.Button(visible=False)#,
168
+ # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
169
+ else:
170
+ return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
171
+ # gr.Button("Run Evaluation", visible=True),
172
+ # gr.DownloadButton(visible=False))
173
+ submit_btn.click(start_evaluation,
174
+ inputs=[command_output, jsonl_file, subset, split],
175
+ outputs=[log_output, download_btn])
176
+
177
+ demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
178
+ scheduler = BackgroundScheduler()
demo.py DELETED
@@ -1,178 +0,0 @@
1
- import gradio as gr
2
- import subprocess
3
- import sys
4
- import os
5
- import threading
6
- import time
7
- import uuid
8
- import glob
9
- import shutil
10
- from pathlib import Path
11
- from apscheduler.schedulers.background import BackgroundScheduler
12
-
13
- default_command = "bigcodebench.evaluate"
14
- is_running = False
15
- lock = threading.Lock()
16
-
17
- def generate_command(
18
- jsonl_file, split, subset, parallel,
19
- min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
20
- check_gt_only, no_gt
21
- ):
22
- command = [default_command]
23
-
24
- if jsonl_file is not None:
25
- # Copy the uploaded file to the current directory
26
- local_filename = os.path.basename(jsonl_file.name)
27
- shutil.copy(jsonl_file.name, local_filename)
28
- command.extend(["--samples", local_filename])
29
-
30
- command.extend(["--split", split, "--subset", subset])
31
-
32
- if parallel is not None and parallel != 0:
33
- command.extend(["--parallel", str(int(parallel))])
34
-
35
- command.extend([
36
- "--min-time-limit", str(min_time_limit),
37
- "--max-as-limit", str(int(max_as_limit)),
38
- "--max-data-limit", str(int(max_data_limit)),
39
- "--max-stack-limit", str(int(max_stack_limit))
40
- ])
41
-
42
- if check_gt_only:
43
- command.append("--check-gt-only")
44
-
45
- if no_gt:
46
- command.append("--no-gt")
47
-
48
- return " ".join(command)
49
-
50
-
51
- def cleanup_previous_files(jsonl_file):
52
- if jsonl_file is not None:
53
- file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
54
- else:
55
- file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
56
- for file in glob.glob("*"):
57
- try:
58
- if file not in file_list:
59
- os.remove(file)
60
- except Exception as e:
61
- print(f"Error during cleanup of {file}: {e}")
62
-
63
- def find_result_file():
64
- json_files = glob.glob("*.json")
65
- if json_files:
66
- return max(json_files, key=os.path.getmtime)
67
- return None
68
-
69
- def run_bigcodebench(command):
70
- global is_running
71
- with lock:
72
- if is_running:
73
- yield "A command is already running. Please wait for it to finish.\n"
74
- return
75
- is_running = True
76
-
77
- try:
78
- yield f"Executing command: {command}\n"
79
-
80
- process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
81
-
82
- for line in process.stdout:
83
- yield line
84
-
85
- # process.wait()
86
-
87
- if process.returncode != 0:
88
- yield f"Error: Command exited with status {process.returncode}\n"
89
-
90
- yield "Evaluation completed.\n"
91
-
92
- result_file = find_result_file()
93
- if result_file:
94
- yield f"Result file found: {result_file}\n"
95
- else:
96
- yield "No result file found.\n"
97
- finally:
98
- with lock:
99
- is_running = False
100
-
101
- def stream_logs(command, jsonl_file=None):
102
- global is_running
103
-
104
- if is_running:
105
- yield "A command is already running. Please wait for it to finish.\n"
106
- return
107
-
108
- cleanup_previous_files(jsonl_file)
109
- yield "Cleaned up previous files.\n"
110
-
111
- log_content = []
112
- for log_line in run_bigcodebench(command):
113
- log_content.append(log_line)
114
- yield "".join(log_content)
115
-
116
- with gr.Blocks() as demo:
117
- gr.Markdown("# BigCodeBench Evaluator")
118
-
119
- with gr.Row():
120
- jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
121
- split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
122
- subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
123
-
124
- with gr.Row():
125
- parallel = gr.Number(label="Parallel (optional)", precision=0)
126
- min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
127
- max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
128
-
129
- with gr.Row():
130
- max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
131
- max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
132
- check_gt_only = gr.Checkbox(label="Check GT Only")
133
- no_gt = gr.Checkbox(label="No GT")
134
-
135
- command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
136
- with gr.Row():
137
- submit_btn = gr.Button("Run Evaluation")
138
- download_btn = gr.DownloadButton(label="Download Result")
139
- log_output = gr.Textbox(label="Execution Logs", lines=20)
140
-
141
- input_components = [
142
- jsonl_file, split, subset, parallel,
143
- min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
144
- check_gt_only, no_gt
145
- ]
146
-
147
- for component in input_components:
148
- component.change(generate_command, inputs=input_components, outputs=command_output)
149
-
150
-
151
- def start_evaluation(command, jsonl_file, subset, split):
152
- extra = subset + "_" if subset != "full" else ""
153
- if jsonl_file is not None:
154
- result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
155
- else:
156
- result_path = None
157
-
158
- for log in stream_logs(command, jsonl_file):
159
- if jsonl_file is not None:
160
- yield log, gr.update(value=result_path, label=result_path), gr.update()
161
- else:
162
- yield log, gr.update(), gr.update()
163
- is_running = False
164
- result_file = find_result_file()
165
- if result_file:
166
- return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
167
- # gr.Button(visible=False)#,
168
- # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
169
- else:
170
- return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
171
- # gr.Button("Run Evaluation", visible=True),
172
- # gr.DownloadButton(visible=False))
173
- submit_btn.click(start_evaluation,
174
- inputs=[command_output, jsonl_file, subset, split],
175
- outputs=[log_output, download_btn])
176
-
177
- demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
178
- scheduler = BackgroundScheduler()