bigcodebench-evaluator-2

Sleeping

App Files Files Community

bigcodebench-evaluator-2 / app.py

terryyz

Update app.py

fc44da7 verified 4 months ago

raw

history blame

30 kB

	import os
	import logging
	import time
	import datetime
	import gradio as gr
	from threading import Thread
	import datasets
	from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	from apscheduler.schedulers.background import BackgroundScheduler

	# Start ephemeral Spaces on PRs (see config in README.md)
	from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci

	from src.display.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	# INTRODUCTION_TEXT,
	TITLE,
	ABOUT_TEXT,
	SUBMISSION_TEXT_3,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	COLS,
	EVAL_COLS,
	EVAL_TYPES,
	AutoEvalColumn,
	fields,
	EvalQueueColumn
	)
	from src.envs import (
	API,
	EVAL_REQUESTS_PATH,
	RESULT_REPO,
	DATA_VERSION,
	DATA_REPO,
	HARD_RESULT_REPO,
	ELO_REPO,
	HARD_ELO_REPO,
	SOLVE_REPO,
	HARD_SOLVE_REPO,
	HF_TOKEN,
	QUEUE_REPO,
	REPO_ID,
	VOTES_REPO,
	VOTES_PATH,
	HF_HOME,
	)
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.execute import generate_command, default_command, is_running, lock, stream_logs, find_result_file
	from src.tools.plots import plot_elo_mle, plot_solve_rate
	# from src.voting.vote_system import VoteManager, run_scheduler

	# Configure logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

	# Start ephemeral Spaces on PRs (see config in README.md)
	from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci

	# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
	# This controls whether a full initialization should be performed.
	DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
	NEW_DATA_ON_LEADERBOARD = True
	LEADERBOARD_DF = None
	HARD_LEADERBOARD_DF = None
	ELO_TASK_DF = None
	ELO_BENCH_DF = None
	HARD_ELO_TASK_DF = None
	HARD_ELO_BENCH_DF = None
	COMPLETE_SOLVE_DF = None
	INSTRUCT_SOLVE_DF = None
	HARD_COMPLETE_SOLVE_DF = None
	HARD_INSTRUCT_SOLVE_DF = None

	DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
	verification_mode="no_checks")


	def filter_data(data, keyword):
	if not keyword:
	return data
	filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
	return filtered_data


	def update_display(search_keyword, index, show_test):
	filtered_data = filter_data(DATA, search_keyword)

	if not filtered_data:
	return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]

	max_index = len(filtered_data) - 1
	index = min(max(0, index), max_index)

	task_id = filtered_data[index]['task_id']
	snippet1 = filtered_data[index]['complete_prompt']
	snippet2 = filtered_data[index]['instruct_prompt']
	# snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
	snippet4 = filtered_data[index]['test'] if show_test else ""

	return [
	task_id,
	snippet1,
	snippet2,
	# snippet3,
	snippet4,
	len(filtered_data),
	gr.update(maximum=max_index, value=index)
	]

	def restart_space():
	API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)


	def time_diff_wrapper(func):
	def wrapper(args, *kwargs):
	start_time = time.time()
	result = func(args, *kwargs)
	end_time = time.time()
	diff = end_time - start_time
	logging.info(f"Time taken for {func.__name__}: {diff} seconds")
	return result

	return wrapper


	@time_diff_wrapper
	def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
	"""Download dataset with exponential backoff retries."""
	attempt = 0
	while attempt < max_attempts:
	try:
	logging.info(f"Downloading {repo_id} to {local_dir}")
	snapshot_download(
	repo_id=repo_id,
	local_dir=local_dir,
	repo_type=repo_type,
	tqdm_class=None,
	etag_timeout=30,
	max_workers=8,
	)
	logging.info("Download successful")
	return
	except Exception as e:
	wait_time = backoff_factor**attempt
	logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
	time.sleep(wait_time)
	attempt += 1
	raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")

	def get_latest_data_leaderboard(
	leaderboard_initial_df = None,
	hard_leaderboard_initial_df = None,
	elo_task_df = None,
	elo_bench_df = None,
	hard_elo_task_df = None,
	hard_elo_bench_df = None,
	complete_solve_df = None,
	instruct_solve_df = None,
	hard_complete_solve_df = None,
	hard_instruct_solve_df = None
	):
	global NEW_DATA_ON_LEADERBOARD
	global LEADERBOARD_DF
	global HARD_LEADERBOARD_DF
	global ELO_TASK_DF
	global ELO_BENCH_DF
	global HARD_ELO_TASK_DF
	global HARD_ELO_BENCH_DF
	global COMPLETE_SOLVE_DF
	global INSTRUCT_SOLVE_DF
	global HARD_COMPLETE_SOLVE_DF
	global HARD_INSTRUCT_SOLVE_DF

	if NEW_DATA_ON_LEADERBOARD:
	print("Leaderboard updated at reload!")
	leaderboard_dataset = datasets.load_dataset(
	RESULT_REPO,
	"default",
	split="train",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	)
	LEADERBOARD_DF = get_leaderboard_df(
	leaderboard_dataset=leaderboard_dataset,
	cols=COLS,
	)
	hard_leaderboard_dataset = datasets.load_dataset(
	HARD_RESULT_REPO,
	"default",
	split="train",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	)
	hard_leaderboard_df = get_leaderboard_df(
	leaderboard_dataset=hard_leaderboard_dataset,
	cols=COLS,
	)
	HARD_LEADERBOARD_DF = hard_leaderboard_df

	elo_task_df = datasets.load_dataset(
	ELO_REPO,
	"default",
	split="task_no_tie",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	elo_bench_df = datasets.load_dataset(
	ELO_REPO,
	"default",
	split="benchmark_tie",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	ELO_TASK_DF = elo_task_df
	ELO_BENCH_DF = elo_bench_df

	hard_elo_task_df = datasets.load_dataset(
	HARD_ELO_REPO,
	"default",
	split="task_no_tie",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	hard_elo_bench_df = datasets.load_dataset(
	HARD_ELO_REPO,
	"default",
	split="benchmark_tie",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	HARD_ELO_TASK_DF = hard_elo_task_df
	HARD_ELO_BENCH_DF = hard_elo_bench_df

	complete_solve_df = datasets.load_dataset(
	SOLVE_REPO,
	"default",
	split="complete",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	instruct_solve_df = datasets.load_dataset(
	SOLVE_REPO,
	"default",
	split="instruct",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	COMPLETE_SOLVE_DF = complete_solve_df
	INSTRUCT_SOLVE_DF = instruct_solve_df

	hard_complete_solve_df = datasets.load_dataset(
	HARD_SOLVE_REPO,
	"default",
	split="complete",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	hard_instruct_solve_df = datasets.load_dataset(
	HARD_SOLVE_REPO,
	"default",
	split="instruct",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
	verification_mode="no_checks"
	).to_pandas()
	HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
	HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df

	NEW_DATA_ON_LEADERBOARD = False

	else:
	LEADERBOARD_DF = leaderboard_initial_df
	# HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
	ELO_TASK_DF = elo_task_df
	# ELO_BENCH_DF = elo_bench_df
	# HARD_ELO_TASK_DF = hard_elo_task_df
	HARD_ELO_BENCH_DF = hard_elo_bench_df
	COMPLETE_SOLVE_DF = complete_solve_df
	# INSTRUCT_SOLVE_DF = instruct_solve_df
	# HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
	HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df

	return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
	# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)


	def init_space():
	"""Initializes the application space, loading only necessary data."""

	# Always redownload the leaderboard DataFrame
	global LEADERBOARD_DF
	global HARD_LEADERBOARD_DF
	global ELO_TASK_DF
	global ELO_BENCH_DF
	global HARD_ELO_TASK_DF
	global HARD_ELO_BENCH_DF
	global COMPLETE_SOLVE_DF
	global INSTRUCT_SOLVE_DF
	global HARD_COMPLETE_SOLVE_DF
	global HARD_INSTRUCT_SOLVE_DF

	LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
	# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()

	return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
	# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)

	# Initialize VoteManager
	# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)


	# Schedule the upload_votes method to run every 15 minutes
	# schedule.every(15).minutes.do(vote_manager.upload_votes)

	# Start the scheduler in a separate thread
	# scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
	# scheduler_thread.start()

	# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
	# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
	LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
	ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
	COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
	HARD_INSTRUCT_SOLVE_DF = init_space()
	# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()

	# Data processing for plots now only on demand in the respective Gradio tab
	# def load_and_create_plots():
	# plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
	# return plot_df

	# Function to check if a user is logged in
	def check_login(profile: gr.OAuthProfile \| None) -> bool:
	if profile is None:
	return False
	return True

	def init_leaderboard(dataframe):
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")
	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
	label="Select Columns to Display:",
	),
	search_columns=[AutoEvalColumn.model.name],
	hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
	filter_columns=[
	ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
	ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
	ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
	ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
	],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)


	def init_others(dataframe):
	if dataframe is None or dataframe.empty:
	raise ValueError("Gradio DataFrame is empty or None.")
	return gr.Dataframe(dataframe, visible=False)

	main_block = gr.Blocks(css=custom_css)
	with main_block as demo:
	with gr.Row(elem_id="header-row"):
	gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")

	# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.Tab("💎 Hard Set") as hard_tabs:
	with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
	hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
	gr.Markdown(
	"""
	Notes:
	- For the efficiency reasons, we only display the Hard Set leaderboard.
	- _Hard Set_ vs _Full Set_:
	- <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
	- <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
	- _Complete_ vs _Instruct_:
	- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
	- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
	- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
	- `Average` is the average of `Complete` and `Instruct` when both are available.
	- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
	- `#Act Params (B)` is the number of activated model parameters during inference.
	- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
	- For more details check the 📝 About section.
	""",
	elem_classes="markdown-text",
	)

	with gr.TabItem("📊 Elo Rating", id="hard_elo"):
	with gr.Column():
	with gr.Group():
	gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
	hard_task_elo_map = gr.Plot()
	hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
	demo.load(plot_elo_mle, [hard_elo_task_gr],
	hard_task_elo_map)
	with gr.Group():
	gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
	hard_bench_elo_map = gr.Plot()
	hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
	demo.load(plot_elo_mle, [hard_elo_bench_gr],
	hard_bench_elo_map)

	with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
	with gr.Column():
	hard_complete_map = gr.Plot()
	hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
	demo.load(plot_solve_rate, [hard_complete_solve_gr,
	gr.Textbox("Complete", visible=False),
	gr.Number(10, visible=False),
	gr.Number(16, visible=False),
	], hard_complete_map)
	hard_instruct_map = gr.Plot()
	hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
	demo.load(plot_solve_rate, [hard_instruct_solve_gr,
	gr.Textbox("Instruct", visible=False),
	gr.Number(10, visible=False),
	gr.Number(16, visible=False),
	], hard_instruct_map)
	with gr.Tab("🎯 Full Set") as full_tabs:
	with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
	leaderboard = init_leaderboard(LEADERBOARD_DF)
	gr.Markdown(
	"""
	Notes:
	- _Complete_ vs _Instruct_:
	- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
	- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
	- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
	- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
	- `size` is the amount of activated model weight during inference.
	- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
	- For more details check the 📝 About section.
	""",
	elem_classes="markdown-text",
	)

	with gr.TabItem("📊 Elo Rating", id="full_elo"):
	with gr.Column():
	with gr.Group():

	gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
	task_elo_map = gr.Plot()
	elo_task_gr = init_others(ELO_TASK_DF)
	demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
	with gr.Group():
	gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
	bench_elo_map = gr.Plot()
	elo_bench_gr = init_others(ELO_BENCH_DF)
	demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)

	with gr.TabItem("🧩 Solve Rate", id="full_solve"):
	with gr.Column():
	complete_map = gr.Plot()
	complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
	demo.load(plot_solve_rate, [complete_solve_gr,
	gr.Textbox("Complete", visible=False),
	], complete_map)
	instruct_map = gr.Plot()
	instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
	demo.load(plot_solve_rate, [instruct_solve_gr,
	gr.Textbox("Instruct", visible=False),
	], instruct_map)
	with gr.TabItem("📝 About", id=3):
	gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
	with gr.TabItem("🔎 Data Viewer", id="viewer"):
	search_input = gr.Textbox(label="Search by keyword")
	count_output = gr.Number(label="Number of filtered items")
	index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
	# show_solution = gr.Checkbox(label="Show Solution")
	show_test = gr.Checkbox(label="Show Test Cases")
	update_button = gr.Button("Update")

	task_id_output = gr.Textbox(label="Task ID")
	code_completion = gr.Code(language="python", label="Code Completion")
	nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction")
	# solution = gr.Code(language="python", label="Solution")
	test_cases = gr.Code(language="python", label="Test Cases")

	update_button.click(
	update_display,
	inputs=[search_input, index_slider, show_test],
	outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
	)

	# Initial load
	demo.load(
	update_display,
	inputs=[search_input, index_slider, show_test],
	outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
	)

	with gr.TabItem("🚀 Request", id=4):
	gr.Markdown(SUBMISSION_TEXT_3)

	with gr.TabItem("🛠️ Execute", id=5):
	gr.Markdown("# BigCodeBench Evaluator")

	with gr.Row():
	jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
	split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
	subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")

	with gr.Row():
	parallel = gr.Number(label="Parallel (optional)", precision=0)
	min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
	max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)

	with gr.Row():
	max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
	max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
	check_gt_only = gr.Checkbox(label="Check GT Only")
	no_gt = gr.Checkbox(label="No GT")

	command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
	with gr.Row():
	submit_btn = gr.Button("Run Evaluation")
	download_btn = gr.DownloadButton(label="Download Result")
	log_output = gr.Textbox(label="Execution Logs", lines=20)

	input_components = [
	jsonl_file, split, subset, parallel,
	min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
	check_gt_only, no_gt
	]

	for component in input_components:
	component.change(generate_command, inputs=input_components, outputs=command_output)


	def start_evaluation(command, jsonl_file, subset, split):
	extra = subset + "_" if subset != "full" else ""
	if jsonl_file is not None:
	result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
	else:
	result_path = None

	for log in stream_logs(command, jsonl_file):
	if jsonl_file is not None:
	yield log, gr.update(value=result_path, label=result_path), gr.update()
	else:
	yield log, gr.update(), gr.update()
	is_running = False
	result_file = find_result_file()
	if result_file:
	return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
	# gr.Button(visible=False)#,
	# gr.DownloadButton(label="Download Result", value=result_file, visible=True))
	else:
	return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
	# gr.Button("Run Evaluation", visible=True),
	# gr.DownloadButton(visible=False))
	submit_btn.click(start_evaluation,
	inputs=[command_output, jsonl_file, subset, split],
	outputs=[log_output, download_btn])

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)

	main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
	# main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
	# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
	# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])

	main_block.queue(default_concurrency_limit=100)


	def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
	# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
	# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
	# ht to Lucain!
	if SPACE_ID is None:
	print("Not in a Space: Space CI disabled.")
	return WebhooksServer(ui=main_block)

	if IS_EPHEMERAL_SPACE:
	print("In an ephemeral Space: Space CI disabled.")
	return WebhooksServer(ui=main_block)

	card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
	config = card.data.get("space_ci", {})
	print(f"Enabling Space CI with config from README: {config}")

	return configure_space_ci(
	blocks=ui,
	trusted_authors=config.get("trusted_authors"),
	private=config.get("private", "auto"),
	variables=config.get("variables", "auto"),
	secrets=config.get("secrets"),
	hardware=config.get("hardware"),
	storage=config.get("storage"),
	)

	# Create webhooks server (with CI url if in Space and not ephemeral)
	webhooks_server = enable_space_ci_and_return_server(ui=main_block)

	# Add webhooks
	@webhooks_server.add_webhook
	def update_leaderboard(payload: WebhookPayload) -> None:
	"""Redownloads the leaderboard dataset each time it updates"""
	if payload.repo.type == "dataset" and payload.event.action == "update":
	global NEW_DATA_ON_LEADERBOARD
	if NEW_DATA_ON_LEADERBOARD:
	return
	NEW_DATA_ON_LEADERBOARD = True

	for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
	datasets.load_dataset(
	repo,
	"default",
	cache_dir=HF_HOME,
	download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
	verification_mode="no_checks"
	)



	webhooks_server.launch()

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
	scheduler.start()