Spaces:

autoevaluate
/

model-evaluator

Runtime error

App Files Files Community

model-evaluator / app.py

lewtun HF staff

Merge branch 'main' into fix-app

2da95ac over 2 years ago

raw

history blame

16.1 kB

	import inspect
	import os
	import uuid
	from pathlib import Path

	import pandas as pd
	import streamlit as st
	from datasets import get_dataset_config_names, list_metrics, load_metric
	from dotenv import load_dotenv
	from huggingface_hub import list_datasets
	from tqdm import tqdm

	from evaluation import filter_evaluated_models
	from utils import (
	format_col_mapping,
	get_compatible_models,
	get_key,
	get_metadata,
	http_get,
	http_post,
	)

	if Path(".env").is_file():
	load_dotenv(".env")

	HF_TOKEN = os.getenv("HF_TOKEN")
	AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
	AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
	DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")


	TASK_TO_ID = {
	"binary_classification": 1,
	"multi_class_classification": 2,
	# "multi_label_classification": 3, # Not fully supported in AutoTrain
	"entity_extraction": 4,
	"extractive_question_answering": 5,
	"translation": 6,
	"summarization": 8,
	}

	TASK_TO_DEFAULT_METRICS = {
	"binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
	"multi_class_classification": [
	"f1_micro",
	"f1_macro",
	"f1_weighted",
	"precision_macro",
	"precision_micro",
	"precision_weighted",
	"recall_macro",
	"recall_micro",
	"recall_weighted",
	"accuracy",
	],
	"entity_extraction": ["precision", "recall", "f1", "accuracy"],
	"extractive_question_answering": [],
	"translation": ["sacrebleu", "gen_len"],
	"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum", "gen_len"],
	}

	SUPPORTED_TASKS = list(TASK_TO_ID.keys())


	@st.cache
	def get_supported_metrics():
	metrics = list_metrics()
	supported_metrics = []
	for metric in tqdm(metrics):
	try:
	metric_func = load_metric(metric)
	except Exception as e:
	print(e)
	print("Skipping the following metric, which cannot load:", metric)

	argspec = inspect.getfullargspec(metric_func.compute)
	if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
	# We require that "references" and "predictions" are arguments
	# to the metric function. We also require that the other arguments
	# besides "references" and "predictions" have defaults and so do not
	# need to be specified explicitly.
	defaults = True
	for key, value in argspec.kwonlydefaults.items():
	if key not in ("references", "predictions"):
	if value is None:
	defaults = False
	break

	if defaults:
	supported_metrics.append(metric)
	return supported_metrics


	supported_metrics = get_supported_metrics()


	#######
	# APP #
	#######
	st.title("Evaluation as a Service")
	st.markdown(
	"""
	Welcome to Hugging Face's Evaluation as a Service! This application allows
	you to evaluate 🤗 Transformers models with a dataset on the Hub. Please
	select the dataset and configuration below. The results of your evaluation
	will be displayed on the public leaderboard
	[here](https://huggingface.co/spaces/autoevaluate/leaderboards).
	"""
	)

	all_datasets = [d.id for d in list_datasets()]
	query_params = st.experimental_get_query_params()
	default_dataset = all_datasets[0]
	if "dataset" in query_params:
	if len(query_params["dataset"]) > 0 and query_params["dataset"][0] in all_datasets:
	default_dataset = query_params["dataset"][0]

	selected_dataset = st.selectbox("Select a dataset", all_datasets, index=all_datasets.index(default_dataset))
	st.experimental_set_query_params(**{"dataset": [selected_dataset]})


	metadata = get_metadata(selected_dataset)
	print(metadata)
	if metadata is None:
	st.warning("No evaluation metadata found. Please configure the evaluation job below.")

	with st.expander("Advanced configuration"):
	# Select task
	selected_task = st.selectbox(
	"Select a task",
	SUPPORTED_TASKS,
	index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
	)
	# Select config
	configs = get_dataset_config_names(selected_dataset)
	selected_config = st.selectbox("Select a config", configs)

	# Select splits
	splits_resp = http_get(
	path="/splits",
	domain=DATASETS_PREVIEW_API,
	params={"dataset": selected_dataset},
	)
	if splits_resp.status_code == 200:
	split_names = []
	all_splits = splits_resp.json()
	for split in all_splits["splits"]:
	if split["config"] == selected_config:
	split_names.append(split["split"])

	selected_split = st.selectbox(
	"Select a split",
	split_names,
	index=split_names.index(metadata[0]["splits"]["eval_split"]) if metadata is not None else 0,
	)

	# Select columns
	rows_resp = http_get(
	path="/rows",
	domain=DATASETS_PREVIEW_API,
	params={
	"dataset": selected_dataset,
	"config": selected_config,
	"split": selected_split,
	},
	).json()
	col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)

	st.markdown("Map your data columns")
	col1, col2 = st.columns(2)

	# TODO: find a better way to layout these items
	# TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
	col_mapping = {}
	if selected_task in ["binary_classification", "multi_class_classification"]:
	with col1:
	st.markdown("`text` column")
	st.text("")
	st.text("")
	st.text("")
	st.text("")
	st.markdown("`target` column")
	with col2:
	text_col = st.selectbox(
	"This column should contain the text you want to classify",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
	)
	target_col = st.selectbox(
	"This column should contain the labels you want to assign to the text",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
	)
	col_mapping[text_col] = "text"
	col_mapping[target_col] = "target"

	elif selected_task == "entity_extraction":
	with col1:
	st.markdown("`tokens` column")
	st.text("")
	st.text("")
	st.text("")
	st.text("")
	st.markdown("`tags` column")
	with col2:
	tokens_col = st.selectbox(
	"This column should contain the array of tokens",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "tokens")) if metadata is not None else 0,
	)
	tags_col = st.selectbox(
	"This column should contain the labels to associate to each part of the text",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "tags")) if metadata is not None else 0,
	)
	col_mapping[tokens_col] = "tokens"
	col_mapping[tags_col] = "tags"

	elif selected_task == "translation":
	with col1:
	st.markdown("`source` column")
	st.text("")
	st.text("")
	st.text("")
	st.text("")
	st.markdown("`target` column")
	with col2:
	text_col = st.selectbox(
	"This column should contain the text you want to translate",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "source")) if metadata is not None else 0,
	)
	target_col = st.selectbox(
	"This column should contain an example translation of the source text",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
	)
	col_mapping[text_col] = "source"
	col_mapping[target_col] = "target"

	elif selected_task == "summarization":
	with col1:
	st.markdown("`text` column")
	st.text("")
	st.text("")
	st.text("")
	st.text("")
	st.markdown("`target` column")
	with col2:
	text_col = st.selectbox(
	"This column should contain the text you want to summarize",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
	)
	target_col = st.selectbox(
	"This column should contain an example summarization of the text",
	col_names,
	index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
	)
	col_mapping[text_col] = "text"
	col_mapping[target_col] = "target"

	elif selected_task == "extractive_question_answering":
	col_mapping = metadata[0]["col_mapping"]
	# Hub YAML parser converts periods to hyphens, so we remap them here
	col_mapping = format_col_mapping(col_mapping)
	with col1:
	st.markdown("`context` column")
	st.text("")
	st.text("")
	st.text("")
	st.text("")
	st.markdown("`question` column")
	st.text("")
	st.text("")
	st.text("")
	st.text("")
	st.markdown("`answers.text` column")
	st.text("")
	st.text("")
	st.text("")
	st.text("")
	st.markdown("`answers.answer_start` column")
	with col2:
	context_col = st.selectbox(
	"This column should contain the question's context",
	col_names,
	index=col_names.index(get_key(col_mapping, "context")) if metadata is not None else 0,
	)
	question_col = st.selectbox(
	"This column should contain the question to be answered, given the context",
	col_names,
	index=col_names.index(get_key(col_mapping, "question")) if metadata is not None else 0,
	)
	answers_text_col = st.selectbox(
	"This column should contain example answers to the question, extracted from the context",
	col_names,
	index=col_names.index(get_key(col_mapping, "answers.text")) if metadata is not None else 0,
	)
	answers_start_col = st.selectbox(
	"This column should contain the indices in the context of the first character of each answers.text",
	col_names,
	index=col_names.index(get_key(col_mapping, "answers.answer_start")) if metadata is not None else 0,
	)
	col_mapping[context_col] = "context"
	col_mapping[question_col] = "question"
	col_mapping[answers_text_col] = "answers.text"
	col_mapping[answers_start_col] = "answers.answer_start"

	with st.form(key="form"):

	compatible_models = get_compatible_models(selected_task, selected_dataset)
	st.markdown("The following metrics will be computed")
	html_string = " ".join(
	[
	'<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
	+ '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
	+ 'padding-left:5px;color:white">'
	+ metric
	+ "</div></div>"
	for metric in TASK_TO_DEFAULT_METRICS[selected_task]
	]
	)
	st.markdown(html_string, unsafe_allow_html=True)
	selected_metrics = st.multiselect(
	"(Optional) Select additional metrics",
	list(set(supported_metrics) - set(TASK_TO_DEFAULT_METRICS[selected_task])),
	)
	st.info(
	"Note: user-selected metrics will be run with their default arguments from "
	+ "[here](https://github.com/huggingface/datasets/tree/master/metrics)"
	)

	selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
	print("Selected models:", selected_models)

	if len(selected_models) > 0:
	selected_models = filter_evaluated_models(
	selected_models,
	selected_task,
	selected_dataset,
	selected_config,
	selected_split,
	)
	print("Selected models:", selected_models)

	submit_button = st.form_submit_button("Make submission")

	if submit_button:
	if len(selected_models) > 0:
	project_id = str(uuid.uuid4())[:3]
	payload = {
	"username": AUTOTRAIN_USERNAME,
	"proj_name": f"eval-project-{project_id}",
	"task": TASK_TO_ID[selected_task],
	"config": {
	"language": "en"
	if selected_task != "translation"
	else "en2de", # Need this dummy pair to enable translation
	"max_models": 5,
	"instance": {
	"provider": "aws",
	"instance_type": "ml.g4dn.4xlarge",
	"max_runtime_seconds": 172800,
	"num_instances": 1,
	"disk_size_gb": 150,
	},
	"evaluation": {
	"metrics": selected_metrics,
	"models": selected_models,
	},
	},
	}
	print(f"Payload: {payload}")
	project_json_resp = http_post(
	path="/projects/create",
	payload=payload,
	token=HF_TOKEN,
	domain=AUTOTRAIN_BACKEND_API,
	).json()
	print(project_json_resp)

	if project_json_resp["created"]:
	payload = {
	"split": 4, # use "auto" split choice in AutoTrain
	"col_mapping": col_mapping,
	"load_config": {"max_size_bytes": 0, "shuffle": False},
	}
	data_json_resp = http_post(
	path=f"/projects/{project_json_resp['id']}/data/{selected_dataset}",
	payload=payload,
	token=HF_TOKEN,
	domain=AUTOTRAIN_BACKEND_API,
	params={
	"type": "dataset",
	"config_name": selected_config,
	"split_name": selected_split,
	},
	).json()
	print(data_json_resp)
	if data_json_resp["download_status"] == 1:
	train_json_resp = http_get(
	path=f"/projects/{project_json_resp['id']}/data/start_process",
	token=HF_TOKEN,
	domain=AUTOTRAIN_BACKEND_API,
	).json()
	print(train_json_resp)
	if train_json_resp["success"]:
	st.success(f"✅ Successfully submitted evaluation job with project ID {project_id}")
	st.markdown(
	f"""
	Evaluation takes appoximately 1 hour to complete, so grab a ☕ or 🍵 while you wait:

	📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) \
	to view the results from your submission
	"""
	)
	else:
	st.error("🙈 Oh no, there was an error submitting your evaluation job!")
	else:
	st.warning("⚠️ No models were selected for evaluation!")