model-evaluator / app.py
lewtun's picture
lewtun HF staff
Add info
4f5bbb3
raw
history blame
4.5 kB
import os
from pathlib import Path
import streamlit as st
from dotenv import load_dotenv
from utils import get_compatible_models, get_metadata, http_post
if Path(".env").is_file():
load_dotenv(".env")
HF_TOKEN = os.getenv("HF_TOKEN")
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
TASK_TO_ID = {
"binary_classification": 1,
"multi_class_classification": 2,
"multi_label_classification": 3,
"entity_extraction": 4,
"extractive_question_answering": 5,
"translation": 6,
"summarization": 8,
"single_column_regression": 10,
}
# TODO: remove this hardcorded logic and accept any dataset on the Hub
DATASETS_TO_EVALUATE = ["emotion", "conll2003", "imdb", "squad", "xsum", "ncbi_disease", "go_emotions"]
###########
### APP ###
###########
st.title("Evaluation as a Service")
st.markdown(
"""
Welcome to Hugging Face's Evaluation as a Service! This application allows
you to evaluate any πŸ€— Transformers model with a dataset on the Hub. Please
select the dataset and configuration below. The results of your evaluation
will be displayed on the public leaderboard
[here](https://huggingface.co/spaces/huggingface/leaderboards).
"""
)
dataset_name = st.selectbox("Select a dataset", [f"lewtun/autoevaluate__{dset}" for dset in DATASETS_TO_EVALUATE])
with st.form(key="form"):
# TODO: remove this step once we select real datasets
# Strip out original dataset name
original_dataset_name = dataset_name.split("/")[-1].split("__")[-1]
# In general this will be a list of multiple configs => need to generalise logic here
metadata = get_metadata(dataset_name)
dataset_config = st.selectbox("Select a config", [metadata[0]["config"]])
splits = metadata[0]["splits"]
split_names = list(splits.values())
eval_split = splits.get("eval_split", split_names[0])
selected_split = st.selectbox("Select a split", split_names, index=split_names.index(eval_split))
# TODO: add a function to handle the mapping task <--> column mapping
col_mapping = metadata[0]["col_mapping"]
col_names = list(col_mapping.keys())
# TODO: figure out how to get all dataset column names (i.e. features) without download dataset itself
st.markdown("**Map your data columns**")
col1, col2 = st.columns(2)
# TODO: find a better way to layout these items
# TODO: propagate this information to payload
with col1:
st.markdown("`text` column")
st.text("")
st.text("")
st.text("")
st.text("")
st.markdown("`target` column")
with col2:
st.selectbox("This column should contain the text you want to classify", col_names, index=0)
st.selectbox("This column should contain the labels you want to assign to the text", col_names, index=1)
compatible_models = get_compatible_models(metadata[0]["task"], original_dataset_name)
selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models, compatible_models[0])
submit_button = st.form_submit_button("Make submission")
if submit_button:
for model in selected_models:
payload = {
"username": AUTOTRAIN_USERNAME,
"task": TASK_TO_ID[metadata[0]["task_id"]],
"model": model,
"col_mapping": metadata[0]["col_mapping"],
"split": selected_split,
"dataset": original_dataset_name,
"config": dataset_config,
}
json_resp = http_post(
path="/evaluate/create", payload=payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
).json()
if json_resp["status"] == 1:
st.success(f"βœ… Successfully submitted model {model} for evaluation with job ID {json_resp['id']}")
st.markdown(
f"""
Evaluation takes appoximately 1 hour to complete, so grab a β˜• or 🍡 while you wait:
* πŸ“Š Click [here](https://huggingface.co/spaces/huggingface/leaderboards) to view the results from your submission
* πŸ’Ύ Click [here](https://huggingface.co/datasets/autoevaluate/eval-staging-{json_resp['id']}) to view the stored predictions on the Hugging Face Hub
"""
)
else:
st.error("πŸ™ˆ Oh noes, there was an error submitting your submission!")