Spaces:

autoevaluate
/

model-evaluator

Runtime error

App Files Files Community

lewtun HF staff commited on Jun 17, 2022

Commit

ef65795

•

1 Parent(s): 1d91658

Pin supported metrics and tweak info

Browse files

Files changed (4) hide show

app.py +59 -51
evaluation.py +3 -3
requirements.txt +0 -2
utils.py +44 -1

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import inspect
 import os
 import uuid
 from pathlib import Path
@@ -7,9 +6,7 @@ import pandas as pd
 import streamlit as st
 from datasets import get_dataset_config_names
 from dotenv import load_dotenv
-from evaluate import load
-from huggingface_hub import list_datasets, list_metrics
-from tqdm import tqdm
 from evaluation import filter_evaluated_models
 from utils import (
@@ -57,42 +54,53 @@ TASK_TO_DEFAULT_METRICS = {
 SUPPORTED_TASKS = list(TASK_TO_ID.keys())
-@st.experimental_memo
-def get_supported_metrics():
-    metrics = [metric.id for metric in list_metrics()]
-    supported_metrics = []
-    for metric in tqdm(metrics):
-        # TODO: this currently requires all metric dependencies to be installed
-        # in the same environment. Refactor to avoid needing to actually load
-        # the metric.
-        try:
-            print(f"INFO -- Attempting to load metric: {metric}")
-            metric_func = load(metric)
-        except Exception as e:
-            print(e)
-            print("WARNING -- Skipping the following metric, which cannot load:", metric)
-            continue
-        argspec = inspect.getfullargspec(metric_func.compute)
-        if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
-            # We require that "references" and "predictions" are arguments
-            # to the metric function. We also require that the other arguments
-            # besides "references" and "predictions" have defaults and so do not
-            # need to be specified explicitly.
-            defaults = True
-            for key, value in argspec.kwonlydefaults.items():
-                if key not in ("references", "predictions"):
-                    if value is None:
-                        defaults = False
-                        break
-            if defaults:
-                supported_metrics.append(metric)
-    return supported_metrics
-supported_metrics = get_supported_metrics()
 #######
@@ -101,12 +109,13 @@ supported_metrics = get_supported_metrics()
 st.title("Evaluation on the Hub")
 st.markdown(
     """
-    Welcome to Hugging Face's automatic model evaluator! This application allows
-    you to evaluate 🤗 Transformers
     [models](https://huggingface.co/models?library=transformers&sort=downloads)
-    across a wide variety of datasets on the Hub. Please select the dataset and
-    configuration below. The results of your evaluation will be displayed on the
-    [public
     leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards).
     """
 )
@@ -363,11 +372,10 @@ with st.expander("Advanced configuration"):
     st.markdown(html_string, unsafe_allow_html=True)
     selected_metrics = st.multiselect(
         "(Optional) Select additional metrics",
-        list(set(supported_metrics) - set(TASK_TO_DEFAULT_METRICS[selected_task])),
-    )
-    st.info(
-        """Note: user-selected metrics will be run with their default arguments. \
-            Check out the [available metrics](https://huggingface.co/metrics) for more details."""
     )
 with st.form(key="form"):
@@ -375,7 +383,7 @@ with st.form(key="form"):
     selected_models = st.multiselect(
         "Select the models you wish to evaluate",
         compatible_models,
-        help="""Don't see your model in this list? Add the dataset and task it was trained on to the \
             [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
     )
     print("INFO -- Selected models before filter:", selected_models)

 import os
 import uuid
 from pathlib import Path
 import streamlit as st
 from datasets import get_dataset_config_names
 from dotenv import load_dotenv
+from huggingface_hub import list_datasets
 from evaluation import filter_evaluated_models
 from utils import (
 SUPPORTED_TASKS = list(TASK_TO_ID.keys())
+# Extracted from utils.get_supported_metrics
+# Hardcoded for now due to speed / caching constraints
+SUPPORTED_METRICS = [
+    "accuracy",
+    "bertscore",
+    "bleu",
+    "cer",
+    "chrf",
+    "code_eval",
+    "comet",
+    "competition_math",
+    "coval",
+    "cuad",
+    "exact_match",
+    "f1",
+    "frugalscore",
+    "google_bleu",
+    "mae",
+    "mahalanobis",
+    "matthews_correlation",
+    "mean_iou",
+    "meteor",
+    "mse",
+    "pearsonr",
+    "perplexity",
+    "precision",
+    "recall",
+    "roc_auc",
+    "rouge",
+    "sacrebleu",
+    "sari",
+    "seqeval",
+    "spearmanr",
+    "squad",
+    "squad_v2",
+    "ter",
+    "trec_eval",
+    "wer",
+    "wiki_split",
+    "xnli",
+    "angelina-wang/directional_bias_amplification",
+    "jordyvl/ece",
+    "lvwerra/ai4code",
+    "lvwerra/amex",
+    "lvwerra/test",
+    "lvwerra/test_metric",
+]
 #######
 st.title("Evaluation on the Hub")
 st.markdown(
     """
+    Welcome to Hugging Face's automatic model evaluator 👋!
+    This application allows you to evaluate 🤗 Transformers
     [models](https://huggingface.co/models?library=transformers&sort=downloads)
+    across a wide variety of [datasets](https://huggingface.co/datasets) on the
+    Hub. Please select the dataset and configuration below. The results of your
+    evaluation will be displayed on the [public
     leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards).
     """
 )
     st.markdown(html_string, unsafe_allow_html=True)
     selected_metrics = st.multiselect(
         "(Optional) Select additional metrics",
+        sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
+        help="""User-selected metrics will be computed with their default arguments. \
+            For example, `f1` will report results for binary labels. \
+            Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
     )
 with st.form(key="form"):
     selected_models = st.multiselect(
         "Select the models you wish to evaluate",
         compatible_models,
+        help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
             [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
     )
     print("INFO -- Selected models before filter:", selected_models)

evaluation.py CHANGED Viewed

@@ -42,8 +42,8 @@ def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_
             dataset_split=dataset_split,
         )
         candidate_id = hash(evaluation_info)
-        if candidate_id in evaluation_ids:
-            st.info(f"Model `{model}` has already been evaluated on this configuration. Skipping evaluation...")
-            models.pop(idx)
     return models

             dataset_split=dataset_split,
         )
         candidate_id = hash(evaluation_info)
+        # if candidate_id in evaluation_ids:
+        #     st.info(f"Model `{model}` has already been evaluated on this configuration. Skipping evaluation...")
+        #     models.pop(idx)
     return models

requirements.txt CHANGED Viewed

@@ -7,7 +7,5 @@ jsonlines
 # Dataset specific deps
 py7zr<0.19
 openpyxl<3.1
-# Metric specific deps
-scikit-learn<1.2
 # Dirty bug from Google
 protobuf<=3.20.1

 # Dataset specific deps
 py7zr<0.19
 openpyxl<3.1
 # Dirty bug from Google
 protobuf<=3.20.1

utils.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from typing import Dict, List, Union
 import jsonlines
 import requests
-from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info
 AUTOTRAIN_TASK_TO_HUB_TASK = {
     "binary_classification": "text-classification",
@@ -128,3 +132,42 @@ def commit_evaluation_log(evaluation_log, hf_access_token=None):
         commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
     )
     print("INFO -- Pushed evaluation logs to the Hub")

+import inspect
 from typing import Dict, List, Union
 import jsonlines
 import requests
+import streamlit as st
+from evaluate import load
+from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info, list_metrics
+from tqdm import tqdm
 AUTOTRAIN_TASK_TO_HUB_TASK = {
     "binary_classification": "text-classification",
         commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
     )
     print("INFO -- Pushed evaluation logs to the Hub")
+@st.experimental_memo
+def get_supported_metrics():
+    """Helper function to get all metrics compatible with evaluation service.
+    Requires all metric dependencies installed in the same environment, so wait until
+    https://github.com/huggingface/evaluate/issues/138 is resolved before using this.
+    """
+    metrics = [metric.id for metric in list_metrics()]
+    supported_metrics = []
+    for metric in tqdm(metrics):
+        # TODO: this currently requires all metric dependencies to be installed
+        # in the same environment. Refactor to avoid needing to actually load
+        # the metric.
+        try:
+            print(f"INFO -- Attempting to load metric: {metric}")
+            metric_func = load(metric)
+        except Exception as e:
+            print(e)
+            print("WARNING -- Skipping the following metric, which cannot load:", metric)
+            continue
+        argspec = inspect.getfullargspec(metric_func.compute)
+        if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
+            # We require that "references" and "predictions" are arguments
+            # to the metric function. We also require that the other arguments
+            # besides "references" and "predictions" have defaults and so do not
+            # need to be specified explicitly.
+            defaults = True
+            for key, value in argspec.kwonlydefaults.items():
+                if key not in ("references", "predictions"):
+                    if value is None:
+                        defaults = False
+                        break
+            if defaults:
+                supported_metrics.append(metric)
+    return supported_metrics