lewtun HF staff commited on
Commit
ef65795
β€’
1 Parent(s): 1d91658

Pin supported metrics and tweak info

Browse files
Files changed (4) hide show
  1. app.py +59 -51
  2. evaluation.py +3 -3
  3. requirements.txt +0 -2
  4. utils.py +44 -1
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import inspect
2
  import os
3
  import uuid
4
  from pathlib import Path
@@ -7,9 +6,7 @@ import pandas as pd
7
  import streamlit as st
8
  from datasets import get_dataset_config_names
9
  from dotenv import load_dotenv
10
- from evaluate import load
11
- from huggingface_hub import list_datasets, list_metrics
12
- from tqdm import tqdm
13
 
14
  from evaluation import filter_evaluated_models
15
  from utils import (
@@ -57,42 +54,53 @@ TASK_TO_DEFAULT_METRICS = {
57
 
58
  SUPPORTED_TASKS = list(TASK_TO_ID.keys())
59
 
60
-
61
- @st.experimental_memo
62
- def get_supported_metrics():
63
- metrics = [metric.id for metric in list_metrics()]
64
- supported_metrics = []
65
- for metric in tqdm(metrics):
66
- # TODO: this currently requires all metric dependencies to be installed
67
- # in the same environment. Refactor to avoid needing to actually load
68
- # the metric.
69
- try:
70
- print(f"INFO -- Attempting to load metric: {metric}")
71
- metric_func = load(metric)
72
- except Exception as e:
73
- print(e)
74
- print("WARNING -- Skipping the following metric, which cannot load:", metric)
75
- continue
76
-
77
- argspec = inspect.getfullargspec(metric_func.compute)
78
- if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
79
- # We require that "references" and "predictions" are arguments
80
- # to the metric function. We also require that the other arguments
81
- # besides "references" and "predictions" have defaults and so do not
82
- # need to be specified explicitly.
83
- defaults = True
84
- for key, value in argspec.kwonlydefaults.items():
85
- if key not in ("references", "predictions"):
86
- if value is None:
87
- defaults = False
88
- break
89
-
90
- if defaults:
91
- supported_metrics.append(metric)
92
- return supported_metrics
93
-
94
-
95
- supported_metrics = get_supported_metrics()
 
 
 
 
 
 
 
 
 
 
 
96
 
97
 
98
  #######
@@ -101,12 +109,13 @@ supported_metrics = get_supported_metrics()
101
  st.title("Evaluation on the Hub")
102
  st.markdown(
103
  """
104
- Welcome to Hugging Face's automatic model evaluator! This application allows
105
- you to evaluate πŸ€— Transformers
 
106
  [models](https://huggingface.co/models?library=transformers&sort=downloads)
107
- across a wide variety of datasets on the Hub. Please select the dataset and
108
- configuration below. The results of your evaluation will be displayed on the
109
- [public
110
  leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards).
111
  """
112
  )
@@ -363,11 +372,10 @@ with st.expander("Advanced configuration"):
363
  st.markdown(html_string, unsafe_allow_html=True)
364
  selected_metrics = st.multiselect(
365
  "(Optional) Select additional metrics",
366
- list(set(supported_metrics) - set(TASK_TO_DEFAULT_METRICS[selected_task])),
367
- )
368
- st.info(
369
- """Note: user-selected metrics will be run with their default arguments. \
370
- Check out the [available metrics](https://huggingface.co/metrics) for more details."""
371
  )
372
 
373
  with st.form(key="form"):
@@ -375,7 +383,7 @@ with st.form(key="form"):
375
  selected_models = st.multiselect(
376
  "Select the models you wish to evaluate",
377
  compatible_models,
378
- help="""Don't see your model in this list? Add the dataset and task it was trained on to the \
379
  [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
380
  )
381
  print("INFO -- Selected models before filter:", selected_models)
 
 
1
  import os
2
  import uuid
3
  from pathlib import Path
 
6
  import streamlit as st
7
  from datasets import get_dataset_config_names
8
  from dotenv import load_dotenv
9
+ from huggingface_hub import list_datasets
 
 
10
 
11
  from evaluation import filter_evaluated_models
12
  from utils import (
 
54
 
55
  SUPPORTED_TASKS = list(TASK_TO_ID.keys())
56
 
57
+ # Extracted from utils.get_supported_metrics
58
+ # Hardcoded for now due to speed / caching constraints
59
+ SUPPORTED_METRICS = [
60
+ "accuracy",
61
+ "bertscore",
62
+ "bleu",
63
+ "cer",
64
+ "chrf",
65
+ "code_eval",
66
+ "comet",
67
+ "competition_math",
68
+ "coval",
69
+ "cuad",
70
+ "exact_match",
71
+ "f1",
72
+ "frugalscore",
73
+ "google_bleu",
74
+ "mae",
75
+ "mahalanobis",
76
+ "matthews_correlation",
77
+ "mean_iou",
78
+ "meteor",
79
+ "mse",
80
+ "pearsonr",
81
+ "perplexity",
82
+ "precision",
83
+ "recall",
84
+ "roc_auc",
85
+ "rouge",
86
+ "sacrebleu",
87
+ "sari",
88
+ "seqeval",
89
+ "spearmanr",
90
+ "squad",
91
+ "squad_v2",
92
+ "ter",
93
+ "trec_eval",
94
+ "wer",
95
+ "wiki_split",
96
+ "xnli",
97
+ "angelina-wang/directional_bias_amplification",
98
+ "jordyvl/ece",
99
+ "lvwerra/ai4code",
100
+ "lvwerra/amex",
101
+ "lvwerra/test",
102
+ "lvwerra/test_metric",
103
+ ]
104
 
105
 
106
  #######
 
109
  st.title("Evaluation on the Hub")
110
  st.markdown(
111
  """
112
+ Welcome to Hugging Face's automatic model evaluator πŸ‘‹!
113
+
114
+ This application allows you to evaluate πŸ€— Transformers
115
  [models](https://huggingface.co/models?library=transformers&sort=downloads)
116
+ across a wide variety of [datasets](https://huggingface.co/datasets) on the
117
+ Hub. Please select the dataset and configuration below. The results of your
118
+ evaluation will be displayed on the [public
119
  leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards).
120
  """
121
  )
 
372
  st.markdown(html_string, unsafe_allow_html=True)
373
  selected_metrics = st.multiselect(
374
  "(Optional) Select additional metrics",
375
+ sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
376
+ help="""User-selected metrics will be computed with their default arguments. \
377
+ For example, `f1` will report results for binary labels. \
378
+ Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
 
379
  )
380
 
381
  with st.form(key="form"):
 
383
  selected_models = st.multiselect(
384
  "Select the models you wish to evaluate",
385
  compatible_models,
386
+ help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
387
  [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
388
  )
389
  print("INFO -- Selected models before filter:", selected_models)
evaluation.py CHANGED
@@ -42,8 +42,8 @@ def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_
42
  dataset_split=dataset_split,
43
  )
44
  candidate_id = hash(evaluation_info)
45
- if candidate_id in evaluation_ids:
46
- st.info(f"Model `{model}` has already been evaluated on this configuration. Skipping evaluation...")
47
- models.pop(idx)
48
 
49
  return models
 
42
  dataset_split=dataset_split,
43
  )
44
  candidate_id = hash(evaluation_info)
45
+ # if candidate_id in evaluation_ids:
46
+ # st.info(f"Model `{model}` has already been evaluated on this configuration. Skipping evaluation...")
47
+ # models.pop(idx)
48
 
49
  return models
requirements.txt CHANGED
@@ -7,7 +7,5 @@ jsonlines
7
  # Dataset specific deps
8
  py7zr<0.19
9
  openpyxl<3.1
10
- # Metric specific deps
11
- scikit-learn<1.2
12
  # Dirty bug from Google
13
  protobuf<=3.20.1
 
7
  # Dataset specific deps
8
  py7zr<0.19
9
  openpyxl<3.1
 
 
10
  # Dirty bug from Google
11
  protobuf<=3.20.1
utils.py CHANGED
@@ -1,8 +1,12 @@
 
1
  from typing import Dict, List, Union
2
 
3
  import jsonlines
4
  import requests
5
- from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info
 
 
 
6
 
7
  AUTOTRAIN_TASK_TO_HUB_TASK = {
8
  "binary_classification": "text-classification",
@@ -128,3 +132,42 @@ def commit_evaluation_log(evaluation_log, hf_access_token=None):
128
  commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
129
  )
130
  print("INFO -- Pushed evaluation logs to the Hub")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
  from typing import Dict, List, Union
3
 
4
  import jsonlines
5
  import requests
6
+ import streamlit as st
7
+ from evaluate import load
8
+ from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info, list_metrics
9
+ from tqdm import tqdm
10
 
11
  AUTOTRAIN_TASK_TO_HUB_TASK = {
12
  "binary_classification": "text-classification",
 
132
  commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
133
  )
134
  print("INFO -- Pushed evaluation logs to the Hub")
135
+
136
+
137
+ @st.experimental_memo
138
+ def get_supported_metrics():
139
+ """Helper function to get all metrics compatible with evaluation service.
140
+
141
+ Requires all metric dependencies installed in the same environment, so wait until
142
+ https://github.com/huggingface/evaluate/issues/138 is resolved before using this.
143
+ """
144
+ metrics = [metric.id for metric in list_metrics()]
145
+ supported_metrics = []
146
+ for metric in tqdm(metrics):
147
+ # TODO: this currently requires all metric dependencies to be installed
148
+ # in the same environment. Refactor to avoid needing to actually load
149
+ # the metric.
150
+ try:
151
+ print(f"INFO -- Attempting to load metric: {metric}")
152
+ metric_func = load(metric)
153
+ except Exception as e:
154
+ print(e)
155
+ print("WARNING -- Skipping the following metric, which cannot load:", metric)
156
+ continue
157
+
158
+ argspec = inspect.getfullargspec(metric_func.compute)
159
+ if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
160
+ # We require that "references" and "predictions" are arguments
161
+ # to the metric function. We also require that the other arguments
162
+ # besides "references" and "predictions" have defaults and so do not
163
+ # need to be specified explicitly.
164
+ defaults = True
165
+ for key, value in argspec.kwonlydefaults.items():
166
+ if key not in ("references", "predictions"):
167
+ if value is None:
168
+ defaults = False
169
+ break
170
+
171
+ if defaults:
172
+ supported_metrics.append(metric)
173
+ return supported_metrics