lewtun HF staff commited on
Commit
2da95ac
2 Parent(s): 326ac2a ffb1f74

Merge branch 'main' into fix-app

Browse files
Files changed (2) hide show
  1. app.py +76 -2
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,12 +1,14 @@
 
1
  import os
2
  import uuid
3
  from pathlib import Path
4
 
5
  import pandas as pd
6
  import streamlit as st
7
- from datasets import get_dataset_config_names
8
  from dotenv import load_dotenv
9
  from huggingface_hub import list_datasets
 
10
 
11
  from evaluation import filter_evaluated_models
12
  from utils import (
@@ -37,9 +39,61 @@ TASK_TO_ID = {
37
  "summarization": 8,
38
  }
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  SUPPORTED_TASKS = list(TASK_TO_ID.keys())
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  #######
44
  # APP #
45
  #######
@@ -256,6 +310,26 @@ with st.expander("Advanced configuration"):
256
  with st.form(key="form"):
257
 
258
  compatible_models = get_compatible_models(selected_task, selected_dataset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
261
  print("Selected models:", selected_models)
@@ -292,7 +366,7 @@ with st.form(key="form"):
292
  "disk_size_gb": 150,
293
  },
294
  "evaluation": {
295
- "metrics": [],
296
  "models": selected_models,
297
  },
298
  },
 
1
+ import inspect
2
  import os
3
  import uuid
4
  from pathlib import Path
5
 
6
  import pandas as pd
7
  import streamlit as st
8
+ from datasets import get_dataset_config_names, list_metrics, load_metric
9
  from dotenv import load_dotenv
10
  from huggingface_hub import list_datasets
11
+ from tqdm import tqdm
12
 
13
  from evaluation import filter_evaluated_models
14
  from utils import (
 
39
  "summarization": 8,
40
  }
41
 
42
+ TASK_TO_DEFAULT_METRICS = {
43
+ "binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
44
+ "multi_class_classification": [
45
+ "f1_micro",
46
+ "f1_macro",
47
+ "f1_weighted",
48
+ "precision_macro",
49
+ "precision_micro",
50
+ "precision_weighted",
51
+ "recall_macro",
52
+ "recall_micro",
53
+ "recall_weighted",
54
+ "accuracy",
55
+ ],
56
+ "entity_extraction": ["precision", "recall", "f1", "accuracy"],
57
+ "extractive_question_answering": [],
58
+ "translation": ["sacrebleu", "gen_len"],
59
+ "summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum", "gen_len"],
60
+ }
61
+
62
  SUPPORTED_TASKS = list(TASK_TO_ID.keys())
63
 
64
 
65
+ @st.cache
66
+ def get_supported_metrics():
67
+ metrics = list_metrics()
68
+ supported_metrics = []
69
+ for metric in tqdm(metrics):
70
+ try:
71
+ metric_func = load_metric(metric)
72
+ except Exception as e:
73
+ print(e)
74
+ print("Skipping the following metric, which cannot load:", metric)
75
+
76
+ argspec = inspect.getfullargspec(metric_func.compute)
77
+ if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
78
+ # We require that "references" and "predictions" are arguments
79
+ # to the metric function. We also require that the other arguments
80
+ # besides "references" and "predictions" have defaults and so do not
81
+ # need to be specified explicitly.
82
+ defaults = True
83
+ for key, value in argspec.kwonlydefaults.items():
84
+ if key not in ("references", "predictions"):
85
+ if value is None:
86
+ defaults = False
87
+ break
88
+
89
+ if defaults:
90
+ supported_metrics.append(metric)
91
+ return supported_metrics
92
+
93
+
94
+ supported_metrics = get_supported_metrics()
95
+
96
+
97
  #######
98
  # APP #
99
  #######
 
310
  with st.form(key="form"):
311
 
312
  compatible_models = get_compatible_models(selected_task, selected_dataset)
313
+ st.markdown("The following metrics will be computed")
314
+ html_string = " ".join(
315
+ [
316
+ '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
317
+ + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
318
+ + 'padding-left:5px;color:white">'
319
+ + metric
320
+ + "</div></div>"
321
+ for metric in TASK_TO_DEFAULT_METRICS[selected_task]
322
+ ]
323
+ )
324
+ st.markdown(html_string, unsafe_allow_html=True)
325
+ selected_metrics = st.multiselect(
326
+ "(Optional) Select additional metrics",
327
+ list(set(supported_metrics) - set(TASK_TO_DEFAULT_METRICS[selected_task])),
328
+ )
329
+ st.info(
330
+ "Note: user-selected metrics will be run with their default arguments from "
331
+ + "[here](https://github.com/huggingface/datasets/tree/master/metrics)"
332
+ )
333
 
334
  selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
335
  print("Selected models:", selected_models)
 
366
  "disk_size_gb": 150,
367
  },
368
  "evaluation": {
369
+ "metrics": selected_metrics,
370
  "models": selected_models,
371
  },
372
  },
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  huggingface-hub==0.4.0
2
  python-dotenv
3
  streamlit==1.2.0
 
4
  py7zr
 
1
  huggingface-hub==0.4.0
2
  python-dotenv
3
  streamlit==1.2.0
4
+ datasets
5
  py7zr