jogonba2 commited on
Commit
64096c9
1 Parent(s): 9ddbb93

added alignscore and removed bleurt

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. generation_evaluator.py +53 -26
  3. gradio_tst.py +21 -11
  4. requirements.txt +4 -2
app.py CHANGED
@@ -2,4 +2,4 @@ import evaluate
2
  from gradio_tst import launch_gradio_widget2
3
 
4
  module = evaluate.load("generation_evaluator.py")
5
- launch_gradio_widget2(module)
 
2
  from gradio_tst import launch_gradio_widget2
3
 
4
  module = evaluate.load("generation_evaluator.py")
5
+ launch_gradio_widget2(module)
generation_evaluator.py CHANGED
@@ -1,6 +1,8 @@
1
  import datasets
2
  import evaluate
3
  import numpy as np
 
 
4
 
5
  _CITATION = """\
6
  @inproceedings{lin-2004-rouge,
@@ -77,10 +79,8 @@ Moreover, BERTScore computes precision, recall, and F1 measure, which can be use
77
  generation tasks.
78
  See the project's README at https://github.com/Tiiiger/bert_score#readme for more information.
79
 
80
- BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018)
81
- and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations. You may run BLEURT out-of-the-box or fine-tune
82
- it for your specific application (the latter is expected to perform better).
83
- See the project's README at https://github.com/google-research/bleurt#readme for more information.
84
 
85
  ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
86
  and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
@@ -119,8 +119,8 @@ BERT_SCORE:{
119
  "f1": F1 score.
120
  "hashcode": Hashcode of the library.
121
  },
122
- BLEURT:{
123
- "scores": List of scores.
124
  },
125
  CHRF:{
126
  'score' (float): The chrF (chrF++) score,
@@ -130,6 +130,14 @@ CHRF:{
130
  }
131
  """
132
 
 
 
 
 
 
 
 
 
133
 
134
  class GenerationEvaluator(evaluate.Metric):
135
  def _info(self):
@@ -152,52 +160,71 @@ class GenerationEvaluator(evaluate.Metric):
152
  ],
153
  )
154
 
155
- def _compute(self, predictions, references):
 
 
 
 
 
 
 
 
 
 
156
 
 
 
157
  rouge_score = evaluate.load("rouge")
158
 
159
  rouge_results = rouge_score.compute(
160
  predictions=predictions, references=references
161
  )
 
 
162
  bleu_score = evaluate.load("bleu")
163
  bleu_results = bleu_score.compute(
164
  predictions=predictions, references=references
165
  )
166
 
 
167
  exact_match_score = evaluate.load("exact_match")
168
  exact_match_results = exact_match_score.compute(
169
  predictions=predictions, references=references
170
  )
171
 
 
172
  bert_score = evaluate.load("bertscore")
173
  bert_score_results = bert_score.compute(
174
  predictions=predictions, references=references, lang="en"
175
  )
176
-
177
- mean_precision = np.mean(bert_score_results['precision'])
178
- mean_recall = np.mean(bert_score_results['recall'])
179
- mean_f1 = np.mean(bert_score_results['f1'])
180
-
181
- bert_score_results['precision'] = round(mean_precision, 4)
182
- bert_score_results['recall'] = round(mean_recall, 4)
183
- bert_score_results['f1'] = round(mean_f1, 4)
184
-
185
- bleurt_score = evaluate.load("bleurt", module_type="metric")
186
- bleurt_results = bleurt_score.compute(
187
- predictions=predictions, references=references
 
 
 
188
  )
189
-
190
- mean_bleurt_score = np.mean(bleurt_results['scores'])
191
- bleurt_results['scores'] = round(mean_bleurt_score, 4)
192
-
193
  chrf = evaluate.load("chrf")
194
- chrf_results = chrf.compute(predictions=predictions, references=references)
 
 
195
 
196
  return {
197
  "ROUGE": rouge_results,
198
  "BLEU": bleu_results,
199
  "EXACT_MATCH": exact_match_results,
200
  "BERT_SCORE": bert_score_results,
201
- "BLEURT": bleurt_results,
202
- "CHRF": chrf_results
203
  }
 
1
  import datasets
2
  import evaluate
3
  import numpy as np
4
+ import spacy
5
+ from alignscore import AlignScore
6
 
7
  _CITATION = """\
8
  @inproceedings{lin-2004-rouge,
 
79
  generation tasks.
80
  See the project's README at https://github.com/Tiiiger/bert_score#readme for more information.
81
 
82
+ AlignScore evaluates whether all the information in b is contained in a (b does not contradict a).
83
+ See https://github.com/yuh-zha/AlignScore for more information.
 
 
84
 
85
  ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
86
  and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
 
119
  "f1": F1 score.
120
  "hashcode": Hashcode of the library.
121
  },
122
+ AlignScore:{
123
+ "score": mean align scores using roberta-large as scorer
124
  },
125
  CHRF:{
126
  'score' (float): The chrF (chrF++) score,
 
130
  }
131
  """
132
 
133
+ ALIGNSCORE_ARGS = {
134
+ "model": "roberta-large",
135
+ "batch_size": 32,
136
+ "device": "cuda",
137
+ "ckpt_path": "https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt",
138
+ "evaluation_mode": "nli_sp",
139
+ }
140
+
141
 
142
  class GenerationEvaluator(evaluate.Metric):
143
  def _info(self):
 
160
  ],
161
  )
162
 
163
+ def _download_and_prepare(self, dl_manager):
164
+ # Download Spacy en_core_web_sm model for AlignScore
165
+ try:
166
+ spacy.load("en_core_web_sm")
167
+ except OSError:
168
+ spacy.cli.download("en_core_web_sm")
169
+
170
+ # Download AlignScore checkpoint
171
+ model_path = dl_manager.download(ALIGNSCORE_ARGS["ckpt_path"])
172
+ ALIGNSCORE_ARGS["ckpt_path"] = model_path
173
+ self.align_scorer = AlignScore(**ALIGNSCORE_ARGS)
174
 
175
+ def _compute(self, predictions, references):
176
+ # Compute ROUGE
177
  rouge_score = evaluate.load("rouge")
178
 
179
  rouge_results = rouge_score.compute(
180
  predictions=predictions, references=references
181
  )
182
+
183
+ # Compute BLEU
184
  bleu_score = evaluate.load("bleu")
185
  bleu_results = bleu_score.compute(
186
  predictions=predictions, references=references
187
  )
188
 
189
+ # Compute Exact Match
190
  exact_match_score = evaluate.load("exact_match")
191
  exact_match_results = exact_match_score.compute(
192
  predictions=predictions, references=references
193
  )
194
 
195
+ # Compute BERTScore
196
  bert_score = evaluate.load("bertscore")
197
  bert_score_results = bert_score.compute(
198
  predictions=predictions, references=references, lang="en"
199
  )
200
+
201
+ mean_precision = np.mean(bert_score_results["precision"])
202
+ mean_recall = np.mean(bert_score_results["recall"])
203
+ mean_f1 = np.mean(bert_score_results["f1"])
204
+
205
+ bert_score_results["precision"] = round(mean_precision, 4)
206
+ bert_score_results["recall"] = round(mean_recall, 4)
207
+ bert_score_results["f1"] = round(mean_f1, 4)
208
+
209
+ # Compute AlignScore
210
+ align_score = round(
211
+ np.mean(
212
+ self.align_scorer.score(contexts=references, claims=predictions)
213
+ ),
214
+ 4,
215
  )
216
+
217
+ # Compute CHRF
 
 
218
  chrf = evaluate.load("chrf")
219
+ chrf_results = chrf.compute(
220
+ predictions=predictions, references=references
221
+ )
222
 
223
  return {
224
  "ROUGE": rouge_results,
225
  "BLEU": bleu_results,
226
  "EXACT_MATCH": exact_match_results,
227
  "BERT_SCORE": bert_score_results,
228
+ "CHRF": chrf_results,
229
+ "ALIGN_SCORE": align_score,
230
  }
gradio_tst.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  import os
3
  import re
4
  import sys
@@ -7,10 +8,6 @@ from pathlib import Path
7
  import numpy as np
8
  from datasets import Value
9
 
10
- import logging
11
-
12
-
13
-
14
  REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
15
 
16
 
@@ -27,7 +24,9 @@ def infer_gradio_input_types(feature_types):
27
  for feature_type in feature_types:
28
  input_type = "json"
29
  if isinstance(feature_type, Value):
30
- if feature_type.dtype.startswith("int") or feature_type.dtype.startswith("float"):
 
 
31
  input_type = "number"
32
  elif feature_type.dtype == "string":
33
  input_type = "str"
@@ -59,9 +58,13 @@ def parse_gradio_data(data, input_types):
59
  data.dropna(inplace=True)
60
  for feature_name, input_type in zip(data, input_types):
61
  if input_type == "json":
62
- metric_inputs[feature_name] = [json.loads(d) for d in data[feature_name].to_list()]
 
 
63
  elif input_type == "str":
64
- metric_inputs[feature_name] = [d.strip('"') for d in data[feature_name].to_list()]
 
 
65
  else:
66
  metric_inputs[feature_name] = data[feature_name]
67
  return metric_inputs
@@ -79,9 +82,13 @@ def parse_test_cases(test_cases, feature_names, input_types):
79
  parsed_cases = []
80
  for feat, input_type in zip(feature_names, input_types):
81
  if input_type == "json":
82
- parsed_cases.append([str(element) for element in test_case[feat]])
 
 
83
  elif input_type == "str":
84
- parsed_cases.append(['"' + element + '"' for element in test_case[feat]])
 
 
85
  else:
86
  parsed_cases.append(test_case[feat])
87
  examples.append([list(i) for i in zip(*parsed_cases)])
@@ -94,7 +101,9 @@ def launch_gradio_widget2(metric):
94
  try:
95
  import gradio as gr
96
  except ImportError as error:
97
- logging.error("To create a metric widget with Gradio make sure gradio is installed.")
 
 
98
  raise error
99
 
100
  local_path = Path(sys.path[0])
@@ -118,7 +127,8 @@ def launch_gradio_widget2(metric):
118
  ),
119
  outputs=gr.Textbox(label=metric.name),
120
  description=(
121
- metric.info.description + "\nIf this is a text-based metric, make sure to wrap you input in double quotes."
 
122
  " Alternatively you can use a JSON-formatted list as input."
123
  ),
124
  title=f"Metric: {metric.name}",
 
1
  import json
2
+ import logging
3
  import os
4
  import re
5
  import sys
 
8
  import numpy as np
9
  from datasets import Value
10
 
 
 
 
 
11
  REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
12
 
13
 
 
24
  for feature_type in feature_types:
25
  input_type = "json"
26
  if isinstance(feature_type, Value):
27
+ if feature_type.dtype.startswith(
28
+ "int"
29
+ ) or feature_type.dtype.startswith("float"):
30
  input_type = "number"
31
  elif feature_type.dtype == "string":
32
  input_type = "str"
 
58
  data.dropna(inplace=True)
59
  for feature_name, input_type in zip(data, input_types):
60
  if input_type == "json":
61
+ metric_inputs[feature_name] = [
62
+ json.loads(d) for d in data[feature_name].to_list()
63
+ ]
64
  elif input_type == "str":
65
+ metric_inputs[feature_name] = [
66
+ d.strip('"') for d in data[feature_name].to_list()
67
+ ]
68
  else:
69
  metric_inputs[feature_name] = data[feature_name]
70
  return metric_inputs
 
82
  parsed_cases = []
83
  for feat, input_type in zip(feature_names, input_types):
84
  if input_type == "json":
85
+ parsed_cases.append(
86
+ [str(element) for element in test_case[feat]]
87
+ )
88
  elif input_type == "str":
89
+ parsed_cases.append(
90
+ ['"' + element + '"' for element in test_case[feat]]
91
+ )
92
  else:
93
  parsed_cases.append(test_case[feat])
94
  examples.append([list(i) for i in zip(*parsed_cases)])
 
101
  try:
102
  import gradio as gr
103
  except ImportError as error:
104
+ logging.error(
105
+ "To create a metric widget with Gradio make sure gradio is installed."
106
+ )
107
  raise error
108
 
109
  local_path = Path(sys.path[0])
 
127
  ),
128
  outputs=gr.Textbox(label=metric.name),
129
  description=(
130
+ metric.info.description
131
+ + "\nIf this is a text-based metric, make sure to wrap you input in double quotes."
132
  " Alternatively you can use a JSON-formatted list as input."
133
  ),
134
  title=f"Metric: {metric.name}",
requirements.txt CHANGED
@@ -3,7 +3,9 @@ datasets
3
  scikit-learn
4
  gradio
5
  bert_score
6
- git+https://github.com/google-research/bleurt.git
7
  numpy
8
  git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
9
- sacrebleu
 
 
 
3
  scikit-learn
4
  gradio
5
  bert_score
6
+ rouge_score
7
  numpy
8
  git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
9
+ sacrebleu
10
+ git+ssh://[email protected]/yuh-zha/AlignScore.git
11
+ spacy