Spaces:
Runtime error
Runtime error
from logging import warning | |
from datasets import load_dataset | |
from alignscore.inference import Inferencer | |
import numpy as np | |
from scipy.stats import pearsonr, kendalltau, spearmanr | |
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, balanced_accuracy_score, matthews_corrcoef | |
import pandas as pd | |
import torch | |
import json | |
import pickle | |
import os | |
HUGGINGFACE_DATASETS = { | |
'stsb': ['glue', 'stsb', 'validation'], | |
'mrpc': ['glue', 'mrpc', 'test'], | |
'axb': ['super_glue', 'axb', 'test'], | |
'axg': ['super_glue', 'axg', 'test'], | |
'cb': ['super_glue', 'cb', 'validation'], | |
'rte': ['super_glue', 'rte', 'validation'], | |
'wnli': ['SetFit/wnli', 'validation'], | |
'paws': ['paws', 'labeled_final', 'test'], | |
'mnli_matched': ['multi_nli', 'validation_matched'], | |
'mnli_mismatched': ['multi_nli', 'validation_mismatched'], | |
'nli_fever': ['pietrolesci/nli_fever', 'dev'], | |
'doc_nli': ['saattrupdan/doc-nli', 'test'], | |
'sem_eval': ['sem_eval_2014_task_1', 'test'], | |
'sick': ['sick', 'default', 'test'], | |
'race_m': ['race', 'middle', 'test'], | |
'race_h': ['race', 'high', 'test'], | |
'boolq': ['boolq', 'validation'], | |
'anli_1': ['anli', 'test_r1'], | |
'anli_2': ['anli', 'test_r2'], | |
'anli_3': ['anli', 'test_r3'], | |
'snli': ['snli', 'test'], | |
'vitaminc': ['tals/vitaminc', 'test'], | |
'qqp': ['glue', 'qqp', 'validation'], | |
# below are tasks from https://arxiv.org/pdf/2104.14690.pdf | |
'sst2': ['SetFit/sst2', 'test'], | |
# can't find MR | |
'cr': ['SetFit/SentEval-CR', 'test'], | |
# can't find MPQA | |
'subj': ['SetFit/subj', 'test'], | |
# can't find OS | |
'imdb': ['SetFit/imdb', 'test'], # note: I can't confirm if this is the same dataset used in that paper | |
# The original dataset is no longer accessiable | |
'cola': ['glue', 'cola', 'validation'], | |
'yelp_efl': ['SetFit/yelp_review_full', 'test'], | |
'ag_news': ['SetFit/ag_news', 'test'], | |
'trec': ['SetFit/TREC-QC', 'test',], | |
'dream': ['dream', 'test'], | |
'quartz': ['quartz', 'test'], | |
'eraser_multi_rc': ['eraser_multi_rc', 'test'], | |
'quail': ['quail', 'challenge'], | |
'sciq': ['sciq', 'test'], | |
'gap': ['gap', 'test'], | |
'qnli': ['glue', 'qnli', 'validation'] | |
} | |
PICKLE_DATASETS = [ | |
'newsroom', | |
'rank19', | |
'bagel', | |
'sfhot', | |
'sfres' | |
] | |
ALL_TASKS = { # enumerate all possible tasks | |
'stsb': 0, ### using which output: regression, binary, tri-label | |
'sick': 0, | |
'race_m': 1, | |
'race_h': 1, | |
'boolq': 1, | |
'anli_1': 2, | |
'anli_2': 2, | |
'anli_3': 2, | |
'snli': 2, | |
'vitaminc': 2, | |
'mrpc': 1, | |
'paws': 1, | |
'mnli_matched': 2, | |
'mnli_mismatched': 2, | |
'sem_eval': 1, | |
'summeval': 1, | |
'qags_xsum': 1, | |
'qags_cnndm': 1, | |
'frank': 1, | |
'xsumfaith': 1, | |
'samsum': 1, | |
'yelp': 1, | |
'persona_chat': 1, | |
'topical_chat': 1, | |
'paws_qqp': 1, | |
'qqp': 1, | |
'newsroom': 1, | |
'rank19': 1, | |
'bagel': 1, | |
'sfhot': 1, | |
'sfres': 1, | |
'wmt17': 0, | |
'wmt18': 0, | |
'wmt19': 0, | |
'sst2': 1, | |
'cr': 1, | |
'subj': 1, | |
'imdb': 1, | |
'cola': 1, | |
'yelp_efl': 1, | |
'ag_news': 1, | |
'trec': 1, | |
'axb': 1, | |
'axg': 1, | |
'cb': 2, | |
'rte': 2, | |
'wnli': 2, | |
'dream': 1, | |
'quartz': 1, | |
'nli_fever': 2, | |
'doc_nli': 1, | |
'eraser_multi_rc': 1, | |
'quail': 1, | |
'sciq': 1, | |
'gap': 1, | |
'qnli': 1 | |
} | |
FEW_SHOT_N = 8 | |
FEW_SHOT_SEEDS = [30247, 38252, 29050, 1091, 35554, 25309, 79319, 35079, 35256, 46744] | |
class Evaluator(): | |
def __init__(self, eval_tasks, align_func, save_all_tables=False, clean_data=True) -> None: | |
self.align_func = align_func | |
self.eval_tasks = eval_tasks # ['stsb', 'paws', ...] | |
self.result_save_name = "Default_result_name" | |
self.result_tables = [] | |
self.result_dicts = [] | |
self.clean_data = clean_data | |
self.init_eval_dataset() | |
self.should_save_all_tables = save_all_tables | |
warning(f"Saving the result is: {self.should_save_all_tables}") | |
def init_eval_dataset(self): | |
self.dataset = dict() | |
for eval_task in self.eval_tasks: | |
if eval_task in HUGGINGFACE_DATASETS: | |
if len(HUGGINGFACE_DATASETS[eval_task]) == 3: | |
self.dataset[eval_task] = load_dataset(HUGGINGFACE_DATASETS[eval_task][0], HUGGINGFACE_DATASETS[eval_task][1])[HUGGINGFACE_DATASETS[eval_task][2]] | |
elif len(HUGGINGFACE_DATASETS[eval_task]) == 2: | |
if isinstance(HUGGINGFACE_DATASETS[eval_task][1], tuple): | |
dataset = load_dataset(HUGGINGFACE_DATASETS[eval_task][0]) | |
self.dataset[eval_task] = {split:dataset[split] for split in HUGGINGFACE_DATASETS[eval_task][1]} | |
else: | |
self.dataset[eval_task] = load_dataset(HUGGINGFACE_DATASETS[eval_task][0])[HUGGINGFACE_DATASETS[eval_task][1]] | |
elif eval_task == 'paws_qqp': | |
self.dataset[eval_task] = pd.read_csv('data/paws_qqp/output/dev_and_test.tsv', sep='\t') | |
elif eval_task == 'beir': | |
print("beir load by itself") | |
self.dataset[eval_task] = "BEIR Benchmark" | |
elif eval_task in PICKLE_DATASETS: | |
with open(f'data/eval/{eval_task}.pkl', 'rb') as f: | |
self.dataset[eval_task] = pickle.load(f) | |
elif 'wmt' in eval_task: | |
self.dataset[eval_task] = [] | |
with open(f'data/eval/{eval_task}_eval.jsonl', 'r', encoding='utf8') as f: | |
for example in f: | |
self.dataset[eval_task].append(json.loads(example)) | |
elif 'true' == eval_task: | |
for each_true_sub in os.listdir('data/eval/true'): | |
if 'qags' in each_true_sub: | |
each_true_sub_name = 'true_' + '_'.join(each_true_sub.split('_')[:2]) | |
else: | |
each_true_sub_name = 'true_' + '_'.join(each_true_sub.split('_')[:1]) | |
self.dataset[each_true_sub_name] = pd.read_csv(os.path.join('data/eval/true', each_true_sub)) | |
elif 'summac' == eval_task: | |
from summac.benchmark import SummaCBenchmark | |
self.summac_validation_set = dict() | |
summac_benchmark = SummaCBenchmark(benchmark_folder="./data/eval/summac/benchmark", cut='test') | |
for each in summac_benchmark.datasets: | |
summac_dt_name = each['name'] | |
self.dataset['summac_'+summac_dt_name] = each['dataset'] | |
summac_benchmark_valid = SummaCBenchmark(benchmark_folder="./data/eval/summac/benchmark", cut='val') | |
for each in summac_benchmark_valid.datasets: | |
summac_dt_name = each['name'] | |
self.summac_validation_set['summac_'+summac_dt_name] = each['dataset'] | |
else: | |
f = open(f'data/eval/{eval_task}.json') | |
self.dataset[eval_task] = json.load(f) | |
f.close() | |
def print_result_table(self, table): | |
self.result_tables.append(pd.DataFrame(table).to_markdown()) | |
self.result_dicts.append(table) | |
print(self.result_tables[-1]) | |
def print_all_tables(self): | |
print("\n All Evaluation Results:") | |
for each in self.result_tables: | |
print(each) | |
print('='*100) | |
def save_all_tables(self): | |
with open(f'exp_results/{self.result_save_name}.pkl', 'wb') as f: | |
pickle.dump(self.result_dicts, f, protocol=pickle.HIGHEST_PROTOCOL) | |
def evaluate(self): | |
for each_task in self.dataset: | |
eval(f'self.evaluate_{each_task}()') | |
if self.should_save_all_tables: | |
self.save_all_tables() | |
def get_accuracy(self, true_score, pred_score): | |
return [accuracy_score(true_score, [m>0.5 for m in pred_score])] | |
def get_balanced_accuracy(self, true_score, pred_score, thres=0.5): | |
return [balanced_accuracy_score(true_score, [m>thres for m in pred_score])] | |
def get_f1(self, true_score, pred_score): | |
return [f1_score(true_score, [m>0.5 for m in pred_score])] | |
def get_3label_f1(self, true_score, pred_score): | |
return [f1_score(true_score, pred_score, average='micro')] | |
def get_pearson(self, true_score, pred_score): | |
return pearsonr(pred_score, true_score) | |
def get_kendalltau(self, true_score, pred_score): | |
return kendalltau(pred_score, true_score) | |
def get_spearman(self, true_score, pred_score): | |
return spearmanr(pred_score, true_score) | |
def get_matthews_corr(self, true_score, pred_score): | |
return [matthews_corrcoef(true_score, [s>0.5 for s in pred_score])] | |
def clean_text(self, context, claims): | |
from nltk.tokenize import sent_tokenize | |
if not self.clean_data: | |
return claims | |
word_cases = {token.lower():token for token in context.strip().split()} | |
def clean(text): | |
text = ' '.join(word_cases.get(token.lower(), token) for token in text.strip().split()) | |
text = text.replace('“', '"').replace('”', '"').replace('’', '\'').replace('‘', '\'').replace('`', '\'').replace('-lrb-', '(').replace('-rrb-', ')') | |
text= ' '.join(each.strip()[0].capitalize()+each.strip()[1:] for each in sent_tokenize(text)) | |
return text | |
if isinstance(claims, str): | |
return clean(claims) | |
return [clean(text) for text in claims] | |
def evaluate_newsroom(self): | |
true_score = [] | |
true_score_rel = [] | |
true_score_binary = [] | |
sent1 = [] | |
sent2 = [] | |
for sample in self.dataset['newsroom'].values(): | |
summaries, informativeness, relevance = zip(*( | |
(s['sys_summ'], s['scores']['informativeness'], s['scores']['relevance']) | |
for s in sample['sys_summs'].values() | |
)) | |
cleaned_summaries = self.clean_text(sample['src'], summaries) | |
for summary, inf_score, rel_score in zip(cleaned_summaries, informativeness, relevance): | |
sent1.append(sample['src']) | |
sent2.append(summary) | |
true_score.append(inf_score) | |
true_score_rel.append(rel_score) | |
true_score_binary.append(int(inf_score >= 4)) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['newsroom']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'newsroom', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score_binary, pred_score), | |
'Pearson_rel': self.get_pearson(true_score_rel, pred_score), | |
'Spearman_rel': self.get_spearman(true_score_rel, pred_score), | |
'Kendall_rel': self.get_kendalltau(true_score_rel, pred_score), | |
}) | |
def evaluate_rank19(self): | |
def chunks(lst, n): | |
"""Yield successive n-sized chunks from lst.""" | |
for i in range(0, len(lst), n): | |
yield lst[i:i + n] | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['rank19']: | |
for example_summs in self.dataset['rank19'][example]['sys_summs']: | |
sent1.append(self.dataset['rank19'][example]['src']) | |
sent2.append(self.dataset['rank19'][example]['sys_summs'][example_summs]['sys_summ']) | |
true_score.append(self.dataset['rank19'][example]['sys_summs'][example_summs]['scores']['fact']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['rank19']].tolist() | |
pred_score_bin = [] | |
assert len(pred_score) % 2 == 0 | |
for i, pair in enumerate(chunks(pred_score, 2)): | |
pred_score_bin.extend([0, 1] if pair[1] > pair[0] else [1, 0]) | |
self.print_result_table({ | |
'Dataset_name': 'rank19', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score_bin)[0], | |
'AUC': roc_auc_score(true_score, pred_score_bin) | |
}) | |
def evaluate_bagel(self): | |
true_score = [] | |
true_score_binary = [] | |
sent1 = [] | |
sent2 = [] | |
pred_score = [] | |
for example in self.dataset['bagel']: | |
sent1.append(' '.join(self.dataset['bagel'][example]['ref_summs'])) | |
sent2.append(self.dataset['bagel'][example]['sys_summ']) | |
true_score.append(self.dataset['bagel'][example]['scores']['informativeness']) | |
if(self.dataset['bagel'][example]['scores']['informativeness'] >= 4.0): | |
true_score_binary.append(1) | |
else: | |
true_score_binary.append(0) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['bagel']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'bagel', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score_binary, pred_score) | |
}) | |
def evaluate_sfhot(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
pred_score = [] | |
for example in self.dataset['sfhot']: | |
for ref in self.dataset['sfhot'][example]['ref_summs']: | |
sent1.append(self.dataset['sfhot'][example]['sys_summ']) | |
sent2.append(ref) | |
pred_score.append(max(self.align_func(sent1, sent2)[ALL_TASKS['sfhot']].tolist())) | |
sent1 = [] | |
sent2 = [] | |
if(self.dataset['sfhot'][example]['scores']['quality'] >= 4.0): | |
true_score.append(1) | |
else: | |
true_score.append(0) | |
self.print_result_table({ | |
'Dataset_name': 'sfhot', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_sfres(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
pred_score = [] | |
for example in self.dataset['sfres']: | |
for ref in self.dataset['sfres'][example]['ref_summs']: | |
sent1.append(self.dataset['sfres'][example]['sys_summ']) | |
sent2.append(ref) | |
pred_score.append(max(self.align_func(sent1, sent2)[ALL_TASKS['sfres']].tolist())) | |
sent1 = [] | |
sent2 = [] | |
if(self.dataset['sfres'][example]['scores']['quality'] >= 4.0): | |
true_score.append(1) | |
else: | |
true_score.append(0) | |
self.print_result_table({ | |
'Dataset_name': 'sfres', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_stsb(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['stsb']: | |
sent1.append(example['sentence1']) | |
sent2.append(example['sentence2']) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['stsb']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'stsb', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_sick(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['sick']: | |
sent1.append(example['sentence_A']) | |
sent2.append(example['sentence_B']) | |
true_score.append(example['relatedness_score']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['sick']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'sick-r', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_race_m(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['race_m']: | |
for i, option in enumerate(example['options']): | |
article.append(example['article']) | |
qa.append(example['question']+" "+option+" " if "_" not in example['question'] else ' '.join(example['question'].replace("_", " "+option+" ").split())) | |
if i == ord(example['answer'])-65: | |
true_score.append(i) # 0,1,2,3 | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['race_m']].tolist() | |
for a, b, c, d in zip(*[iter(pred_score_temp)]*4): | |
arr = [0]*4 | |
pred_score.append(np.argmax([a,b,c,d])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'race-m', | |
'Accuracy': [acc], | |
}) | |
def evaluate_race_h(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['race_h']: | |
for i, option in enumerate(example['options']): | |
article.append(example['article']) | |
qa.append(example['question']+" "+option+" " if "_" not in example['question'] else ' '.join(example['question'].replace("_", " "+option+" ").split())) | |
if i == ord(example['answer'])-65: | |
true_score.append(i) # 0,1,2,3 | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['race_h']].tolist() | |
for a, b, c, d in zip(*[iter(pred_score_temp)]*4): | |
pred_score.append(np.argmax([a,b,c,d])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'race-h', | |
'Accuracy': [acc] | |
}) | |
# How to combine passage, question, and single answer for boolq | |
def evaluate_boolq(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['boolq']: | |
for i in range(2): | |
article.append(example['passage']) | |
if i == 0: | |
qa.append(example['question']+" "+"No.") # 0 | |
else: | |
qa.append(example['question']+" "+"Yes.") # 1 | |
true_score.append(int(example['answer'])) | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['boolq']].tolist() | |
for a, b in zip(*[iter(pred_score_temp)]*2): | |
pred_score.append(np.argmax([a,b])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'boolq', | |
'Accuracy': [acc] | |
}) | |
def evaluate_anli_1(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['anli_1']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(example['label'] if example['label']!=-1 else 1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['anli_1']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'anli-1', | |
'Accuracy': [accuracy_score(true_score, pred_score)] | |
}) | |
def evaluate_anli_2(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['anli_2']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(example['label'] if example['label']!=-1 else 1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['anli_2']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'anli-2', | |
'Accuracy': [accuracy_score(true_score, pred_score)] | |
}) | |
def evaluate_anli_3(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['anli_3']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(example['label'] if example['label']!=-1 else 1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['anli_3']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'anli-3', | |
'Accuracy': [accuracy_score(true_score, pred_score)] | |
}) | |
def evaluate_nli_fever(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['nli_fever']: | |
sent1.append(example['hypothesis']) # the original dataset flipped | |
sent2.append(example['premise']) | |
true_score.append(example['label'] if example['label']!=-1 else 1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['nli_fever']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'nli_fever', | |
'Accuracy': [accuracy_score(true_score, pred_score)] | |
}) | |
def evaluate_snli(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['snli']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(example['label'] if example['label']!=-1 else 1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['snli']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'snli', | |
'Accuracy': [accuracy_score(true_score, pred_score)] | |
}) | |
def evaluate_axb(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['axb']: | |
sent1.append(example['sentence1']) | |
sent2.append(example['sentence2']) | |
true_score.append(1 if example['label']==0 else 0) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['axb']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'axb', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)], | |
'Matthews': self.get_matthews_corr(true_score, pred_score) | |
}) | |
def evaluate_axg(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['axg']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(1 if example['label']==0 else 0) | |
pred_score = self.align_func(sent1, sent2)[2][:,0].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'axg', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)], | |
}) | |
def evaluate_cb(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['cb']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
if example['label'] == 0: | |
label = 0 | |
elif example['label'] == 1: | |
label = 2 | |
elif example['label'] == 2: | |
label = 1 | |
true_score.append(label) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['cb']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'cb', | |
'Accuracy': [accuracy_score(true_score, pred_score)], | |
}) | |
def evaluate_rte(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['rte']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(1 if example['label']==0 else 0) | |
pred_score = self.align_func(sent1, sent2)[2][:,0].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'rte', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)], | |
}) | |
def evaluate_wnli(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['wnli']: | |
sent1.append(example['text1']) | |
sent2.append(example['text2']) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[2][:,0].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'wnli', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)], | |
}) | |
def evaluate_doc_nli(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['doc_nli']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(1 if example['label'] == 'entailment' else 0) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['doc_nli']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'doc_nli', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)], | |
}) | |
def evaluate_qnli(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['qnli']: | |
sent1.append(example['sentence']) | |
sent2.append(example['question']) | |
true_score.append(1 if example['label'] == 0 else 0) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qnli']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'qnli', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)], | |
}) | |
def evaluate_dream(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['dream']: | |
for i, option in enumerate(example['choice']): | |
article.append(' '.join(example['dialogue'])) | |
qa.append(example['question']+" "+option+" ") | |
if option == example['answer']: | |
true_score.append(i) # 0,1,2,3 | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['dream']].tolist() | |
for a, b, c in zip(*[iter(pred_score_temp)]*3): | |
arr = [0]*3 | |
pred_score.append(np.argmax([a,b,c])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'dream', | |
'Accuracy': [acc], | |
}) | |
def evaluate_quartz(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['quartz']: | |
for i, option in enumerate(example['choices']['text']): | |
article.append(example['para']) | |
qa.append(example['question']+" "+option+" ") | |
if i == ord(example['answerKey'])-65: | |
true_score.append(i) # 0,1,2,3 | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['quartz']].tolist() | |
for a, b in zip(*[iter(pred_score_temp)]*2): | |
arr = [0]*2 | |
pred_score.append(np.argmax([a,b])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'quartz', | |
'Accuracy': [acc], | |
}) | |
def evaluate_eraser_multi_rc(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['eraser_multi_rc']: | |
sent1.append(example['passage']) | |
sent2.append(example['query_and_answer'].replace("|", "")) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['eraser_multi_rc']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'eraser_multi_rc', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)] | |
}) | |
def evaluate_quail(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['quail']: | |
for i, option in enumerate(example['answers']): | |
article.append(example['context']) | |
qa.append(example['question']+" "+option+" ") | |
if i == example['correct_answer_id']: | |
true_score.append(i) # 0,1,2,3 | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['quail']].tolist() | |
for a, b, c, d in zip(*[iter(pred_score_temp)]*4): | |
arr = [0]*4 | |
pred_score.append(np.argmax([a,b,c,d])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'quail', | |
'Accuracy': [acc], | |
}) | |
def evaluate_sciq(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['sciq']: | |
options = [example['correct_answer'], example['distractor1'], example['distractor2'], example['distractor3']] | |
for i, option in enumerate(options): | |
article.append(example['support']) | |
qa.append(example['question']+" "+option+" ") | |
if i == 0: | |
true_score.append(i) # 0,1,2,3, always 0 | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['sciq']].tolist() | |
for a, b, c, d in zip(*[iter(pred_score_temp)]*4): | |
arr = [0]*4 | |
pred_score.append(np.argmax([a,b,c,d])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'sciq', | |
'Accuracy': [acc], | |
}) | |
def evaluate_gap(self): | |
true_score = [] | |
article = [] | |
qa = [] | |
for example in self.dataset['gap']: | |
options = [example['Text'][:example['Pronoun-offset']]+example['A']+example['Text'][(example['Pronoun-offset']+len(example['Pronoun'])):], | |
example['Text'][:example['Pronoun-offset']]+example['B']+example['Text'][(example['Pronoun-offset']+len(example['Pronoun'])):]] | |
for i, option in enumerate(options): | |
article.append(example['Text']) | |
qa.append(option) | |
true_score.append(1 if example['B-coref'] else 0) # 0,1,2,3, always 0 | |
pred_score = [] | |
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['gap']].tolist() | |
for a, b in zip(*[iter(pred_score_temp)]*2): | |
pred_score.append(np.argmax([a,b])) | |
assert len(pred_score) == len(true_score) | |
acc = [int(p==t) for p, t in zip(pred_score, true_score)] | |
acc = sum(acc) / len(acc) | |
self.print_result_table({ | |
'Dataset_name': 'gap', | |
'Accuracy': [acc], | |
}) | |
# How to group fact checking | |
def evaluate_vitaminc(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['vitaminc']: | |
sent1.append(example['evidence']) | |
sent2.append(example['claim']) | |
if example['label'] == 'SUPPORTS': | |
true_score.append(0) | |
elif example['label'] == 'REFUTES': | |
true_score.append(2) | |
else: | |
true_score.append(1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['vitaminc']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'vitaminc', | |
'F1': self.get_3label_f1(true_score, pred_score), | |
'Accuracy': [accuracy_score(true_score, pred_score)], | |
}) | |
def evaluate_mrpc(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['mrpc']: | |
sent1.append(example['sentence1']) | |
sent2.append(example['sentence2']) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['mrpc']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'mrpc', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)] | |
}) | |
def evaluate_paws(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['paws']: | |
sent1.append(example['sentence1']) | |
sent2.append(example['sentence2']) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['paws']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'paws', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)] | |
}) | |
def evaluate_mnli_matched(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['mnli_matched']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(example['label'] if example['label']!=-1 else 1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['mnli_matched']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'mnli_matched', | |
'Accuracy': [accuracy_score(true_score, pred_score)] | |
}) | |
def evaluate_mnli_mismatched(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['mnli_mismatched']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
true_score.append(example['label'] if example['label']!=-1 else 1) | |
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['mnli_mismatched']], dim=-1).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'mnli_mismatched', | |
'Accuracy': [accuracy_score(true_score, pred_score)] | |
}) | |
def evaluate_sem_eval(self): | |
print('Reached here') | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['sem_eval']: | |
sent1.append(example['premise']) | |
sent2.append(example['hypothesis']) | |
if example['entailment_judgment'] == 1: | |
true_score.append(1) | |
else: | |
true_score.append(0) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['sem_eval']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'sem_eval', | |
'Accuracy': self.get_accuracy(true_score, pred_score) | |
}) | |
def evaluate_summeval(self): | |
true_score = [] | |
true_score_rel = [] | |
true_score_binary = [] | |
pred_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['summeval']: | |
cleaned_summary = self.clean_text(example['document'], example['summary']) | |
sent1.append(example['document']) | |
sent2.append(cleaned_summary) | |
true_score.append(example['consistency']) | |
true_score_rel.append(example['relevance']) | |
true_score_binary.append(1 if example['consistency'] == 5.0 else 0) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['summeval']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'summeval', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score_binary, pred_score), | |
'Pearson_rel': self.get_pearson(true_score_rel, pred_score), | |
'Spearman_rel': self.get_spearman(true_score_rel, pred_score), | |
'Kendall_rel': self.get_kendalltau(true_score_rel, pred_score), | |
}) | |
def evaluate_qags_xsum(self): | |
true_score = [] | |
pred_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['qags_xsum']: | |
sent1.append(example['document']) | |
sent2.append(example['summary']) | |
true_score.append(example['consistency']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qags_xsum']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'qags_xsum', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_qags_cnndm(self): | |
true_score = [] | |
pred_score = [] | |
sent1 = [] | |
sent2 = [] | |
true_score_binary = [] | |
for example in self.dataset['qags_cnndm']: | |
sent1.append(example['document']) | |
sent2.append(example['summary']) | |
true_score.append(example['consistency']) | |
true_score_binary.append(1 if example['consistency'] == 1.0 else 0) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qags_cnndm']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'qags_cnndm', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score_binary, pred_score) | |
}) | |
def evaluate_frank(self): | |
from spacy.lang.en import English | |
nlp = English() | |
nlp.add_pipe("sentencizer") | |
for d in self.dataset['frank']: | |
if d['dataset'] == 'cnndm': | |
continue | |
d['document'] = ' '.join([each.text for each in nlp(d['document']).sents]) | |
true_score_xsum = [] | |
true_score_cnndm = [] | |
pred_score_xsum = [] | |
pred_score_cnndm = [] | |
sent1_xsum = [] | |
sent1_cnndm = [] | |
sent2_xsum = [] | |
sent2_cnndm = [] | |
true_score_binary_cnndm = [] | |
true_score_binary_xsum = [] | |
for example in self.dataset['frank']: | |
if example['dataset'] == 'cnndm': | |
sent1_cnndm.append(example['document']) | |
sent2_cnndm.append(self.clean_text(example['document'], example['summary'])) | |
true_score_cnndm.append(example['score']) | |
true_score_binary_cnndm.append(1 if example['score'] == 1.0 else 0) | |
elif example['dataset'] == 'xsum': | |
sent1_xsum.append(example['document']) | |
sent2_xsum.append(self.clean_text(example['document'], example['summary'])) | |
true_score_xsum.append(example['score']) | |
true_score_binary_xsum.append(1 if example['score'] == 1.0 else 0) | |
pred_score_xsum = self.align_func(sent1_xsum, sent2_xsum)[ALL_TASKS['frank']].tolist() # | |
pred_score_cnndm = self.align_func(sent1_cnndm, sent2_cnndm)[ALL_TASKS['frank']].tolist() # | |
self.print_result_table({ | |
'Dataset_name': 'frank-xsum', | |
'Pearson': self.get_pearson(true_score_xsum, pred_score_xsum), | |
'Spearman': self.get_spearman(true_score_xsum, pred_score_xsum), | |
'Kendall': self.get_kendalltau(true_score_xsum, pred_score_xsum), | |
'AUC': roc_auc_score(true_score_binary_xsum, pred_score_xsum) | |
}) | |
self.print_result_table({ | |
'Dataset_name': 'frank-cnndm', | |
'Pearson': self.get_pearson(true_score_cnndm, pred_score_cnndm), | |
'Spearman': self.get_spearman(true_score_cnndm, pred_score_cnndm), | |
'Kendall': self.get_kendalltau(true_score_cnndm, pred_score_cnndm), | |
'AUC': roc_auc_score(true_score_binary_cnndm, pred_score_cnndm) | |
}) | |
self.print_result_table({ | |
'Dataset_name': 'frank-all', | |
'Pearson': self.get_pearson(true_score_xsum+true_score_cnndm, pred_score_xsum+pred_score_cnndm), | |
'Spearman': self.get_spearman(true_score_xsum+true_score_cnndm, pred_score_xsum+pred_score_cnndm), | |
'Kendall': self.get_kendalltau(true_score_xsum+true_score_cnndm, pred_score_xsum+pred_score_cnndm), | |
'AUC': roc_auc_score(true_score_binary_xsum+true_score_binary_cnndm, pred_score_xsum+pred_score_cnndm) | |
}) | |
def evaluate_xsumfaith(self): | |
dataset_name = 'xsumfaith' | |
true_score = [] | |
pred_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset[dataset_name]: | |
sent1.append(example['document']) | |
sent2.append(self.clean_text(example['document'], example['claim'])) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS[dataset_name]].tolist() | |
self.print_result_table({ | |
'Dataset_name': dataset_name, | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
}) | |
def evaluate_samsum(self): | |
dataset_name = 'samsum' | |
label_mapping = { | |
'factual': 1, | |
'factually incorrect': 0, | |
'too incoherent': 0 | |
} | |
import string | |
printable = set(string.printable) | |
true_score = [] | |
pred_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset[dataset_name]: | |
cleaned_doc = ''.join(filter(lambda x: x in printable, example['article'])) | |
sent1.append(cleaned_doc) | |
sent2.append(example['summary']) | |
true_score.append(label_mapping[example['label']]) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS[dataset_name]].tolist() | |
self.print_result_table({ | |
'Dataset_name': dataset_name, | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_yelp(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['yelp']: | |
sent1.append(example['input_sent']) | |
sent2.append(example['output_sent']) | |
true_score.append(example['preservation']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['yelp']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'yelp', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_persona_chat(self): | |
true_score = [] | |
pred_score = [] | |
premise = [] | |
hypothesis = [] | |
for example in self.dataset['persona_chat']: | |
premise.append(example['dialog_history']+example['fact']) | |
hypothesis.append(example['response']) | |
true_score.append(example['engaging']) | |
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['persona_chat']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'persona_chat_eng', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
true_score = [] | |
pred_score = [] | |
premise = [] | |
hypothesis = [] | |
for example in self.dataset['persona_chat']: | |
premise.append(example['fact']) | |
hypothesis.append(example['response']) | |
true_score.append(example['uses_knowledge']) | |
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['persona_chat']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'persona_chat_grd', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_topical_chat(self): | |
true_score = [] | |
pred_score = [] | |
premise = [] | |
hypothesis = [] | |
for example in self.dataset['topical_chat']: | |
premise.append(example['dialog_history']+example['fact']) | |
hypothesis.append(example['response']) | |
true_score.append(example['engaging']) | |
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['topical_chat']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'topical_chat_eng', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
true_score = [] | |
pred_score = [] | |
premise = [] | |
hypothesis = [] | |
for example in self.dataset['topical_chat']: | |
premise.append(example['fact']) | |
hypothesis.append(example['response']) | |
true_score.append(example['uses_knowledge']) | |
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['topical_chat']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'topical_chat_grd', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_paws_qqp(self): | |
sent1 = [] | |
sent2 = [] | |
true_score = [] | |
for i in range(self.dataset['paws_qqp']['label'].size): | |
sent1.append(self.dataset['paws_qqp']['sentence1'][i][2:-1]) | |
sent2.append(self.dataset['paws_qqp']['sentence2'][i][2:-1]) | |
true_score.append(self.dataset['paws_qqp']['label'][i]) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['paws_qqp']].tolist() | |
roc_auc = roc_auc_score(true_score, pred_score) | |
self.print_result_table({ | |
'Dataset_name': 'paws_qqp', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc] | |
}) | |
def evaluate_qqp(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['qqp']: | |
sent1.append(example['question1']) | |
sent2.append(example['question2']) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qqp']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'qqp', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc_score(true_score, pred_score)] | |
}) | |
def evaluate_wmt17(self): | |
lang_pair = list(set([each['lang'] for each in self.dataset['wmt17']])) | |
for each_lang_pair in lang_pair: | |
true_score = [] | |
premise = [] | |
hypothesis = [] | |
for example in self.dataset['wmt17']: | |
if example['lang'] != each_lang_pair: | |
continue | |
premise.append(example['reference']) | |
hypothesis.append(example['candidate']) | |
true_score.append(example['score']) | |
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['wmt17']].tolist() | |
self.print_result_table({ | |
'Dataset_name': f'wmt17-{each_lang_pair}', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_wmt18(self): | |
lang_pair = list(set([each['lang'] for each in self.dataset['wmt18']])) | |
for each_lang_pair in lang_pair: | |
true_score = [] | |
premise = [] | |
hypothesis = [] | |
for example in self.dataset['wmt18']: | |
if example['lang'] != each_lang_pair: | |
continue | |
premise.append(example['reference']) | |
hypothesis.append(example['candidate']) | |
true_score.append(example['score']) | |
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['wmt18']].tolist() | |
self.print_result_table({ | |
'Dataset_name': f'wmt18-{each_lang_pair}', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_wmt19(self): | |
lang_pair = list(set([each['lang'] for each in self.dataset['wmt19']])) | |
for each_lang_pair in lang_pair: | |
true_score = [] | |
premise = [] | |
hypothesis = [] | |
for example in self.dataset['wmt19']: | |
if example['lang'] != each_lang_pair: | |
continue | |
premise.append(example['reference']) | |
hypothesis.append(example['candidate']) | |
true_score.append(example['score']) | |
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['wmt19']].tolist() | |
self.print_result_table({ | |
'Dataset_name': f'wmt19-{each_lang_pair}', | |
'Pearson': self.get_pearson(true_score, pred_score), | |
'Spearman': self.get_spearman(true_score, pred_score), | |
'Kendall': self.get_kendalltau(true_score, pred_score) | |
}) | |
def evaluate_sst2(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['sst2']: | |
sent1.append(example['text']) | |
sent2.append('It was great.') | |
true_score.append(int(example['label_text'] == 'positive')) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['sst2']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'sst2', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_cr(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['cr']: | |
sent1.append(example['text']) | |
sent2.append('It was great.') | |
true_score.append(int(example['label_text'] == 'positive')) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['cr']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'cr', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_subj(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['subj']: | |
sent1.append(example['text']) | |
sent2.append('It was objective.') | |
true_score.append(int(example['label_text'] == 'objective')) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['subj']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'subj', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_imdb(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['imdb']: | |
sent1.append(example['text']) | |
sent2.append('It was great.') | |
true_score.append(int(example['label_text'] == 'positive')) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['imdb']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'imdb', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_imdb_knn(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['imdb']: | |
sent1.append(example['text']) | |
sent2.append('It was great.') | |
true_score.append(int(example['label_text'] == 'positive')) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['imdb']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'imdb', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_cola(self): | |
true_score = [] | |
sent1 = [] | |
sent2 = [] | |
for example in self.dataset['cola']: | |
sent1.append(example['sentence']) | |
sent2.append('It was correct.') | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['cola']].tolist() | |
self.print_result_table({ | |
'Dataset_name': 'cola', | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': roc_auc_score(true_score, pred_score) | |
}) | |
def evaluate_yelp_efl(self): | |
sent = [] | |
label = [] | |
for example in self.dataset['yelp_efl']: | |
sent.append(example['text']) | |
label.append(example['label']) | |
templates = [ | |
'It was terrible.', | |
'It was bad.', | |
'It was ok.', | |
'It was good.', | |
'It was great.', | |
] | |
template_lists = [[template] * len(sent) for template in templates] | |
predictions = [ | |
self.align_func(sent, template_list)[ALL_TASKS['yelp_efl']] | |
for template_list in template_lists | |
] | |
pred_label = torch.argmax(torch.stack(predictions), dim=0).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'yelp_efl', | |
'Accuracy': [accuracy_score(label, pred_label)] | |
}) | |
def evaluate_ag_news(self): | |
sent = [] | |
label = [] | |
for example in self.dataset['ag_news']: | |
sent.append(example['text']) | |
label.append(example['label']) | |
templates = [ | |
'It is world news.', | |
'It is sports news.', | |
'It is business news.', | |
'It is science news.', | |
] | |
template_lists = [[template] * len(sent) for template in templates] | |
predictions = [ | |
self.align_func(sent, template_list)[ALL_TASKS['ag_news']] | |
for template_list in template_lists | |
] | |
pred_label = torch.argmax(torch.stack(predictions), dim=0).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'ag_news', | |
'Accuracy': [accuracy_score(label, pred_label)] | |
}) | |
def evaluate_trec(self): | |
sent = [] | |
label = [] | |
for example in self.dataset['trec']: | |
sent.append(example['text']) | |
label.append(example['label_coarse']) | |
templates = [ | |
'It is description.', | |
'It is entity.', | |
'It is expression.', | |
'It is human.', | |
'It is number.', | |
'It is location.', | |
] | |
template_lists = [[template] * len(sent) for template in templates] | |
predictions = [ | |
self.align_func(sent, template_list)[ALL_TASKS['trec']] | |
for template_list in template_lists | |
] | |
pred_label = torch.argmax(torch.stack(predictions), dim=0).tolist() | |
self.print_result_table({ | |
'Dataset_name': 'trec', | |
'Accuracy': [accuracy_score(label, pred_label)] | |
}) | |
def true_task_helper(self, dataset_name): | |
sent1 = [] | |
sent2 = [] | |
true_score = [] | |
for i in range(len(self.dataset[dataset_name])): | |
context = self.dataset[dataset_name].iloc[i]['grounding'] | |
claim = self.dataset[dataset_name].iloc[i]['generated_text'] | |
sent1.append(context) | |
sent2.append(self.clean_text(context, claim)) | |
true_score.append(self.dataset[dataset_name].iloc[i]['label']) | |
pred_score = self.align_func(sent1, sent2)[1].tolist() | |
roc_auc = roc_auc_score(true_score, pred_score) | |
self.print_result_table({ | |
'Dataset_name': dataset_name, | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'AUC': [roc_auc] | |
}) | |
def evaluate_true_begin(self): | |
dataset_name = 'true_begin' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_dialfact(self): | |
dataset_name = 'true_dialfact' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_fever(self): | |
dataset_name = 'true_fever' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_frank(self): | |
dataset_name = 'true_frank' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_mnbm(self): | |
dataset_name = 'true_mnbm' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_paws(self): | |
dataset_name = 'true_paws' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_q2(self): | |
dataset_name = 'true_q2' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_qags_cnndm(self): | |
dataset_name = 'true_qags_cnndm' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_qags_xsum(self): | |
dataset_name = 'true_qags_xsum' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_summeval(self): | |
dataset_name = 'true_summeval' | |
self.true_task_helper(dataset_name) | |
def evaluate_true_vitc(self): | |
dataset_name = 'true_vitc' | |
self.true_task_helper(dataset_name) | |
def get_summac_thres(self, dataset_name): | |
sent1 = [] | |
sent2 = [] | |
true_score = [] | |
for example in self.summac_validation_set[dataset_name]: | |
sent1.append(example['document']) | |
sent2.append(self.clean_text(example['document'], example['claim'])) # | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[1].tolist() | |
thres_result = [] | |
for i in range(1001): | |
thres = i / 1000 | |
thres_result.append((thres, balanced_accuracy_score(true_score, [p>thres for p in pred_score]))) | |
best_thres = sorted(thres_result, key=lambda x: x[1], reverse=True)[0] | |
print(f"best thres for {dataset_name} is {best_thres[0]} @ {best_thres[1]}") | |
return best_thres[0] | |
def summac_task_helper(self, dataset_name): | |
sent1 = [] | |
sent2 = [] | |
true_score = [] | |
for example in self.dataset[dataset_name]: | |
sent1.append(example['document']) | |
sent2.append(self.clean_text(example['document'], example['claim'])) | |
true_score.append(example['label']) | |
pred_score = self.align_func(sent1, sent2)[1].tolist() | |
roc_auc = roc_auc_score(true_score, pred_score) | |
balanced_acc_thres = self.get_summac_thres(dataset_name) | |
self.print_result_table({ | |
'Dataset_name': dataset_name, | |
'F1': self.get_f1(true_score, pred_score), | |
'Accuracy': self.get_accuracy(true_score, pred_score), | |
'BalancedAcc': self.get_balanced_accuracy(true_score, pred_score, thres=balanced_acc_thres), | |
'threshold': balanced_acc_thres, | |
'AUC': [roc_auc] | |
}) | |
def evaluate_summac_cogensumm(self): | |
dataset_name = 'summac_cogensumm' | |
self.summac_task_helper(dataset_name) | |
def evaluate_summac_xsumfaith(self): | |
dataset_name = 'summac_xsumfaith' | |
self.summac_task_helper(dataset_name) | |
def evaluate_summac_polytope(self): | |
dataset_name = 'summac_polytope' | |
self.summac_task_helper(dataset_name) | |
def evaluate_summac_factcc(self): | |
dataset_name = 'summac_factcc' | |
self.summac_task_helper(dataset_name) | |
def evaluate_summac_summeval(self): | |
dataset_name = 'summac_summeval' | |
self.summac_task_helper(dataset_name) | |
def evaluate_summac_frank(self): | |
dataset_name = 'summac_frank' | |
self.summac_task_helper(dataset_name) | |
def evaluate_beir(self): | |
from beir import util, LoggingHandler | |
from beir.datasets.data_loader import GenericDataLoader | |
from beir.retrieval.evaluation import EvaluateRetrieval | |
from beir.retrieval.search.lexical import BM25Search as BM25 | |
from beir.reranking.models import CrossEncoder | |
from beir.reranking import Rerank | |
import pathlib, os | |
import logging | |
import random | |
#### Just some code to print debug information to stdout | |
logging.basicConfig(format='%(asctime)s - %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S', | |
level=logging.INFO, | |
handlers=[LoggingHandler()]) | |
#### /print debug information to stdout | |
#### Download trec-covid.zip dataset and unzip the dataset | |
for beir_dataset_name in ['msmarco', 'trec-covid', 'nfcorpus', 'nq', 'hotpotqa', 'fiqa', | |
'arguana', 'webis-touche2020', 'cqadupstack', 'quora', | |
'dbpedia-entity', 'scidocs', 'fever', 'climate-fever', 'scifact']: | |
# for beir_dataset_name in ['fever']: | |
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(beir_dataset_name) | |
# out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets") | |
out_dir = f"./data/eval/beir/{beir_dataset_name}/" | |
data_path = util.download_and_unzip(url, out_dir) | |
#### Provide the data path where trec-covid has been downloaded and unzipped to the data loader | |
# data folder would contain these files: | |
# (1) trec-covid/corpus.jsonl (format: jsonlines) | |
# (2) trec-covid/queries.jsonl (format: jsonlines) | |
# (3) trec-covid/qrels/test.tsv (format: tsv ("\t")) | |
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") | |
######################################### | |
#### (1) RETRIEVE Top-100 docs using BM25 | |
######################################### | |
#### Provide parameters for Elasticsearch | |
# print(corpus) | |
hostname = "localhost" #localhost | |
index_name = beir_dataset_name # trec-covid | |
initialize = True # False | |
model = BM25(index_name=index_name, hostname=hostname, initialize=initialize) | |
retriever = EvaluateRetrieval(model, k_values=[1,3,5,10,100,1000]) | |
#### Retrieve dense results (format of results is identical to qrels) | |
results = retriever.retrieve(corpus, queries) | |
# Rerank top-100 results using the reranker provided | |
reranker = Rerank(self.align_func) | |
rerank_results = reranker.rerank(corpus, queries, results, top_k=100) | |
#### Evaluate your retrieval using NDCG@k, MAP@K ... | |
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, rerank_results, retriever.k_values) | |
self.print_result_table({ | |
'Dataset_name': beir_dataset_name, | |
'ndcg': ndcg, | |
'map': _map, | |
'recall': recall, | |
'precision': precision | |
}) | |
def evaluate_xxx(self): | |
pass | |
class evaluateMultiCheckpoints: | |
def __init__(self, config, device='cuda:0') -> None: | |
sample_checkpoint = { | |
'backbone': 'roberta-base', | |
'task_name': 'align-wo-finetune | align-finetune | roberta-finetune-baseline | nli-wo-finetune | nli-finetune', | |
'path': 'some path', | |
'result_save_path': 'some path' | |
} | |
self.config = config ## a dictionary | |
self.device = device | |
self.tasks = [ | |
'summeval', 'qags_xsum', 'qags_cnndm', 'persona_chat', 'topical_chat', | |
'mnli_mismatched', 'mnli_matched', | |
'sick', 'yelp', 'stsb', | |
'anli_1','anli_2', 'anli_3', 'snli', 'vitaminc', | |
'mrpc', 'paws', 'sem_eval', 'paws_qqp', 'qqp', | |
'newsroom', 'rank19', 'bagel', 'race_m', 'race_h' | |
] | |
def experimentForSlide1216(self): | |
for ckpt in self.config: | |
self.evaluateOneCheckpoint(ckpt) | |
def evaluateOneCheckpoint(self, ckpt): | |
model_name = ckpt['path'].split('/')[-1].split('.ckpt')[0] | |
infer = Inferencer(ckpt_path=ckpt['path'], | |
model=ckpt['backbone'], batch_size=32, device=self.device) | |
evaluator = Evaluator(eval_tasks=self.tasks, align_func=infer.inference, save_all_tables=True) | |
evaluator.result_save_name = f"{ckpt['result_save_path']}{model_name}" | |
evaluator.evaluate() |