PANH's picture
Upload 15 files
ffca110 verified
from logging import warning
from datasets import load_dataset
from alignscore.inference import Inferencer
import numpy as np
from scipy.stats import pearsonr, kendalltau, spearmanr
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, balanced_accuracy_score, matthews_corrcoef
import pandas as pd
import torch
import json
import pickle
import os
HUGGINGFACE_DATASETS = {
'stsb': ['glue', 'stsb', 'validation'],
'mrpc': ['glue', 'mrpc', 'test'],
'axb': ['super_glue', 'axb', 'test'],
'axg': ['super_glue', 'axg', 'test'],
'cb': ['super_glue', 'cb', 'validation'],
'rte': ['super_glue', 'rte', 'validation'],
'wnli': ['SetFit/wnli', 'validation'],
'paws': ['paws', 'labeled_final', 'test'],
'mnli_matched': ['multi_nli', 'validation_matched'],
'mnli_mismatched': ['multi_nli', 'validation_mismatched'],
'nli_fever': ['pietrolesci/nli_fever', 'dev'],
'doc_nli': ['saattrupdan/doc-nli', 'test'],
'sem_eval': ['sem_eval_2014_task_1', 'test'],
'sick': ['sick', 'default', 'test'],
'race_m': ['race', 'middle', 'test'],
'race_h': ['race', 'high', 'test'],
'boolq': ['boolq', 'validation'],
'anli_1': ['anli', 'test_r1'],
'anli_2': ['anli', 'test_r2'],
'anli_3': ['anli', 'test_r3'],
'snli': ['snli', 'test'],
'vitaminc': ['tals/vitaminc', 'test'],
'qqp': ['glue', 'qqp', 'validation'],
# below are tasks from https://arxiv.org/pdf/2104.14690.pdf
'sst2': ['SetFit/sst2', 'test'],
# can't find MR
'cr': ['SetFit/SentEval-CR', 'test'],
# can't find MPQA
'subj': ['SetFit/subj', 'test'],
# can't find OS
'imdb': ['SetFit/imdb', 'test'], # note: I can't confirm if this is the same dataset used in that paper
# The original dataset is no longer accessiable
'cola': ['glue', 'cola', 'validation'],
'yelp_efl': ['SetFit/yelp_review_full', 'test'],
'ag_news': ['SetFit/ag_news', 'test'],
'trec': ['SetFit/TREC-QC', 'test',],
'dream': ['dream', 'test'],
'quartz': ['quartz', 'test'],
'eraser_multi_rc': ['eraser_multi_rc', 'test'],
'quail': ['quail', 'challenge'],
'sciq': ['sciq', 'test'],
'gap': ['gap', 'test'],
'qnli': ['glue', 'qnli', 'validation']
}
PICKLE_DATASETS = [
'newsroom',
'rank19',
'bagel',
'sfhot',
'sfres'
]
ALL_TASKS = { # enumerate all possible tasks
'stsb': 0, ### using which output: regression, binary, tri-label
'sick': 0,
'race_m': 1,
'race_h': 1,
'boolq': 1,
'anli_1': 2,
'anli_2': 2,
'anli_3': 2,
'snli': 2,
'vitaminc': 2,
'mrpc': 1,
'paws': 1,
'mnli_matched': 2,
'mnli_mismatched': 2,
'sem_eval': 1,
'summeval': 1,
'qags_xsum': 1,
'qags_cnndm': 1,
'frank': 1,
'xsumfaith': 1,
'samsum': 1,
'yelp': 1,
'persona_chat': 1,
'topical_chat': 1,
'paws_qqp': 1,
'qqp': 1,
'newsroom': 1,
'rank19': 1,
'bagel': 1,
'sfhot': 1,
'sfres': 1,
'wmt17': 0,
'wmt18': 0,
'wmt19': 0,
'sst2': 1,
'cr': 1,
'subj': 1,
'imdb': 1,
'cola': 1,
'yelp_efl': 1,
'ag_news': 1,
'trec': 1,
'axb': 1,
'axg': 1,
'cb': 2,
'rte': 2,
'wnli': 2,
'dream': 1,
'quartz': 1,
'nli_fever': 2,
'doc_nli': 1,
'eraser_multi_rc': 1,
'quail': 1,
'sciq': 1,
'gap': 1,
'qnli': 1
}
FEW_SHOT_N = 8
FEW_SHOT_SEEDS = [30247, 38252, 29050, 1091, 35554, 25309, 79319, 35079, 35256, 46744]
class Evaluator():
def __init__(self, eval_tasks, align_func, save_all_tables=False, clean_data=True) -> None:
self.align_func = align_func
self.eval_tasks = eval_tasks # ['stsb', 'paws', ...]
self.result_save_name = "Default_result_name"
self.result_tables = []
self.result_dicts = []
self.clean_data = clean_data
self.init_eval_dataset()
self.should_save_all_tables = save_all_tables
warning(f"Saving the result is: {self.should_save_all_tables}")
def init_eval_dataset(self):
self.dataset = dict()
for eval_task in self.eval_tasks:
if eval_task in HUGGINGFACE_DATASETS:
if len(HUGGINGFACE_DATASETS[eval_task]) == 3:
self.dataset[eval_task] = load_dataset(HUGGINGFACE_DATASETS[eval_task][0], HUGGINGFACE_DATASETS[eval_task][1])[HUGGINGFACE_DATASETS[eval_task][2]]
elif len(HUGGINGFACE_DATASETS[eval_task]) == 2:
if isinstance(HUGGINGFACE_DATASETS[eval_task][1], tuple):
dataset = load_dataset(HUGGINGFACE_DATASETS[eval_task][0])
self.dataset[eval_task] = {split:dataset[split] for split in HUGGINGFACE_DATASETS[eval_task][1]}
else:
self.dataset[eval_task] = load_dataset(HUGGINGFACE_DATASETS[eval_task][0])[HUGGINGFACE_DATASETS[eval_task][1]]
elif eval_task == 'paws_qqp':
self.dataset[eval_task] = pd.read_csv('data/paws_qqp/output/dev_and_test.tsv', sep='\t')
elif eval_task == 'beir':
print("beir load by itself")
self.dataset[eval_task] = "BEIR Benchmark"
elif eval_task in PICKLE_DATASETS:
with open(f'data/eval/{eval_task}.pkl', 'rb') as f:
self.dataset[eval_task] = pickle.load(f)
elif 'wmt' in eval_task:
self.dataset[eval_task] = []
with open(f'data/eval/{eval_task}_eval.jsonl', 'r', encoding='utf8') as f:
for example in f:
self.dataset[eval_task].append(json.loads(example))
elif 'true' == eval_task:
for each_true_sub in os.listdir('data/eval/true'):
if 'qags' in each_true_sub:
each_true_sub_name = 'true_' + '_'.join(each_true_sub.split('_')[:2])
else:
each_true_sub_name = 'true_' + '_'.join(each_true_sub.split('_')[:1])
self.dataset[each_true_sub_name] = pd.read_csv(os.path.join('data/eval/true', each_true_sub))
elif 'summac' == eval_task:
from summac.benchmark import SummaCBenchmark
self.summac_validation_set = dict()
summac_benchmark = SummaCBenchmark(benchmark_folder="./data/eval/summac/benchmark", cut='test')
for each in summac_benchmark.datasets:
summac_dt_name = each['name']
self.dataset['summac_'+summac_dt_name] = each['dataset']
summac_benchmark_valid = SummaCBenchmark(benchmark_folder="./data/eval/summac/benchmark", cut='val')
for each in summac_benchmark_valid.datasets:
summac_dt_name = each['name']
self.summac_validation_set['summac_'+summac_dt_name] = each['dataset']
else:
f = open(f'data/eval/{eval_task}.json')
self.dataset[eval_task] = json.load(f)
f.close()
def print_result_table(self, table):
self.result_tables.append(pd.DataFrame(table).to_markdown())
self.result_dicts.append(table)
print(self.result_tables[-1])
def print_all_tables(self):
print("\n All Evaluation Results:")
for each in self.result_tables:
print(each)
print('='*100)
def save_all_tables(self):
with open(f'exp_results/{self.result_save_name}.pkl', 'wb') as f:
pickle.dump(self.result_dicts, f, protocol=pickle.HIGHEST_PROTOCOL)
def evaluate(self):
for each_task in self.dataset:
eval(f'self.evaluate_{each_task}()')
if self.should_save_all_tables:
self.save_all_tables()
def get_accuracy(self, true_score, pred_score):
return [accuracy_score(true_score, [m>0.5 for m in pred_score])]
def get_balanced_accuracy(self, true_score, pred_score, thres=0.5):
return [balanced_accuracy_score(true_score, [m>thres for m in pred_score])]
def get_f1(self, true_score, pred_score):
return [f1_score(true_score, [m>0.5 for m in pred_score])]
def get_3label_f1(self, true_score, pred_score):
return [f1_score(true_score, pred_score, average='micro')]
def get_pearson(self, true_score, pred_score):
return pearsonr(pred_score, true_score)
def get_kendalltau(self, true_score, pred_score):
return kendalltau(pred_score, true_score)
def get_spearman(self, true_score, pred_score):
return spearmanr(pred_score, true_score)
def get_matthews_corr(self, true_score, pred_score):
return [matthews_corrcoef(true_score, [s>0.5 for s in pred_score])]
def clean_text(self, context, claims):
from nltk.tokenize import sent_tokenize
if not self.clean_data:
return claims
word_cases = {token.lower():token for token in context.strip().split()}
def clean(text):
text = ' '.join(word_cases.get(token.lower(), token) for token in text.strip().split())
text = text.replace('“', '"').replace('”', '"').replace('’', '\'').replace('‘', '\'').replace('`', '\'').replace('-lrb-', '(').replace('-rrb-', ')')
text= ' '.join(each.strip()[0].capitalize()+each.strip()[1:] for each in sent_tokenize(text))
return text
if isinstance(claims, str):
return clean(claims)
return [clean(text) for text in claims]
def evaluate_newsroom(self):
true_score = []
true_score_rel = []
true_score_binary = []
sent1 = []
sent2 = []
for sample in self.dataset['newsroom'].values():
summaries, informativeness, relevance = zip(*(
(s['sys_summ'], s['scores']['informativeness'], s['scores']['relevance'])
for s in sample['sys_summs'].values()
))
cleaned_summaries = self.clean_text(sample['src'], summaries)
for summary, inf_score, rel_score in zip(cleaned_summaries, informativeness, relevance):
sent1.append(sample['src'])
sent2.append(summary)
true_score.append(inf_score)
true_score_rel.append(rel_score)
true_score_binary.append(int(inf_score >= 4))
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['newsroom']].tolist()
self.print_result_table({
'Dataset_name': 'newsroom',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score_binary, pred_score),
'Pearson_rel': self.get_pearson(true_score_rel, pred_score),
'Spearman_rel': self.get_spearman(true_score_rel, pred_score),
'Kendall_rel': self.get_kendalltau(true_score_rel, pred_score),
})
def evaluate_rank19(self):
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['rank19']:
for example_summs in self.dataset['rank19'][example]['sys_summs']:
sent1.append(self.dataset['rank19'][example]['src'])
sent2.append(self.dataset['rank19'][example]['sys_summs'][example_summs]['sys_summ'])
true_score.append(self.dataset['rank19'][example]['sys_summs'][example_summs]['scores']['fact'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['rank19']].tolist()
pred_score_bin = []
assert len(pred_score) % 2 == 0
for i, pair in enumerate(chunks(pred_score, 2)):
pred_score_bin.extend([0, 1] if pair[1] > pair[0] else [1, 0])
self.print_result_table({
'Dataset_name': 'rank19',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score_bin)[0],
'AUC': roc_auc_score(true_score, pred_score_bin)
})
def evaluate_bagel(self):
true_score = []
true_score_binary = []
sent1 = []
sent2 = []
pred_score = []
for example in self.dataset['bagel']:
sent1.append(' '.join(self.dataset['bagel'][example]['ref_summs']))
sent2.append(self.dataset['bagel'][example]['sys_summ'])
true_score.append(self.dataset['bagel'][example]['scores']['informativeness'])
if(self.dataset['bagel'][example]['scores']['informativeness'] >= 4.0):
true_score_binary.append(1)
else:
true_score_binary.append(0)
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['bagel']].tolist()
self.print_result_table({
'Dataset_name': 'bagel',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score_binary, pred_score)
})
def evaluate_sfhot(self):
true_score = []
sent1 = []
sent2 = []
pred_score = []
for example in self.dataset['sfhot']:
for ref in self.dataset['sfhot'][example]['ref_summs']:
sent1.append(self.dataset['sfhot'][example]['sys_summ'])
sent2.append(ref)
pred_score.append(max(self.align_func(sent1, sent2)[ALL_TASKS['sfhot']].tolist()))
sent1 = []
sent2 = []
if(self.dataset['sfhot'][example]['scores']['quality'] >= 4.0):
true_score.append(1)
else:
true_score.append(0)
self.print_result_table({
'Dataset_name': 'sfhot',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_sfres(self):
true_score = []
sent1 = []
sent2 = []
pred_score = []
for example in self.dataset['sfres']:
for ref in self.dataset['sfres'][example]['ref_summs']:
sent1.append(self.dataset['sfres'][example]['sys_summ'])
sent2.append(ref)
pred_score.append(max(self.align_func(sent1, sent2)[ALL_TASKS['sfres']].tolist()))
sent1 = []
sent2 = []
if(self.dataset['sfres'][example]['scores']['quality'] >= 4.0):
true_score.append(1)
else:
true_score.append(0)
self.print_result_table({
'Dataset_name': 'sfres',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_stsb(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['stsb']:
sent1.append(example['sentence1'])
sent2.append(example['sentence2'])
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['stsb']].tolist()
self.print_result_table({
'Dataset_name': 'stsb',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_sick(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['sick']:
sent1.append(example['sentence_A'])
sent2.append(example['sentence_B'])
true_score.append(example['relatedness_score'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['sick']].tolist()
self.print_result_table({
'Dataset_name': 'sick-r',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_race_m(self):
true_score = []
article = []
qa = []
for example in self.dataset['race_m']:
for i, option in enumerate(example['options']):
article.append(example['article'])
qa.append(example['question']+" "+option+" " if "_" not in example['question'] else ' '.join(example['question'].replace("_", " "+option+" ").split()))
if i == ord(example['answer'])-65:
true_score.append(i) # 0,1,2,3
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['race_m']].tolist()
for a, b, c, d in zip(*[iter(pred_score_temp)]*4):
arr = [0]*4
pred_score.append(np.argmax([a,b,c,d]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'race-m',
'Accuracy': [acc],
})
def evaluate_race_h(self):
true_score = []
article = []
qa = []
for example in self.dataset['race_h']:
for i, option in enumerate(example['options']):
article.append(example['article'])
qa.append(example['question']+" "+option+" " if "_" not in example['question'] else ' '.join(example['question'].replace("_", " "+option+" ").split()))
if i == ord(example['answer'])-65:
true_score.append(i) # 0,1,2,3
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['race_h']].tolist()
for a, b, c, d in zip(*[iter(pred_score_temp)]*4):
pred_score.append(np.argmax([a,b,c,d]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'race-h',
'Accuracy': [acc]
})
# How to combine passage, question, and single answer for boolq
def evaluate_boolq(self):
true_score = []
article = []
qa = []
for example in self.dataset['boolq']:
for i in range(2):
article.append(example['passage'])
if i == 0:
qa.append(example['question']+" "+"No.") # 0
else:
qa.append(example['question']+" "+"Yes.") # 1
true_score.append(int(example['answer']))
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['boolq']].tolist()
for a, b in zip(*[iter(pred_score_temp)]*2):
pred_score.append(np.argmax([a,b]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'boolq',
'Accuracy': [acc]
})
def evaluate_anli_1(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['anli_1']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(example['label'] if example['label']!=-1 else 1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['anli_1']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'anli-1',
'Accuracy': [accuracy_score(true_score, pred_score)]
})
def evaluate_anli_2(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['anli_2']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(example['label'] if example['label']!=-1 else 1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['anli_2']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'anli-2',
'Accuracy': [accuracy_score(true_score, pred_score)]
})
def evaluate_anli_3(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['anli_3']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(example['label'] if example['label']!=-1 else 1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['anli_3']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'anli-3',
'Accuracy': [accuracy_score(true_score, pred_score)]
})
def evaluate_nli_fever(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['nli_fever']:
sent1.append(example['hypothesis']) # the original dataset flipped
sent2.append(example['premise'])
true_score.append(example['label'] if example['label']!=-1 else 1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['nli_fever']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'nli_fever',
'Accuracy': [accuracy_score(true_score, pred_score)]
})
def evaluate_snli(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['snli']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(example['label'] if example['label']!=-1 else 1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['snli']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'snli',
'Accuracy': [accuracy_score(true_score, pred_score)]
})
def evaluate_axb(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['axb']:
sent1.append(example['sentence1'])
sent2.append(example['sentence2'])
true_score.append(1 if example['label']==0 else 0)
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['axb']].tolist()
self.print_result_table({
'Dataset_name': 'axb',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)],
'Matthews': self.get_matthews_corr(true_score, pred_score)
})
def evaluate_axg(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['axg']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(1 if example['label']==0 else 0)
pred_score = self.align_func(sent1, sent2)[2][:,0].tolist()
self.print_result_table({
'Dataset_name': 'axg',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)],
})
def evaluate_cb(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['cb']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
if example['label'] == 0:
label = 0
elif example['label'] == 1:
label = 2
elif example['label'] == 2:
label = 1
true_score.append(label)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['cb']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'cb',
'Accuracy': [accuracy_score(true_score, pred_score)],
})
def evaluate_rte(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['rte']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(1 if example['label']==0 else 0)
pred_score = self.align_func(sent1, sent2)[2][:,0].tolist()
self.print_result_table({
'Dataset_name': 'rte',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)],
})
def evaluate_wnli(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['wnli']:
sent1.append(example['text1'])
sent2.append(example['text2'])
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[2][:,0].tolist()
self.print_result_table({
'Dataset_name': 'wnli',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)],
})
def evaluate_doc_nli(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['doc_nli']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(1 if example['label'] == 'entailment' else 0)
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['doc_nli']].tolist()
self.print_result_table({
'Dataset_name': 'doc_nli',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)],
})
def evaluate_qnli(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['qnli']:
sent1.append(example['sentence'])
sent2.append(example['question'])
true_score.append(1 if example['label'] == 0 else 0)
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qnli']].tolist()
self.print_result_table({
'Dataset_name': 'qnli',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)],
})
def evaluate_dream(self):
true_score = []
article = []
qa = []
for example in self.dataset['dream']:
for i, option in enumerate(example['choice']):
article.append(' '.join(example['dialogue']))
qa.append(example['question']+" "+option+" ")
if option == example['answer']:
true_score.append(i) # 0,1,2,3
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['dream']].tolist()
for a, b, c in zip(*[iter(pred_score_temp)]*3):
arr = [0]*3
pred_score.append(np.argmax([a,b,c]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'dream',
'Accuracy': [acc],
})
def evaluate_quartz(self):
true_score = []
article = []
qa = []
for example in self.dataset['quartz']:
for i, option in enumerate(example['choices']['text']):
article.append(example['para'])
qa.append(example['question']+" "+option+" ")
if i == ord(example['answerKey'])-65:
true_score.append(i) # 0,1,2,3
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['quartz']].tolist()
for a, b in zip(*[iter(pred_score_temp)]*2):
arr = [0]*2
pred_score.append(np.argmax([a,b]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'quartz',
'Accuracy': [acc],
})
def evaluate_eraser_multi_rc(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['eraser_multi_rc']:
sent1.append(example['passage'])
sent2.append(example['query_and_answer'].replace("|", ""))
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['eraser_multi_rc']].tolist()
self.print_result_table({
'Dataset_name': 'eraser_multi_rc',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)]
})
def evaluate_quail(self):
true_score = []
article = []
qa = []
for example in self.dataset['quail']:
for i, option in enumerate(example['answers']):
article.append(example['context'])
qa.append(example['question']+" "+option+" ")
if i == example['correct_answer_id']:
true_score.append(i) # 0,1,2,3
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['quail']].tolist()
for a, b, c, d in zip(*[iter(pred_score_temp)]*4):
arr = [0]*4
pred_score.append(np.argmax([a,b,c,d]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'quail',
'Accuracy': [acc],
})
def evaluate_sciq(self):
true_score = []
article = []
qa = []
for example in self.dataset['sciq']:
options = [example['correct_answer'], example['distractor1'], example['distractor2'], example['distractor3']]
for i, option in enumerate(options):
article.append(example['support'])
qa.append(example['question']+" "+option+" ")
if i == 0:
true_score.append(i) # 0,1,2,3, always 0
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['sciq']].tolist()
for a, b, c, d in zip(*[iter(pred_score_temp)]*4):
arr = [0]*4
pred_score.append(np.argmax([a,b,c,d]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'sciq',
'Accuracy': [acc],
})
def evaluate_gap(self):
true_score = []
article = []
qa = []
for example in self.dataset['gap']:
options = [example['Text'][:example['Pronoun-offset']]+example['A']+example['Text'][(example['Pronoun-offset']+len(example['Pronoun'])):],
example['Text'][:example['Pronoun-offset']]+example['B']+example['Text'][(example['Pronoun-offset']+len(example['Pronoun'])):]]
for i, option in enumerate(options):
article.append(example['Text'])
qa.append(option)
true_score.append(1 if example['B-coref'] else 0) # 0,1,2,3, always 0
pred_score = []
pred_score_temp = self.align_func(article, qa)[ALL_TASKS['gap']].tolist()
for a, b in zip(*[iter(pred_score_temp)]*2):
pred_score.append(np.argmax([a,b]))
assert len(pred_score) == len(true_score)
acc = [int(p==t) for p, t in zip(pred_score, true_score)]
acc = sum(acc) / len(acc)
self.print_result_table({
'Dataset_name': 'gap',
'Accuracy': [acc],
})
# How to group fact checking
def evaluate_vitaminc(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['vitaminc']:
sent1.append(example['evidence'])
sent2.append(example['claim'])
if example['label'] == 'SUPPORTS':
true_score.append(0)
elif example['label'] == 'REFUTES':
true_score.append(2)
else:
true_score.append(1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['vitaminc']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'vitaminc',
'F1': self.get_3label_f1(true_score, pred_score),
'Accuracy': [accuracy_score(true_score, pred_score)],
})
def evaluate_mrpc(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['mrpc']:
sent1.append(example['sentence1'])
sent2.append(example['sentence2'])
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['mrpc']].tolist()
self.print_result_table({
'Dataset_name': 'mrpc',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)]
})
def evaluate_paws(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['paws']:
sent1.append(example['sentence1'])
sent2.append(example['sentence2'])
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['paws']].tolist()
self.print_result_table({
'Dataset_name': 'paws',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)]
})
def evaluate_mnli_matched(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['mnli_matched']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(example['label'] if example['label']!=-1 else 1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['mnli_matched']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'mnli_matched',
'Accuracy': [accuracy_score(true_score, pred_score)]
})
def evaluate_mnli_mismatched(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['mnli_mismatched']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
true_score.append(example['label'] if example['label']!=-1 else 1)
pred_score = torch.argmax(self.align_func(sent1, sent2)[ALL_TASKS['mnli_mismatched']], dim=-1).tolist()
self.print_result_table({
'Dataset_name': 'mnli_mismatched',
'Accuracy': [accuracy_score(true_score, pred_score)]
})
def evaluate_sem_eval(self):
print('Reached here')
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['sem_eval']:
sent1.append(example['premise'])
sent2.append(example['hypothesis'])
if example['entailment_judgment'] == 1:
true_score.append(1)
else:
true_score.append(0)
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['sem_eval']].tolist()
self.print_result_table({
'Dataset_name': 'sem_eval',
'Accuracy': self.get_accuracy(true_score, pred_score)
})
def evaluate_summeval(self):
true_score = []
true_score_rel = []
true_score_binary = []
pred_score = []
sent1 = []
sent2 = []
for example in self.dataset['summeval']:
cleaned_summary = self.clean_text(example['document'], example['summary'])
sent1.append(example['document'])
sent2.append(cleaned_summary)
true_score.append(example['consistency'])
true_score_rel.append(example['relevance'])
true_score_binary.append(1 if example['consistency'] == 5.0 else 0)
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['summeval']].tolist()
self.print_result_table({
'Dataset_name': 'summeval',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score_binary, pred_score),
'Pearson_rel': self.get_pearson(true_score_rel, pred_score),
'Spearman_rel': self.get_spearman(true_score_rel, pred_score),
'Kendall_rel': self.get_kendalltau(true_score_rel, pred_score),
})
def evaluate_qags_xsum(self):
true_score = []
pred_score = []
sent1 = []
sent2 = []
for example in self.dataset['qags_xsum']:
sent1.append(example['document'])
sent2.append(example['summary'])
true_score.append(example['consistency'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qags_xsum']].tolist()
self.print_result_table({
'Dataset_name': 'qags_xsum',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_qags_cnndm(self):
true_score = []
pred_score = []
sent1 = []
sent2 = []
true_score_binary = []
for example in self.dataset['qags_cnndm']:
sent1.append(example['document'])
sent2.append(example['summary'])
true_score.append(example['consistency'])
true_score_binary.append(1 if example['consistency'] == 1.0 else 0)
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qags_cnndm']].tolist()
self.print_result_table({
'Dataset_name': 'qags_cnndm',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score_binary, pred_score)
})
def evaluate_frank(self):
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer")
for d in self.dataset['frank']:
if d['dataset'] == 'cnndm':
continue
d['document'] = ' '.join([each.text for each in nlp(d['document']).sents])
true_score_xsum = []
true_score_cnndm = []
pred_score_xsum = []
pred_score_cnndm = []
sent1_xsum = []
sent1_cnndm = []
sent2_xsum = []
sent2_cnndm = []
true_score_binary_cnndm = []
true_score_binary_xsum = []
for example in self.dataset['frank']:
if example['dataset'] == 'cnndm':
sent1_cnndm.append(example['document'])
sent2_cnndm.append(self.clean_text(example['document'], example['summary']))
true_score_cnndm.append(example['score'])
true_score_binary_cnndm.append(1 if example['score'] == 1.0 else 0)
elif example['dataset'] == 'xsum':
sent1_xsum.append(example['document'])
sent2_xsum.append(self.clean_text(example['document'], example['summary']))
true_score_xsum.append(example['score'])
true_score_binary_xsum.append(1 if example['score'] == 1.0 else 0)
pred_score_xsum = self.align_func(sent1_xsum, sent2_xsum)[ALL_TASKS['frank']].tolist() #
pred_score_cnndm = self.align_func(sent1_cnndm, sent2_cnndm)[ALL_TASKS['frank']].tolist() #
self.print_result_table({
'Dataset_name': 'frank-xsum',
'Pearson': self.get_pearson(true_score_xsum, pred_score_xsum),
'Spearman': self.get_spearman(true_score_xsum, pred_score_xsum),
'Kendall': self.get_kendalltau(true_score_xsum, pred_score_xsum),
'AUC': roc_auc_score(true_score_binary_xsum, pred_score_xsum)
})
self.print_result_table({
'Dataset_name': 'frank-cnndm',
'Pearson': self.get_pearson(true_score_cnndm, pred_score_cnndm),
'Spearman': self.get_spearman(true_score_cnndm, pred_score_cnndm),
'Kendall': self.get_kendalltau(true_score_cnndm, pred_score_cnndm),
'AUC': roc_auc_score(true_score_binary_cnndm, pred_score_cnndm)
})
self.print_result_table({
'Dataset_name': 'frank-all',
'Pearson': self.get_pearson(true_score_xsum+true_score_cnndm, pred_score_xsum+pred_score_cnndm),
'Spearman': self.get_spearman(true_score_xsum+true_score_cnndm, pred_score_xsum+pred_score_cnndm),
'Kendall': self.get_kendalltau(true_score_xsum+true_score_cnndm, pred_score_xsum+pred_score_cnndm),
'AUC': roc_auc_score(true_score_binary_xsum+true_score_binary_cnndm, pred_score_xsum+pred_score_cnndm)
})
def evaluate_xsumfaith(self):
dataset_name = 'xsumfaith'
true_score = []
pred_score = []
sent1 = []
sent2 = []
for example in self.dataset[dataset_name]:
sent1.append(example['document'])
sent2.append(self.clean_text(example['document'], example['claim']))
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS[dataset_name]].tolist()
self.print_result_table({
'Dataset_name': dataset_name,
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
})
def evaluate_samsum(self):
dataset_name = 'samsum'
label_mapping = {
'factual': 1,
'factually incorrect': 0,
'too incoherent': 0
}
import string
printable = set(string.printable)
true_score = []
pred_score = []
sent1 = []
sent2 = []
for example in self.dataset[dataset_name]:
cleaned_doc = ''.join(filter(lambda x: x in printable, example['article']))
sent1.append(cleaned_doc)
sent2.append(example['summary'])
true_score.append(label_mapping[example['label']])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS[dataset_name]].tolist()
self.print_result_table({
'Dataset_name': dataset_name,
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_yelp(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['yelp']:
sent1.append(example['input_sent'])
sent2.append(example['output_sent'])
true_score.append(example['preservation'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['yelp']].tolist()
self.print_result_table({
'Dataset_name': 'yelp',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_persona_chat(self):
true_score = []
pred_score = []
premise = []
hypothesis = []
for example in self.dataset['persona_chat']:
premise.append(example['dialog_history']+example['fact'])
hypothesis.append(example['response'])
true_score.append(example['engaging'])
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['persona_chat']].tolist()
self.print_result_table({
'Dataset_name': 'persona_chat_eng',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
true_score = []
pred_score = []
premise = []
hypothesis = []
for example in self.dataset['persona_chat']:
premise.append(example['fact'])
hypothesis.append(example['response'])
true_score.append(example['uses_knowledge'])
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['persona_chat']].tolist()
self.print_result_table({
'Dataset_name': 'persona_chat_grd',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_topical_chat(self):
true_score = []
pred_score = []
premise = []
hypothesis = []
for example in self.dataset['topical_chat']:
premise.append(example['dialog_history']+example['fact'])
hypothesis.append(example['response'])
true_score.append(example['engaging'])
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['topical_chat']].tolist()
self.print_result_table({
'Dataset_name': 'topical_chat_eng',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
true_score = []
pred_score = []
premise = []
hypothesis = []
for example in self.dataset['topical_chat']:
premise.append(example['fact'])
hypothesis.append(example['response'])
true_score.append(example['uses_knowledge'])
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['topical_chat']].tolist()
self.print_result_table({
'Dataset_name': 'topical_chat_grd',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_paws_qqp(self):
sent1 = []
sent2 = []
true_score = []
for i in range(self.dataset['paws_qqp']['label'].size):
sent1.append(self.dataset['paws_qqp']['sentence1'][i][2:-1])
sent2.append(self.dataset['paws_qqp']['sentence2'][i][2:-1])
true_score.append(self.dataset['paws_qqp']['label'][i])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['paws_qqp']].tolist()
roc_auc = roc_auc_score(true_score, pred_score)
self.print_result_table({
'Dataset_name': 'paws_qqp',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc]
})
def evaluate_qqp(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['qqp']:
sent1.append(example['question1'])
sent2.append(example['question2'])
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['qqp']].tolist()
self.print_result_table({
'Dataset_name': 'qqp',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc_score(true_score, pred_score)]
})
def evaluate_wmt17(self):
lang_pair = list(set([each['lang'] for each in self.dataset['wmt17']]))
for each_lang_pair in lang_pair:
true_score = []
premise = []
hypothesis = []
for example in self.dataset['wmt17']:
if example['lang'] != each_lang_pair:
continue
premise.append(example['reference'])
hypothesis.append(example['candidate'])
true_score.append(example['score'])
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['wmt17']].tolist()
self.print_result_table({
'Dataset_name': f'wmt17-{each_lang_pair}',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_wmt18(self):
lang_pair = list(set([each['lang'] for each in self.dataset['wmt18']]))
for each_lang_pair in lang_pair:
true_score = []
premise = []
hypothesis = []
for example in self.dataset['wmt18']:
if example['lang'] != each_lang_pair:
continue
premise.append(example['reference'])
hypothesis.append(example['candidate'])
true_score.append(example['score'])
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['wmt18']].tolist()
self.print_result_table({
'Dataset_name': f'wmt18-{each_lang_pair}',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_wmt19(self):
lang_pair = list(set([each['lang'] for each in self.dataset['wmt19']]))
for each_lang_pair in lang_pair:
true_score = []
premise = []
hypothesis = []
for example in self.dataset['wmt19']:
if example['lang'] != each_lang_pair:
continue
premise.append(example['reference'])
hypothesis.append(example['candidate'])
true_score.append(example['score'])
pred_score = self.align_func(premise, hypothesis)[ALL_TASKS['wmt19']].tolist()
self.print_result_table({
'Dataset_name': f'wmt19-{each_lang_pair}',
'Pearson': self.get_pearson(true_score, pred_score),
'Spearman': self.get_spearman(true_score, pred_score),
'Kendall': self.get_kendalltau(true_score, pred_score)
})
def evaluate_sst2(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['sst2']:
sent1.append(example['text'])
sent2.append('It was great.')
true_score.append(int(example['label_text'] == 'positive'))
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['sst2']].tolist()
self.print_result_table({
'Dataset_name': 'sst2',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_cr(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['cr']:
sent1.append(example['text'])
sent2.append('It was great.')
true_score.append(int(example['label_text'] == 'positive'))
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['cr']].tolist()
self.print_result_table({
'Dataset_name': 'cr',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_subj(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['subj']:
sent1.append(example['text'])
sent2.append('It was objective.')
true_score.append(int(example['label_text'] == 'objective'))
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['subj']].tolist()
self.print_result_table({
'Dataset_name': 'subj',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_imdb(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['imdb']:
sent1.append(example['text'])
sent2.append('It was great.')
true_score.append(int(example['label_text'] == 'positive'))
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['imdb']].tolist()
self.print_result_table({
'Dataset_name': 'imdb',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_imdb_knn(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['imdb']:
sent1.append(example['text'])
sent2.append('It was great.')
true_score.append(int(example['label_text'] == 'positive'))
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['imdb']].tolist()
self.print_result_table({
'Dataset_name': 'imdb',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_cola(self):
true_score = []
sent1 = []
sent2 = []
for example in self.dataset['cola']:
sent1.append(example['sentence'])
sent2.append('It was correct.')
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[ALL_TASKS['cola']].tolist()
self.print_result_table({
'Dataset_name': 'cola',
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': roc_auc_score(true_score, pred_score)
})
def evaluate_yelp_efl(self):
sent = []
label = []
for example in self.dataset['yelp_efl']:
sent.append(example['text'])
label.append(example['label'])
templates = [
'It was terrible.',
'It was bad.',
'It was ok.',
'It was good.',
'It was great.',
]
template_lists = [[template] * len(sent) for template in templates]
predictions = [
self.align_func(sent, template_list)[ALL_TASKS['yelp_efl']]
for template_list in template_lists
]
pred_label = torch.argmax(torch.stack(predictions), dim=0).tolist()
self.print_result_table({
'Dataset_name': 'yelp_efl',
'Accuracy': [accuracy_score(label, pred_label)]
})
def evaluate_ag_news(self):
sent = []
label = []
for example in self.dataset['ag_news']:
sent.append(example['text'])
label.append(example['label'])
templates = [
'It is world news.',
'It is sports news.',
'It is business news.',
'It is science news.',
]
template_lists = [[template] * len(sent) for template in templates]
predictions = [
self.align_func(sent, template_list)[ALL_TASKS['ag_news']]
for template_list in template_lists
]
pred_label = torch.argmax(torch.stack(predictions), dim=0).tolist()
self.print_result_table({
'Dataset_name': 'ag_news',
'Accuracy': [accuracy_score(label, pred_label)]
})
def evaluate_trec(self):
sent = []
label = []
for example in self.dataset['trec']:
sent.append(example['text'])
label.append(example['label_coarse'])
templates = [
'It is description.',
'It is entity.',
'It is expression.',
'It is human.',
'It is number.',
'It is location.',
]
template_lists = [[template] * len(sent) for template in templates]
predictions = [
self.align_func(sent, template_list)[ALL_TASKS['trec']]
for template_list in template_lists
]
pred_label = torch.argmax(torch.stack(predictions), dim=0).tolist()
self.print_result_table({
'Dataset_name': 'trec',
'Accuracy': [accuracy_score(label, pred_label)]
})
def true_task_helper(self, dataset_name):
sent1 = []
sent2 = []
true_score = []
for i in range(len(self.dataset[dataset_name])):
context = self.dataset[dataset_name].iloc[i]['grounding']
claim = self.dataset[dataset_name].iloc[i]['generated_text']
sent1.append(context)
sent2.append(self.clean_text(context, claim))
true_score.append(self.dataset[dataset_name].iloc[i]['label'])
pred_score = self.align_func(sent1, sent2)[1].tolist()
roc_auc = roc_auc_score(true_score, pred_score)
self.print_result_table({
'Dataset_name': dataset_name,
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'AUC': [roc_auc]
})
def evaluate_true_begin(self):
dataset_name = 'true_begin'
self.true_task_helper(dataset_name)
def evaluate_true_dialfact(self):
dataset_name = 'true_dialfact'
self.true_task_helper(dataset_name)
def evaluate_true_fever(self):
dataset_name = 'true_fever'
self.true_task_helper(dataset_name)
def evaluate_true_frank(self):
dataset_name = 'true_frank'
self.true_task_helper(dataset_name)
def evaluate_true_mnbm(self):
dataset_name = 'true_mnbm'
self.true_task_helper(dataset_name)
def evaluate_true_paws(self):
dataset_name = 'true_paws'
self.true_task_helper(dataset_name)
def evaluate_true_q2(self):
dataset_name = 'true_q2'
self.true_task_helper(dataset_name)
def evaluate_true_qags_cnndm(self):
dataset_name = 'true_qags_cnndm'
self.true_task_helper(dataset_name)
def evaluate_true_qags_xsum(self):
dataset_name = 'true_qags_xsum'
self.true_task_helper(dataset_name)
def evaluate_true_summeval(self):
dataset_name = 'true_summeval'
self.true_task_helper(dataset_name)
def evaluate_true_vitc(self):
dataset_name = 'true_vitc'
self.true_task_helper(dataset_name)
def get_summac_thres(self, dataset_name):
sent1 = []
sent2 = []
true_score = []
for example in self.summac_validation_set[dataset_name]:
sent1.append(example['document'])
sent2.append(self.clean_text(example['document'], example['claim'])) #
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[1].tolist()
thres_result = []
for i in range(1001):
thres = i / 1000
thres_result.append((thres, balanced_accuracy_score(true_score, [p>thres for p in pred_score])))
best_thres = sorted(thres_result, key=lambda x: x[1], reverse=True)[0]
print(f"best thres for {dataset_name} is {best_thres[0]} @ {best_thres[1]}")
return best_thres[0]
def summac_task_helper(self, dataset_name):
sent1 = []
sent2 = []
true_score = []
for example in self.dataset[dataset_name]:
sent1.append(example['document'])
sent2.append(self.clean_text(example['document'], example['claim']))
true_score.append(example['label'])
pred_score = self.align_func(sent1, sent2)[1].tolist()
roc_auc = roc_auc_score(true_score, pred_score)
balanced_acc_thres = self.get_summac_thres(dataset_name)
self.print_result_table({
'Dataset_name': dataset_name,
'F1': self.get_f1(true_score, pred_score),
'Accuracy': self.get_accuracy(true_score, pred_score),
'BalancedAcc': self.get_balanced_accuracy(true_score, pred_score, thres=balanced_acc_thres),
'threshold': balanced_acc_thres,
'AUC': [roc_auc]
})
def evaluate_summac_cogensumm(self):
dataset_name = 'summac_cogensumm'
self.summac_task_helper(dataset_name)
def evaluate_summac_xsumfaith(self):
dataset_name = 'summac_xsumfaith'
self.summac_task_helper(dataset_name)
def evaluate_summac_polytope(self):
dataset_name = 'summac_polytope'
self.summac_task_helper(dataset_name)
def evaluate_summac_factcc(self):
dataset_name = 'summac_factcc'
self.summac_task_helper(dataset_name)
def evaluate_summac_summeval(self):
dataset_name = 'summac_summeval'
self.summac_task_helper(dataset_name)
def evaluate_summac_frank(self):
dataset_name = 'summac_frank'
self.summac_task_helper(dataset_name)
def evaluate_beir(self):
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.reranking.models import CrossEncoder
from beir.reranking import Rerank
import pathlib, os
import logging
import random
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
#### /print debug information to stdout
#### Download trec-covid.zip dataset and unzip the dataset
for beir_dataset_name in ['msmarco', 'trec-covid', 'nfcorpus', 'nq', 'hotpotqa', 'fiqa',
'arguana', 'webis-touche2020', 'cqadupstack', 'quora',
'dbpedia-entity', 'scidocs', 'fever', 'climate-fever', 'scifact']:
# for beir_dataset_name in ['fever']:
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(beir_dataset_name)
# out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
out_dir = f"./data/eval/beir/{beir_dataset_name}/"
data_path = util.download_and_unzip(url, out_dir)
#### Provide the data path where trec-covid has been downloaded and unzipped to the data loader
# data folder would contain these files:
# (1) trec-covid/corpus.jsonl (format: jsonlines)
# (2) trec-covid/queries.jsonl (format: jsonlines)
# (3) trec-covid/qrels/test.tsv (format: tsv ("\t"))
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
#########################################
#### (1) RETRIEVE Top-100 docs using BM25
#########################################
#### Provide parameters for Elasticsearch
# print(corpus)
hostname = "localhost" #localhost
index_name = beir_dataset_name # trec-covid
initialize = True # False
model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever = EvaluateRetrieval(model, k_values=[1,3,5,10,100,1000])
#### Retrieve dense results (format of results is identical to qrels)
results = retriever.retrieve(corpus, queries)
# Rerank top-100 results using the reranker provided
reranker = Rerank(self.align_func)
rerank_results = reranker.rerank(corpus, queries, results, top_k=100)
#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, rerank_results, retriever.k_values)
self.print_result_table({
'Dataset_name': beir_dataset_name,
'ndcg': ndcg,
'map': _map,
'recall': recall,
'precision': precision
})
def evaluate_xxx(self):
pass
class evaluateMultiCheckpoints:
def __init__(self, config, device='cuda:0') -> None:
sample_checkpoint = {
'backbone': 'roberta-base',
'task_name': 'align-wo-finetune | align-finetune | roberta-finetune-baseline | nli-wo-finetune | nli-finetune',
'path': 'some path',
'result_save_path': 'some path'
}
self.config = config ## a dictionary
self.device = device
self.tasks = [
'summeval', 'qags_xsum', 'qags_cnndm', 'persona_chat', 'topical_chat',
'mnli_mismatched', 'mnli_matched',
'sick', 'yelp', 'stsb',
'anli_1','anli_2', 'anli_3', 'snli', 'vitaminc',
'mrpc', 'paws', 'sem_eval', 'paws_qqp', 'qqp',
'newsroom', 'rank19', 'bagel', 'race_m', 'race_h'
]
def experimentForSlide1216(self):
for ckpt in self.config:
self.evaluateOneCheckpoint(ckpt)
def evaluateOneCheckpoint(self, ckpt):
model_name = ckpt['path'].split('/')[-1].split('.ckpt')[0]
infer = Inferencer(ckpt_path=ckpt['path'],
model=ckpt['backbone'], batch_size=32, device=self.device)
evaluator = Evaluator(eval_tasks=self.tasks, align_func=infer.inference, save_all_tables=True)
evaluator.result_save_name = f"{ckpt['result_save_path']}{model_name}"
evaluator.evaluate()