|
|
|
|
|
import os.path as osp |
|
from collections import defaultdict |
|
from typing import List, Optional |
|
|
|
import mmengine |
|
import numpy as np |
|
from mmengine import ConfigDict |
|
from rich import print |
|
from rich.table import Table |
|
|
|
from opencompass.utils import (dataset_abbr_from_cfg, get_infer_output_path, |
|
get_logger, model_abbr_from_cfg) |
|
from opencompass.utils.prompt import get_prompt_hash |
|
|
|
METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth'] |
|
METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len'] |
|
|
|
|
|
META_COL_COUNT = 4 |
|
EPS = 1e-6 |
|
|
|
def bold(text): |
|
return f'[bold]{text}[/bold]' |
|
|
|
|
|
def green_bold(text): |
|
return f'[green][bold]{text}[/bold][/green]' |
|
|
|
|
|
def format_float(v): |
|
return f'{v:.2f}' |
|
|
|
|
|
def to_float(text: str): |
|
try: |
|
return float(text) |
|
except ValueError: |
|
return 0 |
|
|
|
|
|
def is_section_row(row: List[str]) -> bool: |
|
|
|
return row[-1] == '-' and row[0][0] == '-' |
|
|
|
|
|
def average_rows(name, rows: List[List[str]]) -> List[str]: |
|
|
|
new_row = ['-'] * len(rows[0]) |
|
new_row[0] = bold(name) |
|
|
|
all_accs = defaultdict(list) |
|
for row in rows: |
|
for i, acc in enumerate(row[META_COL_COUNT:]): |
|
all_accs[i].append(to_float(acc)) |
|
|
|
for i, accs in enumerate(all_accs.values()): |
|
new_row[META_COL_COUNT + i] = format_float(np.mean(accs)) |
|
return new_row |
|
|
|
|
|
def create_section_row(row_i: int, row: List[str], table) -> List[str]: |
|
section_name = bold('[' + row[0].replace('-', '').strip() + ']') |
|
|
|
|
|
section_rows = [] |
|
for next_row in table[row_i + 1 :]: |
|
if is_section_row(next_row): |
|
break |
|
section_rows.append(next_row) |
|
return average_rows(section_name, section_rows) |
|
|
|
|
|
def create_win_row(rows: List[List[str]]) -> List[str]: |
|
win_count = defaultdict(int) |
|
for row in rows: |
|
all_scores = [to_float(_) for _ in row[META_COL_COUNT:]] |
|
best_indeice = [i for i, s in enumerate(all_scores) if s > np.max(all_scores) - EPS] |
|
for best_index in best_indeice: |
|
win_count[best_index] += 1 |
|
new_row = ['-'] * len(rows[0]) |
|
new_row[0] = bold('Win Count') |
|
for i, count in win_count.items(): |
|
new_row[META_COL_COUNT + i] = str(count) |
|
return new_row |
|
|
|
|
|
def highlight(row: List[str], meta_col_count: int = META_COL_COUNT) -> List[str]: |
|
new_row = [_ for _ in row] |
|
all_scores = [to_float(_) for _ in row[meta_col_count:]] |
|
best_indeice = [i + meta_col_count for i, s in enumerate(all_scores) if s > np.max(all_scores) - EPS] |
|
for best_index in best_indeice: |
|
new_row[best_index] = green_bold(row[best_index]) |
|
return new_row |
|
|
|
|
|
class MultiModelSummarizer: |
|
"""MultiModel. |
|
|
|
Args: |
|
config (ConfigDict): The configuration object of the evaluation task. |
|
It's expected to be filled out at runtime. |
|
dataset_abbrs (list[str], optional): Dataset abbreviations to be |
|
listed in the summary. |
|
summary_groups (list): The dataset groups whose results need to be |
|
averaged out. For example, mmlu. Each item it a dict with |
|
'name' (str) and 'subsets' (list of dataset abbrs), and optionally |
|
'weights' if weighted average is needed. |
|
prompt_db: A deprecated field. |
|
""" |
|
|
|
def __init__(self, config: ConfigDict, dataset_abbrs: Optional[List[str]] = None, summary_groups: List = [], prompt_db = None) -> None: |
|
self.tasks = [] |
|
self.cfg = config |
|
self.logger = get_logger() |
|
self.summary_groups = summary_groups |
|
self.dataset_abbrs = dataset_abbrs |
|
if prompt_db: |
|
self.logger.warning('prompt_db is deprecated and no longer used. ' |
|
'Please remove it from your config.') |
|
self.models_summary_group_metrics = {} |
|
self.table = self.load() |
|
|
|
def load( self ): |
|
model_cfgs = self.cfg['models'] |
|
dataset_cfgs = self.cfg['datasets'] |
|
work_dir = self.cfg['work_dir'] |
|
|
|
|
|
raw_results = {} |
|
parsed_results = {} |
|
dataset_metrics = {} |
|
|
|
model_abbrs = [model_abbr_from_cfg(model) for model in model_cfgs] |
|
for model in model_cfgs: |
|
model_abbr = model_abbr_from_cfg(model) |
|
parsed_results[model_abbr] = {} |
|
raw_results[model_abbr] = {} |
|
for dataset in dataset_cfgs: |
|
dataset_abbr = dataset_abbr_from_cfg(dataset) |
|
filepath = get_infer_output_path(model, dataset, osp.join(work_dir, 'results')) |
|
if not osp.exists(filepath): |
|
continue |
|
result = mmengine.load(filepath) |
|
raw_results[model_abbr][dataset_abbr] = result |
|
if 'error' in result: |
|
self.logger.debug(f'error in {model_abbr} {dataset_abbr} {result["error"]}') |
|
continue |
|
else: |
|
parsed_results[model_abbr][dataset_abbr] = [] |
|
dataset_metrics[dataset_abbr] = [] |
|
for metric, score in result.items(): |
|
if metric not in METRIC_BLACKLIST and isinstance(score, (int, float)): |
|
parsed_results[model_abbr][dataset_abbr].append(score) |
|
dataset_metrics[dataset_abbr].append(metric) |
|
else: |
|
continue |
|
if len(parsed_results[model_abbr][dataset_abbr]) == 0: |
|
self.logger.warning(f'unknown result format: {result}, continue') |
|
del parsed_results[model_abbr][dataset_abbr] |
|
del dataset_metrics[dataset_abbr] |
|
continue |
|
indice = sorted( |
|
list(range(len(dataset_metrics[dataset_abbr]))), |
|
key=lambda i: ( |
|
METRIC_WHITELIST.index(dataset_metrics[dataset_abbr][i]) |
|
if dataset_metrics[dataset_abbr][i] in METRIC_WHITELIST |
|
else len(METRIC_WHITELIST) |
|
) |
|
) |
|
parsed_results[model_abbr][dataset_abbr] = [parsed_results[model_abbr][dataset_abbr][i] for i in indice] |
|
dataset_metrics[dataset_abbr] = [dataset_metrics[dataset_abbr][i] for i in indice] |
|
|
|
|
|
dataset_eval_mode = {} |
|
for dataset in dataset_cfgs: |
|
inferencer = dataset.get('infer_cfg', {}).get('inferencer', {}).get('type', '') |
|
inferencer = inferencer if isinstance(inferencer, str) else inferencer.__name__ |
|
dataset_abbr = dataset_abbr_from_cfg(dataset) |
|
if 'GenInferencer' in inferencer: |
|
dataset_eval_mode[dataset_abbr] = 'gen' |
|
elif 'PPLInferencer' in inferencer: |
|
dataset_eval_mode[dataset_abbr] = 'ppl' |
|
else: |
|
dataset_eval_mode[dataset_abbr] = 'unknown' |
|
self.logger.warning(f'unknown inferencer: {inferencer} - {dataset_abbr}') |
|
|
|
|
|
summary_groups = self.summary_groups |
|
summary_group_metrics = {} |
|
for sg in summary_groups: |
|
for model_abbr in model_abbrs: |
|
results = {} |
|
eval_modes = [] |
|
for dataset_abbr in sg['subsets']: |
|
if dataset_abbr in parsed_results[model_abbr]: |
|
results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0] |
|
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) |
|
summary_group_metrics[sg['name']] = results |
|
if len(results) == len(sg['subsets']): |
|
if 'weights' in sg: |
|
numerator = sum(results[k] * sg['weights'][k] for k in sg['weights']) |
|
denominator = sum(sg['weights'].values()) |
|
metric = 'weighted_average' |
|
else: |
|
numerator = sum(results[k] for k in results) |
|
denominator = len(results) |
|
metric = 'naive_average' |
|
results[metric] = numerator / denominator |
|
eval_modes = list(set(eval_modes)) |
|
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed' |
|
|
|
|
|
raw_results[model_abbr][sg['name']] = results |
|
parsed_results[model_abbr][sg['name']] = [numerator / denominator] |
|
dataset_metrics[sg['name']] = [metric] |
|
dataset_eval_mode[sg['name']] = eval_mode |
|
elif len(results) == 0: |
|
continue |
|
else: |
|
raw_results[model_abbr][sg['name']] = {'error': 'missing datasets: {}'.format(set(sg['subsets']) - set(results.keys()))} |
|
|
|
prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in dataset_cfgs} |
|
|
|
|
|
summarizer_dataset_abbrs = [] |
|
if self.dataset_abbrs is None: |
|
for dataset in dataset_cfgs: |
|
dataset_abbr = dataset_abbr_from_cfg(dataset) |
|
if dataset_abbr in dataset_metrics: |
|
for metric in dataset_metrics[dataset_abbr]: |
|
summarizer_dataset_abbrs.append((dataset_abbr, metric)) |
|
else: |
|
summarizer_dataset_abbrs.append((dataset_abbr, None)) |
|
for dataset_abbr in dataset_metrics: |
|
for metric in dataset_metrics[dataset_abbr]: |
|
if (dataset_abbr, metric) not in summarizer_dataset_abbrs: |
|
summarizer_dataset_abbrs.append((dataset_abbr, metric)) |
|
else: |
|
for item in self.dataset_abbrs: |
|
if isinstance(item, str): |
|
summarizer_dataset_abbrs.append((item, None)) |
|
elif isinstance(item, (list, tuple)): |
|
summarizer_dataset_abbrs.append((item[0], item[1])) |
|
|
|
table = [] |
|
header = ['dataset', 'version', 'metric', 'mode'] + model_abbrs |
|
table.append(header) |
|
for dataset_abbr, metric in summarizer_dataset_abbrs: |
|
if dataset_abbr not in dataset_metrics: |
|
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs)) |
|
continue |
|
if metric is None: |
|
index = 0 |
|
metric = dataset_metrics[dataset_abbr][0] |
|
elif metric in dataset_metrics[dataset_abbr]: |
|
index = dataset_metrics[dataset_abbr].index(metric) |
|
else: |
|
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(model_abbrs)) |
|
continue |
|
|
|
row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')] |
|
for model_abbr in model_abbrs: |
|
if dataset_abbr in parsed_results[model_abbr]: |
|
row.append('{:.02f}'.format(parsed_results[model_abbr][dataset_abbr][index])) |
|
else: |
|
row.append('-') |
|
table.append(row) |
|
|
|
self.models_summary_group_metrics[table[0][-1]] = summary_group_metrics |
|
return table |
|
|
|
def merge(self, summarizer: 'MultiModelSummarizer'): |
|
assert len(self.table) == len(summarizer.table) |
|
for row_i, row in enumerate(summarizer.table): |
|
base_row = self.table[row_i] |
|
if base_row[:3] != row[:3]: |
|
self.logger.warning(f'cannot merge tables with different headers: {base_row} vs {row}') |
|
base_row.extend(row[META_COL_COUNT:]) |
|
new_model_name = summarizer.table[0][-1] |
|
assert new_model_name not in self.models_summary_group_metrics |
|
self.models_summary_group_metrics[new_model_name] = summarizer.models_summary_group_metrics[new_model_name] |
|
|
|
def summarize(self): |
|
""" |
|
Format in self.table |
|
[ |
|
['dataset', 'version', 'metric', 'mode', 'model_name'], |
|
['--------- 考试 Exam ---------', '-', '-', '-', '-'], |
|
['ARC-c', '1e0de5', 'accuracy', 'gen', '79.32'], |
|
['ARC-e', '1e0de5', 'accuracy', 'gen', '85.36'], |
|
['--------- 语言 Language ---------', '-', '-', '-', '-'], |
|
['WiC', 'd06864', 'accuracy', 'gen', '55.64'], |
|
['chid-dev', '211ee7', 'accuracy', 'gen', '52.97'], |
|
['--------- 知识 Knowledge ---------', '-', '-', '-', '-'], |
|
['BoolQ', '883d50', 'accuracy', 'gen', '86.06'], |
|
['--------- 理解 Understanding ---------', '-', '-', '-', '-'], |
|
['C3', '8c358f', 'accuracy', 'gen', '88.33'], |
|
['race-middle', '9a54b6', 'accuracy', 'gen', '90.32'], |
|
['--------- 推理 Reasoning ---------', '-', '-', '-', '-'], |
|
['cmnli', '1abf97', 'accuracy', 'gen', '38.26'], |
|
['ocnli', 'c4cb6c', 'accuracy', 'gen', '32.92'], |
|
] |
|
""" |
|
|
|
table = Table() |
|
for i, col_name in enumerate(self.table[0]): |
|
table.add_column(col_name, overflow='fold', max_width=20 if i >= META_COL_COUNT else None) |
|
|
|
section_rows = [] |
|
all_rows = [] |
|
for row_i, row in enumerate(self.table): |
|
if row_i == 0: |
|
continue |
|
if is_section_row(row): |
|
table.add_section() |
|
new_row = create_section_row(row_i, row, self.table) |
|
section_rows.append(new_row) |
|
else: |
|
new_row = row |
|
all_rows.append(new_row) |
|
|
|
table.add_row(*highlight(new_row)) |
|
|
|
if section_rows: |
|
table.add_section() |
|
average_row = average_rows('Naive Average', section_rows) |
|
average_row = highlight(average_row) |
|
table.add_row(*average_row) |
|
|
|
table.add_section() |
|
table.add_row(*highlight(create_win_row(all_rows))) |
|
print(table) |
|
|
|
def show_group(self, group: str): |
|
table = Table(title=group) |
|
table.add_column('Dataset', overflow='fold') |
|
|
|
|
|
group_metrics = None |
|
for model_name, summary_group_metrics in self.models_summary_group_metrics.items(): |
|
if group not in summary_group_metrics: |
|
self.logger.warning(f'group {group} not found in {model_name}') |
|
return |
|
table.add_column(model_name, overflow='fold') |
|
group_metrics = summary_group_metrics[group] |
|
|
|
for subset_name in group_metrics.keys(): |
|
if subset_name == 'naive_average': |
|
continue |
|
|
|
row = [subset_name] |
|
for summary_group_metrics in self.models_summary_group_metrics.values(): |
|
metric = summary_group_metrics[group][subset_name] |
|
row.append(format_float(metric)) |
|
table.add_row(*highlight(row, meta_col_count=1)) |
|
|
|
print(table) |
|
|