import copy as cp import json from collections import defaultdict from urllib.request import urlopen import gradio as gr import numpy as np import pandas as pd from pathlib import Path from typing import Union, List, Dict from loguru import logger from judgerbench.meta_data import ( DATADIR, LEADERBOARD_FILE_MAPPING, DEFAULT_BENCH, FIELD_MAPPING, STYLE_CLASS_MAPPING, META_FIELDS, URL ) def listinstr(lst, s): assert isinstance(lst, list) for item in lst: if item in s: return True return False def load_results_from_url(): data = json.loads(urlopen(URL).read()) return data def nth_large(val, vals): return sum([1 for v in vals if v > val]) + 1 def format_timestamp(timestamp): date = timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6] time = timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12] return date + ' ' + time def model_size_flag(sz, FIELDS): if pd.isna(sz) and 'Unknown' in FIELDS: return True if pd.isna(sz): return False if '<4B' in FIELDS and sz < 4: return True if '4B-10B' in FIELDS and sz >= 4 and sz < 10: return True if '10B-20B' in FIELDS and sz >= 10 and sz < 20: return True if '20B-40B' in FIELDS and sz >= 20 and sz < 40: return True if '>40B' in FIELDS and sz >= 40: return True return False def model_type_flag(line, FIELDS): if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes': return True if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes': return True if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No': return True return False def build_l1_df(fields): check_box = {} check_box['essential'] = [ # 'Method', # 'Param (B)', 'Model', ] # revise there to set default dataset check_box['default'] = DEFAULT_BENCH check_box['avg'] = ['Average'] check_box['accuracy'] = ['Accuracy_CN', 'Accuracy_EN', 'Accuracy',] check_box['all'] = fields type_map = defaultdict(lambda: 'number') # type_map['Method'] = 'html' type_map['Model'] = 'str' # type_map['Language Model'] = 'str' # type_map['Vision Model'] = 'str' # type_map['OpenSource'] = 'str' # type_map['Verified'] = 'str' check_box['type_map'] = type_map df = generate_table(fields) return df, check_box def build_l2_df(results, dataset): res = defaultdict(list) sub = [v for v in results.values() if dataset in v] assert len(sub) fields = list(sub[0][dataset].keys()) non_overall_fields = [x for x in fields if 'Overall' not in x] overall_fields = [x for x in fields if 'Overall' in x] if dataset == 'MME': non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)] overall_fields = overall_fields + ['Perception', 'Cognition'] if dataset == 'OCRBench': non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)] overall_fields = ['Final Score'] for m in results: item = results[m] if dataset not in item: continue meta = item['META'] for k in META_FIELDS: if k == 'Param (B)': param = meta['Parameters'] res[k].append(float(param.replace('B', '')) if param != '' else None) elif k == 'Method': name, url = meta['Method'] res[k].append(f'{name}') else: res[k].append(meta[k]) fields = [x for x in fields] for d in non_overall_fields: res[d].append(item[dataset][d]) for d in overall_fields: res[d].append(item[dataset][d]) df = pd.DataFrame(res) all_fields = overall_fields + non_overall_fields # Use the first 5 non-overall fields as required fields required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5] if dataset == 'OCRBench': df = df.sort_values('Final Score') elif dataset == 'COCO_VAL': df = df.sort_values('CIDEr') else: df = df.sort_values('Overall') df = df.iloc[::-1] check_box = {} check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model'] check_box['required'] = required_fields check_box['all'] = all_fields type_map = defaultdict(lambda: 'number') type_map['Method'] = 'html' type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str' check_box['type_map'] = type_map return df, check_box def generate_table1(results, fields): def get_mmbench_v11(item): assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item val = (item['MMBench_TEST_CN_V11']['Overall'] + item['MMBench_TEST_EN_V11']['Overall']) / 2 val = float(f'{val:.1f}') return val res = defaultdict(list) for i, m in enumerate(results): item = results[m] meta = item['META'] for k in META_FIELDS: if k == 'Param (B)': param = meta['Parameters'] res[k].append(float(param.replace('B', '')) if param != '' else None) elif k == 'Method': name, url = meta['Method'] res[k].append(f'{name}') res['name'].append(name) else: res[k].append(meta[k]) scores, ranks = [], [] for d in fields: key_name = 'Overall' if d != 'OCRBench' else 'Final Score' # Every Model should have MMBench_V11 results if d == 'MMBench_V11': val = get_mmbench_v11(item) res[d].append(val) scores.append(val) ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()])) elif d in item: res[d].append(item[d][key_name]) if d == 'MME': scores.append(item[d][key_name] / 28) elif d == 'OCRBench': scores.append(item[d][key_name] / 10) else: scores.append(item[d][key_name]) ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x])) else: res[d].append(None) scores.append(None) ranks.append(None) res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None) res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None) df = pd.DataFrame(res) valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])] valid = valid.sort_values('Avg Score') valid = valid.iloc[::-1] if len(fields): missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0]) missing = missing.iloc[::-1] df = pd.concat([valid, missing]) return df def generate_table( fields: List[str] = None, filename: str = None, path: Union[str, Path] = DATADIR / "overall.csv", ): if filename in LEADERBOARD_FILE_MAPPING: path = DATADIR / LEADERBOARD_FILE_MAPPING[filename] if filename is None and path is None: raise ValueError("filename and path cannot both be None.") REQUIRED_FILEDS = META_FIELDS + [ # 'Average' ] df = pd.read_csv(path) # df_reshaped = ( # df # .drop(columns=["dataset", "mode", "version"]) # .melt( # id_vars=["metric"], # var_name="model", # value_name="value" # ) # .pivot(index=["model"], columns=["metric"], values='value') # ) # df_reshaped.columns.name = None # df_reshaped.reset_index(inplace=True) # df_reshaped.rename(columns=FIELD_MAPPING, inplace=True) # if fields is not None: # for field in fields: # if field not in df_reshaped.columns: # raise ValueError(f"{field} is not a valid field in leaderboard table.") # new_fields = [field for field in FIELD_MAPPING.values() if field in REQUIRED_FILEDS + fields] # logger.info(f"{new_fields=}") # df_reshaped = df_reshaped.loc[:,new_fields].copy() # valid, missing = df_reshaped[~pd.isna(df_reshaped['Average'])], df_reshaped[pd.isna(df_reshaped['Average'])] # valid = valid.sort_values('Average', ascending=False) # if len(fields): # missing = missing.sort_values( # 'Accuracy' if 'Accuracy' in fields else fields[0], # ascending=False, # ) # df_sorted = pd.concat([valid, missing]) df_sorted = df return df_sorted