SeaLLM-7B-v2.5-simple / performance_plot.py
nxphi47's picture
Create performance_plot.py
5919bed
raw
history blame
8.04 kB
import os
import gradio as gr
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datasets import load_dataset
from plotly.subplots import make_subplots
CATEGORIES = ["task-solving", "math-reasoning", "general-instruction", "natural-question", "safety"]
LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl']
FORCE_DOWNLOAD = bool(int(os.environ.get("FORCE_DOWNLOAD", "0")))
HF_TOKEN = str(os.environ.get("HF_TOKEN", ""))
DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", ""))
PERFORMANCE_FILENAME = str(os.environ.get("PERFORMANCE_FILENAME", "gpt4_single_json.csv"))
rename_map = {
"seallm13b10L6k_a_5a1R1_seaall_sft4x_1_5a1_r2_0_dpo_8_40000s": "SeaLLM-13b",
# "seallm13b10L4k_a_sft4xdpo_5a": "SeaLLM-13b-10L",
"polylm": "PolyLM-13b",
"qwen": "Qwen-14b",
"gpt-3.5-turbo": "GPT-3.5-turbo",
"gpt-4-1106-preview": "GPT-4-turbo",
}
CATEGORIES = [ "task-solving", "math-reasoning", "general-instruction", "natural-question", "safety", ]
CATEGORIES_NAMES = {
"task-solving": 'Task-solving',
"math-reasoning": 'Math',
"general-instruction": 'General-instruction',
"natural-question": 'NaturalQA',
"safety": 'Safety',
}
# LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl']
LANGS = ['en', 'vi', 'id', 'ms', 'tl', 'th', 'km', 'lo', 'my']
LANG_NAMES = {
'en': 'eng',
'vi': 'vie',
'th': 'tha',
'id': 'ind',
'km': 'khm',
'lo': 'lao',
'ms': 'msa',
'my': 'mya',
'tl': 'tgl',
}
MODEL_DFRAME = None
def get_model_df():
# global MODEL_DFRAME
# if isinstance(MODEL_DFRAME, pd.DataFrame):
# print(f'Load cache data frame')
# return MODEL_DFRAME
from huggingface_hub import hf_hub_download
assert DATA_SET_REPO_PATH != ''
assert HF_TOKEN != ''
repo_id = DATA_SET_REPO_PATH
filename = PERFORMANCE_FILENAME
# data_path = f"{DATA_SET_REPO_PATH}/{PERFORMANCE_FILENAME}"
file_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
force_download=FORCE_DOWNLOAD,
local_dir='./hf_cache',
repo_type="dataset",
token=HF_TOKEN
)
print(f'Downloaded file at {file_path} from {DATA_SET_REPO_PATH} / {PERFORMANCE_FILENAME}')
df = pd.read_csv(file_path)
return df
def aggregate_df(df, model_dict, category_name, categories):
scores_all = []
all_models = df["model"].unique()
for model in all_models:
for i, cat in enumerate(categories):
# filter category/model, and score format error (<1% case)
res = df[(df[category_name]==cat) & (df["model"]==model) & (df["score"] >= 0)]
score = res["score"].mean()
cat_name = cat
scores_all.append({"model": model, category_name: cat_name, "score": score})
target_models = list(model_dict.keys())
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)
df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]
rename_map = model_dict
for k, v in rename_map.items():
df_score.replace(k, v, inplace=True)
return df_score
def polar_subplot(fig, dframe, model_names, category_label, category_names, row, col, showlegend=True):
# cat category
colors = px.colors.qualitative.Plotly
for i, (model, model_name) in enumerate(model_names):
cat_list = dframe[dframe['model'] == model_name][category_label].tolist()
score_list = dframe[dframe['model'] == model_name]['score'].tolist()
cat_list += [cat_list[0]]
cat_list = [category_names[x] for x in cat_list]
score_list += [score_list[0]]
polar = go.Scatterpolar(
name = model_name,
r = score_list,
theta = cat_list,
legendgroup=f'{i}',
marker=dict(color=colors[i]),
hovertemplate="""Score: %{r:.2f}""",
showlegend=showlegend,
)
fig.add_trace(polar, row, col)
def plot_agg_fn():
df = get_model_df()
all_models = df["model"].unique()
model_names = list(rename_map.items())
colors = px.colors.qualitative.Plotly
cat_df = aggregate_df(df, rename_map, "category", CATEGORIES, )
lang_df = aggregate_df(df, rename_map, "lang", LANGS, )
fig = make_subplots(
rows=1, cols=2,
specs=[[{'type': 'polar'}]*2],
subplot_titles=("By Category", "By Language"),
)
fig.layout.annotations[0].y = 1.05
fig.layout.annotations[1].y = 1.05
# cat category
for i, (model, model_name) in enumerate(model_names):
cat_list = cat_df[cat_df['model'] == model_name]['category'].tolist()
score_list = cat_df[cat_df['model'] == model_name]['score'].tolist()
cat_list += [cat_list[0]]
cat_list = [CATEGORIES_NAMES[x] for x in cat_list]
score_list += [score_list[0]]
polar = go.Scatterpolar(
name = model_name,
r = score_list,
theta = cat_list,
legendgroup=f'{i}',
marker=dict(color=colors[i]),
hovertemplate="""Score: %{r:.2f}""",
)
fig.add_trace(polar, 1, 1)
# cat langs
for i, (model, model_name) in enumerate(model_names):
cat_list = lang_df[lang_df['model'] == model_name]['lang'].tolist()
score_list = lang_df[lang_df['model'] == model_name]['score'].tolist()
cat_list += [cat_list[0]]
score_list += [score_list[0]]
cat_list = [LANG_NAMES[x] for x in cat_list]
polar = go.Scatterpolar(
name = model_name,
r = score_list,
theta = cat_list,
legendgroup=f'{i}',
marker=dict(color=colors[i]),
hovertemplate="""Score: %{r:.2f}""",
showlegend=False,
)
fig.add_trace(polar, 1, 2)
polar_config = dict(
angularaxis = dict(
rotation=90, # start position of angular axis
),
radialaxis = dict(
range=[0, 10],
),
)
fig.update_layout(
polar = polar_config,
polar2 = polar_config,
title='Sea-Bench (rated by GPT-4)',
)
return fig
def plot_by_lang_fn():
df = get_model_df()
model_names = list(rename_map.items())
fig = make_subplots(
rows=3, cols=3,
specs=[[{'type': 'polar'}]*3] * 3,
subplot_titles=list(LANG_NAMES.values()),
# vertical_spacing=1
)
# print(fig.layout.annotations)
for ano in fig.layout.annotations:
ano.y = ano.y + 0.02
has_safety = ['vi', 'id', 'th']
for lang_id, lang in enumerate(LANGS):
cat_names = CATEGORIES if lang in has_safety else [x for x in CATEGORIES if x != 'safety']
cat_lang_df = aggregate_df(df[df['lang'] == lang], rename_map, "category", cat_names, )
row = lang_id // 3 + 1
col = lang_id % 3 + 1
polar_subplot(fig, cat_lang_df, model_names, 'category', CATEGORIES_NAMES, row, col, showlegend=lang_id == 0)
polar_config = dict(
angularaxis = dict(
rotation=90, # start position of angular axis
),
radialaxis = dict(
range=[0, 10],
),
)
layer_kwargs = {f"polar{i}": polar_config for i in range(1, 10)}
fig.update_layout(
title='Sea-Bench - By language (rated by GPT-4)',
height=1000,
# width=1200,
**layer_kwargs
)
return fig
def attach_plot_to_demo(demo):
with gr.Accordion("Psst... wanna see some performance benchmarks?", open=False):
gr_plot_agg = gr.Plot(label="Aggregated")
gr_plot_bylang = gr.Plot(label='By language')
# def callback():
demo.load(plot_agg_fn, [], gr_plot_agg)
demo.load(plot_by_lang_fn, [], gr_plot_bylang)
# return callback