File size: 5,065 Bytes
13e8963 ed93a1c 13e8963 ed93a1c 13e8963 ed93a1c 13e8963 ed93a1c 13e8963 b199c90 13e8963 ed93a1c 13e8963 ed93a1c 13e8963 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import glob
import json
import datasets # type: ignore
from huggingface_hub import snapshot_download # type: ignore
import pandas as pd # type: ignore
from backend.envs import EVAL_DATASET, TRACES_DATASET, TOKEN, EVAL_RESULTS_PATH
SUBSETS = ["base","cot","orig"]
def load_cot_data():
####
# Load the evaluation results data
####
# download raw data
print("Downloading evaluation results...")
snapshot_download(
repo_id=EVAL_DATASET,
revision="main",
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
max_workers=8,
token=TOKEN
)
# get all models for which results are stored
models = []
for path in glob.glob(f"{EVAL_RESULTS_PATH}/data/*/*", recursive=False):
models.append(path.replace(f"{EVAL_RESULTS_PATH}/data/",""))
# load the evaluation results and create a dataframe
results = []
for model in models:
for subset in SUBSETS:
result_files = glob.glob(f"{EVAL_RESULTS_PATH}/data/{model}/{subset}/**/*.json", recursive=True)
for json_filepath in result_files:
with open(json_filepath) as fp:
data = json.load(fp)
if "results" in data.keys():
for k,v in data["results"].items():
record = v.copy()
record["model"] = model
record["subset"] = subset
results.append(record)
df_results = pd.DataFrame(results)
del results
# postprocess task/config data
def split_alias(alias: str) -> pd.Series:
if alias[-5:]=="_base":
alias = alias[:-5]
elif alias[-4:]=="_cot":
alias = alias[:-4]
if "_" not in alias:
task = alias
config = ""
else:
config, task = alias.split("_")
return pd.Series({"task": task, "config": config})
df_results = pd.concat([df_results, df_results.alias.apply(split_alias)], axis=1)
# baseline accuracies in separete df
df_baseline = df_results[df_results.subset.eq("base")].groupby(["model","task"])[["acc,none"]].mean()
# build cot eval df with baseline accuracies in separate column
df_tmp1 = df_results[df_results.subset.eq("cot")].sort_values(by=["model","task","config"])
df_tmp1.reset_index(inplace=True, drop=True)
df_cot = df_tmp1[["model","task","config"]].copy()
df_cot["acc_cot"] = df_tmp1["acc,none"]
df_cot["acc_base"] = df_cot.apply(lambda row: df_baseline.loc[(row.model, row.task)]["acc,none"], axis=1)
df_cot["acc_gain"] = df_cot.acc_cot - df_cot.acc_base
df_cot["delta_rel"] = (df_cot.acc_cot - df_cot.acc_base)/df_cot.acc_base
# average eval results for all tasks in extra df
df_cot_avg = df_cot.groupby(["model","config"]).mean(numeric_only=True).reset_index()
df_cot_avg["task"] = "all"
# add average results to cot df
df_cot = pd.concat([df_cot_avg, df_cot], ignore_index=True)
####
# Load the traces data
####
# load traces data and extract configs
print("Loading traces data...")
dataset = datasets.load_dataset(TRACES_DATASET, split="test", token=TOKEN, num_proc=8)
dataset = dataset.select_columns(["config_data"])
df_cottraces = pd.DataFrame({"config_data": dataset["config_data"]})
del dataset
config_data = []
for data in df_cottraces.config_data.to_list():
if data is not None:
config_data.append(dict(data))
del df_cottraces
df_cotconfigs = pd.DataFrame(config_data)
df_cotconfigs.drop_duplicates(inplace=True, ignore_index=True)
df_cotconfigs
# add cot configs data to df_cot
def select_config_data(row):
df_selected = df_cotconfigs[df_cotconfigs.name.eq(row.config) & df_cotconfigs.model.eq(row.model)]
if len(df_selected) == 0:
print(f"Config {row.config} not found for model {row.model}")
return None
return df_selected.drop(columns=["name", "model", "task"]).iloc[0]
df_cot = pd.concat(
[
df_cot,
df_cot.apply(select_config_data, axis=1)
],
axis=1
)
# accuracy values in percent
for col in ['acc_base', 'acc_cot', 'acc_gain']:
df_cot[col] = 100 * df_cot[col]
print("Regimes dataframe created:")
print(df_cot.head(3))
####
# Create error dataframe
####
df_cot_err = df_cot.groupby(["model","task"]).agg({'acc_gain': ['mean', 'min', 'max'], "acc_base": "mean", "acc_cot": "mean"})
df_cot_err.columns = ['-'.join(col).strip() for col in df_cot_err.columns.values]
df_cot_err["acc_gain-err"] = 0.5 * (df_cot_err["acc_gain-max"] - df_cot_err["acc_gain-min"])
df_cot_err.reset_index(inplace=True)
df_cot_err.rename(columns={"acc_base-mean": "base accuracy", "acc_cot-mean": "cot accuracy", "acc_gain-mean": "marginal acc. gain"}, inplace=True)
print("Error dataframe created:")
print(df_cot_err.head(3))
return df_cot_err, df_cot |