In [2]:
import pandas as pd
from matplotlib.figure import Figure

df = pd.read_csv("../src_data/diff_dedup_attempts.csv")
df

Unnamed: 0,runname,seed,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,...,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,sciq/acc,sciq/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
0,big-run-refinedweb,6,0,0.330893,0.186,0.233,0.272,0.258,0.166,0.286,...,0.367,0.362,0.516,0.497,0.208,0.202,0.2195,0.2510,0.230294,0.250147
1,big-run-refinedweb,6,1000,0.353481,0.233,0.253,0.288,0.276,0.120,0.256,...,0.365,0.398,0.502,0.500,0.582,0.528,0.2650,0.2900,0.240583,0.252852
2,big-run-refinedweb,6,2000,0.376461,0.282,0.280,0.315,0.328,0.154,0.284,...,0.368,0.390,0.511,0.498,0.683,0.590,0.3055,0.3170,0.245067,0.261686
3,big-run-refinedweb,6,3000,0.387825,0.282,0.287,0.331,0.350,0.152,0.306,...,0.376,0.386,0.512,0.495,0.748,0.646,0.3210,0.3410,0.250268,0.266600
4,big-run-refinedweb,6,4000,0.398105,0.310,0.318,0.340,0.389,0.168,0.306,...,0.371,0.392,0.513,0.495,0.736,0.634,0.3305,0.3425,0.250732,0.268341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,big-run-url_dedups_lowercase_char_length,6,163000,0.477694,0.396,0.375,0.477,0.578,0.226,0.354,...,0.408,0.415,0.562,0.548,0.879,0.817,0.4655,0.4540,0.303672,0.325554
1340,big-run-url_dedups_lowercase_char_length,6,164000,0.476591,0.396,0.375,0.478,0.581,0.228,0.342,...,0.417,0.414,0.555,0.544,0.883,0.827,0.4600,0.4570,0.306406,0.329724
1341,big-run-url_dedups_lowercase_char_length,6,165000,0.478964,0.405,0.388,0.474,0.583,0.230,0.362,...,0.414,0.412,0.562,0.541,0.881,0.826,0.4545,0.4465,0.304121,0.327213
1342,big-run-url_dedups_lowercase_char_length,6,166000,0.477467,0.398,0.381,0.470,0.579,0.234,0.354,...,0.413,0.411,0.554,0.544,0.887,0.831,0.4625,0.4565,0.305855,0.328240


In [3]:
pd.unique(df["runname"]).tolist()

['big-run-refinedweb',
 'big-run-sampled_cross_minhash_dump',
 'big-run-sampled_full_filtered_no_dedup',
 'big-run-sampled_full_imh_linededup',
 'big-run-sampled_full_ind_minhash',
 'big-run-sampled_line_dedup_3lines2',
 'big-run-sampled_line_dedup_min_words',
 'big-run-url_dedups_lowercase_char_length']

In [4]:
runs_mapping = {
    "big-run-refinedweb": "RefinedWeb",
    "big-run-sampled_cross_minhash_dump": "FineWeb full MinHash",
    "big-run-sampled_full_filtered_no_dedup": "FineWeb filtered only",
    "big-run-sampled_full_ind_minhash": "FineWeb independent MinHash",
    "big-run-sampled_full_imh_linededup": "FineWeb line dedup",
    "big-run-sampled_line_dedup_3lines2": "FineWeb 3-line dedup",
    "big-run-sampled_line_dedup_min_words": "FineWeb line dedup w/ min words",
    "big-run-url_dedups_lowercase_char_length": "FineWeb URL dedup"
}

In [5]:
import json
import os
from matplotlib import pyplot as plt
metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',
                   'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']

def normalize_runname(runname):
    return runname.replace("/", "_")

grouped = (
    df.groupby(["runname", "steps"])
    .agg(
        {
            key: "mean" for key in metrics
        }
    )
    .reset_index()
)

file_id="../assets/data/plots/dedup_attempts"
files = {}
for metric in metrics:
    datas = {}
    for name, group in grouped.groupby("runname"):
        group = group[["steps", metric]].sort_values(by="steps")
        group = group.set_index("steps")
        rolling_avg = group
        # rolling_avg = group.rolling(wjjjjjjjjjjjjjindow=5).mean()
        datas[name] = {
            "x": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),
            "y": rolling_avg[metric].tolist(),
            "label": runs_mapping[name],
        }
    # Sort the datata based on the steps
    datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1]["y"][-1])}
    # Create a folder
    os.makedirs(f"{file_id}", exist_ok=True)
    with open(f"{file_id}/{normalize_runname(metric)}.json", "w") as f:
        json.dump({
            "data": datas,
            "layout": {
                "title": {
                    "text": "Attempting to further globally dedup worsened perf"
                },
            }
        }, f)
    files[metric] = {"file": f"{normalize_runname(metric)}.json"}
# Create index
with open(f"{file_id}/index.json", "w") as f:
    json.dump({
        "files": files,
        "settings": {
            "defaultMetric": "agg_score",
            "slider":{"min":0,"max":30,"default":5}
        }
    }, f)
        
    