{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "138889b92720ce2e", "metadata": { "ExecuteTime": { "end_time": "2024-05-14T09:02:09.162993Z", "start_time": "2024-05-14T09:02:09.134625Z" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
runnameseedstepsagg_scorecommonsense_qa/acccommonsense_qa/acc_normhellaswag/acchellaswag/acc_normopenbookqa/accopenbookqa/acc_norm...siqa/accsiqa/acc_normwinogrande/accwinogrande/acc_normsciq/accsciq/acc_normarc/accarc/acc_normmmlu/accmmlu/acc_norm
0big-run-sampled-fineweb-c4-filters600.3308930.1860.2330.2720.2580.1660.286...0.3670.3620.5160.4970.2080.2020.21950.25100.2302940.250147
1big-run-sampled-fineweb-c4-filters610000.3593030.2500.2630.2930.2850.1400.276...0.3760.4010.4970.4790.5940.5240.27400.29850.2416170.251920
2big-run-sampled-fineweb-c4-filters620000.3753930.2680.2770.3190.3240.1500.274...0.3720.4110.5070.4840.6880.6060.30150.32700.2465770.259146
3big-run-sampled-fineweb-c4-filters630000.3896550.3030.3050.3240.3580.1520.280...0.3830.3890.5200.5060.7410.6470.33950.34050.2550010.268740
4big-run-sampled-fineweb-c4-filters640000.4011950.3090.3100.3530.3930.1380.288...0.3780.4020.5340.5110.7660.6520.33950.34950.2562030.269056
..................................................................
667big-run-sampled_full_filtered_no_dedup61630000.4662550.4260.3720.4690.5550.2420.354...0.3890.3940.5630.5440.8690.8080.44600.44350.2971250.317543
668big-run-sampled_full_filtered_no_dedup61640000.4697430.4310.3760.4670.5560.2320.356...0.3910.3970.5680.5520.8610.8000.44500.45150.3027060.318447
669big-run-sampled_full_filtered_no_dedup61650000.4698470.4260.3750.4720.5490.2340.364...0.3890.4010.5620.5480.8670.7950.44350.44750.2975860.319279
670big-run-sampled_full_filtered_no_dedup61660000.4676510.4230.3650.4700.5550.2260.356...0.3920.3990.5640.5450.8720.8120.43650.44750.2972560.319704
671big-run-sampled_full_filtered_no_dedup61670000.4696520.4160.3730.4690.5600.2340.356...0.3920.3940.5650.5570.8670.8030.44300.44550.2974090.317717
\n", "

672 rows × 22 columns

\n", "
" ], "text/plain": [ " runname seed steps agg_score \\\n", "0 big-run-sampled-fineweb-c4-filters 6 0 0.330893 \n", "1 big-run-sampled-fineweb-c4-filters 6 1000 0.359303 \n", "2 big-run-sampled-fineweb-c4-filters 6 2000 0.375393 \n", "3 big-run-sampled-fineweb-c4-filters 6 3000 0.389655 \n", "4 big-run-sampled-fineweb-c4-filters 6 4000 0.401195 \n", ".. ... ... ... ... \n", "667 big-run-sampled_full_filtered_no_dedup 6 163000 0.466255 \n", "668 big-run-sampled_full_filtered_no_dedup 6 164000 0.469743 \n", "669 big-run-sampled_full_filtered_no_dedup 6 165000 0.469847 \n", "670 big-run-sampled_full_filtered_no_dedup 6 166000 0.467651 \n", "671 big-run-sampled_full_filtered_no_dedup 6 167000 0.469652 \n", "\n", " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n", "0 0.186 0.233 0.272 \n", "1 0.250 0.263 0.293 \n", "2 0.268 0.277 0.319 \n", "3 0.303 0.305 0.324 \n", "4 0.309 0.310 0.353 \n", ".. ... ... ... \n", "667 0.426 0.372 0.469 \n", "668 0.431 0.376 0.467 \n", "669 0.426 0.375 0.472 \n", "670 0.423 0.365 0.470 \n", "671 0.416 0.373 0.469 \n", "\n", " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n", "0 0.258 0.166 0.286 ... 0.367 \n", "1 0.285 0.140 0.276 ... 0.376 \n", "2 0.324 0.150 0.274 ... 0.372 \n", "3 0.358 0.152 0.280 ... 0.383 \n", "4 0.393 0.138 0.288 ... 0.378 \n", ".. ... ... ... ... ... \n", "667 0.555 0.242 0.354 ... 0.389 \n", "668 0.556 0.232 0.356 ... 0.391 \n", "669 0.549 0.234 0.364 ... 0.389 \n", "670 0.555 0.226 0.356 ... 0.392 \n", "671 0.560 0.234 0.356 ... 0.392 \n", "\n", " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n", "0 0.362 0.516 0.497 0.208 \n", "1 0.401 0.497 0.479 0.594 \n", "2 0.411 0.507 0.484 0.688 \n", "3 0.389 0.520 0.506 0.741 \n", "4 0.402 0.534 0.511 0.766 \n", ".. ... ... ... ... \n", "667 0.394 0.563 0.544 0.869 \n", "668 0.397 0.568 0.552 0.861 \n", "669 0.401 0.562 0.548 0.867 \n", "670 0.399 0.564 0.545 0.872 \n", "671 0.394 0.565 0.557 0.867 \n", "\n", " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n", "0 0.202 0.2195 0.2510 0.230294 0.250147 \n", "1 0.524 0.2740 0.2985 0.241617 0.251920 \n", "2 0.606 0.3015 0.3270 0.246577 0.259146 \n", "3 0.647 0.3395 0.3405 0.255001 0.268740 \n", "4 0.652 0.3395 0.3495 0.256203 0.269056 \n", ".. ... ... ... ... ... \n", "667 0.808 0.4460 0.4435 0.297125 0.317543 \n", "668 0.800 0.4450 0.4515 0.302706 0.318447 \n", "669 0.795 0.4435 0.4475 0.297586 0.319279 \n", "670 0.812 0.4365 0.4475 0.297256 0.319704 \n", "671 0.803 0.4430 0.4455 0.297409 0.317717 \n", "\n", "[672 rows x 22 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from matplotlib.figure import Figure\n", "\n", "df = pd.read_csv(\"../src_data/all-filters-big-runs.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 2, "id": "839a06a71d9183e5", "metadata": { "ExecuteTime": { "end_time": "2024-05-14T09:02:10.094329Z", "start_time": "2024-05-14T09:02:10.081683Z" } }, "outputs": [ { "data": { "text/plain": [ "['big-run-sampled-fineweb-c4-filters',\n", " 'big-run-sampled_full_ind_minhash',\n", " 'big-run-fineweb-v1-all-dumps',\n", " 'big-run-sampled_full_filtered_no_dedup']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.unique(df[\"runname\"]).tolist()" ] }, { "cell_type": "code", "execution_count": 3, "id": "b610f43caefdf01", "metadata": { "ExecuteTime": { "end_time": "2024-05-14T09:03:06.294766Z", "start_time": "2024-05-14T09:03:06.291388Z" }, "collapsed": false }, "outputs": [], "source": [ "runs_mapping = {\n", " # \"big-run-refinedweb\": \"RefinedWeb\",\n", " # \"big-run-c4\": \"C4\",\n", " \"big-run-sampled_full_filtered_no_dedup\": \"FineWeb: base filtering only\",\n", " \"big-run-sampled_full_ind_minhash\": \"FineWeb: independent MinHash (id mh)\",\n", " \"big-run-sampled-fineweb-c4-filters\": \"FineWeb: id mh + C4 filters\",\n", " \"big-run-fineweb-v1-all-dumps\": \"FineWeb: id mh + C4 + custom filters\",\n", "}" ] }, { "cell_type": "code", "execution_count": 6, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2024-05-14T09:03:08.298110Z", "start_time": "2024-05-14T09:03:08.024839Z" }, "collapsed": true }, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "import os\n", "import json\n", "\n", "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n", " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n", "\n", "def normalize_runname(runname):\n", " return runname.replace(\"/\", \"_\")\n", "\n", "grouped = (\n", " df.groupby([\"runname\", \"steps\"])\n", " .agg(\n", " {\n", " key: \"mean\" for key in metrics\n", " }\n", " )\n", " .reset_index()\n", ")\n", "\n", "file_id=\"../assets/data/plots/all_filtering_steps\"\n", "files = {}\n", "for metric in metrics:\n", " datas = {}\n", " for name, group in grouped.groupby(\"runname\"):\n", " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n", " group = group.set_index(\"steps\")\n", " rolling_avg = group\n", " # rolling_avg = group.rolling(window=5).mean()\n", " datas[name] = {\n", " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n", " \"y\": rolling_avg[metric].tolist(),\n", " \"label\": runs_mapping[name],\n", " }\n", " # Sort the datata based on the steps\n", " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n", " # Create a folder\n", " os.makedirs(f\"{file_id}\", exist_ok=True)\n", " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n", " json.dump({\n", " \"data\": datas,\n", " \"layout\": {\n", " \"title\": {\n", " \"text\": \"The different FineWeb processing steps\"\n", " },\n", " }\n", " }, f)\n", " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n", "# Create l\n", "with open(f\"{file_id}/index.json\", \"w\") as f:\n", " json.dump({\n", " \"files\": files,\n", " \"settings\": {\n", " \"defaultMetric\": \"agg_score\",\n", " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n", " }\n", " }, f)\n", " " ] }, { "cell_type": "code", "execution_count": 12, "id": "af28ebbd054cdc33", "metadata": { "ExecuteTime": { "end_time": "2024-05-14T08:14:41.132508Z", "start_time": "2024-05-14T08:14:41.130025Z" }, "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6b8c428e2fedeb1a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }