{ "cells": [ { "cell_type": "code", "execution_count": 11, "id": "dc6eb3bd-f5f1-4995-9f42-c86d9de320dc", "metadata": {}, "outputs": [], "source": [ "# ARC, HellaSwag, MMLU,TruthfulQA, Winogrande, GSM8K\n", "DPO_v4 = [62.54, 79.73, 68.08, 53.94, 75.61, 71.04]\n", "DPO_v3 = [62.46, 79.5, 68.21, 53.27, 75.93, 70.81]\n", "DPO_v2 = [62.63, 79.2, 68.33, 53.29, 75.37, 70.58]\n", "DPO_v1 = [61.52, 79.06, 67.09, 51.85, 74.66, 69.29]\n", "meta = [60.75,78.55,67.07,51.65,74.51,68.69]\n", "\n", "#Scatter / BAR -> AVG scores of each model (Score vs Categoryname)\n", "# Line -> Task scores for each model (Score vs Epoch)" ] }, { "cell_type": "code", "execution_count": 14, "id": "523ddb4b-4cf7-4c83-98c7-e46ea8df4e5d", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'matplotlib'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyplot\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'matplotlib'" ] } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 13, "id": "86bfc73e-2f02-4fa4-a2dd-24d998c1123b", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | ARC | \n", "HellaSwag | \n", "MMLU | \n", "TruthfulQA | \n", "Winogrande | \n", "GSM8K | \n", "AVG | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "62.54 | \n", "79.73 | \n", "68.08 | \n", "53.94 | \n", "75.61 | \n", "71.04 | \n", "68.490000 | \n", "
1 | \n", "62.46 | \n", "79.50 | \n", "68.21 | \n", "53.27 | \n", "75.93 | \n", "70.81 | \n", "68.363333 | \n", "
2 | \n", "62.63 | \n", "79.20 | \n", "68.33 | \n", "53.29 | \n", "75.37 | \n", "70.58 | \n", "68.233333 | \n", "
3 | \n", "61.52 | \n", "79.06 | \n", "67.09 | \n", "51.85 | \n", "74.66 | \n", "69.29 | \n", "67.245000 | \n", "
4 | \n", "60.75 | \n", "78.55 | \n", "67.07 | \n", "51.65 | \n", "74.51 | \n", "68.69 | \n", "66.870000 | \n", "