{ "cells": [ { "cell_type": "code", "execution_count": 11, "id": "dc6eb3bd-f5f1-4995-9f42-c86d9de320dc", "metadata": {}, "outputs": [], "source": [ "# ARC, HellaSwag, MMLU,TruthfulQA, Winogrande, GSM8K\n", "DPO_v4 = [62.54, 79.73, 68.08, 53.94, 75.61, 71.04]\n", "DPO_v3 = [62.46, 79.5, 68.21, 53.27, 75.93, 70.81]\n", "DPO_v2 = [62.63, 79.2, 68.33, 53.29, 75.37, 70.58]\n", "DPO_v1 = [61.52, 79.06, 67.09, 51.85, 74.66, 69.29]\n", "meta = [60.75,78.55,67.07,51.65,74.51,68.69]\n", "\n", "#Scatter / BAR -> AVG scores of each model (Score vs Categoryname)\n", "# Line -> Task scores for each model (Score vs Epoch)" ] }, { "cell_type": "code", "execution_count": 14, "id": "523ddb4b-4cf7-4c83-98c7-e46ea8df4e5d", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'matplotlib'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyplot\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'matplotlib'" ] } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 13, "id": "86bfc73e-2f02-4fa4-a2dd-24d998c1123b", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ARCHellaSwagMMLUTruthfulQAWinograndeGSM8KAVG
062.5479.7368.0853.9475.6171.0468.490000
162.4679.5068.2153.2775.9370.8168.363333
262.6379.2068.3353.2975.3770.5868.233333
361.5279.0667.0951.8574.6669.2967.245000
460.7578.5567.0751.6574.5168.6966.870000
\n", "
" ], "text/plain": [ " ARC HellaSwag MMLU TruthfulQA Winogrande GSM8K AVG\n", "0 62.54 79.73 68.08 53.94 75.61 71.04 68.490000\n", "1 62.46 79.50 68.21 53.27 75.93 70.81 68.363333\n", "2 62.63 79.20 68.33 53.29 75.37 70.58 68.233333\n", "3 61.52 79.06 67.09 51.85 74.66 69.29 67.245000\n", "4 60.75 78.55 67.07 51.65 74.51 68.69 66.870000" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# !pip install pandas\n", "import pandas as pd\n", "\n", "df = pd.DataFrame([DPO_v4,DPO_v3,DPO_v2,DPO_v1 ,meta], columns=['ARC', 'HellaSwag', 'MMLU','TruthfulQA', 'Winogrande', 'GSM8K'])\n", "df['AVG'] = df.mean(axis=1)\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "id": "89874d9c-2544-439a-a0ef-85acb74e4cc9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 68.490000\n", "1 68.363333\n", "2 68.233333\n", "3 67.245000\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.mean(axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "5d5278ea-0254-4881-82a3-360b551a9a1a", "metadata": {}, "outputs": [], "source": [ "# Scatter plot for AVG scores\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(df.columns[:-1], df['AVG'], color='blue', label='Average Score')\n", "plt.xlabel('Model Name')\n", "plt.ylabel('Average Score')\n", "plt.title('Average Scores of Models')\n", "plt.legend()\n", "plt.grid(True)\n", "plt.savefig('avg_scores_scatter.png') # Save the plot as PNG\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }