diff --git "a/notebooks/v2_analysis.ipynb" "b/notebooks/v2_analysis.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/v2_analysis.ipynb" @@ -0,0 +1,1376 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8745f6ee-a9b0-4f68-9f2c-2e27ea86c2a3", + "metadata": {}, + "source": [ + "# Load Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "20508587-c46c-4645-a3d5-845cd55f1512", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import huggingface_hub\n", + "import datasets\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import matplotlib.patches as mpatches\n", + "\n", + "from datetime import datetime, timezone" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ffbf9842-cf52-4989-9de1-91f108b1b146", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5c4cb1e2-a571-4f8b-98d0-61848c833ac9", + "metadata": {}, + "outputs": [], + "source": [ + "# HF Colours\n", + "colors = [\"#FF9D00\", \"#FFD21E\", \"#32343D\", '#FF323D']" + ] + }, + { + "cell_type": "markdown", + "id": "d37bd88b-f89d-440d-9541-6b6e589376e9", + "metadata": {}, + "source": [ + "# Data Loading and Preprocessing" + ] + }, + { + "cell_type": "markdown", + "id": "e1befb4e-f1a3-4c47-a46f-724660e08f31", + "metadata": {}, + "source": [ + "## Load V2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e398b673-20c7-4a83-8230-221829078cb2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(203, 32)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load the v2 JSONL file\n", + "ds = datasets.load_dataset(\"open-llm-leaderboard/contents_v2\", split=\"train\")\n", + "data_v2 = ds.to_pandas()\n", + "data_v2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "464175a6-0034-4c6d-b7c8-51cb69be9db4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | eval_name | \n", + "Precision | \n", + "Type | \n", + "T | \n", + "Weight type | \n", + "Architecture | \n", + "Model | \n", + "fullname | \n", + "Model sha | \n", + "Average ⬆️ | \n", + "Hub License | \n", + "Hub ❤️ | \n", + "#Params (B) | \n", + "Available on the hub | \n", + "Merged | \n", + "MoE | \n", + "Flagged | \n", + "date | \n", + "Chat Template | \n", + "IFEval Raw | \n", + "IFEval | \n", + "BBH Raw | \n", + "BBH | \n", + "MATH Lvl 5 Raw | \n", + "MATH Lvl 5 | \n", + "GPQA Raw | \n", + "GPQA | \n", + "MUSR Raw | \n", + "MUSR | \n", + "MMLU-PRO Raw | \n", + "MMLU-PRO | \n", + "Maintainer's Highlight | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "upstage_SOLAR-10.7B-v1.0_float16 | \n", + "float16 | \n", + "🟢 pretrained | \n", + "🟢 | \n", + "Original | \n", + "LlamaForCausalLM | \n", + "<a target=\"_blank\" href=\"https://huggingface.c... | \n", + "upstage/SOLAR-10.7B-v1.0 | \n", + "a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa5 | \n", + "17.072003 | \n", + "apache-2.0 | \n", + "248 | \n", + "10 | \n", + "True | \n", + "True | \n", + "True | \n", + "False | \n", + "2024-06-12T12:27:42Z | \n", + "False | \n", + "0.242126 | \n", + "24.212645 | \n", + "0.509387 | \n", + "29.789358 | \n", + "0.021148 | \n", + "2.114804 | \n", + "0.281040 | \n", + "4.138702 | \n", + "0.437156 | \n", + "13.677865 | \n", + "0.340010 | \n", + "26.667775 | \n", + "True | \n", + "
1 | \n", + "upstage_SOLAR-10.7B-Instruct-v1.0_float16 | \n", + "float16 | \n", + "💬 chat models (RLHF, DPO, IFT, ...) | \n", + "💬 | \n", + "Original | \n", + "LlamaForCausalLM | \n", + "<a target=\"_blank\" href=\"https://huggingface.c... | \n", + "upstage/SOLAR-10.7B-Instruct-v1.0 | \n", + "c08c25ed66414a878fe0401a3596d536c083606c | \n", + "19.961989 | \n", + "cc-by-nc-4.0 | \n", + "591 | \n", + "10 | \n", + "True | \n", + "True | \n", + "True | \n", + "False | \n", + "2024-06-12T12:06:58Z | \n", + "True | \n", + "0.473661 | \n", + "47.366100 | \n", + "0.516249 | \n", + "31.872402 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.308725 | \n", + "7.829978 | \n", + "0.389937 | \n", + "6.942188 | \n", + "0.313830 | \n", + "23.758865 | \n", + "True | \n", + "
2 | \n", + "togethercomputer_RedPajama-INCITE-Instruct-3B-... | \n", + "float16 | \n", + "🔶 fine-tuned on domain-specific datasets | \n", + "🔶 | \n", + "Original | \n", + "GPTNeoXForCausalLM | \n", + "<a target=\"_blank\" href=\"https://huggingface.c... | \n", + "togethercomputer/RedPajama-INCITE-Instruct-3B-v1 | \n", + "0c66778ee09a036886741707733620b91057909a | \n", + "5.877290 | \n", + "apache-2.0 | \n", + "91 | \n", + "3 | \n", + "True | \n", + "True | \n", + "True | \n", + "False | \n", + "2024-06-12T12:07:46Z | \n", + "False | \n", + "0.212426 | \n", + "21.242636 | \n", + "0.314602 | \n", + "4.510786 | \n", + "0.006042 | \n", + "0.604230 | \n", + "0.247483 | \n", + "0.000000 | \n", + "0.388604 | \n", + "6.408854 | \n", + "0.110954 | \n", + "1.217125 | \n", + "True | \n", + "
3 | \n", + "togethercomputer_RedPajama-INCITE-Chat-3B-v1_f... | \n", + "float16 | \n", + "🔶 fine-tuned on domain-specific datasets | \n", + "🔶 | \n", + "Original | \n", + "GPTNeoXForCausalLM | \n", + "<a target=\"_blank\" href=\"https://huggingface.c... | \n", + "togethercomputer/RedPajama-INCITE-Chat-3B-v1 | \n", + "f0e0995eba801096ed04cb87931d96a8316871af | \n", + "4.950649 | \n", + "apache-2.0 | \n", + "147 | \n", + "3 | \n", + "True | \n", + "True | \n", + "True | \n", + "False | \n", + "2024-06-13T17:58:59Z | \n", + "False | \n", + "0.165215 | \n", + "16.521496 | \n", + "0.321669 | \n", + "5.164728 | \n", + "0.003021 | \n", + "0.302115 | \n", + "0.244128 | \n", + "0.000000 | \n", + "0.368448 | \n", + "5.089323 | \n", + "0.112699 | \n", + "1.411052 | \n", + "True | \n", + "
4 | \n", + "togethercomputer_RedPajama-INCITE-Base-3B-v1_f... | \n", + "float16 | \n", + "🟢 pretrained | \n", + "🟢 | \n", + "Original | \n", + "GPTNeoXForCausalLM | \n", + "<a target=\"_blank\" href=\"https://huggingface.c... | \n", + "togethercomputer/RedPajama-INCITE-Base-3B-v1 | \n", + "094fbdd0c911feb485ce55de1952ab2e75277e1e | \n", + "5.645099 | \n", + "apache-2.0 | \n", + "90 | \n", + "3 | \n", + "True | \n", + "True | \n", + "True | \n", + "False | \n", + "2024-06-12T12:28:23Z | \n", + "False | \n", + "0.229363 | \n", + "22.936254 | \n", + "0.306040 | \n", + "3.518608 | \n", + "0.009063 | \n", + "0.906344 | \n", + "0.243289 | \n", + "0.000000 | \n", + "0.373875 | \n", + "4.001042 | \n", + "0.111120 | \n", + "1.235594 | \n", + "True | \n", + "
\n", + " | eval_name | \n", + "Precision | \n", + "Type | \n", + "T | \n", + "Weight type | \n", + "Architecture | \n", + "Model | \n", + "fullname | \n", + "Model sha | \n", + "Average ⬆️ | \n", + "Hub License | \n", + "Hub ❤️ | \n", + "#Params (B) | \n", + "Available on the hub | \n", + "Merged | \n", + "MoE | \n", + "Flagged | \n", + "date | \n", + "Chat Template | \n", + "IFEval Raw | \n", + "IFEval | \n", + "BBH Raw | \n", + "BBH | \n", + "MATH Lvl 5 Raw | \n", + "MATH Lvl 5 | \n", + "GPQA Raw | \n", + "GPQA | \n", + "MUSR Raw | \n", + "MUSR | \n", + "MMLU-PRO Raw | \n", + "MMLU-PRO | \n", + "Maintainer's Highlight | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "upstage_SOLAR-10.7B-v1.0_float16 | \n", + "float16 | \n", + "🟢 pretrained | \n", + "🟢 | \n", + "Original | \n", + "LlamaForCausalLM | \n", + "<a target=\"_blank\" href=\"https://huggingface.c... | \n", + "upstage/SOLAR-10.7B-v1.0 | \n", + "a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa5 | \n", + "17.072003 | \n", + "apache-2.0 | \n", + "248 | \n", + "10 | \n", + "True | \n", + "True | \n", + "True | \n", + "False | \n", + "2023-12-12 14:57:41+00:00 | \n", + "False | \n", + "0.242126 | \n", + "24.212645 | \n", + "0.509387 | \n", + "29.789358 | \n", + "0.021148 | \n", + "2.114804 | \n", + "0.281040 | \n", + "4.138702 | \n", + "0.437156 | \n", + "13.677865 | \n", + "0.34001 | \n", + "26.667775 | \n", + "True | \n", + "
1 | \n", + "upstage_SOLAR-10.7B-Instruct-v1.0_float16 | \n", + "float16 | \n", + "💬 chat models (RLHF, DPO, IFT, ...) | \n", + "💬 | \n", + "Original | \n", + "LlamaForCausalLM | \n", + "<a target=\"_blank\" href=\"https://huggingface.c... | \n", + "upstage/SOLAR-10.7B-Instruct-v1.0 | \n", + "c08c25ed66414a878fe0401a3596d536c083606c | \n", + "19.961989 | \n", + "cc-by-nc-4.0 | \n", + "591 | \n", + "10 | \n", + "True | \n", + "True | \n", + "True | \n", + "False | \n", + "2023-12-12 12:39:22+00:00 | \n", + "True | \n", + "0.473661 | \n", + "47.366100 | \n", + "0.516249 | \n", + "31.872402 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.308725 | \n", + "7.829978 | \n", + "0.389937 | \n", + "6.942188 | \n", + "0.31383 | \n", + "23.758865 | \n", + "True | \n", + "
\n", + " | Type | \n", + "IFEval | \n", + "MATH Lvl 5 | \n", + "Average ⬆️ | \n", + "
---|---|---|---|---|
0 | \n", + "Chat Models | \n", + "47.014196 | \n", + "4.729041 | \n", + "19.054433 | \n", + "
1 | \n", + "Fine-tuned Models | \n", + "26.169263 | \n", + "2.531290 | \n", + "12.222311 | \n", + "
2 | \n", + "Pretrained Models | \n", + "21.958824 | \n", + "3.236317 | \n", + "11.288183 | \n", + "
\n", + " | eval_name | \n", + "median_score | \n", + "IFEval | \n", + "BBH | \n", + "MATH Lvl 5 | \n", + "GPQA | \n", + "MUSR | \n", + "MMLU-PRO | \n", + "
---|---|---|---|---|---|---|---|---|
124 | \n", + "Qwen_Qwen2-72B-Instruct_bfloat16 | \n", + "42.022044 | \n", + "79.891687 | \n", + "57.483009 | \n", + "35.120846 | \n", + "16.331096 | \n", + "17.167969 | \n", + "48.923242 | \n", + "
66 | \n", + "meta-llama_Meta-Llama-3-70B-Instruct_bfloat16 | \n", + "35.041118 | \n", + "80.990771 | \n", + "50.185133 | \n", + "23.338369 | \n", + "4.921700 | \n", + "10.920573 | \n", + "46.743868 | \n", + "
62 | \n", + "microsoft_Orca-2-13b_bfloat16 | \n", + "22.612409 | \n", + "31.279339 | \n", + "27.308019 | \n", + "0.981873 | \n", + "4.026846 | \n", + "25.787760 | \n", + "19.437057 | \n", + "
67 | \n", + "meta-llama_Meta-Llama-3-70B_bfloat16 | \n", + "18.113793 | \n", + "16.031906 | \n", + "48.709813 | \n", + "16.540785 | \n", + "19.686801 | \n", + "16.011198 | \n", + "41.212323 | \n", + "