Spaces:

flowers-team
/

StickToYourRoleLeaderboard

Running

App Files Files Community

grg commited on Sep 26

Commit

abb889e

•

1 Parent(s): 961db60

Adding llama_3.2 and updating links

Browse files

Files changed (49) hide show

static/leaderboard.csv +29 -27
static/models_data/Mistral-7B-Instruct-v0.1/model_detail.html +3 -3
static/models_data/Mistral-7B-Instruct-v0.2/model_detail.html +3 -3
static/models_data/Mistral-7B-Instruct-v0.3/model_detail.html +2 -2
static/models_data/Mistral-Large-Instruct-2407/model_detail.html +3 -3
static/models_data/Mistral-Small-Instruct-2409/model_detail.html +2 -2
static/models_data/Mixtral-8x22B-Instruct-v0.1/model_detail.html +3 -3
static/models_data/Mixtral-8x7B-Instruct-v0.1/model_detail.html +3 -3
static/models_data/Qwen2-72B-Instruct/model_detail.html +3 -3
static/models_data/Qwen2-7B-Instruct/model_detail.html +3 -3
static/models_data/Qwen2.5-0.5B-Instruct/model_detail.html +3 -3
static/models_data/Qwen2.5-32B-Instruct/model_detail.html +3 -3
static/models_data/Qwen2.5-72B-Instruct/model_detail.html +3 -3
static/models_data/Qwen2.5-7B-Instruct/model_detail.html +3 -3
static/models_data/cardinal.svg +331 -249
static/models_data/command_r_plus/model_detail.html +2 -2
static/models_data/gpt-3.5-turbo-0125/model_detail.html +2 -2
static/models_data/gpt-4o-0513/model_detail.html +2 -2
static/models_data/gpt-4o-mini-2024-07-18/model_detail.html +2 -2
static/models_data/llama_3.1_405b_instruct_4bit/model_detail.html +5 -5
static/models_data/llama_3.1_70b_instruct/model_detail.html +3 -3
static/models_data/llama_3.1_8b_instruct/model_detail.html +3 -3
static/models_data/llama_3.2_1b_instruct/cfa_metrics.csv +10 -0
static/models_data/llama_3.2_1b_instruct/matrix.svg +1959 -0
static/models_data/llama_3.2_1b_instruct/ranks.svg +0 -0
static/models_data/llama_3.2_1b_instruct/structure.svg +0 -0
static/models_data/llama_3.2_3b_instruct/cfa_metrics.csv +10 -0
static/models_data/llama_3.2_3b_instruct/matrix.svg +1911 -0
static/models_data/llama_3.2_3b_instruct/ranks.svg +0 -0
static/models_data/llama_3.2_3b_instruct/structure.svg +0 -0
static/models_data/llama_3_70b_instruct/model_detail.html +3 -3
static/models_data/llama_3_8b_instruct/model_detail.html +3 -3
static/models_data/ordinal.svg +332 -250
static/models_data/phi-3-medium-128k-instruct/model_detail.html +3 -3
static/models_data/phi-3-mini-128k-instruct/model_detail.html +2 -2
static/models_data/phi-3.5-MoE-instruct/cfa_metrics.csv +9 -9
static/models_data/phi-3.5-MoE-instruct/matrix.svg +356 -395
static/models_data/phi-3.5-MoE-instruct/model_detail.html +2 -2
static/models_data/phi-3.5-MoE-instruct/ranks.svg +0 -0
static/models_data/phi-3.5-MoE-instruct/structure.svg +0 -0
static/models_data/phi-3.5-mini-instruct/cfa_metrics.csv +9 -9
static/models_data/phi-3.5-mini-instruct/matrix.svg +364 -370
static/models_data/phi-3.5-mini-instruct/model_detail.html +2 -2
static/models_data/phi-3.5-mini-instruct/ranks.svg +0 -0
static/models_data/phi-3.5-mini-instruct/structure.svg +0 -0
templates/about.html +11 -11
templates/index.html +5 -5
templates/model_detail.html +1 -1
templates/new_model.html +4 -4

static/leaderboard.csv CHANGED Viewed

@@ -1,28 +1,30 @@
 Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Stress,CFI,SRMR,RMSEA
-phi-3-mini-128k-instruct,0.33279914529914534,0.32984992817164005,0.039299993295009855,0.281800547806919,0.5861361111111111,0.42524166666666674,0.3974944444444444
-phi-3-medium-128k-instruct,0.3333333333333333,0.30802986933853177,0.09692037989916814,0.2651981204439735,0.43025555555555556,0.5503277777777777,0.5381722222222222
-phi-3.5-mini-instruct,0.2435897435897436,0.2680653144619754,0.0361229186530762,0.28422749224983457,0.40715555555555555,0.5721138888888888,0.5507833333333333
-phi-3.5-MoE-instruct,0.40010683760683763,0.36128192067041315,0.10985291697837646,0.2739229692168671,0.5530944444444444,0.4248777777777778,0.40345
-Mistral-7B-Instruct-v0.1,0.22168803418803418,0.26609566354811315,0.027216280472015988,0.2829498135031582,0.38917777777777773,0.5561138888888888,0.530213888888889
-Mistral-7B-Instruct-v0.2,0.35683760683760685,0.32133832899241477,0.14417876497818388,0.265188983528973,0.3802722222222222,0.5727305555555555,0.5483611111111111
-Mistral-7B-Instruct-v0.3,0.25961538461538464,0.26572479479146804,0.07960539866974455,0.2742399030139009,0.31385,0.6241,0.6081333333333333
-Mixtral-8x7B-Instruct-v0.1,0.44925213675213665,0.3819009850972602,0.21473356319081474,0.2624402608740656,0.45275,0.5034666666666667,0.4905694444444444
-Mixtral-8x22B-Instruct-v0.1,0.3477564102564103,0.31529864972153404,0.1414001940345544,0.2548838005881672,0.3772361111111111,0.5810888888888889,0.5844750000000001
-command_r_plus,0.5726495726495726,0.4995356672762356,0.3429686514651868,0.23811982320641845,0.6033000000000001,0.3740166666666668,0.3667527777777777
-llama_3_8b_instruct,0.4983974358974359,0.4295836112681494,0.24527785038654715,0.245806400289881,0.5498222222222222,0.42656388888888896,0.42189444444444446
-llama_3_70b_instruct,0.7777777777777778,0.6839540364836003,0.607020698814379,0.18525883672204868,0.7210055555555557,0.2346083333333333,0.25758888888888887
-llama_3.1_8b_instruct,0.5592948717948718,0.4786874422110324,0.4295080949846363,0.22060228669473025,0.4305722222222223,0.5455027777777777,0.553
-llama_3.1_70b_instruct,0.8215811965811967,0.7172545013390067,0.691365862744007,0.1709718847084183,0.6979472222222223,0.2636777777777777,0.2907250000000001
-llama_3.1_405b_instruct_4bit,0.7329059829059829,0.6490864350383405,0.7232098126552619,0.1702199925365422,0.4875722222222223,0.4963444444444445,0.5211555555555556
-Qwen2-7B-Instruct,0.40651709401709396,0.36370005127542027,0.25108519506513916,0.25776537005719313,0.3560861111111111,0.6009722222222222,0.5920888888888889
-Qwen2-72B-Instruct,0.5721153846153846,0.5461212335522644,0.6465993243020925,0.20297742879025626,0.3045,0.6543138888888889,0.6646361111111111
-Qwen2.5-0.5B-Instruct,0.2954059829059829,0.3005554090516966,0.002970456550606876,0.2928913315666324,0.5371250000000001,0.44709722222222226,0.404575
-Qwen2.5-7B-Instruct,0.6132478632478633,0.5163098181421168,0.333554494486959,0.2505866550331236,0.6473694444444444,0.30400277777777773,0.29651944444444434
-Qwen2.5-32B-Instruct,0.7323717948717948,0.656917654644944,0.6724190751477237,0.1806656189868978,0.5603222222222223,0.40237500000000004,0.41161666666666663
-Qwen2.5-72B-Instruct,0.8253205128205129,0.7104489147495714,0.6974116787371809,0.16176650806326276,0.6734583333333333,0.2993,0.3184472222222223
-gpt-3.5-turbo-0125,0.24626068376068375,0.28218378886707396,0.08240359836763214,0.28728574920060357,0.3873055555555555,0.599925,0.572238888888889
-gpt-4o-0513,0.6810897435897435,0.5989532974661671,0.5122163952167618,0.19201420113771173,0.6235416666666667,0.34458611111111115,0.3441805555555555
-gpt-4o-mini-2024-07-18,0.3782051282051282,0.3418785071827972,0.13575309046266867,0.2707065266105181,0.44214722222222214,0.5004583333333332,0.47896666666666665
-Mistral-Large-Instruct-2407,0.8472222222222222,0.7374229691535793,0.7644582301049158,0.16944638941325085,0.6510750000000001,0.31028611111111104,0.3297916666666667
-Mistral-Small-Instruct-2409,0.7745726495726496,0.6890378862258165,0.6416815833333804,0.1894343546381,0.6840472222222221,0.2601583333333335,0.2888777777777778
-dummy,0.18269230769230768,0.2291015386716794,-0.009004148398032956,0.2928877637010999,0.3755222222222222,0.622275,0.5915305555555557

 Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Stress,CFI,SRMR,RMSEA
+phi-3-mini-128k-instruct,0.34424603174603174,0.32984992817164005,0.039299993295009855,0.281800547806919,0.5861361111111111,0.42524166666666674,0.3974944444444444
+phi-3-medium-128k-instruct,0.3516865079365079,0.30802986933853177,0.09692037989916814,0.2651981204439735,0.43025555555555556,0.5503277777777777,0.5381722222222222
+phi-3.5-mini-instruct,0.25744047619047616,0.2680653144619754,0.0361229186530762,0.28422749224983457,0.40715555555555555,0.5721138888888888,0.5507833333333333
+phi-3.5-MoE-instruct,0.41617063492063494,0.36128192067041315,0.10985291697837646,0.2739229692168671,0.5530944444444444,0.4248777777777778,0.40345
+Mistral-7B-Instruct-v0.1,0.23214285714285715,0.26609566354811315,0.027216280472015988,0.2829498135031582,0.38917777777777773,0.5561138888888888,0.530213888888889
+Mistral-7B-Instruct-v0.2,0.36904761904761907,0.32133832899241477,0.14417876497818388,0.265188983528973,0.3802722222222222,0.5727305555555555,0.5483611111111111
+Mistral-7B-Instruct-v0.3,0.27132936507936506,0.26572479479146804,0.07960539866974455,0.2742399030139009,0.31385,0.6241,0.6081333333333333
+Mixtral-8x7B-Instruct-v0.1,0.4667658730158731,0.3819009850972602,0.21473356319081474,0.2624402608740656,0.45275,0.5034666666666667,0.4905694444444444
+Mixtral-8x22B-Instruct-v0.1,0.3625992063492063,0.31529864972153404,0.1414001940345544,0.2548838005881672,0.3772361111111111,0.5810888888888889,0.5844750000000001
+command_r_plus,0.5922619047619047,0.4995356672762356,0.3429686514651868,0.23811982320641845,0.6033000000000001,0.3740166666666668,0.3667527777777777
+llama_3_8b_instruct,0.5153769841269842,0.4295836112681494,0.24527785038654715,0.245806400289881,0.5498222222222222,0.42656388888888896,0.42189444444444446
+llama_3_70b_instruct,0.7876984126984127,0.6839540364836003,0.607020698814379,0.18525883672204868,0.7210055555555557,0.2346083333333333,0.25758888888888887
+llama_3.1_8b_instruct,0.5773809523809523,0.4786874422110324,0.4295080949846363,0.22060228669473025,0.4305722222222223,0.5455027777777777,0.553
+llama_3.1_70b_instruct,0.8253968253968255,0.7172545013390067,0.691365862744007,0.1709718847084183,0.6979472222222223,0.2636777777777777,0.2907250000000001
+llama_3.1_405b_instruct_4bit,0.7405753968253967,0.6490864350383405,0.7232098126552619,0.1702199925365422,0.4875722222222223,0.4963444444444445,0.5211555555555556
+llama_3.2_1b_instruct,0.22718253968253965,0.2522036562381785,0.027192115495770382,0.29255310096654275,0.37450000000000006,0.5990222222222223,0.5740638888888888
+llama_3.2_3b_instruct,0.4221230158730159,0.3615804465210719,0.13450325180647235,0.27485276839064654,0.5017,0.44956666666666667,0.4226500000000001
+Qwen2-7B-Instruct,0.42757936507936506,0.36370005127542027,0.25108519506513916,0.25776537005719313,0.3560861111111111,0.6009722222222222,0.5920888888888889
+Qwen2-72B-Instruct,0.5823412698412699,0.5461212335522644,0.6465993243020925,0.20297742879025626,0.3045,0.6543138888888889,0.6646361111111111
+Qwen2.5-0.5B-Instruct,0.30406746031746035,0.3005554090516966,0.002970456550606876,0.2928913315666324,0.5371250000000001,0.44709722222222226,0.404575
+Qwen2.5-7B-Instruct,0.632440476190476,0.5163098181421168,0.333554494486959,0.2505866550331236,0.6473694444444444,0.30400277777777773,0.29651944444444434
+Qwen2.5-32B-Instruct,0.7395833333333334,0.656917654644944,0.6724190751477237,0.1806656189868978,0.5603222222222223,0.40237500000000004,0.41161666666666663
+Qwen2.5-72B-Instruct,0.8298611111111112,0.7104489147495714,0.6974116787371809,0.16176650806326276,0.6734583333333333,0.2993,0.3184472222222223
+gpt-3.5-turbo-0125,0.26190476190476186,0.28218378886707396,0.08240359836763214,0.28728574920060357,0.3873055555555555,0.599925,0.572238888888889
+gpt-4o-0513,0.6944444444444444,0.5989532974661671,0.5122163952167618,0.19201420113771173,0.6235416666666667,0.34458611111111115,0.3441805555555555
+gpt-4o-mini-2024-07-18,0.3968253968253968,0.3418785071827972,0.13575309046266867,0.2707065266105181,0.44214722222222214,0.5004583333333332,0.47896666666666665
+Mistral-Large-Instruct-2407,0.8501984126984127,0.7374229691535793,0.7644582301049158,0.16944638941325085,0.6510750000000001,0.31028611111111104,0.3297916666666667
+Mistral-Small-Instruct-2409,0.7842261904761906,0.6890378862258165,0.6416815833333804,0.1894343546381,0.6840472222222221,0.2601583333333335,0.2888777777777778
+dummy,0.1929563492063492,0.2291015386716794,-0.009004148398032956,0.2928877637010999,0.3755222222222222,0.622275,0.5915305555555557

static/models_data/Mistral-7B-Instruct-v0.1/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
-    You can find the release blog post <a href="https://mistral.ai/news/announcing-mistral-7b/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1</a>.
     The model has 7.3B parameters, and supports up to 8K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
+    You can find the release blog post <a target="_blank" href="https://mistral.ai/news/announcing-mistral-7b/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1</a>.
     The model has 7.3B parameters, and supports up to 8K token contexts.
 </p>

static/models_data/Mistral-7B-Instruct-v0.2/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
-    You can find the release blog post <a href="https://mistral.ai/news/la-plateforme/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2</a>.
     The model has 7.3B parameters, and supports up to 8K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
+    You can find the release blog post <a target="_blank" href="https://mistral.ai/news/la-plateforme/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2</a>.
     The model has 7.3B parameters, and supports up to 8K token contexts.
 </p>

static/models_data/Mistral-7B-Instruct-v0.3/model_detail.html CHANGED Viewed

@@ -1,5 +1,5 @@
 <p>
-    This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3</a>.
     The model has 7.3B parameters, and supports up to 8K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3</a>.
     The model has 7.3B parameters, and supports up to 8K token contexts.
 </p>

static/models_data/Mistral-Large-Instruct-2407/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
-    You can find the release blog post <a href="https://mistral.ai/news/mistral-large-2407/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/mistralai/Mistral-Large-Instruct-2407">https://huggingface.co/mistralai/Mistral-Large-Instruct-2407</a>.
     The 123B model supports up to 128K token context windows.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
+    You can find the release blog post <a target="_blank" href="https://mistral.ai/news/mistral-large-2407/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/mistralai/Mistral-Large-Instruct-2407">https://huggingface.co/mistralai/Mistral-Large-Instruct-2407</a>.
     The 123B model supports up to 128K token context windows.
 </p>

static/models_data/Mistral-Small-Instruct-2409/model_detail.html CHANGED Viewed

@@ -1,5 +1,5 @@
 <p>
-    This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/mistralai/Mistral-Small-Instruct-2409">https://huggingface.co/mistralai/Mistral-Small-Instruct-2409</a>.
     The 22B model supports up to 32K token sequences.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/mistralai/Mistral-Small-Instruct-2409">https://huggingface.co/mistralai/Mistral-Small-Instruct-2409</a>.
     The 22B model supports up to 32K token sequences.
 </p>

static/models_data/Mixtral-8x22B-Instruct-v0.1/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
-    You can find the release blog post <a href="https://mistral.ai/news/mixtral-8x22b/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1</a>.
     The model has 141B total and 39B active parameters. It supports up to 64K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
+    You can find the release blog post <a target="_blank" href="https://mistral.ai/news/mixtral-8x22b/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1</a>.
     The model has 141B total and 39B active parameters. It supports up to 64K token contexts.
 </p>

static/models_data/Mixtral-8x7B-Instruct-v0.1/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
-    You can find the release blog post <a href="https://mistral.ai/news/mixtral-of-experts/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1</a>.
     The model has 46.7B total and 12.9B active parameters. It supports up to 32K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
+    You can find the release blog post <a target="_blank" href="https://mistral.ai/news/mixtral-of-experts/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1</a>.
     The model has 46.7B total and 12.9B active parameters. It supports up to 32K token contexts.
 </p>

static/models_data/Qwen2-72B-Instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
-    You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/Qwen/Qwen2-72B-Instruct">https://huggingface.co/Qwen/Qwen2-72B-Instruct</a>.
     The 72B model was pretrained on 29 different languages, and supports up to 128K tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
+    You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/Qwen/Qwen2-72B-Instruct">https://huggingface.co/Qwen/Qwen2-72B-Instruct</a>.
     The 72B model was pretrained on 29 different languages, and supports up to 128K tokens.
 </p>

static/models_data/Qwen2-7B-Instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
-    You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/Qwen/Qwen2-7B-Instruct">https://huggingface.co/Qwen/Qwen2-7B-Instruct</a>.
     The 7B model was pretrained on 29 different languages, and supports up to 128K tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
+    You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/Qwen/Qwen2-7B-Instruct">https://huggingface.co/Qwen/Qwen2-7B-Instruct</a>.
     The 7B model was pretrained on 29 different languages, and supports up to 128K tokens.
 </p>

static/models_data/Qwen2.5-0.5B-Instruct/model_detail.html CHANGED Viewed

@@ -1,7 +1,7 @@
 <p>
-    This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
-    You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct">https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct</a>.
     The 0.5B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
+    You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct">https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct</a>.
     The 0.5B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

static/models_data/Qwen2.5-32B-Instruct/model_detail.html CHANGED Viewed

@@ -1,7 +1,7 @@
 <p>
-    This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
-    You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/Qwen/Qwen2.5-32B-Instruct">https://huggingface.co/Qwen/Qwen2.5-32B-Instruct</a>.
     The 32B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
+    You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-32B-Instruct">https://huggingface.co/Qwen/Qwen2.5-32B-Instruct</a>.
     The 32B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

static/models_data/Qwen2.5-72B-Instruct/model_detail.html CHANGED Viewed

@@ -1,7 +1,7 @@
 <p>
-    This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
-    You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/Qwen/Qwen2.5-72B-Instruct">https://huggingface.co/Qwen/Qwen2.5-72B-Instruct</a>.
     The 72B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
+    You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-72B-Instruct">https://huggingface.co/Qwen/Qwen2.5-72B-Instruct</a>.
     The 72B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

static/models_data/Qwen2.5-7B-Instruct/model_detail.html CHANGED Viewed

@@ -1,7 +1,7 @@
 <p>
-    This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
-    You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct">https://huggingface.co/Qwen/Qwen2.5-7B-Instruct</a>.
     The 7B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
+    You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct">https://huggingface.co/Qwen/Qwen2.5-7B-Instruct</a>.
     The 7B model was pretrained on 18 trillion tokens spanning 29 languages.
     It supports up to 128K tokens and can generate up to 8K tokens.
 </p>

static/models_data/cardinal.svg CHANGED Viewed

static/models_data/command_r_plus/model_detail.html CHANGED Viewed

@@ -1,5 +1,5 @@
 <p>
-    This open-source model was created by <a target="_blank" href="https://cohere.com/">Cohere<AI<a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/CohereForAI/c4ai-command-r-plus">https://huggingface.co/CohereForAI/c4ai-command-r-plus</a>.
     The model has 104B parameters, and supports up to 128K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" target="_blank" href="https://cohere.com/">Cohere<AI<a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/CohereForAI/c4ai-command-r-plus">https://huggingface.co/CohereForAI/c4ai-command-r-plus</a>.
     The model has 104B parameters, and supports up to 128K token contexts.
 </p>

static/models_data/gpt-3.5-turbo-0125/model_detail.html CHANGED Viewed

@@ -1,4 +1,4 @@
 <p>
-    This proprietary model was created by <a href="https://openai.com/">OpenAI<a>.
-    You can find the release blog post <a href="https://openai.com/index/chatgpt/">here</a>.
 </p>

 <p>
+    This proprietary model was created by <a target="_blank" href="https://openai.com/">OpenAI<a>.
+    You can find the release blog post <a target="_blank" href="https://openai.com/index/chatgpt/">here</a>.
 </p>

static/models_data/gpt-4o-0513/model_detail.html CHANGED Viewed

@@ -1,4 +1,4 @@
 <p>
-    This proprietary model was created by <a href="https://openai.com/">OpenAI<a>.
-    You can find the release blog post <a href="https://openai.com/index/hello-gpt-4o/">here</a>.
 </p>

 <p>
+    This proprietary model was created by <a target="_blank" href="https://openai.com/">OpenAI<a>.
+    You can find the release blog post <a target="_blank" href="https://openai.com/index/hello-gpt-4o/">here</a>.
 </p>

static/models_data/gpt-4o-mini-2024-07-18/model_detail.html CHANGED Viewed

@@ -1,4 +1,4 @@
 <p>
-    This proprietary model was created by <a href="https://openai.com/">OpenAI<a>.
-    You can find the release blog post <a href="https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/">here</a>.
 </p>

 <p>
+    This proprietary model was created by <a target="_blank" href="https://openai.com/">OpenAI<a>.
+    You can find the release blog post <a target="_blank" href="https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/">here</a>.
 </p>

static/models_data/llama_3.1_405b_instruct_4bit/model_detail.html CHANGED Viewed

@@ -1,8 +1,8 @@
 <p>
-    This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
-    You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
-    The 16bit precision model is available on the huggingface hub:  <a href="https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct</a>.
-    Due to computational constrains we use the 4bit quantized version, which is also available on the huggingfacehub: <a href="unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit">unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit</a>.
-    It is relevant to note that we compared with a 16bit version hosted by <a href="https://www.together.ai/">TogetherAI</a> on a subset of problems that fall in the 4k tokens limit defined by the TogetherAI API, and we did not see drastic changes in performance.
     The 405B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
+    You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
+    The 16bit precision model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct</a>.
+    Due to computational constrains we use the 4bit quantized version, which is also available on the huggingfacehub: <a target="_blank" href="unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit">unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit</a>.
+    It is relevant to note that we compared with a 16bit version hosted by <a target="_blank" href="https://www.together.ai/">TogetherAI</a> on a subset of problems that fall in the 4k tokens limit defined by the TogetherAI API, and we did not see drastic changes in performance.
     The 405B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
 </p>

static/models_data/llama_3.1_70b_instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
-    You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct</a>.
     The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
+    You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct</a>.
     The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
 </p>

static/models_data/llama_3.1_8b_instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
-    You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct</a>.
     The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
+    You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct</a>.
     The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
 </p>

static/models_data/llama_3.2_1b_instruct/cfa_metrics.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+Context chunk,CFI,TLI,SRMR,RMSEA
+chunk_0,0.3755,0.32637499999999997,0.549175,0.541125
+chunk_1,0.162625,0.13435,0.780375,0.77835
+chunk_2,0.387375,0.38245,0.551025,0.525875
+chunk_3,0.167275,0.116375,0.774475,0.77235
+chunk_4,0.4379,0.504325,0.54175,0.5132
+chunk_chess_0,1.0,1.4561,0.09875,0.0
+chunk_grammar_1,0.36235,0.5313,0.55545,0.519675
+chunk_no_conv,0.227475,0.213625,0.7701,0.766
+chunk_svs_no_conv,0.25,-0.47565,0.7701,0.75

static/models_data/llama_3.2_1b_instruct/matrix.svg ADDED Viewed

static/models_data/llama_3.2_1b_instruct/ranks.svg ADDED Viewed

static/models_data/llama_3.2_1b_instruct/structure.svg ADDED Viewed

static/models_data/llama_3.2_3b_instruct/cfa_metrics.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+Context chunk,CFI,TLI,SRMR,RMSEA
+chunk_0,0.6424,0.5847249999999999,0.32084999999999997,0.30565
+chunk_1,0.5,0.549275,0.54585,0.5
+chunk_2,0.709275,0.808125,0.32205,0.266925
+chunk_3,0.401425,0.34225,0.551375,0.537925
+chunk_4,0.8504499999999999,0.8442249999999999,0.09325,0.047325
+chunk_chess_0,0.598325,0.546125,0.32780000000000004,0.31285
+chunk_grammar_1,0.423075,-0.325425,0.326675,0.28650000000000003
+chunk_no_conv,0.39035,0.35895,0.55825,0.546675
+chunk_svs_no_conv,0.0,0.0,1.0,1.0

static/models_data/llama_3.2_3b_instruct/matrix.svg ADDED Viewed

static/models_data/llama_3.2_3b_instruct/ranks.svg ADDED Viewed

static/models_data/llama_3.2_3b_instruct/structure.svg ADDED Viewed

static/models_data/llama_3_70b_instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
-    You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct</a>.
     The 70B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
+    You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct</a>.
     The 70B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
 </p>

static/models_data/llama_3_8b_instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
-    You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct</a>.
     The 8B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
+    You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct</a>.
     The 8B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
 </p>

static/models_data/ordinal.svg CHANGED Viewed

static/models_data/phi-3-medium-128k-instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
-    This open-source model was created by <a href="https://www.microsoft.com/">Microsoft<a>.
-    You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct">https://huggingface.co/microsoft/Phi-3-medium-128k-instruct</a>.
     The model has 14B parameters, and supports up to 128K token contexts.
 </p>

 <p>
+    This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
+    You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct">https://huggingface.co/microsoft/Phi-3-medium-128k-instruct</a>.
     The model has 14B parameters, and supports up to 128K token contexts.
 </p>

static/models_data/phi-3-mini-128k-instruct/model_detail.html CHANGED Viewed

@@ -1,6 +1,6 @@
 <p>
     This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
-    You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
     The model has 3.8B parameters, and supports up to 128K token contexts.
 </p>

 <p>
     This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
+    You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
     The model has 3.8B parameters, and supports up to 128K token contexts.
 </p>

static/models_data/phi-3.5-MoE-instruct/cfa_metrics.csv CHANGED Viewed

@@ -1,10 +1,10 @@
 Context chunk,CFI,TLI,SRMR,RMSEA
-chunk_0,0.60585,0.56655,0.3221,0.292425
-chunk_1,0.5,-5.229425,0.54015,0.5
-chunk_2,0.5,0.740525,0.53725,0.5
-chunk_3,0.382075,0.342425,0.5569,0.5377000000000001
-chunk_4,0.6503,0.7807,0.32289999999999996,0.27775
-chunk_chess_0,0.6083500000000001,0.5570999999999999,0.326175,0.322725
-chunk_grammar_1,0.6076,2.2525749999999998,0.314825,0.275225
-chunk_no_conv,0.70345,0.68145,0.30300000000000005,0.328425
-chunk_svs_no_conv,0.42022499999999996,0.4001,0.6006,0.5968

 Context chunk,CFI,TLI,SRMR,RMSEA
+chunk_0,0.5877,0.55345,0.3251,0.28752500000000003
+chunk_1,0.239025,0.235475,0.773975,0.75525
+chunk_2,0.140675,0.11245,0.776725,0.7737499999999999
+chunk_3,0.191575,0.1765,0.778475,0.7643
+chunk_4,0.708925,1.3283,0.3235,0.267325
+chunk_chess_0,0.7886500000000001,3.1239,0.103125,0.0695
+chunk_grammar_1,0.405325,0.458975,0.558325,0.519225
+chunk_no_conv,0.6516,0.614875,0.3338,0.32622500000000004
+chunk_svs_no_conv,0.45072500000000004,0.43657500000000005,0.57665,0.57805

static/models_data/phi-3.5-MoE-instruct/matrix.svg CHANGED Viewed

static/models_data/phi-3.5-MoE-instruct/model_detail.html CHANGED Viewed

@@ -1,7 +1,7 @@
 <p>
     This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
-    You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
     The model has 16x3.8B parameters with 6.6B active parameters, and supports up to 128K token contexts.
     Even though this model supports system messages, we evaluate this model as user-message-only model
     (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)

 <p>
     This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
+    You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
     The model has 16x3.8B parameters with 6.6B active parameters, and supports up to 128K token contexts.
     Even though this model supports system messages, we evaluate this model as user-message-only model
     (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)

static/models_data/phi-3.5-MoE-instruct/ranks.svg CHANGED Viewed

static/models_data/phi-3.5-MoE-instruct/structure.svg CHANGED Viewed

static/models_data/phi-3.5-mini-instruct/cfa_metrics.csv CHANGED Viewed

@@ -1,10 +1,10 @@
 Context chunk,CFI,TLI,SRMR,RMSEA
-chunk_0,0.45045,0.432475,0.55265,0.523725
-chunk_1,0.444025,0.950775,0.54795,0.522375
-chunk_2,0.5,-0.08234999999999998,0.5404249999999999,0.5
-chunk_3,0.66345,-0.36625,0.31935,0.267325
-chunk_4,0.25,0.312175,0.767975,0.75
-chunk_chess_0,0.107725,0.02015,0.783025,0.78245
-chunk_grammar_1,0.36365000000000003,-0.143875,0.54325,0.518725
-chunk_no_conv,0.6351,0.59805,0.324775,0.34245
-chunk_svs_no_conv,0.25,4.3912,0.769625,0.75

 Context chunk,CFI,TLI,SRMR,RMSEA
+chunk_0,0.211175,0.1873,0.773425,0.77085
+chunk_1,0.52475,0.47522499999999995,0.32894999999999996,0.280075
+chunk_2,0.128625,0.08935,0.778575,0.765325
+chunk_3,0.651475,-0.39820000000000005,0.32237499999999997,0.26990000000000003
+chunk_4,0.25,0.679925,0.768375,0.75
+chunk_chess_0,0.0,0.0,1.0,1.0
+chunk_grammar_1,0.198325,0.181625,0.777725,0.761875
+chunk_no_conv,0.25,0.6973,0.76295,0.75
+chunk_svs_no_conv,0.478275,-1.2948250000000001,0.542625,0.5085500000000001

static/models_data/phi-3.5-mini-instruct/matrix.svg CHANGED Viewed

static/models_data/phi-3.5-mini-instruct/model_detail.html CHANGED Viewed

@@ -1,7 +1,7 @@
 <p>
     This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
-    You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
-    The model is available on the huggingface hub:  <a href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
     The model has 3.8B parameters, and supports up to 128K token contexts.
     Even though this model supports system messages, we evaluate this model as user-message-only model
     (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)

 <p>
     This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
+    You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
+    The model is available on the huggingface hub:  <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
     The model has 3.8B parameters, and supports up to 128K token contexts.
     Even though this model supports system messages, we evaluate this model as user-message-only model
     (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)

static/models_data/phi-3.5-mini-instruct/ranks.svg CHANGED Viewed

static/models_data/phi-3.5-mini-instruct/structure.svg CHANGED Viewed

templates/about.html CHANGED Viewed

@@ -254,7 +254,7 @@
             <ul>
                 <li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
                 <li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
-                <li> <b> chunk_0-chunk-4 </b>: <a href="https://gitlab.inria.fr/gkovac/value_stability/-/tree/master/contexts/leaderboard_reddit_chunks?ref_type=heads">50 reddit posts</a> used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest. </li>
                 <li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
                 <li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
             </ul>
@@ -265,14 +265,14 @@
             <p>
                 Validity refers to the extent the questionnaire measures what it purports to measure.
                 It can be seen as the questionnaire's accuracy in measuring the intended factors, i.e. values.
-                Following the recommendations in <a href="https://pubmed.ncbi.nlm.nih.gov/22329443/">this paper</a>,
                 the validation consists of two phases: Theory-Based Multidimensional Scaling (MDS) and Confirmatory Factor Analysis (CFA).
             </p>
             <p>
                 <b>Theory-Based Multidimensional Scaling (MDS)</b> tests that the expressed values are organized in a circular structure as predicted by the theory.
                 Values should be ordered in a circle in the same order as shown on the figure below (Tradition and Conformity should be on the same angle with Tradition closer to the center).
                 To compute the structure in our data, we calculate the intercorrelations between different items (questions).
-                This provides us with 40 points in a 40D space (for PVQ-40), which is space is then reduced to 2D by <a href="https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html">MDS</a>.
                 Crucially, MDS is initialized with the theoretical circular value structure, i.e. items corresponding to the same value are assigned the same angle.
                 When MDS is fit, it provides the <b>Stress (&darr;) </b> metric ('Stress-1 index') indicating the goodness of the fit.
                 A value of 0 indicates 'perfect' fit, 0.025 excellent, 0.05 good, 0.1 fair, and 0.2 poor.
@@ -297,7 +297,7 @@
                 The model is defined according to the theory
                 and the fit of this model is used as a metric.
                 Due to the circular structure of basic personal values,
-                it is <a href="https://pubmed.ncbi.nlm.nih.gov/22329443/">recommended</a> to employ a Magnifying glass CFA strategy.
                 Four separate models are fit, one for each of the high level values (consisting of several low-level values):
                 Conservation (security, conformity, tradition),
                 Openness to Change (self-direction, stimulation, hedonism),
@@ -324,7 +324,7 @@ their expression of that value).
                 Intuitively, this can be seen as addressing the following question:
                 <b>"Does Jack always (in every context) value Tradition more than Jane does?"</b>.
                 As shown below, instead of comparing two points in time, we compare
-                <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/personas/real_world_people/personas.json?ref_type=heads">the simulated population</a>
                 in different contexts (simulated conversations of different topics).
                 We then average over different context pairs and values to obtain the final estimate.
             </p>
@@ -346,7 +346,7 @@ their expression of that value).
                     i.e. the average of \( (n\_models-1) * ( \binom{n\_context\_chunks}{2} + n\_validity\_metrics*n\_context\_chunks) \).</li>
             </ul>
             <p>
-                Following this <a href="https://arxiv.org/abs/2405.01719">paper</a> and associated <a href="https://github.com/socialfoundations/benchbench">benchbench</a> library,
                 we can compute the diversity and the sensitivity of the two ranking methods.
                 A benchmark is considered <b>diverse</b> if different tasks order models in different ways.
                 We use the reversed Kendall’s coefficient of concordance (W) diversity metric.
@@ -359,7 +359,7 @@ their expression of that value).
         <div class="section" id="paper">
             <div class="section-title">Differences with the paper</div>
             <p>
-                This leaderboard is grounded in the methodology presented in our <a href="https://arxiv.org/abs/2402.14846">research paper</a>.
                 The paper contains various experiments which are not included in the leaderboard such as:
                 multiple populations,
                 within-person stability,
@@ -371,7 +371,7 @@ their expression of that value).
                 <ol>
                     <li>a new population was created and was balanced with respect to gender</li>
                     <li>context chunks - instead of evaluating the stability of a population between pairs of contexts, where all personas are given the same topic (e.g. chess), we evaluate it between pairs of context chunks, where each participant is given a different random context</li>
-                    <li>more diverse and longer contexts (up to 6k tokens) were created with reddit posts from the <a href="https://webis.de/data/webis-tldr-17.html">webis dataset</a> (the dataset was cleaned to exclude posts from NSFW subreddits)</li>
                     <li>different interlocutors - chess and grammar topic were still introduced as in the paper (same context for all participants), but the interlocutor model was instructed to simulate a random persona from the same population (as opposed to a human user in other settings)</li>
                     <li>in the paper, multiple seeds for the order of suggested answers were used, given that the results didn't vary much between seeds, here, a single seed was used facilitating the analysis with more longer contexts</li>
                     <li>evaluations were also done without simulating conversations (no_conv setting)</li>
@@ -416,9 +416,9 @@ their expression of that value).
             </div>
             <ul>
                 <li>Contact: <a href="mailto: [email protected]">[email protected]</a></li>
-                <li>See the <a href="https://sites.google.com/view/llmvaluestability">Project website</a></li>
-                <li>See the Flowers team <a href="http://developmentalsystems.org">blog</a> and <a href="https://flowers.inria.fr/">website</a></li>
-                <li>See Grgur's website and other projects: <a href="https://grgkovac.github.io">https://grgkovac.github.io</a></li>
             </ul>
         </div>
     </div>

             <ul>
                 <li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
                 <li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
+                <li> <b> chunk_0-chunk-4 </b>: <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/tree/master/contexts/leaderboard_reddit_chunks?ref_type=heads">50 reddit posts</a> used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest. </li>
                 <li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
                 <li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
             </ul>
             <p>
                 Validity refers to the extent the questionnaire measures what it purports to measure.
                 It can be seen as the questionnaire's accuracy in measuring the intended factors, i.e. values.
+                Following the recommendations in <a target="_blank" href="https://pubmed.ncbi.nlm.nih.gov/22329443/">this paper</a>,
                 the validation consists of two phases: Theory-Based Multidimensional Scaling (MDS) and Confirmatory Factor Analysis (CFA).
             </p>
             <p>
                 <b>Theory-Based Multidimensional Scaling (MDS)</b> tests that the expressed values are organized in a circular structure as predicted by the theory.
                 Values should be ordered in a circle in the same order as shown on the figure below (Tradition and Conformity should be on the same angle with Tradition closer to the center).
                 To compute the structure in our data, we calculate the intercorrelations between different items (questions).
+                This provides us with 40 points in a 40D space (for PVQ-40), which is space is then reduced to 2D by <a target="_blank" href="https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html">MDS</a>.
                 Crucially, MDS is initialized with the theoretical circular value structure, i.e. items corresponding to the same value are assigned the same angle.
                 When MDS is fit, it provides the <b>Stress (&darr;) </b> metric ('Stress-1 index') indicating the goodness of the fit.
                 A value of 0 indicates 'perfect' fit, 0.025 excellent, 0.05 good, 0.1 fair, and 0.2 poor.
                 The model is defined according to the theory
                 and the fit of this model is used as a metric.
                 Due to the circular structure of basic personal values,
+                it is <a target="_blank" href="https://pubmed.ncbi.nlm.nih.gov/22329443/">recommended</a> to employ a Magnifying glass CFA strategy.
                 Four separate models are fit, one for each of the high level values (consisting of several low-level values):
                 Conservation (security, conformity, tradition),
                 Openness to Change (self-direction, stimulation, hedonism),
                 Intuitively, this can be seen as addressing the following question:
                 <b>"Does Jack always (in every context) value Tradition more than Jane does?"</b>.
                 As shown below, instead of comparing two points in time, we compare
+                <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/personas/real_world_people/personas.json?ref_type=heads">the simulated population</a>
                 in different contexts (simulated conversations of different topics).
                 We then average over different context pairs and values to obtain the final estimate.
             </p>
                     i.e. the average of \( (n\_models-1) * ( \binom{n\_context\_chunks}{2} + n\_validity\_metrics*n\_context\_chunks) \).</li>
             </ul>
             <p>
+                Following this <a target="_blank" href="https://arxiv.org/abs/2405.01719">paper</a> and associated <a target="_blank" href="https://github.com/socialfoundations/benchbench">benchbench</a> library,
                 we can compute the diversity and the sensitivity of the two ranking methods.
                 A benchmark is considered <b>diverse</b> if different tasks order models in different ways.
                 We use the reversed Kendall’s coefficient of concordance (W) diversity metric.
         <div class="section" id="paper">
             <div class="section-title">Differences with the paper</div>
             <p>
+                This leaderboard is grounded in the methodology presented in our <a target="_blank" href="https://arxiv.org/abs/2402.14846">research paper</a>.
                 The paper contains various experiments which are not included in the leaderboard such as:
                 multiple populations,
                 within-person stability,
                 <ol>
                     <li>a new population was created and was balanced with respect to gender</li>
                     <li>context chunks - instead of evaluating the stability of a population between pairs of contexts, where all personas are given the same topic (e.g. chess), we evaluate it between pairs of context chunks, where each participant is given a different random context</li>
+                    <li>more diverse and longer contexts (up to 6k tokens) were created with reddit posts from the <a target="_blank" href="https://webis.de/data/webis-tldr-17.html">webis dataset</a> (the dataset was cleaned to exclude posts from NSFW subreddits)</li>
                     <li>different interlocutors - chess and grammar topic were still introduced as in the paper (same context for all participants), but the interlocutor model was instructed to simulate a random persona from the same population (as opposed to a human user in other settings)</li>
                     <li>in the paper, multiple seeds for the order of suggested answers were used, given that the results didn't vary much between seeds, here, a single seed was used facilitating the analysis with more longer contexts</li>
                     <li>evaluations were also done without simulating conversations (no_conv setting)</li>
             </div>
             <ul>
                 <li>Contact: <a href="mailto: [email protected]">[email protected]</a></li>
+                <li>See the <a target="_blank" href="https://sites.google.com/view/llmvaluestability">Project website</a></li>
+                <li>See the Flowers team <a target="_blank" href="http://developmentalsystems.org">blog</a> and <a target="_blank" href="https://flowers.inria.fr/">website</a></li>
+                <li>See Grgur's website and other projects: <a target="_blank" href="https://grgkovac.github.io">https://grgkovac.github.io</a></li>
             </ul>
         </div>
     </div>

templates/index.html CHANGED Viewed

@@ -272,14 +272,14 @@
             </a>
         </div>
         <p>
-            We leverage Schwartz's theory of <a href="https://www.sciencedirect.com/science/article/abs/pii/S0065260108602816">Basic Personal Values</a>,
             which defines 10 values  Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, Universalism),
-            and the associated PVQ-40 and SVS questionnaires (available <a href="https://www.researchgate.net/publication/354384463_A_Repository_of_Schwartz_Value_Scales_with_Instructions_and_an_Introduction">here</a>).
         </p>
         <p>
-            Using the <a href="https://pubmed.ncbi.nlm.nih.gov/31402448/">methodology from psychology</a>, we focus on population-level (interpersonal) value stability, i.e. <b>Rank-Order stability (RO stability)</b>.
             Rank-Order stability refers to the extent to which the order of different personas (in terms of expression of some value) remains the same along different contexts.
-            Refer <a href="{{ url_for('about', _anchor='rank_order_stability') }}">here</a> or to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for more details.
         </p>
         <p>
             In addition to Rank-Order stability we compute <b>validity metrics (Stress, CFI, SRMR, RMSEA)</b>, which are a common practice in psychology.
@@ -290,7 +290,7 @@
         </p>
         <p>
             We <b>aggregate</b> Rank-Order stability and validation metrics to rank the models. We do so in two ways: <b>Cardinal</b> and <b>Ordinal</b>.
-            Following <a href="https://arxiv.org/abs/2405.01719">this paper</a>, we compute the stability and diversity of those rankings. See <a href="{{ url_for('about', _anchor='aggregate_metrics') }}">here</a> for more details.
         </p>
         <p>
             To sum up here are the metrics used:

             </a>
         </div>
         <p>
+            We leverage Schwartz's theory of <a target="_blank" href="https://www.sciencedirect.com/science/article/abs/pii/S0065260108602816">Basic Personal Values</a>,
             which defines 10 values  Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, Universalism),
+            and the associated PVQ-40 and SVS questionnaires (available <a target="_blank" href="https://www.researchgate.net/publication/354384463_A_Repository_of_Schwartz_Value_Scales_with_Instructions_and_an_Introduction">here</a>).
         </p>
         <p>
+            Using the <a target="_blank" href="https://pubmed.ncbi.nlm.nih.gov/31402448/">methodology from psychology</a>, we focus on population-level (interpersonal) value stability, i.e. <b>Rank-Order stability (RO stability)</b>.
             Rank-Order stability refers to the extent to which the order of different personas (in terms of expression of some value) remains the same along different contexts.
+            Refer <a href="{{ url_for('about', _anchor='rank_order_stability') }}">here</a> or to our <a target="_blank" href="https://arxiv.org/abs/2402.14846">paper</a> for more details.
         </p>
         <p>
             In addition to Rank-Order stability we compute <b>validity metrics (Stress, CFI, SRMR, RMSEA)</b>, which are a common practice in psychology.
         </p>
         <p>
             We <b>aggregate</b> Rank-Order stability and validation metrics to rank the models. We do so in two ways: <b>Cardinal</b> and <b>Ordinal</b>.
+            Following <a target="_blank" href="https://arxiv.org/abs/2405.01719">this paper</a>, we compute the stability and diversity of those rankings. See <a href="{{ url_for('about', _anchor='aggregate_metrics') }}">here</a> for more details.
         </p>
         <p>
             To sum up here are the metrics used:

templates/model_detail.html CHANGED Viewed

@@ -255,7 +255,7 @@
                 Rank-Order stability is computed by ordering the personas based on their expression of some value,
                 and then computing the correlation between their orders in two different context chunks.
                 The stability estimates for the ten values are then averaged to get the final Rank-Order stability measure.
-                Refer to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for details.
             </p>
             <div class="matrix-image-container">
                 <a href="{{ url_for('static', filename='models_data/' + model_name + '/matrix.svg') }}" target="_blank">

                 Rank-Order stability is computed by ordering the personas based on their expression of some value,
                 and then computing the correlation between their orders in two different context chunks.
                 The stability estimates for the ten values are then averaged to get the final Rank-Order stability measure.
+                Refer to our <a target="_blank" href="https://arxiv.org/abs/2402.14846">paper</a> for details.
             </p>
             <div class="matrix-image-container">
                 <a href="{{ url_for('static', filename='models_data/' + model_name + '/matrix.svg') }}" target="_blank">

templates/new_model.html CHANGED Viewed

@@ -184,11 +184,11 @@
     <div class="section">
         <div id="evaluate_custom_model" class="section-title">Evaluate a custom model</div>
         <p>
-            To evaluate a custom model you can use our <a href="https://gitlab.inria.fr/gkovac/value_stability">open-source code</a>.
             If a model is in the huggingface transformers format (saved either localy or on the hub),
             it can be simply added by adding a config file.
             The model can then be evaluated as any other model.
-            To do so, follow the <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/README.md?ref_type=heads#adding-a-new-model">instructions</a> in the README.md file.
         </p>
     </div>
     <div class="section" id="paper">
@@ -205,9 +205,9 @@
                 <code>`Leaderboard/results/stability_leaderboard/&lt;your_model_name&gt;/chunk_0_&lt;timestamp&gt;/results.json`</code>
             </li>
             <li>
-                <b> Submit the config file </b> - Create a pull request to our <a href="https://gitlab.inria.fr/gkovac/value_stability">repository</a> from a branch <code>"unofficial_model/&lt;your_model_name&gt;"</code>.
                 The pull request should ideally only add the config file in <code>`./models/leaderboard_configs`</code>.
-                If additional changes are needed, they should ideally be constrained to a new model class (see <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/models/huggingfacemodel.py?ref_type=heads">huggingfacemodel.py</a> for reference).
             <li>
                 <b> Submit the model results </b> - submit the *json files as a ZIP using the form below.
                 We will integrate the model's results on our side, and rerank models with yours included.

     <div class="section">
         <div id="evaluate_custom_model" class="section-title">Evaluate a custom model</div>
         <p>
+            To evaluate a custom model you can use our <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability">open-source code</a>.
             If a model is in the huggingface transformers format (saved either localy or on the hub),
             it can be simply added by adding a config file.
             The model can then be evaluated as any other model.
+            To do so, follow the <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/README.md?ref_type=heads#adding-a-new-model">instructions</a> in the README.md file.
         </p>
     </div>
     <div class="section" id="paper">
                 <code>`Leaderboard/results/stability_leaderboard/&lt;your_model_name&gt;/chunk_0_&lt;timestamp&gt;/results.json`</code>
             </li>
             <li>
+                <b> Submit the config file </b> - Create a pull request to our <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability">repository</a> from a branch <code>"unofficial_model/&lt;your_model_name&gt;"</code>.
                 The pull request should ideally only add the config file in <code>`./models/leaderboard_configs`</code>.
+                If additional changes are needed, they should ideally be constrained to a new model class (see <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/models/huggingfacemodel.py?ref_type=heads">huggingfacemodel.py</a> for reference).
             <li>
                 <b> Submit the model results </b> - submit the *json files as a ZIP using the form below.
                 We will integrate the model's results on our side, and rerank models with yours included.