grg commited on
Commit
abb889e
1 Parent(s): 961db60

Adding llama_3.2 and updating links

Browse files
Files changed (49) hide show
  1. static/leaderboard.csv +29 -27
  2. static/models_data/Mistral-7B-Instruct-v0.1/model_detail.html +3 -3
  3. static/models_data/Mistral-7B-Instruct-v0.2/model_detail.html +3 -3
  4. static/models_data/Mistral-7B-Instruct-v0.3/model_detail.html +2 -2
  5. static/models_data/Mistral-Large-Instruct-2407/model_detail.html +3 -3
  6. static/models_data/Mistral-Small-Instruct-2409/model_detail.html +2 -2
  7. static/models_data/Mixtral-8x22B-Instruct-v0.1/model_detail.html +3 -3
  8. static/models_data/Mixtral-8x7B-Instruct-v0.1/model_detail.html +3 -3
  9. static/models_data/Qwen2-72B-Instruct/model_detail.html +3 -3
  10. static/models_data/Qwen2-7B-Instruct/model_detail.html +3 -3
  11. static/models_data/Qwen2.5-0.5B-Instruct/model_detail.html +3 -3
  12. static/models_data/Qwen2.5-32B-Instruct/model_detail.html +3 -3
  13. static/models_data/Qwen2.5-72B-Instruct/model_detail.html +3 -3
  14. static/models_data/Qwen2.5-7B-Instruct/model_detail.html +3 -3
  15. static/models_data/cardinal.svg +331 -249
  16. static/models_data/command_r_plus/model_detail.html +2 -2
  17. static/models_data/gpt-3.5-turbo-0125/model_detail.html +2 -2
  18. static/models_data/gpt-4o-0513/model_detail.html +2 -2
  19. static/models_data/gpt-4o-mini-2024-07-18/model_detail.html +2 -2
  20. static/models_data/llama_3.1_405b_instruct_4bit/model_detail.html +5 -5
  21. static/models_data/llama_3.1_70b_instruct/model_detail.html +3 -3
  22. static/models_data/llama_3.1_8b_instruct/model_detail.html +3 -3
  23. static/models_data/llama_3.2_1b_instruct/cfa_metrics.csv +10 -0
  24. static/models_data/llama_3.2_1b_instruct/matrix.svg +1959 -0
  25. static/models_data/llama_3.2_1b_instruct/ranks.svg +0 -0
  26. static/models_data/llama_3.2_1b_instruct/structure.svg +0 -0
  27. static/models_data/llama_3.2_3b_instruct/cfa_metrics.csv +10 -0
  28. static/models_data/llama_3.2_3b_instruct/matrix.svg +1911 -0
  29. static/models_data/llama_3.2_3b_instruct/ranks.svg +0 -0
  30. static/models_data/llama_3.2_3b_instruct/structure.svg +0 -0
  31. static/models_data/llama_3_70b_instruct/model_detail.html +3 -3
  32. static/models_data/llama_3_8b_instruct/model_detail.html +3 -3
  33. static/models_data/ordinal.svg +332 -250
  34. static/models_data/phi-3-medium-128k-instruct/model_detail.html +3 -3
  35. static/models_data/phi-3-mini-128k-instruct/model_detail.html +2 -2
  36. static/models_data/phi-3.5-MoE-instruct/cfa_metrics.csv +9 -9
  37. static/models_data/phi-3.5-MoE-instruct/matrix.svg +356 -395
  38. static/models_data/phi-3.5-MoE-instruct/model_detail.html +2 -2
  39. static/models_data/phi-3.5-MoE-instruct/ranks.svg +0 -0
  40. static/models_data/phi-3.5-MoE-instruct/structure.svg +0 -0
  41. static/models_data/phi-3.5-mini-instruct/cfa_metrics.csv +9 -9
  42. static/models_data/phi-3.5-mini-instruct/matrix.svg +364 -370
  43. static/models_data/phi-3.5-mini-instruct/model_detail.html +2 -2
  44. static/models_data/phi-3.5-mini-instruct/ranks.svg +0 -0
  45. static/models_data/phi-3.5-mini-instruct/structure.svg +0 -0
  46. templates/about.html +11 -11
  47. templates/index.html +5 -5
  48. templates/model_detail.html +1 -1
  49. templates/new_model.html +4 -4
static/leaderboard.csv CHANGED
@@ -1,28 +1,30 @@
1
  Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Stress,CFI,SRMR,RMSEA
2
- phi-3-mini-128k-instruct,0.33279914529914534,0.32984992817164005,0.039299993295009855,0.281800547806919,0.5861361111111111,0.42524166666666674,0.3974944444444444
3
- phi-3-medium-128k-instruct,0.3333333333333333,0.30802986933853177,0.09692037989916814,0.2651981204439735,0.43025555555555556,0.5503277777777777,0.5381722222222222
4
- phi-3.5-mini-instruct,0.2435897435897436,0.2680653144619754,0.0361229186530762,0.28422749224983457,0.40715555555555555,0.5721138888888888,0.5507833333333333
5
- phi-3.5-MoE-instruct,0.40010683760683763,0.36128192067041315,0.10985291697837646,0.2739229692168671,0.5530944444444444,0.4248777777777778,0.40345
6
- Mistral-7B-Instruct-v0.1,0.22168803418803418,0.26609566354811315,0.027216280472015988,0.2829498135031582,0.38917777777777773,0.5561138888888888,0.530213888888889
7
- Mistral-7B-Instruct-v0.2,0.35683760683760685,0.32133832899241477,0.14417876497818388,0.265188983528973,0.3802722222222222,0.5727305555555555,0.5483611111111111
8
- Mistral-7B-Instruct-v0.3,0.25961538461538464,0.26572479479146804,0.07960539866974455,0.2742399030139009,0.31385,0.6241,0.6081333333333333
9
- Mixtral-8x7B-Instruct-v0.1,0.44925213675213665,0.3819009850972602,0.21473356319081474,0.2624402608740656,0.45275,0.5034666666666667,0.4905694444444444
10
- Mixtral-8x22B-Instruct-v0.1,0.3477564102564103,0.31529864972153404,0.1414001940345544,0.2548838005881672,0.3772361111111111,0.5810888888888889,0.5844750000000001
11
- command_r_plus,0.5726495726495726,0.4995356672762356,0.3429686514651868,0.23811982320641845,0.6033000000000001,0.3740166666666668,0.3667527777777777
12
- llama_3_8b_instruct,0.4983974358974359,0.4295836112681494,0.24527785038654715,0.245806400289881,0.5498222222222222,0.42656388888888896,0.42189444444444446
13
- llama_3_70b_instruct,0.7777777777777778,0.6839540364836003,0.607020698814379,0.18525883672204868,0.7210055555555557,0.2346083333333333,0.25758888888888887
14
- llama_3.1_8b_instruct,0.5592948717948718,0.4786874422110324,0.4295080949846363,0.22060228669473025,0.4305722222222223,0.5455027777777777,0.553
15
- llama_3.1_70b_instruct,0.8215811965811967,0.7172545013390067,0.691365862744007,0.1709718847084183,0.6979472222222223,0.2636777777777777,0.2907250000000001
16
- llama_3.1_405b_instruct_4bit,0.7329059829059829,0.6490864350383405,0.7232098126552619,0.1702199925365422,0.4875722222222223,0.4963444444444445,0.5211555555555556
17
- Qwen2-7B-Instruct,0.40651709401709396,0.36370005127542027,0.25108519506513916,0.25776537005719313,0.3560861111111111,0.6009722222222222,0.5920888888888889
18
- Qwen2-72B-Instruct,0.5721153846153846,0.5461212335522644,0.6465993243020925,0.20297742879025626,0.3045,0.6543138888888889,0.6646361111111111
19
- Qwen2.5-0.5B-Instruct,0.2954059829059829,0.3005554090516966,0.002970456550606876,0.2928913315666324,0.5371250000000001,0.44709722222222226,0.404575
20
- Qwen2.5-7B-Instruct,0.6132478632478633,0.5163098181421168,0.333554494486959,0.2505866550331236,0.6473694444444444,0.30400277777777773,0.29651944444444434
21
- Qwen2.5-32B-Instruct,0.7323717948717948,0.656917654644944,0.6724190751477237,0.1806656189868978,0.5603222222222223,0.40237500000000004,0.41161666666666663
22
- Qwen2.5-72B-Instruct,0.8253205128205129,0.7104489147495714,0.6974116787371809,0.16176650806326276,0.6734583333333333,0.2993,0.3184472222222223
23
- gpt-3.5-turbo-0125,0.24626068376068375,0.28218378886707396,0.08240359836763214,0.28728574920060357,0.3873055555555555,0.599925,0.572238888888889
24
- gpt-4o-0513,0.6810897435897435,0.5989532974661671,0.5122163952167618,0.19201420113771173,0.6235416666666667,0.34458611111111115,0.3441805555555555
25
- gpt-4o-mini-2024-07-18,0.3782051282051282,0.3418785071827972,0.13575309046266867,0.2707065266105181,0.44214722222222214,0.5004583333333332,0.47896666666666665
26
- Mistral-Large-Instruct-2407,0.8472222222222222,0.7374229691535793,0.7644582301049158,0.16944638941325085,0.6510750000000001,0.31028611111111104,0.3297916666666667
27
- Mistral-Small-Instruct-2409,0.7745726495726496,0.6890378862258165,0.6416815833333804,0.1894343546381,0.6840472222222221,0.2601583333333335,0.2888777777777778
28
- dummy,0.18269230769230768,0.2291015386716794,-0.009004148398032956,0.2928877637010999,0.3755222222222222,0.622275,0.5915305555555557
 
 
 
1
  Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Stress,CFI,SRMR,RMSEA
2
+ phi-3-mini-128k-instruct,0.34424603174603174,0.32984992817164005,0.039299993295009855,0.281800547806919,0.5861361111111111,0.42524166666666674,0.3974944444444444
3
+ phi-3-medium-128k-instruct,0.3516865079365079,0.30802986933853177,0.09692037989916814,0.2651981204439735,0.43025555555555556,0.5503277777777777,0.5381722222222222
4
+ phi-3.5-mini-instruct,0.25744047619047616,0.2680653144619754,0.0361229186530762,0.28422749224983457,0.40715555555555555,0.5721138888888888,0.5507833333333333
5
+ phi-3.5-MoE-instruct,0.41617063492063494,0.36128192067041315,0.10985291697837646,0.2739229692168671,0.5530944444444444,0.4248777777777778,0.40345
6
+ Mistral-7B-Instruct-v0.1,0.23214285714285715,0.26609566354811315,0.027216280472015988,0.2829498135031582,0.38917777777777773,0.5561138888888888,0.530213888888889
7
+ Mistral-7B-Instruct-v0.2,0.36904761904761907,0.32133832899241477,0.14417876497818388,0.265188983528973,0.3802722222222222,0.5727305555555555,0.5483611111111111
8
+ Mistral-7B-Instruct-v0.3,0.27132936507936506,0.26572479479146804,0.07960539866974455,0.2742399030139009,0.31385,0.6241,0.6081333333333333
9
+ Mixtral-8x7B-Instruct-v0.1,0.4667658730158731,0.3819009850972602,0.21473356319081474,0.2624402608740656,0.45275,0.5034666666666667,0.4905694444444444
10
+ Mixtral-8x22B-Instruct-v0.1,0.3625992063492063,0.31529864972153404,0.1414001940345544,0.2548838005881672,0.3772361111111111,0.5810888888888889,0.5844750000000001
11
+ command_r_plus,0.5922619047619047,0.4995356672762356,0.3429686514651868,0.23811982320641845,0.6033000000000001,0.3740166666666668,0.3667527777777777
12
+ llama_3_8b_instruct,0.5153769841269842,0.4295836112681494,0.24527785038654715,0.245806400289881,0.5498222222222222,0.42656388888888896,0.42189444444444446
13
+ llama_3_70b_instruct,0.7876984126984127,0.6839540364836003,0.607020698814379,0.18525883672204868,0.7210055555555557,0.2346083333333333,0.25758888888888887
14
+ llama_3.1_8b_instruct,0.5773809523809523,0.4786874422110324,0.4295080949846363,0.22060228669473025,0.4305722222222223,0.5455027777777777,0.553
15
+ llama_3.1_70b_instruct,0.8253968253968255,0.7172545013390067,0.691365862744007,0.1709718847084183,0.6979472222222223,0.2636777777777777,0.2907250000000001
16
+ llama_3.1_405b_instruct_4bit,0.7405753968253967,0.6490864350383405,0.7232098126552619,0.1702199925365422,0.4875722222222223,0.4963444444444445,0.5211555555555556
17
+ llama_3.2_1b_instruct,0.22718253968253965,0.2522036562381785,0.027192115495770382,0.29255310096654275,0.37450000000000006,0.5990222222222223,0.5740638888888888
18
+ llama_3.2_3b_instruct,0.4221230158730159,0.3615804465210719,0.13450325180647235,0.27485276839064654,0.5017,0.44956666666666667,0.4226500000000001
19
+ Qwen2-7B-Instruct,0.42757936507936506,0.36370005127542027,0.25108519506513916,0.25776537005719313,0.3560861111111111,0.6009722222222222,0.5920888888888889
20
+ Qwen2-72B-Instruct,0.5823412698412699,0.5461212335522644,0.6465993243020925,0.20297742879025626,0.3045,0.6543138888888889,0.6646361111111111
21
+ Qwen2.5-0.5B-Instruct,0.30406746031746035,0.3005554090516966,0.002970456550606876,0.2928913315666324,0.5371250000000001,0.44709722222222226,0.404575
22
+ Qwen2.5-7B-Instruct,0.632440476190476,0.5163098181421168,0.333554494486959,0.2505866550331236,0.6473694444444444,0.30400277777777773,0.29651944444444434
23
+ Qwen2.5-32B-Instruct,0.7395833333333334,0.656917654644944,0.6724190751477237,0.1806656189868978,0.5603222222222223,0.40237500000000004,0.41161666666666663
24
+ Qwen2.5-72B-Instruct,0.8298611111111112,0.7104489147495714,0.6974116787371809,0.16176650806326276,0.6734583333333333,0.2993,0.3184472222222223
25
+ gpt-3.5-turbo-0125,0.26190476190476186,0.28218378886707396,0.08240359836763214,0.28728574920060357,0.3873055555555555,0.599925,0.572238888888889
26
+ gpt-4o-0513,0.6944444444444444,0.5989532974661671,0.5122163952167618,0.19201420113771173,0.6235416666666667,0.34458611111111115,0.3441805555555555
27
+ gpt-4o-mini-2024-07-18,0.3968253968253968,0.3418785071827972,0.13575309046266867,0.2707065266105181,0.44214722222222214,0.5004583333333332,0.47896666666666665
28
+ Mistral-Large-Instruct-2407,0.8501984126984127,0.7374229691535793,0.7644582301049158,0.16944638941325085,0.6510750000000001,0.31028611111111104,0.3297916666666667
29
+ Mistral-Small-Instruct-2409,0.7842261904761906,0.6890378862258165,0.6416815833333804,0.1894343546381,0.6840472222222221,0.2601583333333335,0.2888777777777778
30
+ dummy,0.1929563492063492,0.2291015386716794,-0.009004148398032956,0.2928877637010999,0.3755222222222222,0.622275,0.5915305555555557
static/models_data/Mistral-7B-Instruct-v0.1/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
3
- You can find the release blog post <a href="https://mistral.ai/news/announcing-mistral-7b/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1</a>.
5
  The model has 7.3B parameters, and supports up to 8K token contexts.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://mistral.ai/news/announcing-mistral-7b/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1</a>.
5
  The model has 7.3B parameters, and supports up to 8K token contexts.
6
  </p>
static/models_data/Mistral-7B-Instruct-v0.2/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
3
- You can find the release blog post <a href="https://mistral.ai/news/la-plateforme/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2</a>.
5
  The model has 7.3B parameters, and supports up to 8K token contexts.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://mistral.ai/news/la-plateforme/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2</a>.
5
  The model has 7.3B parameters, and supports up to 8K token contexts.
6
  </p>
static/models_data/Mistral-7B-Instruct-v0.3/model_detail.html CHANGED
@@ -1,5 +1,5 @@
1
  <p>
2
- This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
3
- The model is available on the huggingface hub: <a href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3</a>.
4
  The model has 7.3B parameters, and supports up to 8K token contexts.
5
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
3
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3</a>.
4
  The model has 7.3B parameters, and supports up to 8K token contexts.
5
  </p>
static/models_data/Mistral-Large-Instruct-2407/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
3
- You can find the release blog post <a href="https://mistral.ai/news/mistral-large-2407/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/mistralai/Mistral-Large-Instruct-2407">https://huggingface.co/mistralai/Mistral-Large-Instruct-2407</a>.
5
  The 123B model supports up to 128K token context windows.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://mistral.ai/news/mistral-large-2407/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/mistralai/Mistral-Large-Instruct-2407">https://huggingface.co/mistralai/Mistral-Large-Instruct-2407</a>.
5
  The 123B model supports up to 128K token context windows.
6
  </p>
static/models_data/Mistral-Small-Instruct-2409/model_detail.html CHANGED
@@ -1,5 +1,5 @@
1
  <p>
2
- This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
3
- The model is available on the huggingface hub: <a href="https://huggingface.co/mistralai/Mistral-Small-Instruct-2409">https://huggingface.co/mistralai/Mistral-Small-Instruct-2409</a>.
4
  The 22B model supports up to 32K token sequences.
5
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
3
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/mistralai/Mistral-Small-Instruct-2409">https://huggingface.co/mistralai/Mistral-Small-Instruct-2409</a>.
4
  The 22B model supports up to 32K token sequences.
5
  </p>
static/models_data/Mixtral-8x22B-Instruct-v0.1/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
3
- You can find the release blog post <a href="https://mistral.ai/news/mixtral-8x22b/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1</a>.
5
  The model has 141B total and 39B active parameters. It supports up to 64K token contexts.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://mistral.ai/news/mixtral-8x22b/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1</a>.
5
  The model has 141B total and 39B active parameters. It supports up to 64K token contexts.
6
  </p>
static/models_data/Mixtral-8x7B-Instruct-v0.1/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://mistral.ai/">Mistral AI<a>.
3
- You can find the release blog post <a href="https://mistral.ai/news/mixtral-of-experts/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1</a>.
5
  The model has 46.7B total and 12.9B active parameters. It supports up to 32K token contexts.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://mistral.ai/">Mistral AI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://mistral.ai/news/mixtral-of-experts/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1</a>.
5
  The model has 46.7B total and 12.9B active parameters. It supports up to 32K token contexts.
6
  </p>
static/models_data/Qwen2-72B-Instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
- You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/Qwen/Qwen2-72B-Instruct">https://huggingface.co/Qwen/Qwen2-72B-Instruct</a>.
5
  The 72B model was pretrained on 29 different languages, and supports up to 128K tokens.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
+ You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/Qwen/Qwen2-72B-Instruct">https://huggingface.co/Qwen/Qwen2-72B-Instruct</a>.
5
  The 72B model was pretrained on 29 different languages, and supports up to 128K tokens.
6
  </p>
static/models_data/Qwen2-7B-Instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
- You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/Qwen/Qwen2-7B-Instruct">https://huggingface.co/Qwen/Qwen2-7B-Instruct</a>.
5
  The 7B model was pretrained on 29 different languages, and supports up to 128K tokens.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
+ You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/Qwen/Qwen2-7B-Instruct">https://huggingface.co/Qwen/Qwen2-7B-Instruct</a>.
5
  The 7B model was pretrained on 29 different languages, and supports up to 128K tokens.
6
  </p>
static/models_data/Qwen2.5-0.5B-Instruct/model_detail.html CHANGED
@@ -1,7 +1,7 @@
1
  <p>
2
- This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
- You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct">https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct</a>.
5
  The 0.5B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
+ You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct">https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct</a>.
5
  The 0.5B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
static/models_data/Qwen2.5-32B-Instruct/model_detail.html CHANGED
@@ -1,7 +1,7 @@
1
  <p>
2
- This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
- You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/Qwen/Qwen2.5-32B-Instruct">https://huggingface.co/Qwen/Qwen2.5-32B-Instruct</a>.
5
  The 32B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
+ You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-32B-Instruct">https://huggingface.co/Qwen/Qwen2.5-32B-Instruct</a>.
5
  The 32B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
static/models_data/Qwen2.5-72B-Instruct/model_detail.html CHANGED
@@ -1,7 +1,7 @@
1
  <p>
2
- This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
- You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/Qwen/Qwen2.5-72B-Instruct">https://huggingface.co/Qwen/Qwen2.5-72B-Instruct</a>.
5
  The 72B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
+ You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-72B-Instruct">https://huggingface.co/Qwen/Qwen2.5-72B-Instruct</a>.
5
  The 72B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
static/models_data/Qwen2.5-7B-Instruct/model_detail.html CHANGED
@@ -1,7 +1,7 @@
1
  <p>
2
- This open-source model was created by <a href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
- You can find the release blog post <a href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct">https://huggingface.co/Qwen/Qwen2.5-7B-Instruct</a>.
5
  The 7B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://qwenlm.github.io/">The Qwen Team of Alibaba cloud <a>.
3
+ You can find the release blog post <a target="_blank" href="https://qwenlm.github.io/blog/qwen2.5/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct">https://huggingface.co/Qwen/Qwen2.5-7B-Instruct</a>.
5
  The 7B model was pretrained on 18 trillion tokens spanning 29 languages.
6
  It supports up to 128K tokens and can generate up to 8K tokens.
7
  </p>
static/models_data/cardinal.svg CHANGED
static/models_data/command_r_plus/model_detail.html CHANGED
@@ -1,5 +1,5 @@
1
  <p>
2
- This open-source model was created by <a target="_blank" href="https://cohere.com/">Cohere<AI<a>.
3
- The model is available on the huggingface hub: <a href="https://huggingface.co/CohereForAI/c4ai-command-r-plus">https://huggingface.co/CohereForAI/c4ai-command-r-plus</a>.
4
  The model has 104B parameters, and supports up to 128K token contexts.
5
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" target="_blank" href="https://cohere.com/">Cohere<AI<a>.
3
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/CohereForAI/c4ai-command-r-plus">https://huggingface.co/CohereForAI/c4ai-command-r-plus</a>.
4
  The model has 104B parameters, and supports up to 128K token contexts.
5
  </p>
static/models_data/gpt-3.5-turbo-0125/model_detail.html CHANGED
@@ -1,4 +1,4 @@
1
  <p>
2
- This proprietary model was created by <a href="https://openai.com/">OpenAI<a>.
3
- You can find the release blog post <a href="https://openai.com/index/chatgpt/">here</a>.
4
  </p>
 
1
  <p>
2
+ This proprietary model was created by <a target="_blank" href="https://openai.com/">OpenAI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://openai.com/index/chatgpt/">here</a>.
4
  </p>
static/models_data/gpt-4o-0513/model_detail.html CHANGED
@@ -1,4 +1,4 @@
1
  <p>
2
- This proprietary model was created by <a href="https://openai.com/">OpenAI<a>.
3
- You can find the release blog post <a href="https://openai.com/index/hello-gpt-4o/">here</a>.
4
  </p>
 
1
  <p>
2
+ This proprietary model was created by <a target="_blank" href="https://openai.com/">OpenAI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://openai.com/index/hello-gpt-4o/">here</a>.
4
  </p>
static/models_data/gpt-4o-mini-2024-07-18/model_detail.html CHANGED
@@ -1,4 +1,4 @@
1
  <p>
2
- This proprietary model was created by <a href="https://openai.com/">OpenAI<a>.
3
- You can find the release blog post <a href="https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/">here</a>.
4
  </p>
 
1
  <p>
2
+ This proprietary model was created by <a target="_blank" href="https://openai.com/">OpenAI<a>.
3
+ You can find the release blog post <a target="_blank" href="https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/">here</a>.
4
  </p>
static/models_data/llama_3.1_405b_instruct_4bit/model_detail.html CHANGED
@@ -1,8 +1,8 @@
1
  <p>
2
- This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
3
- You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
4
- The 16bit precision model is available on the huggingface hub: <a href="https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct</a>.
5
- Due to computational constrains we use the 4bit quantized version, which is also available on the huggingfacehub: <a href="unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit">unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit</a>.
6
- It is relevant to note that we compared with a 16bit version hosted by <a href="https://www.together.ai/">TogetherAI</a> on a subset of problems that fall in the 4k tokens limit defined by the TogetherAI API, and we did not see drastic changes in performance.
7
  The 405B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
8
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
3
+ You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
4
+ The 16bit precision model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct</a>.
5
+ Due to computational constrains we use the 4bit quantized version, which is also available on the huggingfacehub: <a target="_blank" href="unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit">unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit</a>.
6
+ It is relevant to note that we compared with a 16bit version hosted by <a target="_blank" href="https://www.together.ai/">TogetherAI</a> on a subset of problems that fall in the 4k tokens limit defined by the TogetherAI API, and we did not see drastic changes in performance.
7
  The 405B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
8
  </p>
static/models_data/llama_3.1_70b_instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
3
- You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct</a>.
5
  The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
3
+ You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct</a>.
5
  The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
6
  </p>
static/models_data/llama_3.1_8b_instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
3
- You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct</a>.
5
  The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
3
+ You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3-1/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct</a>.
5
  The 70B model was pretrained on 15 trillion tokens spanning 8 different languages, and supports up to 128K token contexts.
6
  </p>
static/models_data/llama_3.2_1b_instruct/cfa_metrics.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Context chunk,CFI,TLI,SRMR,RMSEA
2
+ chunk_0,0.3755,0.32637499999999997,0.549175,0.541125
3
+ chunk_1,0.162625,0.13435,0.780375,0.77835
4
+ chunk_2,0.387375,0.38245,0.551025,0.525875
5
+ chunk_3,0.167275,0.116375,0.774475,0.77235
6
+ chunk_4,0.4379,0.504325,0.54175,0.5132
7
+ chunk_chess_0,1.0,1.4561,0.09875,0.0
8
+ chunk_grammar_1,0.36235,0.5313,0.55545,0.519675
9
+ chunk_no_conv,0.227475,0.213625,0.7701,0.766
10
+ chunk_svs_no_conv,0.25,-0.47565,0.7701,0.75
static/models_data/llama_3.2_1b_instruct/matrix.svg ADDED
static/models_data/llama_3.2_1b_instruct/ranks.svg ADDED
static/models_data/llama_3.2_1b_instruct/structure.svg ADDED
static/models_data/llama_3.2_3b_instruct/cfa_metrics.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Context chunk,CFI,TLI,SRMR,RMSEA
2
+ chunk_0,0.6424,0.5847249999999999,0.32084999999999997,0.30565
3
+ chunk_1,0.5,0.549275,0.54585,0.5
4
+ chunk_2,0.709275,0.808125,0.32205,0.266925
5
+ chunk_3,0.401425,0.34225,0.551375,0.537925
6
+ chunk_4,0.8504499999999999,0.8442249999999999,0.09325,0.047325
7
+ chunk_chess_0,0.598325,0.546125,0.32780000000000004,0.31285
8
+ chunk_grammar_1,0.423075,-0.325425,0.326675,0.28650000000000003
9
+ chunk_no_conv,0.39035,0.35895,0.55825,0.546675
10
+ chunk_svs_no_conv,0.0,0.0,1.0,1.0
static/models_data/llama_3.2_3b_instruct/matrix.svg ADDED
static/models_data/llama_3.2_3b_instruct/ranks.svg ADDED
static/models_data/llama_3.2_3b_instruct/structure.svg ADDED
static/models_data/llama_3_70b_instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
3
- You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct</a>.
5
  The 70B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
3
+ You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct</a>.
5
  The 70B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
6
  </p>
static/models_data/llama_3_8b_instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://ai.meta.com/">Meta AI</a>.
3
- You can find the release blog post <a href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct</a>.
5
  The 8B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://ai.meta.com/">Meta AI</a>.
3
+ You can find the release blog post <a target="_blank" href="https://ai.meta.com/blog/meta-llama-3/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct">https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct</a>.
5
  The 8B model was pretrained on 15 trillion tokens spanning 30 different languages in sequences of 8,192 tokens.
6
  </p>
static/models_data/ordinal.svg CHANGED
static/models_data/phi-3-medium-128k-instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
- This open-source model was created by <a href="https://www.microsoft.com/">Microsoft<a>.
3
- You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct">https://huggingface.co/microsoft/Phi-3-medium-128k-instruct</a>.
5
  The model has 14B parameters, and supports up to 128K token contexts.
6
  </p>
 
1
  <p>
2
+ This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
3
+ You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct">https://huggingface.co/microsoft/Phi-3-medium-128k-instruct</a>.
5
  The model has 14B parameters, and supports up to 128K token contexts.
6
  </p>
static/models_data/phi-3-mini-128k-instruct/model_detail.html CHANGED
@@ -1,6 +1,6 @@
1
  <p>
2
  This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
3
- You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
5
  The model has 3.8B parameters, and supports up to 128K token contexts.
6
  </p>
 
1
  <p>
2
  This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
3
+ You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
5
  The model has 3.8B parameters, and supports up to 128K token contexts.
6
  </p>
static/models_data/phi-3.5-MoE-instruct/cfa_metrics.csv CHANGED
@@ -1,10 +1,10 @@
1
  Context chunk,CFI,TLI,SRMR,RMSEA
2
- chunk_0,0.60585,0.56655,0.3221,0.292425
3
- chunk_1,0.5,-5.229425,0.54015,0.5
4
- chunk_2,0.5,0.740525,0.53725,0.5
5
- chunk_3,0.382075,0.342425,0.5569,0.5377000000000001
6
- chunk_4,0.6503,0.7807,0.32289999999999996,0.27775
7
- chunk_chess_0,0.6083500000000001,0.5570999999999999,0.326175,0.322725
8
- chunk_grammar_1,0.6076,2.2525749999999998,0.314825,0.275225
9
- chunk_no_conv,0.70345,0.68145,0.30300000000000005,0.328425
10
- chunk_svs_no_conv,0.42022499999999996,0.4001,0.6006,0.5968
 
1
  Context chunk,CFI,TLI,SRMR,RMSEA
2
+ chunk_0,0.5877,0.55345,0.3251,0.28752500000000003
3
+ chunk_1,0.239025,0.235475,0.773975,0.75525
4
+ chunk_2,0.140675,0.11245,0.776725,0.7737499999999999
5
+ chunk_3,0.191575,0.1765,0.778475,0.7643
6
+ chunk_4,0.708925,1.3283,0.3235,0.267325
7
+ chunk_chess_0,0.7886500000000001,3.1239,0.103125,0.0695
8
+ chunk_grammar_1,0.405325,0.458975,0.558325,0.519225
9
+ chunk_no_conv,0.6516,0.614875,0.3338,0.32622500000000004
10
+ chunk_svs_no_conv,0.45072500000000004,0.43657500000000005,0.57665,0.57805
static/models_data/phi-3.5-MoE-instruct/matrix.svg CHANGED
static/models_data/phi-3.5-MoE-instruct/model_detail.html CHANGED
@@ -1,7 +1,7 @@
1
  <p>
2
  This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
3
- You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
5
  The model has 16x3.8B parameters with 6.6B active parameters, and supports up to 128K token contexts.
6
  Even though this model supports system messages, we evaluate this model as user-message-only model
7
  (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)
 
1
  <p>
2
  This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
3
+ You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
5
  The model has 16x3.8B parameters with 6.6B active parameters, and supports up to 128K token contexts.
6
  Even though this model supports system messages, we evaluate this model as user-message-only model
7
  (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)
static/models_data/phi-3.5-MoE-instruct/ranks.svg CHANGED
static/models_data/phi-3.5-MoE-instruct/structure.svg CHANGED
static/models_data/phi-3.5-mini-instruct/cfa_metrics.csv CHANGED
@@ -1,10 +1,10 @@
1
  Context chunk,CFI,TLI,SRMR,RMSEA
2
- chunk_0,0.45045,0.432475,0.55265,0.523725
3
- chunk_1,0.444025,0.950775,0.54795,0.522375
4
- chunk_2,0.5,-0.08234999999999998,0.5404249999999999,0.5
5
- chunk_3,0.66345,-0.36625,0.31935,0.267325
6
- chunk_4,0.25,0.312175,0.767975,0.75
7
- chunk_chess_0,0.107725,0.02015,0.783025,0.78245
8
- chunk_grammar_1,0.36365000000000003,-0.143875,0.54325,0.518725
9
- chunk_no_conv,0.6351,0.59805,0.324775,0.34245
10
- chunk_svs_no_conv,0.25,4.3912,0.769625,0.75
 
1
  Context chunk,CFI,TLI,SRMR,RMSEA
2
+ chunk_0,0.211175,0.1873,0.773425,0.77085
3
+ chunk_1,0.52475,0.47522499999999995,0.32894999999999996,0.280075
4
+ chunk_2,0.128625,0.08935,0.778575,0.765325
5
+ chunk_3,0.651475,-0.39820000000000005,0.32237499999999997,0.26990000000000003
6
+ chunk_4,0.25,0.679925,0.768375,0.75
7
+ chunk_chess_0,0.0,0.0,1.0,1.0
8
+ chunk_grammar_1,0.198325,0.181625,0.777725,0.761875
9
+ chunk_no_conv,0.25,0.6973,0.76295,0.75
10
+ chunk_svs_no_conv,0.478275,-1.2948250000000001,0.542625,0.5085500000000001
static/models_data/phi-3.5-mini-instruct/matrix.svg CHANGED
static/models_data/phi-3.5-mini-instruct/model_detail.html CHANGED
@@ -1,7 +1,7 @@
1
  <p>
2
  This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
3
- You can find the release blog post <a href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
- The model is available on the huggingface hub: <a href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
5
  The model has 3.8B parameters, and supports up to 128K token contexts.
6
  Even though this model supports system messages, we evaluate this model as user-message-only model
7
  (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)
 
1
  <p>
2
  This open-source model was created by <a target="_blank" href="https://www.microsoft.com/">Microsoft<a>.
3
+ You can find the release blog post <a target="_blank" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">here</a>.
4
+ The model is available on the huggingface hub: <a target="_blank" href="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct">https://huggingface.co/microsoft/Phi-3-mini-128k-instruct</a>.
5
  The model has 3.8B parameters, and supports up to 128K token contexts.
6
  Even though this model supports system messages, we evaluate this model as user-message-only model
7
  (the persona is induced by sending the user message "You are &lt;persona&gt;" followed by a manually set "OK" as the assistant's response)
static/models_data/phi-3.5-mini-instruct/ranks.svg CHANGED
static/models_data/phi-3.5-mini-instruct/structure.svg CHANGED
templates/about.html CHANGED
@@ -254,7 +254,7 @@
254
  <ul>
255
  <li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
256
  <li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
257
- <li> <b> chunk_0-chunk-4 </b>: <a href="https://gitlab.inria.fr/gkovac/value_stability/-/tree/master/contexts/leaderboard_reddit_chunks?ref_type=heads">50 reddit posts</a> used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest. </li>
258
  <li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
259
  <li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
260
  </ul>
@@ -265,14 +265,14 @@
265
  <p>
266
  Validity refers to the extent the questionnaire measures what it purports to measure.
267
  It can be seen as the questionnaire's accuracy in measuring the intended factors, i.e. values.
268
- Following the recommendations in <a href="https://pubmed.ncbi.nlm.nih.gov/22329443/">this paper</a>,
269
  the validation consists of two phases: Theory-Based Multidimensional Scaling (MDS) and Confirmatory Factor Analysis (CFA).
270
  </p>
271
  <p>
272
  <b>Theory-Based Multidimensional Scaling (MDS)</b> tests that the expressed values are organized in a circular structure as predicted by the theory.
273
  Values should be ordered in a circle in the same order as shown on the figure below (Tradition and Conformity should be on the same angle with Tradition closer to the center).
274
  To compute the structure in our data, we calculate the intercorrelations between different items (questions).
275
- This provides us with 40 points in a 40D space (for PVQ-40), which is space is then reduced to 2D by <a href="https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html">MDS</a>.
276
  Crucially, MDS is initialized with the theoretical circular value structure, i.e. items corresponding to the same value are assigned the same angle.
277
  When MDS is fit, it provides the <b>Stress (&darr;) </b> metric ('Stress-1 index') indicating the goodness of the fit.
278
  A value of 0 indicates 'perfect' fit, 0.025 excellent, 0.05 good, 0.1 fair, and 0.2 poor.
@@ -297,7 +297,7 @@
297
  The model is defined according to the theory
298
  and the fit of this model is used as a metric.
299
  Due to the circular structure of basic personal values,
300
- it is <a href="https://pubmed.ncbi.nlm.nih.gov/22329443/">recommended</a> to employ a Magnifying glass CFA strategy.
301
  Four separate models are fit, one for each of the high level values (consisting of several low-level values):
302
  Conservation (security, conformity, tradition),
303
  Openness to Change (self-direction, stimulation, hedonism),
@@ -324,7 +324,7 @@ their expression of that value).
324
  Intuitively, this can be seen as addressing the following question:
325
  <b>"Does Jack always (in every context) value Tradition more than Jane does?"</b>.
326
  As shown below, instead of comparing two points in time, we compare
327
- <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/personas/real_world_people/personas.json?ref_type=heads">the simulated population</a>
328
  in different contexts (simulated conversations of different topics).
329
  We then average over different context pairs and values to obtain the final estimate.
330
  </p>
@@ -346,7 +346,7 @@ their expression of that value).
346
  i.e. the average of \( (n\_models-1) * ( \binom{n\_context\_chunks}{2} + n\_validity\_metrics*n\_context\_chunks) \).</li>
347
  </ul>
348
  <p>
349
- Following this <a href="https://arxiv.org/abs/2405.01719">paper</a> and associated <a href="https://github.com/socialfoundations/benchbench">benchbench</a> library,
350
  we can compute the diversity and the sensitivity of the two ranking methods.
351
  A benchmark is considered <b>diverse</b> if different tasks order models in different ways.
352
  We use the reversed Kendall’s coefficient of concordance (W) diversity metric.
@@ -359,7 +359,7 @@ their expression of that value).
359
  <div class="section" id="paper">
360
  <div class="section-title">Differences with the paper</div>
361
  <p>
362
- This leaderboard is grounded in the methodology presented in our <a href="https://arxiv.org/abs/2402.14846">research paper</a>.
363
  The paper contains various experiments which are not included in the leaderboard such as:
364
  multiple populations,
365
  within-person stability,
@@ -371,7 +371,7 @@ their expression of that value).
371
  <ol>
372
  <li>a new population was created and was balanced with respect to gender</li>
373
  <li>context chunks - instead of evaluating the stability of a population between pairs of contexts, where all personas are given the same topic (e.g. chess), we evaluate it between pairs of context chunks, where each participant is given a different random context</li>
374
- <li>more diverse and longer contexts (up to 6k tokens) were created with reddit posts from the <a href="https://webis.de/data/webis-tldr-17.html">webis dataset</a> (the dataset was cleaned to exclude posts from NSFW subreddits)</li>
375
  <li>different interlocutors - chess and grammar topic were still introduced as in the paper (same context for all participants), but the interlocutor model was instructed to simulate a random persona from the same population (as opposed to a human user in other settings)</li>
376
  <li>in the paper, multiple seeds for the order of suggested answers were used, given that the results didn't vary much between seeds, here, a single seed was used facilitating the analysis with more longer contexts</li>
377
  <li>evaluations were also done without simulating conversations (no_conv setting)</li>
@@ -416,9 +416,9 @@ their expression of that value).
416
  </div>
417
  <ul>
418
  <li>Contact: <a href="mailto: [email protected]">[email protected]</a></li>
419
- <li>See the <a href="https://sites.google.com/view/llmvaluestability">Project website</a></li>
420
- <li>See the Flowers team <a href="http://developmentalsystems.org">blog</a> and <a href="https://flowers.inria.fr/">website</a></li>
421
- <li>See Grgur's website and other projects: <a href="https://grgkovac.github.io">https://grgkovac.github.io</a></li>
422
  </ul>
423
  </div>
424
  </div>
 
254
  <ul>
255
  <li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
256
  <li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
257
+ <li> <b> chunk_0-chunk-4 </b>: <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/tree/master/contexts/leaderboard_reddit_chunks?ref_type=heads">50 reddit posts</a> used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest. </li>
258
  <li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
259
  <li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
260
  </ul>
 
265
  <p>
266
  Validity refers to the extent the questionnaire measures what it purports to measure.
267
  It can be seen as the questionnaire's accuracy in measuring the intended factors, i.e. values.
268
+ Following the recommendations in <a target="_blank" href="https://pubmed.ncbi.nlm.nih.gov/22329443/">this paper</a>,
269
  the validation consists of two phases: Theory-Based Multidimensional Scaling (MDS) and Confirmatory Factor Analysis (CFA).
270
  </p>
271
  <p>
272
  <b>Theory-Based Multidimensional Scaling (MDS)</b> tests that the expressed values are organized in a circular structure as predicted by the theory.
273
  Values should be ordered in a circle in the same order as shown on the figure below (Tradition and Conformity should be on the same angle with Tradition closer to the center).
274
  To compute the structure in our data, we calculate the intercorrelations between different items (questions).
275
+ This provides us with 40 points in a 40D space (for PVQ-40), which is space is then reduced to 2D by <a target="_blank" href="https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html">MDS</a>.
276
  Crucially, MDS is initialized with the theoretical circular value structure, i.e. items corresponding to the same value are assigned the same angle.
277
  When MDS is fit, it provides the <b>Stress (&darr;) </b> metric ('Stress-1 index') indicating the goodness of the fit.
278
  A value of 0 indicates 'perfect' fit, 0.025 excellent, 0.05 good, 0.1 fair, and 0.2 poor.
 
297
  The model is defined according to the theory
298
  and the fit of this model is used as a metric.
299
  Due to the circular structure of basic personal values,
300
+ it is <a target="_blank" href="https://pubmed.ncbi.nlm.nih.gov/22329443/">recommended</a> to employ a Magnifying glass CFA strategy.
301
  Four separate models are fit, one for each of the high level values (consisting of several low-level values):
302
  Conservation (security, conformity, tradition),
303
  Openness to Change (self-direction, stimulation, hedonism),
 
324
  Intuitively, this can be seen as addressing the following question:
325
  <b>"Does Jack always (in every context) value Tradition more than Jane does?"</b>.
326
  As shown below, instead of comparing two points in time, we compare
327
+ <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/personas/real_world_people/personas.json?ref_type=heads">the simulated population</a>
328
  in different contexts (simulated conversations of different topics).
329
  We then average over different context pairs and values to obtain the final estimate.
330
  </p>
 
346
  i.e. the average of \( (n\_models-1) * ( \binom{n\_context\_chunks}{2} + n\_validity\_metrics*n\_context\_chunks) \).</li>
347
  </ul>
348
  <p>
349
+ Following this <a target="_blank" href="https://arxiv.org/abs/2405.01719">paper</a> and associated <a target="_blank" href="https://github.com/socialfoundations/benchbench">benchbench</a> library,
350
  we can compute the diversity and the sensitivity of the two ranking methods.
351
  A benchmark is considered <b>diverse</b> if different tasks order models in different ways.
352
  We use the reversed Kendall’s coefficient of concordance (W) diversity metric.
 
359
  <div class="section" id="paper">
360
  <div class="section-title">Differences with the paper</div>
361
  <p>
362
+ This leaderboard is grounded in the methodology presented in our <a target="_blank" href="https://arxiv.org/abs/2402.14846">research paper</a>.
363
  The paper contains various experiments which are not included in the leaderboard such as:
364
  multiple populations,
365
  within-person stability,
 
371
  <ol>
372
  <li>a new population was created and was balanced with respect to gender</li>
373
  <li>context chunks - instead of evaluating the stability of a population between pairs of contexts, where all personas are given the same topic (e.g. chess), we evaluate it between pairs of context chunks, where each participant is given a different random context</li>
374
+ <li>more diverse and longer contexts (up to 6k tokens) were created with reddit posts from the <a target="_blank" href="https://webis.de/data/webis-tldr-17.html">webis dataset</a> (the dataset was cleaned to exclude posts from NSFW subreddits)</li>
375
  <li>different interlocutors - chess and grammar topic were still introduced as in the paper (same context for all participants), but the interlocutor model was instructed to simulate a random persona from the same population (as opposed to a human user in other settings)</li>
376
  <li>in the paper, multiple seeds for the order of suggested answers were used, given that the results didn't vary much between seeds, here, a single seed was used facilitating the analysis with more longer contexts</li>
377
  <li>evaluations were also done without simulating conversations (no_conv setting)</li>
 
416
  </div>
417
  <ul>
418
  <li>Contact: <a href="mailto: [email protected]">[email protected]</a></li>
419
+ <li>See the <a target="_blank" href="https://sites.google.com/view/llmvaluestability">Project website</a></li>
420
+ <li>See the Flowers team <a target="_blank" href="http://developmentalsystems.org">blog</a> and <a target="_blank" href="https://flowers.inria.fr/">website</a></li>
421
+ <li>See Grgur's website and other projects: <a target="_blank" href="https://grgkovac.github.io">https://grgkovac.github.io</a></li>
422
  </ul>
423
  </div>
424
  </div>
templates/index.html CHANGED
@@ -272,14 +272,14 @@
272
  </a>
273
  </div>
274
  <p>
275
- We leverage Schwartz's theory of <a href="https://www.sciencedirect.com/science/article/abs/pii/S0065260108602816">Basic Personal Values</a>,
276
  which defines 10 values Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, Universalism),
277
- and the associated PVQ-40 and SVS questionnaires (available <a href="https://www.researchgate.net/publication/354384463_A_Repository_of_Schwartz_Value_Scales_with_Instructions_and_an_Introduction">here</a>).
278
  </p>
279
  <p>
280
- Using the <a href="https://pubmed.ncbi.nlm.nih.gov/31402448/">methodology from psychology</a>, we focus on population-level (interpersonal) value stability, i.e. <b>Rank-Order stability (RO stability)</b>.
281
  Rank-Order stability refers to the extent to which the order of different personas (in terms of expression of some value) remains the same along different contexts.
282
- Refer <a href="{{ url_for('about', _anchor='rank_order_stability') }}">here</a> or to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for more details.
283
  </p>
284
  <p>
285
  In addition to Rank-Order stability we compute <b>validity metrics (Stress, CFI, SRMR, RMSEA)</b>, which are a common practice in psychology.
@@ -290,7 +290,7 @@
290
  </p>
291
  <p>
292
  We <b>aggregate</b> Rank-Order stability and validation metrics to rank the models. We do so in two ways: <b>Cardinal</b> and <b>Ordinal</b>.
293
- Following <a href="https://arxiv.org/abs/2405.01719">this paper</a>, we compute the stability and diversity of those rankings. See <a href="{{ url_for('about', _anchor='aggregate_metrics') }}">here</a> for more details.
294
  </p>
295
  <p>
296
  To sum up here are the metrics used:
 
272
  </a>
273
  </div>
274
  <p>
275
+ We leverage Schwartz's theory of <a target="_blank" href="https://www.sciencedirect.com/science/article/abs/pii/S0065260108602816">Basic Personal Values</a>,
276
  which defines 10 values Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, Universalism),
277
+ and the associated PVQ-40 and SVS questionnaires (available <a target="_blank" href="https://www.researchgate.net/publication/354384463_A_Repository_of_Schwartz_Value_Scales_with_Instructions_and_an_Introduction">here</a>).
278
  </p>
279
  <p>
280
+ Using the <a target="_blank" href="https://pubmed.ncbi.nlm.nih.gov/31402448/">methodology from psychology</a>, we focus on population-level (interpersonal) value stability, i.e. <b>Rank-Order stability (RO stability)</b>.
281
  Rank-Order stability refers to the extent to which the order of different personas (in terms of expression of some value) remains the same along different contexts.
282
+ Refer <a href="{{ url_for('about', _anchor='rank_order_stability') }}">here</a> or to our <a target="_blank" href="https://arxiv.org/abs/2402.14846">paper</a> for more details.
283
  </p>
284
  <p>
285
  In addition to Rank-Order stability we compute <b>validity metrics (Stress, CFI, SRMR, RMSEA)</b>, which are a common practice in psychology.
 
290
  </p>
291
  <p>
292
  We <b>aggregate</b> Rank-Order stability and validation metrics to rank the models. We do so in two ways: <b>Cardinal</b> and <b>Ordinal</b>.
293
+ Following <a target="_blank" href="https://arxiv.org/abs/2405.01719">this paper</a>, we compute the stability and diversity of those rankings. See <a href="{{ url_for('about', _anchor='aggregate_metrics') }}">here</a> for more details.
294
  </p>
295
  <p>
296
  To sum up here are the metrics used:
templates/model_detail.html CHANGED
@@ -255,7 +255,7 @@
255
  Rank-Order stability is computed by ordering the personas based on their expression of some value,
256
  and then computing the correlation between their orders in two different context chunks.
257
  The stability estimates for the ten values are then averaged to get the final Rank-Order stability measure.
258
- Refer to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for details.
259
  </p>
260
  <div class="matrix-image-container">
261
  <a href="{{ url_for('static', filename='models_data/' + model_name + '/matrix.svg') }}" target="_blank">
 
255
  Rank-Order stability is computed by ordering the personas based on their expression of some value,
256
  and then computing the correlation between their orders in two different context chunks.
257
  The stability estimates for the ten values are then averaged to get the final Rank-Order stability measure.
258
+ Refer to our <a target="_blank" href="https://arxiv.org/abs/2402.14846">paper</a> for details.
259
  </p>
260
  <div class="matrix-image-container">
261
  <a href="{{ url_for('static', filename='models_data/' + model_name + '/matrix.svg') }}" target="_blank">
templates/new_model.html CHANGED
@@ -184,11 +184,11 @@
184
  <div class="section">
185
  <div id="evaluate_custom_model" class="section-title">Evaluate a custom model</div>
186
  <p>
187
- To evaluate a custom model you can use our <a href="https://gitlab.inria.fr/gkovac/value_stability">open-source code</a>.
188
  If a model is in the huggingface transformers format (saved either localy or on the hub),
189
  it can be simply added by adding a config file.
190
  The model can then be evaluated as any other model.
191
- To do so, follow the <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/README.md?ref_type=heads#adding-a-new-model">instructions</a> in the README.md file.
192
  </p>
193
  </div>
194
  <div class="section" id="paper">
@@ -205,9 +205,9 @@
205
  <code>`Leaderboard/results/stability_leaderboard/&lt;your_model_name&gt;/chunk_0_&lt;timestamp&gt;/results.json`</code>
206
  </li>
207
  <li>
208
- <b> Submit the config file </b> - Create a pull request to our <a href="https://gitlab.inria.fr/gkovac/value_stability">repository</a> from a branch <code>"unofficial_model/&lt;your_model_name&gt;"</code>.
209
  The pull request should ideally only add the config file in <code>`./models/leaderboard_configs`</code>.
210
- If additional changes are needed, they should ideally be constrained to a new model class (see <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/models/huggingfacemodel.py?ref_type=heads">huggingfacemodel.py</a> for reference).
211
  <li>
212
  <b> Submit the model results </b> - submit the *json files as a ZIP using the form below.
213
  We will integrate the model's results on our side, and rerank models with yours included.
 
184
  <div class="section">
185
  <div id="evaluate_custom_model" class="section-title">Evaluate a custom model</div>
186
  <p>
187
+ To evaluate a custom model you can use our <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability">open-source code</a>.
188
  If a model is in the huggingface transformers format (saved either localy or on the hub),
189
  it can be simply added by adding a config file.
190
  The model can then be evaluated as any other model.
191
+ To do so, follow the <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/README.md?ref_type=heads#adding-a-new-model">instructions</a> in the README.md file.
192
  </p>
193
  </div>
194
  <div class="section" id="paper">
 
205
  <code>`Leaderboard/results/stability_leaderboard/&lt;your_model_name&gt;/chunk_0_&lt;timestamp&gt;/results.json`</code>
206
  </li>
207
  <li>
208
+ <b> Submit the config file </b> - Create a pull request to our <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability">repository</a> from a branch <code>"unofficial_model/&lt;your_model_name&gt;"</code>.
209
  The pull request should ideally only add the config file in <code>`./models/leaderboard_configs`</code>.
210
+ If additional changes are needed, they should ideally be constrained to a new model class (see <a target="_blank" href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/models/huggingfacemodel.py?ref_type=heads">huggingfacemodel.py</a> for reference).
211
  <li>
212
  <b> Submit the model results </b> - submit the *json files as a ZIP using the form below.
213
  We will integrate the model's results on our side, and rerank models with yours included.