Spaces:

zhuqiming
/

DomainEval

Running

App Files Files Community

zhuqiming commited on Aug 26

Commit

77e4a24

•

1 Parent(s): d05cebd

更新展示

Browse files

Files changed (4) hide show

css_html.py +64 -0
results.json +966 -0
text_content.py +37 -0
utils.py +27 -0

css_html.py ADDED Viewed

	@@ -0,0 +1,64 @@

+custom_css = """
+#changelog-text {
+    font-size: 16px !important;
+}
+#changelog-text h2 {
+    font-size: 18px !important;
+}
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Hides the final AutoEvalColumn */
+#llm-benchmark-tab-table table td:last-child,
+#llm-benchmark-tab-table table th:last-child {
+    display: none;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+table td:first-child,
+table th:first-child {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+"""

results.json ADDED Viewed

	@@ -0,0 +1,966 @@

+{
+    "pass_1": [
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Computation",
+            "Pass_at_k": 0.9038123167155425
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8340175953079179
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8686217008797654
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8392961876832845
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8604105571847507
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8351906158357771
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.7607038123167156
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8029325513196481
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.7712609970674487
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8516129032258064
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.7554252199413489
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8093841642228738
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Network",
+            "Pass_at_k": 0.703125
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Network",
+            "Pass_at_k": 0.58984375
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Network",
+            "Pass_at_k": 0.66796875
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.64453125
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.62109375
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.58984375
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.6015625
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.62109375
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.60546875
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Network",
+            "Pass_at_k": 0.609375
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.6015625
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.53125
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.5967741935483871
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.489247311827957
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.4946236559139785
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.5053763440860215
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.5
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.45698924731182794
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.41935483870967744
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.42473118279569894
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.43548387096774194
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.478494623655914
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.45161290322580644
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.34946236559139787
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6915887850467289
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Basic",
+            "Pass_at_k": 0.5607476635514018
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6915887850467289
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.5981308411214953
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6542056074766355
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.5794392523364486
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.5514018691588785
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.5887850467289719
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.5233644859813084
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6074766355140186
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.616822429906542
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.4485981308411215
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "System",
+            "Pass_at_k": 0.51
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "System",
+            "Pass_at_k": 0.32
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "System",
+            "Pass_at_k": 0.41
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.46
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.41
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.36
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.35
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.34
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.36
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "System",
+            "Pass_at_k": 0.37
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.42
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.19
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.43
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.31
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.36
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.35
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.38
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.4
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.31
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.27
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.32
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.37
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.35
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.12
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Mean",
+            "Pass_at_k": 0.6392167158851098
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5173093867812127
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5821338153067455
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5662224371484669
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5709516524435644
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5369104775806756
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.49883717003087863
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5079237551407199
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5025963506694164
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5478265270659565
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5325705088456162
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.4081157767758989
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Std",
+            "Pass_at_k": 0.16679801914758088
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Std",
+            "Pass_at_k": 0.1950117243115276
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Std",
+            "Pass_at_k": 0.19393547652062595
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.1693855278154664
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.17923951210025596
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.17321135521991954
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.17089125215414938
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.19904938629943747
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.16815110445446094
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Std",
+            "Pass_at_k": 0.18313053955353828
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.15105015549350911
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.24973689844845592
+        }
+    ],
+    "pass_5": [
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Computation",
+            "Pass_at_k": 0.9126099706744868
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8733137829912023
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Computation",
+            "Pass_at_k": 0.9014662756598241
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8979472140762463
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8891495601173021
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8979472140762463
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8510263929618769
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.898533724340176
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8680351906158358
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Computation",
+            "Pass_at_k": 0.9102639296187683
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8510263929618769
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Computation",
+            "Pass_at_k": 0.8768328445747801
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Network",
+            "Pass_at_k": 0.7265625
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Network",
+            "Pass_at_k": 0.62890625
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Network",
+            "Pass_at_k": 0.70703125
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.70703125
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.65625
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.63671875
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.6328125
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.65625
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.63671875
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Network",
+            "Pass_at_k": 0.640625
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Network",
+            "Pass_at_k": 0.67578125
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Network",
+            "Pass_at_k": 0.55859375
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.6182795698924731
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.521505376344086
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.5483870967741935
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.553763440860215
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.5376344086021505
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.553763440860215
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.4838709677419355
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.5161290322580645
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.5161290322580645
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.553763440860215
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.543010752688172
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Visualization",
+            "Pass_at_k": 0.3978494623655914
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Basic",
+            "Pass_at_k": 0.7102803738317757
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6074766355140186
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Basic",
+            "Pass_at_k": 0.7383177570093458
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6822429906542056
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6822429906542056
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6728971962616822
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6261682242990654
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6635514018691588
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6448598130841121
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6822429906542056
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Basic",
+            "Pass_at_k": 0.6728971962616822
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Basic",
+            "Pass_at_k": 0.48598130841121495
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "System",
+            "Pass_at_k": 0.57
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "System",
+            "Pass_at_k": 0.36
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "System",
+            "Pass_at_k": 0.5
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.57
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.49
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.49
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.41
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.38
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.43
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "System",
+            "Pass_at_k": 0.45
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "System",
+            "Pass_at_k": 0.47
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "System",
+            "Pass_at_k": 0.26
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.49
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.34
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.46
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.42
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.44
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.44
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.42
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.35
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.4
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.42
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.44
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Cryptography",
+            "Pass_at_k": 0.21
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Mean",
+            "Pass_at_k": 0.6712887357331225
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5552003408082177
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Mean",
+            "Pass_at_k": 0.6425337299072272
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.6384974825984445
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.615879493228943
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.6152211001996906
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5706463475004796
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5774106930778998
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.5826237976596688
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Mean",
+            "Pass_at_k": 0.6094825601888648
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Mean",
+            "Pass_at_k": 0.6087859319852885
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Mean",
+            "Pass_at_k": 0.46487622755859775
+        },
+        {
+            "Model": "gpt-4o-mini",
+            "Domain": "Std",
+            "Pass_at_k": 0.14747641211035856
+        },
+        {
+            "Model": "gpt-3.5-turbo",
+            "Domain": "Std",
+            "Pass_at_k": 0.19743922837233668
+        },
+        {
+            "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
+            "Domain": "Std",
+            "Pass_at_k": 0.169043537848292
+        },
+        {
+            "Model": "deepseek-coder-33b-instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.1634243695210041
+        },
+        {
+            "Model": "DeepSeek-Coder-V2-Lite-Instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.16346984877152868
+        },
+        {
+            "Model": "deepseek-coder-6.7b-instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.16363528852513812
+        },
+        {
+            "Model": "CodeLlama-34b-Instruct-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.16828060893964333
+        },
+        {
+            "Model": "CodeLlama-13b-Instruct-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.2055227025195004
+        },
+        {
+            "Model": "CodeLlama-7b-Instruct-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.17281566921046676
+        },
+        {
+            "Model": "CodeQwen1.5-7B-Chat",
+            "Domain": "Std",
+            "Pass_at_k": 0.17954181233010988
+        },
+        {
+            "Model": "Phi-3-medium-4k-instruct",
+            "Domain": "Std",
+            "Pass_at_k": 0.15450285935340832
+        },
+        {
+            "Model": "Llama-2-13b-chat-hf",
+            "Domain": "Std",
+            "Pass_at_k": 0.2409835679041833
+        }
+    ]
+}

text_content.py ADDED Viewed

	@@ -0,0 +1,37 @@

+HEAD_TEXT = """
+Based on the DomainEval benchmark, we evaluate code generation ability of different LLMs across multiple domains.
+More details about how to evaluate the LLM are available in the [DomainEval GitHub repository](https://github.com/domaineval/DomainEval).
+For a complete description of DomainEval benchmark and related experimental analysis, please refer to the paper: [DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation](https://arxiv.org/abs/2408.13204). [![](https://img.shields.io/badge/arXiv-2408.13204-b31b1b.svg)](https://arxiv.org/abs/2408.13204)
+**_Latest News_** 🔥
+- [24/08/26] We release our DomainEval benchmark, leaderboard and paper.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@misc{zhu2024domainevalautoconstructedbenchmarkmultidomain,
+      title={DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation},
+      author={Qiming Zhu and Jialun Cao and Yaojie Lu and Hongyu Lin and Xianpei Han and Le Sun and Shing-Chi Cheung},
+      year={2024},
+      eprint={2408.13204},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2408.13204},
+}
+"""
+ACKNOWLEDGEMENT_TEXT = """
+Inspired from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
+"""
+NOTES_TEXT = """
+**Notes:**
+- Evaluate using pass@k as the evaluation metric.
+- `Mean` denotes the macro average results of pass@k across 6 different domains.
+- `Std` denotes the standard deviation of pass@k across 6 different domains.
+- you can choose differt pass@k in `⏬ Pass@k`.
+- `⏬ Domains` can choose domains you want to show in the leaderboard.
+"""

utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from dataclasses import dataclass
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+def fields(raw_class):
+    return [
+        v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
+    ]
+@dataclass(frozen=True)
+class AutoEvalColumn:  # Auto evals column
+    model = ColumnContent("Model", "markdown", True)
+    average = ColumnContent("Mean", "number", True)
+    std = ColumnContent("Std", "number", True)
+    l_0 = ColumnContent("Computation", "number", True)
+    l_1 = ColumnContent("Network", "number", True)
+    l_2 = ColumnContent("Visualization", "number", True)
+    l_3 = ColumnContent("Basic", "number", True)
+    l_4 = ColumnContent("System", "number", True)
+    l_5 = ColumnContent("Cryptography", "number", True)