zhuqiming commited on
Commit
77e4a24
1 Parent(s): d05cebd

更新展示

Browse files
Files changed (4) hide show
  1. css_html.py +64 -0
  2. results.json +966 -0
  3. text_content.py +37 -0
  4. utils.py +27 -0
css_html.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ #changelog-text {
3
+ font-size: 16px !important;
4
+ }
5
+ #changelog-text h2 {
6
+ font-size: 18px !important;
7
+ }
8
+ .markdown-text {
9
+ font-size: 16px !important;
10
+ }
11
+ #models-to-add-text {
12
+ font-size: 18px !important;
13
+ }
14
+ #citation-button span {
15
+ font-size: 16px !important;
16
+ }
17
+ #citation-button textarea {
18
+ font-size: 16px !important;
19
+ }
20
+ #citation-button > label > button {
21
+ margin: 6px;
22
+ transform: scale(1.3);
23
+ }
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+ #leaderboard-table-lite {
28
+ margin-top: 15px
29
+ }
30
+ #search-bar-table-box > div:first-child {
31
+ background: none;
32
+ border: none;
33
+ }
34
+
35
+ #search-bar {
36
+ padding: 0px;
37
+ }
38
+ /* Hides the final AutoEvalColumn */
39
+ #llm-benchmark-tab-table table td:last-child,
40
+ #llm-benchmark-tab-table table th:last-child {
41
+ display: none;
42
+ }
43
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
44
+ table td:first-child,
45
+ table th:first-child {
46
+ max-width: 400px;
47
+ overflow: auto;
48
+ white-space: nowrap;
49
+ }
50
+ .tab-buttons button {
51
+ font-size: 20px;
52
+ }
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+ #scale-logo .download {
62
+ display: none;
63
+ }
64
+ """
results.json ADDED
@@ -0,0 +1,966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pass_1": [
3
+ {
4
+ "Model": "gpt-4o-mini",
5
+ "Domain": "Computation",
6
+ "Pass_at_k": 0.9038123167155425
7
+ },
8
+ {
9
+ "Model": "gpt-3.5-turbo",
10
+ "Domain": "Computation",
11
+ "Pass_at_k": 0.8340175953079179
12
+ },
13
+ {
14
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
15
+ "Domain": "Computation",
16
+ "Pass_at_k": 0.8686217008797654
17
+ },
18
+ {
19
+ "Model": "deepseek-coder-33b-instruct",
20
+ "Domain": "Computation",
21
+ "Pass_at_k": 0.8392961876832845
22
+ },
23
+ {
24
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
25
+ "Domain": "Computation",
26
+ "Pass_at_k": 0.8604105571847507
27
+ },
28
+ {
29
+ "Model": "deepseek-coder-6.7b-instruct",
30
+ "Domain": "Computation",
31
+ "Pass_at_k": 0.8351906158357771
32
+ },
33
+ {
34
+ "Model": "CodeLlama-34b-Instruct-hf",
35
+ "Domain": "Computation",
36
+ "Pass_at_k": 0.7607038123167156
37
+ },
38
+ {
39
+ "Model": "CodeLlama-13b-Instruct-hf",
40
+ "Domain": "Computation",
41
+ "Pass_at_k": 0.8029325513196481
42
+ },
43
+ {
44
+ "Model": "CodeLlama-7b-Instruct-hf",
45
+ "Domain": "Computation",
46
+ "Pass_at_k": 0.7712609970674487
47
+ },
48
+ {
49
+ "Model": "CodeQwen1.5-7B-Chat",
50
+ "Domain": "Computation",
51
+ "Pass_at_k": 0.8516129032258064
52
+ },
53
+ {
54
+ "Model": "Phi-3-medium-4k-instruct",
55
+ "Domain": "Computation",
56
+ "Pass_at_k": 0.7554252199413489
57
+ },
58
+ {
59
+ "Model": "Llama-2-13b-chat-hf",
60
+ "Domain": "Computation",
61
+ "Pass_at_k": 0.8093841642228738
62
+ },
63
+ {
64
+ "Model": "gpt-4o-mini",
65
+ "Domain": "Network",
66
+ "Pass_at_k": 0.703125
67
+ },
68
+ {
69
+ "Model": "gpt-3.5-turbo",
70
+ "Domain": "Network",
71
+ "Pass_at_k": 0.58984375
72
+ },
73
+ {
74
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
75
+ "Domain": "Network",
76
+ "Pass_at_k": 0.66796875
77
+ },
78
+ {
79
+ "Model": "deepseek-coder-33b-instruct",
80
+ "Domain": "Network",
81
+ "Pass_at_k": 0.64453125
82
+ },
83
+ {
84
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
85
+ "Domain": "Network",
86
+ "Pass_at_k": 0.62109375
87
+ },
88
+ {
89
+ "Model": "deepseek-coder-6.7b-instruct",
90
+ "Domain": "Network",
91
+ "Pass_at_k": 0.58984375
92
+ },
93
+ {
94
+ "Model": "CodeLlama-34b-Instruct-hf",
95
+ "Domain": "Network",
96
+ "Pass_at_k": 0.6015625
97
+ },
98
+ {
99
+ "Model": "CodeLlama-13b-Instruct-hf",
100
+ "Domain": "Network",
101
+ "Pass_at_k": 0.62109375
102
+ },
103
+ {
104
+ "Model": "CodeLlama-7b-Instruct-hf",
105
+ "Domain": "Network",
106
+ "Pass_at_k": 0.60546875
107
+ },
108
+ {
109
+ "Model": "CodeQwen1.5-7B-Chat",
110
+ "Domain": "Network",
111
+ "Pass_at_k": 0.609375
112
+ },
113
+ {
114
+ "Model": "Phi-3-medium-4k-instruct",
115
+ "Domain": "Network",
116
+ "Pass_at_k": 0.6015625
117
+ },
118
+ {
119
+ "Model": "Llama-2-13b-chat-hf",
120
+ "Domain": "Network",
121
+ "Pass_at_k": 0.53125
122
+ },
123
+ {
124
+ "Model": "gpt-4o-mini",
125
+ "Domain": "Visualization",
126
+ "Pass_at_k": 0.5967741935483871
127
+ },
128
+ {
129
+ "Model": "gpt-3.5-turbo",
130
+ "Domain": "Visualization",
131
+ "Pass_at_k": 0.489247311827957
132
+ },
133
+ {
134
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
135
+ "Domain": "Visualization",
136
+ "Pass_at_k": 0.4946236559139785
137
+ },
138
+ {
139
+ "Model": "deepseek-coder-33b-instruct",
140
+ "Domain": "Visualization",
141
+ "Pass_at_k": 0.5053763440860215
142
+ },
143
+ {
144
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
145
+ "Domain": "Visualization",
146
+ "Pass_at_k": 0.5
147
+ },
148
+ {
149
+ "Model": "deepseek-coder-6.7b-instruct",
150
+ "Domain": "Visualization",
151
+ "Pass_at_k": 0.45698924731182794
152
+ },
153
+ {
154
+ "Model": "CodeLlama-34b-Instruct-hf",
155
+ "Domain": "Visualization",
156
+ "Pass_at_k": 0.41935483870967744
157
+ },
158
+ {
159
+ "Model": "CodeLlama-13b-Instruct-hf",
160
+ "Domain": "Visualization",
161
+ "Pass_at_k": 0.42473118279569894
162
+ },
163
+ {
164
+ "Model": "CodeLlama-7b-Instruct-hf",
165
+ "Domain": "Visualization",
166
+ "Pass_at_k": 0.43548387096774194
167
+ },
168
+ {
169
+ "Model": "CodeQwen1.5-7B-Chat",
170
+ "Domain": "Visualization",
171
+ "Pass_at_k": 0.478494623655914
172
+ },
173
+ {
174
+ "Model": "Phi-3-medium-4k-instruct",
175
+ "Domain": "Visualization",
176
+ "Pass_at_k": 0.45161290322580644
177
+ },
178
+ {
179
+ "Model": "Llama-2-13b-chat-hf",
180
+ "Domain": "Visualization",
181
+ "Pass_at_k": 0.34946236559139787
182
+ },
183
+ {
184
+ "Model": "gpt-4o-mini",
185
+ "Domain": "Basic",
186
+ "Pass_at_k": 0.6915887850467289
187
+ },
188
+ {
189
+ "Model": "gpt-3.5-turbo",
190
+ "Domain": "Basic",
191
+ "Pass_at_k": 0.5607476635514018
192
+ },
193
+ {
194
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
195
+ "Domain": "Basic",
196
+ "Pass_at_k": 0.6915887850467289
197
+ },
198
+ {
199
+ "Model": "deepseek-coder-33b-instruct",
200
+ "Domain": "Basic",
201
+ "Pass_at_k": 0.5981308411214953
202
+ },
203
+ {
204
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
205
+ "Domain": "Basic",
206
+ "Pass_at_k": 0.6542056074766355
207
+ },
208
+ {
209
+ "Model": "deepseek-coder-6.7b-instruct",
210
+ "Domain": "Basic",
211
+ "Pass_at_k": 0.5794392523364486
212
+ },
213
+ {
214
+ "Model": "CodeLlama-34b-Instruct-hf",
215
+ "Domain": "Basic",
216
+ "Pass_at_k": 0.5514018691588785
217
+ },
218
+ {
219
+ "Model": "CodeLlama-13b-Instruct-hf",
220
+ "Domain": "Basic",
221
+ "Pass_at_k": 0.5887850467289719
222
+ },
223
+ {
224
+ "Model": "CodeLlama-7b-Instruct-hf",
225
+ "Domain": "Basic",
226
+ "Pass_at_k": 0.5233644859813084
227
+ },
228
+ {
229
+ "Model": "CodeQwen1.5-7B-Chat",
230
+ "Domain": "Basic",
231
+ "Pass_at_k": 0.6074766355140186
232
+ },
233
+ {
234
+ "Model": "Phi-3-medium-4k-instruct",
235
+ "Domain": "Basic",
236
+ "Pass_at_k": 0.616822429906542
237
+ },
238
+ {
239
+ "Model": "Llama-2-13b-chat-hf",
240
+ "Domain": "Basic",
241
+ "Pass_at_k": 0.4485981308411215
242
+ },
243
+ {
244
+ "Model": "gpt-4o-mini",
245
+ "Domain": "System",
246
+ "Pass_at_k": 0.51
247
+ },
248
+ {
249
+ "Model": "gpt-3.5-turbo",
250
+ "Domain": "System",
251
+ "Pass_at_k": 0.32
252
+ },
253
+ {
254
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
255
+ "Domain": "System",
256
+ "Pass_at_k": 0.41
257
+ },
258
+ {
259
+ "Model": "deepseek-coder-33b-instruct",
260
+ "Domain": "System",
261
+ "Pass_at_k": 0.46
262
+ },
263
+ {
264
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
265
+ "Domain": "System",
266
+ "Pass_at_k": 0.41
267
+ },
268
+ {
269
+ "Model": "deepseek-coder-6.7b-instruct",
270
+ "Domain": "System",
271
+ "Pass_at_k": 0.36
272
+ },
273
+ {
274
+ "Model": "CodeLlama-34b-Instruct-hf",
275
+ "Domain": "System",
276
+ "Pass_at_k": 0.35
277
+ },
278
+ {
279
+ "Model": "CodeLlama-13b-Instruct-hf",
280
+ "Domain": "System",
281
+ "Pass_at_k": 0.34
282
+ },
283
+ {
284
+ "Model": "CodeLlama-7b-Instruct-hf",
285
+ "Domain": "System",
286
+ "Pass_at_k": 0.36
287
+ },
288
+ {
289
+ "Model": "CodeQwen1.5-7B-Chat",
290
+ "Domain": "System",
291
+ "Pass_at_k": 0.37
292
+ },
293
+ {
294
+ "Model": "Phi-3-medium-4k-instruct",
295
+ "Domain": "System",
296
+ "Pass_at_k": 0.42
297
+ },
298
+ {
299
+ "Model": "Llama-2-13b-chat-hf",
300
+ "Domain": "System",
301
+ "Pass_at_k": 0.19
302
+ },
303
+ {
304
+ "Model": "gpt-4o-mini",
305
+ "Domain": "Cryptography",
306
+ "Pass_at_k": 0.43
307
+ },
308
+ {
309
+ "Model": "gpt-3.5-turbo",
310
+ "Domain": "Cryptography",
311
+ "Pass_at_k": 0.31
312
+ },
313
+ {
314
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
315
+ "Domain": "Cryptography",
316
+ "Pass_at_k": 0.36
317
+ },
318
+ {
319
+ "Model": "deepseek-coder-33b-instruct",
320
+ "Domain": "Cryptography",
321
+ "Pass_at_k": 0.35
322
+ },
323
+ {
324
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
325
+ "Domain": "Cryptography",
326
+ "Pass_at_k": 0.38
327
+ },
328
+ {
329
+ "Model": "deepseek-coder-6.7b-instruct",
330
+ "Domain": "Cryptography",
331
+ "Pass_at_k": 0.4
332
+ },
333
+ {
334
+ "Model": "CodeLlama-34b-Instruct-hf",
335
+ "Domain": "Cryptography",
336
+ "Pass_at_k": 0.31
337
+ },
338
+ {
339
+ "Model": "CodeLlama-13b-Instruct-hf",
340
+ "Domain": "Cryptography",
341
+ "Pass_at_k": 0.27
342
+ },
343
+ {
344
+ "Model": "CodeLlama-7b-Instruct-hf",
345
+ "Domain": "Cryptography",
346
+ "Pass_at_k": 0.32
347
+ },
348
+ {
349
+ "Model": "CodeQwen1.5-7B-Chat",
350
+ "Domain": "Cryptography",
351
+ "Pass_at_k": 0.37
352
+ },
353
+ {
354
+ "Model": "Phi-3-medium-4k-instruct",
355
+ "Domain": "Cryptography",
356
+ "Pass_at_k": 0.35
357
+ },
358
+ {
359
+ "Model": "Llama-2-13b-chat-hf",
360
+ "Domain": "Cryptography",
361
+ "Pass_at_k": 0.12
362
+ },
363
+ {
364
+ "Model": "gpt-4o-mini",
365
+ "Domain": "Mean",
366
+ "Pass_at_k": 0.6392167158851098
367
+ },
368
+ {
369
+ "Model": "gpt-3.5-turbo",
370
+ "Domain": "Mean",
371
+ "Pass_at_k": 0.5173093867812127
372
+ },
373
+ {
374
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
375
+ "Domain": "Mean",
376
+ "Pass_at_k": 0.5821338153067455
377
+ },
378
+ {
379
+ "Model": "deepseek-coder-33b-instruct",
380
+ "Domain": "Mean",
381
+ "Pass_at_k": 0.5662224371484669
382
+ },
383
+ {
384
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
385
+ "Domain": "Mean",
386
+ "Pass_at_k": 0.5709516524435644
387
+ },
388
+ {
389
+ "Model": "deepseek-coder-6.7b-instruct",
390
+ "Domain": "Mean",
391
+ "Pass_at_k": 0.5369104775806756
392
+ },
393
+ {
394
+ "Model": "CodeLlama-34b-Instruct-hf",
395
+ "Domain": "Mean",
396
+ "Pass_at_k": 0.49883717003087863
397
+ },
398
+ {
399
+ "Model": "CodeLlama-13b-Instruct-hf",
400
+ "Domain": "Mean",
401
+ "Pass_at_k": 0.5079237551407199
402
+ },
403
+ {
404
+ "Model": "CodeLlama-7b-Instruct-hf",
405
+ "Domain": "Mean",
406
+ "Pass_at_k": 0.5025963506694164
407
+ },
408
+ {
409
+ "Model": "CodeQwen1.5-7B-Chat",
410
+ "Domain": "Mean",
411
+ "Pass_at_k": 0.5478265270659565
412
+ },
413
+ {
414
+ "Model": "Phi-3-medium-4k-instruct",
415
+ "Domain": "Mean",
416
+ "Pass_at_k": 0.5325705088456162
417
+ },
418
+ {
419
+ "Model": "Llama-2-13b-chat-hf",
420
+ "Domain": "Mean",
421
+ "Pass_at_k": 0.4081157767758989
422
+ },
423
+ {
424
+ "Model": "gpt-4o-mini",
425
+ "Domain": "Std",
426
+ "Pass_at_k": 0.16679801914758088
427
+ },
428
+ {
429
+ "Model": "gpt-3.5-turbo",
430
+ "Domain": "Std",
431
+ "Pass_at_k": 0.1950117243115276
432
+ },
433
+ {
434
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
435
+ "Domain": "Std",
436
+ "Pass_at_k": 0.19393547652062595
437
+ },
438
+ {
439
+ "Model": "deepseek-coder-33b-instruct",
440
+ "Domain": "Std",
441
+ "Pass_at_k": 0.1693855278154664
442
+ },
443
+ {
444
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
445
+ "Domain": "Std",
446
+ "Pass_at_k": 0.17923951210025596
447
+ },
448
+ {
449
+ "Model": "deepseek-coder-6.7b-instruct",
450
+ "Domain": "Std",
451
+ "Pass_at_k": 0.17321135521991954
452
+ },
453
+ {
454
+ "Model": "CodeLlama-34b-Instruct-hf",
455
+ "Domain": "Std",
456
+ "Pass_at_k": 0.17089125215414938
457
+ },
458
+ {
459
+ "Model": "CodeLlama-13b-Instruct-hf",
460
+ "Domain": "Std",
461
+ "Pass_at_k": 0.19904938629943747
462
+ },
463
+ {
464
+ "Model": "CodeLlama-7b-Instruct-hf",
465
+ "Domain": "Std",
466
+ "Pass_at_k": 0.16815110445446094
467
+ },
468
+ {
469
+ "Model": "CodeQwen1.5-7B-Chat",
470
+ "Domain": "Std",
471
+ "Pass_at_k": 0.18313053955353828
472
+ },
473
+ {
474
+ "Model": "Phi-3-medium-4k-instruct",
475
+ "Domain": "Std",
476
+ "Pass_at_k": 0.15105015549350911
477
+ },
478
+ {
479
+ "Model": "Llama-2-13b-chat-hf",
480
+ "Domain": "Std",
481
+ "Pass_at_k": 0.24973689844845592
482
+ }
483
+ ],
484
+ "pass_5": [
485
+ {
486
+ "Model": "gpt-4o-mini",
487
+ "Domain": "Computation",
488
+ "Pass_at_k": 0.9126099706744868
489
+ },
490
+ {
491
+ "Model": "gpt-3.5-turbo",
492
+ "Domain": "Computation",
493
+ "Pass_at_k": 0.8733137829912023
494
+ },
495
+ {
496
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
497
+ "Domain": "Computation",
498
+ "Pass_at_k": 0.9014662756598241
499
+ },
500
+ {
501
+ "Model": "deepseek-coder-33b-instruct",
502
+ "Domain": "Computation",
503
+ "Pass_at_k": 0.8979472140762463
504
+ },
505
+ {
506
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
507
+ "Domain": "Computation",
508
+ "Pass_at_k": 0.8891495601173021
509
+ },
510
+ {
511
+ "Model": "deepseek-coder-6.7b-instruct",
512
+ "Domain": "Computation",
513
+ "Pass_at_k": 0.8979472140762463
514
+ },
515
+ {
516
+ "Model": "CodeLlama-34b-Instruct-hf",
517
+ "Domain": "Computation",
518
+ "Pass_at_k": 0.8510263929618769
519
+ },
520
+ {
521
+ "Model": "CodeLlama-13b-Instruct-hf",
522
+ "Domain": "Computation",
523
+ "Pass_at_k": 0.898533724340176
524
+ },
525
+ {
526
+ "Model": "CodeLlama-7b-Instruct-hf",
527
+ "Domain": "Computation",
528
+ "Pass_at_k": 0.8680351906158358
529
+ },
530
+ {
531
+ "Model": "CodeQwen1.5-7B-Chat",
532
+ "Domain": "Computation",
533
+ "Pass_at_k": 0.9102639296187683
534
+ },
535
+ {
536
+ "Model": "Phi-3-medium-4k-instruct",
537
+ "Domain": "Computation",
538
+ "Pass_at_k": 0.8510263929618769
539
+ },
540
+ {
541
+ "Model": "Llama-2-13b-chat-hf",
542
+ "Domain": "Computation",
543
+ "Pass_at_k": 0.8768328445747801
544
+ },
545
+ {
546
+ "Model": "gpt-4o-mini",
547
+ "Domain": "Network",
548
+ "Pass_at_k": 0.7265625
549
+ },
550
+ {
551
+ "Model": "gpt-3.5-turbo",
552
+ "Domain": "Network",
553
+ "Pass_at_k": 0.62890625
554
+ },
555
+ {
556
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
557
+ "Domain": "Network",
558
+ "Pass_at_k": 0.70703125
559
+ },
560
+ {
561
+ "Model": "deepseek-coder-33b-instruct",
562
+ "Domain": "Network",
563
+ "Pass_at_k": 0.70703125
564
+ },
565
+ {
566
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
567
+ "Domain": "Network",
568
+ "Pass_at_k": 0.65625
569
+ },
570
+ {
571
+ "Model": "deepseek-coder-6.7b-instruct",
572
+ "Domain": "Network",
573
+ "Pass_at_k": 0.63671875
574
+ },
575
+ {
576
+ "Model": "CodeLlama-34b-Instruct-hf",
577
+ "Domain": "Network",
578
+ "Pass_at_k": 0.6328125
579
+ },
580
+ {
581
+ "Model": "CodeLlama-13b-Instruct-hf",
582
+ "Domain": "Network",
583
+ "Pass_at_k": 0.65625
584
+ },
585
+ {
586
+ "Model": "CodeLlama-7b-Instruct-hf",
587
+ "Domain": "Network",
588
+ "Pass_at_k": 0.63671875
589
+ },
590
+ {
591
+ "Model": "CodeQwen1.5-7B-Chat",
592
+ "Domain": "Network",
593
+ "Pass_at_k": 0.640625
594
+ },
595
+ {
596
+ "Model": "Phi-3-medium-4k-instruct",
597
+ "Domain": "Network",
598
+ "Pass_at_k": 0.67578125
599
+ },
600
+ {
601
+ "Model": "Llama-2-13b-chat-hf",
602
+ "Domain": "Network",
603
+ "Pass_at_k": 0.55859375
604
+ },
605
+ {
606
+ "Model": "gpt-4o-mini",
607
+ "Domain": "Visualization",
608
+ "Pass_at_k": 0.6182795698924731
609
+ },
610
+ {
611
+ "Model": "gpt-3.5-turbo",
612
+ "Domain": "Visualization",
613
+ "Pass_at_k": 0.521505376344086
614
+ },
615
+ {
616
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
617
+ "Domain": "Visualization",
618
+ "Pass_at_k": 0.5483870967741935
619
+ },
620
+ {
621
+ "Model": "deepseek-coder-33b-instruct",
622
+ "Domain": "Visualization",
623
+ "Pass_at_k": 0.553763440860215
624
+ },
625
+ {
626
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
627
+ "Domain": "Visualization",
628
+ "Pass_at_k": 0.5376344086021505
629
+ },
630
+ {
631
+ "Model": "deepseek-coder-6.7b-instruct",
632
+ "Domain": "Visualization",
633
+ "Pass_at_k": 0.553763440860215
634
+ },
635
+ {
636
+ "Model": "CodeLlama-34b-Instruct-hf",
637
+ "Domain": "Visualization",
638
+ "Pass_at_k": 0.4838709677419355
639
+ },
640
+ {
641
+ "Model": "CodeLlama-13b-Instruct-hf",
642
+ "Domain": "Visualization",
643
+ "Pass_at_k": 0.5161290322580645
644
+ },
645
+ {
646
+ "Model": "CodeLlama-7b-Instruct-hf",
647
+ "Domain": "Visualization",
648
+ "Pass_at_k": 0.5161290322580645
649
+ },
650
+ {
651
+ "Model": "CodeQwen1.5-7B-Chat",
652
+ "Domain": "Visualization",
653
+ "Pass_at_k": 0.553763440860215
654
+ },
655
+ {
656
+ "Model": "Phi-3-medium-4k-instruct",
657
+ "Domain": "Visualization",
658
+ "Pass_at_k": 0.543010752688172
659
+ },
660
+ {
661
+ "Model": "Llama-2-13b-chat-hf",
662
+ "Domain": "Visualization",
663
+ "Pass_at_k": 0.3978494623655914
664
+ },
665
+ {
666
+ "Model": "gpt-4o-mini",
667
+ "Domain": "Basic",
668
+ "Pass_at_k": 0.7102803738317757
669
+ },
670
+ {
671
+ "Model": "gpt-3.5-turbo",
672
+ "Domain": "Basic",
673
+ "Pass_at_k": 0.6074766355140186
674
+ },
675
+ {
676
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
677
+ "Domain": "Basic",
678
+ "Pass_at_k": 0.7383177570093458
679
+ },
680
+ {
681
+ "Model": "deepseek-coder-33b-instruct",
682
+ "Domain": "Basic",
683
+ "Pass_at_k": 0.6822429906542056
684
+ },
685
+ {
686
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
687
+ "Domain": "Basic",
688
+ "Pass_at_k": 0.6822429906542056
689
+ },
690
+ {
691
+ "Model": "deepseek-coder-6.7b-instruct",
692
+ "Domain": "Basic",
693
+ "Pass_at_k": 0.6728971962616822
694
+ },
695
+ {
696
+ "Model": "CodeLlama-34b-Instruct-hf",
697
+ "Domain": "Basic",
698
+ "Pass_at_k": 0.6261682242990654
699
+ },
700
+ {
701
+ "Model": "CodeLlama-13b-Instruct-hf",
702
+ "Domain": "Basic",
703
+ "Pass_at_k": 0.6635514018691588
704
+ },
705
+ {
706
+ "Model": "CodeLlama-7b-Instruct-hf",
707
+ "Domain": "Basic",
708
+ "Pass_at_k": 0.6448598130841121
709
+ },
710
+ {
711
+ "Model": "CodeQwen1.5-7B-Chat",
712
+ "Domain": "Basic",
713
+ "Pass_at_k": 0.6822429906542056
714
+ },
715
+ {
716
+ "Model": "Phi-3-medium-4k-instruct",
717
+ "Domain": "Basic",
718
+ "Pass_at_k": 0.6728971962616822
719
+ },
720
+ {
721
+ "Model": "Llama-2-13b-chat-hf",
722
+ "Domain": "Basic",
723
+ "Pass_at_k": 0.48598130841121495
724
+ },
725
+ {
726
+ "Model": "gpt-4o-mini",
727
+ "Domain": "System",
728
+ "Pass_at_k": 0.57
729
+ },
730
+ {
731
+ "Model": "gpt-3.5-turbo",
732
+ "Domain": "System",
733
+ "Pass_at_k": 0.36
734
+ },
735
+ {
736
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
737
+ "Domain": "System",
738
+ "Pass_at_k": 0.5
739
+ },
740
+ {
741
+ "Model": "deepseek-coder-33b-instruct",
742
+ "Domain": "System",
743
+ "Pass_at_k": 0.57
744
+ },
745
+ {
746
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
747
+ "Domain": "System",
748
+ "Pass_at_k": 0.49
749
+ },
750
+ {
751
+ "Model": "deepseek-coder-6.7b-instruct",
752
+ "Domain": "System",
753
+ "Pass_at_k": 0.49
754
+ },
755
+ {
756
+ "Model": "CodeLlama-34b-Instruct-hf",
757
+ "Domain": "System",
758
+ "Pass_at_k": 0.41
759
+ },
760
+ {
761
+ "Model": "CodeLlama-13b-Instruct-hf",
762
+ "Domain": "System",
763
+ "Pass_at_k": 0.38
764
+ },
765
+ {
766
+ "Model": "CodeLlama-7b-Instruct-hf",
767
+ "Domain": "System",
768
+ "Pass_at_k": 0.43
769
+ },
770
+ {
771
+ "Model": "CodeQwen1.5-7B-Chat",
772
+ "Domain": "System",
773
+ "Pass_at_k": 0.45
774
+ },
775
+ {
776
+ "Model": "Phi-3-medium-4k-instruct",
777
+ "Domain": "System",
778
+ "Pass_at_k": 0.47
779
+ },
780
+ {
781
+ "Model": "Llama-2-13b-chat-hf",
782
+ "Domain": "System",
783
+ "Pass_at_k": 0.26
784
+ },
785
+ {
786
+ "Model": "gpt-4o-mini",
787
+ "Domain": "Cryptography",
788
+ "Pass_at_k": 0.49
789
+ },
790
+ {
791
+ "Model": "gpt-3.5-turbo",
792
+ "Domain": "Cryptography",
793
+ "Pass_at_k": 0.34
794
+ },
795
+ {
796
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
797
+ "Domain": "Cryptography",
798
+ "Pass_at_k": 0.46
799
+ },
800
+ {
801
+ "Model": "deepseek-coder-33b-instruct",
802
+ "Domain": "Cryptography",
803
+ "Pass_at_k": 0.42
804
+ },
805
+ {
806
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
807
+ "Domain": "Cryptography",
808
+ "Pass_at_k": 0.44
809
+ },
810
+ {
811
+ "Model": "deepseek-coder-6.7b-instruct",
812
+ "Domain": "Cryptography",
813
+ "Pass_at_k": 0.44
814
+ },
815
+ {
816
+ "Model": "CodeLlama-34b-Instruct-hf",
817
+ "Domain": "Cryptography",
818
+ "Pass_at_k": 0.42
819
+ },
820
+ {
821
+ "Model": "CodeLlama-13b-Instruct-hf",
822
+ "Domain": "Cryptography",
823
+ "Pass_at_k": 0.35
824
+ },
825
+ {
826
+ "Model": "CodeLlama-7b-Instruct-hf",
827
+ "Domain": "Cryptography",
828
+ "Pass_at_k": 0.4
829
+ },
830
+ {
831
+ "Model": "CodeQwen1.5-7B-Chat",
832
+ "Domain": "Cryptography",
833
+ "Pass_at_k": 0.42
834
+ },
835
+ {
836
+ "Model": "Phi-3-medium-4k-instruct",
837
+ "Domain": "Cryptography",
838
+ "Pass_at_k": 0.44
839
+ },
840
+ {
841
+ "Model": "Llama-2-13b-chat-hf",
842
+ "Domain": "Cryptography",
843
+ "Pass_at_k": 0.21
844
+ },
845
+ {
846
+ "Model": "gpt-4o-mini",
847
+ "Domain": "Mean",
848
+ "Pass_at_k": 0.6712887357331225
849
+ },
850
+ {
851
+ "Model": "gpt-3.5-turbo",
852
+ "Domain": "Mean",
853
+ "Pass_at_k": 0.5552003408082177
854
+ },
855
+ {
856
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
857
+ "Domain": "Mean",
858
+ "Pass_at_k": 0.6425337299072272
859
+ },
860
+ {
861
+ "Model": "deepseek-coder-33b-instruct",
862
+ "Domain": "Mean",
863
+ "Pass_at_k": 0.6384974825984445
864
+ },
865
+ {
866
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
867
+ "Domain": "Mean",
868
+ "Pass_at_k": 0.615879493228943
869
+ },
870
+ {
871
+ "Model": "deepseek-coder-6.7b-instruct",
872
+ "Domain": "Mean",
873
+ "Pass_at_k": 0.6152211001996906
874
+ },
875
+ {
876
+ "Model": "CodeLlama-34b-Instruct-hf",
877
+ "Domain": "Mean",
878
+ "Pass_at_k": 0.5706463475004796
879
+ },
880
+ {
881
+ "Model": "CodeLlama-13b-Instruct-hf",
882
+ "Domain": "Mean",
883
+ "Pass_at_k": 0.5774106930778998
884
+ },
885
+ {
886
+ "Model": "CodeLlama-7b-Instruct-hf",
887
+ "Domain": "Mean",
888
+ "Pass_at_k": 0.5826237976596688
889
+ },
890
+ {
891
+ "Model": "CodeQwen1.5-7B-Chat",
892
+ "Domain": "Mean",
893
+ "Pass_at_k": 0.6094825601888648
894
+ },
895
+ {
896
+ "Model": "Phi-3-medium-4k-instruct",
897
+ "Domain": "Mean",
898
+ "Pass_at_k": 0.6087859319852885
899
+ },
900
+ {
901
+ "Model": "Llama-2-13b-chat-hf",
902
+ "Domain": "Mean",
903
+ "Pass_at_k": 0.46487622755859775
904
+ },
905
+ {
906
+ "Model": "gpt-4o-mini",
907
+ "Domain": "Std",
908
+ "Pass_at_k": 0.14747641211035856
909
+ },
910
+ {
911
+ "Model": "gpt-3.5-turbo",
912
+ "Domain": "Std",
913
+ "Pass_at_k": 0.19743922837233668
914
+ },
915
+ {
916
+ "Model": "Qwen2-72B-Instruct-GPTQ-Int4",
917
+ "Domain": "Std",
918
+ "Pass_at_k": 0.169043537848292
919
+ },
920
+ {
921
+ "Model": "deepseek-coder-33b-instruct",
922
+ "Domain": "Std",
923
+ "Pass_at_k": 0.1634243695210041
924
+ },
925
+ {
926
+ "Model": "DeepSeek-Coder-V2-Lite-Instruct",
927
+ "Domain": "Std",
928
+ "Pass_at_k": 0.16346984877152868
929
+ },
930
+ {
931
+ "Model": "deepseek-coder-6.7b-instruct",
932
+ "Domain": "Std",
933
+ "Pass_at_k": 0.16363528852513812
934
+ },
935
+ {
936
+ "Model": "CodeLlama-34b-Instruct-hf",
937
+ "Domain": "Std",
938
+ "Pass_at_k": 0.16828060893964333
939
+ },
940
+ {
941
+ "Model": "CodeLlama-13b-Instruct-hf",
942
+ "Domain": "Std",
943
+ "Pass_at_k": 0.2055227025195004
944
+ },
945
+ {
946
+ "Model": "CodeLlama-7b-Instruct-hf",
947
+ "Domain": "Std",
948
+ "Pass_at_k": 0.17281566921046676
949
+ },
950
+ {
951
+ "Model": "CodeQwen1.5-7B-Chat",
952
+ "Domain": "Std",
953
+ "Pass_at_k": 0.17954181233010988
954
+ },
955
+ {
956
+ "Model": "Phi-3-medium-4k-instruct",
957
+ "Domain": "Std",
958
+ "Pass_at_k": 0.15450285935340832
959
+ },
960
+ {
961
+ "Model": "Llama-2-13b-chat-hf",
962
+ "Domain": "Std",
963
+ "Pass_at_k": 0.2409835679041833
964
+ }
965
+ ]
966
+ }
text_content.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HEAD_TEXT = """
2
+ Based on the DomainEval benchmark, we evaluate code generation ability of different LLMs across multiple domains.
3
+
4
+ More details about how to evaluate the LLM are available in the [DomainEval GitHub repository](https://github.com/domaineval/DomainEval).
5
+
6
+ For a complete description of DomainEval benchmark and related experimental analysis, please refer to the paper: [DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation](https://arxiv.org/abs/2408.13204). [![](https://img.shields.io/badge/arXiv-2408.13204-b31b1b.svg)](https://arxiv.org/abs/2408.13204)
7
+
8
+ **_Latest News_** 🔥
9
+ - [24/08/26] We release our DomainEval benchmark, leaderboard and paper.
10
+ """
11
+
12
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
13
+
14
+ CITATION_BUTTON_TEXT = r"""@misc{zhu2024domainevalautoconstructedbenchmarkmultidomain,
15
+ title={DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation},
16
+ author={Qiming Zhu and Jialun Cao and Yaojie Lu and Hongyu Lin and Xianpei Han and Le Sun and Shing-Chi Cheung},
17
+ year={2024},
18
+ eprint={2408.13204},
19
+ archivePrefix={arXiv},
20
+ primaryClass={cs.AI},
21
+ url={https://arxiv.org/abs/2408.13204},
22
+ }
23
+ """
24
+
25
+ ACKNOWLEDGEMENT_TEXT = """
26
+ Inspired from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
27
+ """
28
+
29
+
30
+ NOTES_TEXT = """
31
+ **Notes:**
32
+ - Evaluate using pass@k as the evaluation metric.
33
+ - `Mean` denotes the macro average results of pass@k across 6 different domains.
34
+ - `Std` denotes the standard deviation of pass@k across 6 different domains.
35
+ - you can choose differt pass@k in `⏬ Pass@k`.
36
+ - `⏬ Domains` can choose domains you want to show in the leaderboard.
37
+ """
utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class ColumnContent:
5
+ name: str
6
+ type: str
7
+ displayed_by_default: bool
8
+ hidden: bool = False
9
+
10
+
11
+ def fields(raw_class):
12
+ return [
13
+ v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
14
+ ]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class AutoEvalColumn: # Auto evals column
19
+ model = ColumnContent("Model", "markdown", True)
20
+ average = ColumnContent("Mean", "number", True)
21
+ std = ColumnContent("Std", "number", True)
22
+ l_0 = ColumnContent("Computation", "number", True)
23
+ l_1 = ColumnContent("Network", "number", True)
24
+ l_2 = ColumnContent("Visualization", "number", True)
25
+ l_3 = ColumnContent("Basic", "number", True)
26
+ l_4 = ColumnContent("System", "number", True)
27
+ l_5 = ColumnContent("Cryptography", "number", True)