Jason Zheng commited on
Commit
6906870
β€’
1 Parent(s): 76f2cf6

first commit

Browse files
Files changed (5) hide show
  1. RESULTS.json +686 -0
  2. app.py +144 -0
  3. css_html.py +64 -0
  4. text_content.py +63 -0
  5. utils.py +37 -0
RESULTS.json ADDED
@@ -0,0 +1,686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gpt-4o-2024-05-13": {
3
+ "readability": {
4
+ "R*": 80.5,
5
+ "RN_p": 81.1,
6
+ "RN_if": 91.8,
7
+ "RN": 75.3,
8
+ "RL_p": 78.9,
9
+ "RL_if": 78.9,
10
+ "RL": 63.2,
11
+ "RC_p": 79.8,
12
+ "RC_if": 78.7,
13
+ "RC": 64.3,
14
+ "MBPP*": 64.6,
15
+ "Readability": 67.6
16
+ },
17
+ "maintainability": {
18
+ "MI*": 38.0,
19
+ "MI_p": 35.0,
20
+ "MI": 75.1,
21
+ "MC*": 57.2,
22
+ "MC_p": 56.3,
23
+ "MC": 35.2,
24
+ "Maintainability": 55.1
25
+ },
26
+ "efficiency": {
27
+ "E*": 59.4,
28
+ "E_p": 58.4,
29
+ "E_NI_T": 44.8,
30
+ "E_NI_S": 42.0,
31
+ "Efficiency": 43.4
32
+ },
33
+ "correctness": {
34
+ "Correctness": 59.9
35
+ },
36
+ "overall": {
37
+ "RACE Score": 56.5
38
+ }
39
+ },
40
+ "gpt-3.5-turbo-0125": {
41
+ "readability": {
42
+ "R*": 62.8,
43
+ "RN_p": 63.2,
44
+ "RN_if": 74.4,
45
+ "RN": 48.3,
46
+ "RL_p": 60.4,
47
+ "RL_if": 76.8,
48
+ "RL": 46.1,
49
+ "RC_p": 65.8,
50
+ "RC_if": 60.0,
51
+ "RC": 41.5,
52
+ "MBPP*": 62.2,
53
+ "Readability": 45.3
54
+ },
55
+ "maintainability": {
56
+ "MI*": 28.0,
57
+ "MI_p": 24.0,
58
+ "MI": 80.2,
59
+ "MC*": 31.1,
60
+ "MC_p": 28.1,
61
+ "MC": 18.5,
62
+ "Maintainability": 49.4
63
+ },
64
+ "efficiency": {
65
+ "E*": 39.6,
66
+ "E_p": 32.7,
67
+ "E_NI_T": 27.5,
68
+ "E_NI_S": 36.5,
69
+ "Efficiency": 32.0
70
+ },
71
+ "correctness": {
72
+ "Correctness": 44.7
73
+ },
74
+ "overall": {
75
+ "RACE Score": 42.8
76
+ }
77
+ },
78
+ "CodeLlama-7b-Instruct-hf": {
79
+ "readability": {
80
+ "R*": 32.3,
81
+ "RN_p": 31.5,
82
+ "RN_if": 55.5,
83
+ "RN": 17.0,
84
+ "RL_p": 31.7,
85
+ "RL_if": 59.7,
86
+ "RL": 23.4,
87
+ "RC_p": 30.2,
88
+ "RC_if": 67.4,
89
+ "RC": 18.3,
90
+ "MBPP*": 43.1,
91
+ "Readability": 19.6
92
+ },
93
+ "maintainability": {
94
+ "MI*": 16.0,
95
+ "MI_p": 15.0,
96
+ "MI": 71.8,
97
+ "MC*": 12.2,
98
+ "MC_p": 10.9,
99
+ "MC": 7.2,
100
+ "Maintainability": 39.5
101
+ },
102
+ "efficiency": {
103
+ "E*": 15.8,
104
+ "E_p": 13.9,
105
+ "E_NI_T": 8.2,
106
+ "E_NI_S": 8.8,
107
+ "Efficiency": 8.5
108
+ },
109
+ "correctness": {
110
+ "Correctness": 23.9
111
+ },
112
+ "overall": {
113
+ "RACE Score": 22.9
114
+ }
115
+ },
116
+ "CodeLlama-7b-Python-hf": {
117
+ "readability": {
118
+ "R*": 29.3,
119
+ "RN_p": 29.5,
120
+ "RN_if": 66.4,
121
+ "RN": 20.4,
122
+ "RL_p": 30.1,
123
+ "RL_if": 76.6,
124
+ "RL": 25.8,
125
+ "RC_p": 24.7,
126
+ "RC_if": 42.1,
127
+ "RC": 11.6,
128
+ "MBPP*": 41.3,
129
+ "Readability": 19.3
130
+ },
131
+ "maintainability": {
132
+ "MI*": 11.0,
133
+ "MI_p": 10.0,
134
+ "MI": 79.4,
135
+ "MC*": 5.6,
136
+ "MC_p": 6.5,
137
+ "MC": 3.7,
138
+ "Maintainability": 41.6
139
+ },
140
+ "efficiency": {
141
+ "E*": 14.9,
142
+ "E_p": 15.8,
143
+ "E_NI_T": 14.3,
144
+ "E_NI_S": 14.4,
145
+ "Efficiency": 14.4
146
+ },
147
+ "correctness": {
148
+ "Correctness": 20.4
149
+ },
150
+ "overall": {
151
+ "RACE Score": 23.9
152
+ }
153
+ },
154
+ "CodeLlama-13b-Instruct-hf": {
155
+ "readability": {
156
+ "R*": 36.0,
157
+ "RN_p": 37.7,
158
+ "RN_if": 57.8,
159
+ "RN": 22.0,
160
+ "RL_p": 35.0,
161
+ "RL_if": 59.9,
162
+ "RL": 23.6,
163
+ "RC_p": 35.7,
164
+ "RC_if": 64.3,
165
+ "RC": 23.2,
166
+ "MBPP*": 40.7,
167
+ "Readability": 22.9
168
+ },
169
+ "maintainability": {
170
+ "MI*": 17.0,
171
+ "MI_p": 19.0,
172
+ "MI": 82.1,
173
+ "MC*": 10.6,
174
+ "MC_p": 13.1,
175
+ "MC": 7.6,
176
+ "Maintainability": 44.8
177
+ },
178
+ "efficiency": {
179
+ "E*": 17.8,
180
+ "E_p": 17.8,
181
+ "E_NI_T": 10.4,
182
+ "E_NI_S": 16.1,
183
+ "Efficiency": 13.2
184
+ },
185
+ "correctness": {
186
+ "Correctness": 24.4
187
+ },
188
+ "overall": {
189
+ "RACE Score": 26.4
190
+ }
191
+ },
192
+ "CodeLlama-13b-Python-hf": {
193
+ "readability": {
194
+ "R*": 40.2,
195
+ "RN_p": 35.0,
196
+ "RN_if": 61.3,
197
+ "RN": 22.4,
198
+ "RL_p": 34.8,
199
+ "RL_if": 83.5,
200
+ "RL": 30.9,
201
+ "RC_p": 30.2,
202
+ "RC_if": 60.7,
203
+ "RC": 20.4,
204
+ "MBPP*": 29.4,
205
+ "Readability": 24.6
206
+ },
207
+ "maintainability": {
208
+ "MI*": 16.0,
209
+ "MI_p": 15.0,
210
+ "MI": 78.6,
211
+ "MC*": 6.1,
212
+ "MC_p": 4.8,
213
+ "MC": 2.4,
214
+ "Maintainability": 40.5
215
+ },
216
+ "efficiency": {
217
+ "E*": 16.8,
218
+ "E_p": 17.8,
219
+ "E_NI_T": 13.8,
220
+ "E_NI_S": 14.7,
221
+ "Efficiency": 14.2
222
+ },
223
+ "correctness": {
224
+ "Correctness": 21.7
225
+ },
226
+ "overall": {
227
+ "RACE Score": 25.3
228
+ }
229
+ },
230
+ "CodeLlama-34b-Instruct-hf": {
231
+ "readability": {
232
+ "R*": 36.0,
233
+ "RN_p": 36.5,
234
+ "RN_if": 54.3,
235
+ "RN": 21.1,
236
+ "RL_p": 35.8,
237
+ "RL_if": 41.7,
238
+ "RL": 17.5,
239
+ "RC_p": 36.3,
240
+ "RC_if": 32.0,
241
+ "RC": 9.4,
242
+ "MBPP*": 45.8,
243
+ "Readability": 16.0
244
+ },
245
+ "maintainability": {
246
+ "MI*": 12.0,
247
+ "MI_p": 18.0,
248
+ "MI": 73.2,
249
+ "MC*": 15.6,
250
+ "MC_p": 14.2,
251
+ "MC": 8.5,
252
+ "Maintainability": 40.9
253
+ },
254
+ "efficiency": {
255
+ "E*": 20.8,
256
+ "E_p": 15.8,
257
+ "E_NI_T": 14.4,
258
+ "E_NI_S": 13.8,
259
+ "Efficiency": 14.1
260
+ },
261
+ "correctness": {
262
+ "Correctness": 26.0
263
+ },
264
+ "overall": {
265
+ "RACE Score": 24.2
266
+ }
267
+ },
268
+ "CodeLlama-34b-Python-hf": {
269
+ "readability": {
270
+ "R*": 31.7,
271
+ "RN_p": 27.2,
272
+ "RN_if": 66.9,
273
+ "RN": 18.6,
274
+ "RL_p": 32.5,
275
+ "RL_if": 73.2,
276
+ "RL": 26.7,
277
+ "RC_p": 27.8,
278
+ "RC_if": 39.4,
279
+ "RC": 6.7,
280
+ "MBPP*": 36.2,
281
+ "Readability": 17.3
282
+ },
283
+ "maintainability": {
284
+ "MI*": 3.0,
285
+ "MI_p": 2.0,
286
+ "MI": 85.3,
287
+ "MC*": 7.2,
288
+ "MC_p": 5.4,
289
+ "MC": 2.2,
290
+ "Maintainability": 43.8
291
+ },
292
+ "efficiency": {
293
+ "E*": 17.8,
294
+ "E_p": 11.9,
295
+ "E_NI_T": 12.0,
296
+ "E_NI_S": 14.4,
297
+ "Efficiency": 13.2
298
+ },
299
+ "correctness": {
300
+ "Correctness": 19.2
301
+ },
302
+ "overall": {
303
+ "RACE Score": 23.4
304
+ }
305
+ },
306
+ "deepseek-coder-6.7b-instruct": {
307
+ "readability": {
308
+ "R*": 65.2,
309
+ "RN_p": 65.5,
310
+ "RN_if": 67.2,
311
+ "RN": 44.4,
312
+ "RL_p": 61.2,
313
+ "RL_if": 73.6,
314
+ "RL": 46.6,
315
+ "RC_p": 61.2,
316
+ "RC_if": 65.5,
317
+ "RC": 42.0,
318
+ "MBPP*": 57.1,
319
+ "Readability": 44.3
320
+ },
321
+ "maintainability": {
322
+ "MI*": 26.0,
323
+ "MI_p": 25.0,
324
+ "MI": 79.3,
325
+ "MC*": 18.9,
326
+ "MC_p": 18.7,
327
+ "MC": 8.2,
328
+ "Maintainability": 43.8
329
+ },
330
+ "efficiency": {
331
+ "E*": 28.7,
332
+ "E_p": 30.7,
333
+ "E_NI_T": 27.1,
334
+ "E_NI_S": 30.0,
335
+ "Efficiency": 28.6
336
+ },
337
+ "correctness": {
338
+ "Correctness": 39.2
339
+ },
340
+ "overall": {
341
+ "RACE Score": 39.0
342
+ }
343
+ },
344
+ "deepseek-coder-7b-instruct-v1.5": {
345
+ "readability": {
346
+ "R*": 61.0,
347
+ "RN_p": 61.5,
348
+ "RN_if": 57.8,
349
+ "RN": 35.2,
350
+ "RL_p": 62.6,
351
+ "RL_if": 70.9,
352
+ "RL": 46.0,
353
+ "RC_p": 62.8,
354
+ "RC_if": 70.2,
355
+ "RC": 46.0,
356
+ "MBPP*": 59.3,
357
+ "Readability": 42.4
358
+ },
359
+ "maintainability": {
360
+ "MI*": 23.0,
361
+ "MI_p": 24.0,
362
+ "MI": 79.6,
363
+ "MC*": 23.3,
364
+ "MC_p": 20.9,
365
+ "MC": 8.9,
366
+ "Maintainability": 44.2
367
+ },
368
+ "efficiency": {
369
+ "E*": 32.7,
370
+ "E_p": 27.7,
371
+ "E_NI_T": 25.1,
372
+ "E_NI_S": 26.8,
373
+ "Efficiency": 26.0
374
+ },
375
+ "correctness": {
376
+ "Correctness": 39.9
377
+ },
378
+ "overall": {
379
+ "RACE Score": 38.1
380
+ }
381
+ },
382
+ "deepseek-coder-33b-instruct": {
383
+ "readability": {
384
+ "R*": 65.9,
385
+ "RN_p": 64.6,
386
+ "RN_if": 86.8,
387
+ "RN": 57.7,
388
+ "RL_p": 65.0,
389
+ "RL_if": 82.7,
390
+ "RL": 53.5,
391
+ "RC_p": 66.5,
392
+ "RC_if": 70.8,
393
+ "RC": 46.4,
394
+ "MBPP*": 61.9,
395
+ "Readability": 52.5
396
+ },
397
+ "maintainability": {
398
+ "MI*": 28.0,
399
+ "MI_p": 30.0,
400
+ "MI": 75.7,
401
+ "MC*": 22.2,
402
+ "MC_p": 27.6,
403
+ "MC": 11.3,
404
+ "Maintainability": 43.5
405
+ },
406
+ "efficiency": {
407
+ "E*": 45.5,
408
+ "E_p": 38.6,
409
+ "E_NI_T": 35.3,
410
+ "E_NI_S": 36.1,
411
+ "Efficiency": 35.7
412
+ },
413
+ "correctness": {
414
+ "Correctness": 44.7
415
+ },
416
+ "overall": {
417
+ "RACE Score": 44.1
418
+ }
419
+ },
420
+ "DeepSeek-Coder-V2-Lite-Instruct": {
421
+ "readability": {
422
+ "R*": 72.0,
423
+ "RN_p": 71.2,
424
+ "RN_if": 55.3,
425
+ "RN": 40.2,
426
+ "RL_p": 66.5,
427
+ "RL_if": 83.7,
428
+ "RL": 57.7,
429
+ "RC_p": 67.1,
430
+ "RC_if": 63.5,
431
+ "RC": 42.7,
432
+ "MBPP*": 62.7,
433
+ "Readability": 46.9
434
+ },
435
+ "maintainability": {
436
+ "MI*": 26.0,
437
+ "MI_p": 30.0,
438
+ "MI": 78.2,
439
+ "MC*": 44.4,
440
+ "MC_p": 44.3,
441
+ "MC": 19.8,
442
+ "Maintainability": 49.0
443
+ },
444
+ "efficiency": {
445
+ "E*": 49.5,
446
+ "E_p": 55.4,
447
+ "E_NI_T": 40.2,
448
+ "E_NI_S": 47.7,
449
+ "Efficiency": 44.0
450
+ },
451
+ "correctness": {
452
+ "Correctness": 50.9
453
+ },
454
+ "overall": {
455
+ "RACE Score": 47.7
456
+ }
457
+ },
458
+ "deepseek-coder": {
459
+ "readability": {
460
+ "R*": 73.8,
461
+ "RN_p": 75.3,
462
+ "RN_if": 91.8,
463
+ "RN": 70.0,
464
+ "RL_p": 75.2,
465
+ "RL_if": 88.4,
466
+ "RL": 67.1,
467
+ "RC_p": 76.5,
468
+ "RC_if": 74.1,
469
+ "RC": 58.5,
470
+ "MBPP*": 68.5,
471
+ "Readability": 65.2
472
+ },
473
+ "maintainability": {
474
+ "MI*": 35.0,
475
+ "MI_p": 38.0,
476
+ "MI": 77.3,
477
+ "MC*": 58.9,
478
+ "MC_p": 58.9,
479
+ "MC": 35.0,
480
+ "Maintainability": 56.1
481
+ },
482
+ "efficiency": {
483
+ "E*": 57.3,
484
+ "E_p": 53.5,
485
+ "E_NI_T": 41.1,
486
+ "E_NI_S": 49.4,
487
+ "Efficiency": 45.2
488
+ },
489
+ "correctness": {
490
+ "Correctness": 58.7
491
+ },
492
+ "overall": {
493
+ "RACE Score": 56.3
494
+ }
495
+ },
496
+ "WizardCoder-Python-7B-V1.0": {
497
+ "readability": {
498
+ "R*": 34.8,
499
+ "RN_p": 35.8,
500
+ "RN_if": 58.3,
501
+ "RN": 22.4,
502
+ "RL_p": 34.3,
503
+ "RL_if": 79.7,
504
+ "RL": 28.0,
505
+ "RC_p": 35.4,
506
+ "RC_if": 25.0,
507
+ "RC": 8.6,
508
+ "MBPP*": 41.8,
509
+ "Readability": 19.7
510
+ },
511
+ "maintainability": {
512
+ "MI*": 19.0,
513
+ "MI_p": 23.0,
514
+ "MI": 79.3,
515
+ "MC*": 10.6,
516
+ "MC_p": 9.8,
517
+ "MC": 7.2,
518
+ "Maintainability": 43.2
519
+ },
520
+ "efficiency": {
521
+ "E*": 19.8,
522
+ "E_p": 19.8,
523
+ "E_NI_T": 15.3,
524
+ "E_NI_S": 16.7,
525
+ "Efficiency": 16.0
526
+ },
527
+ "correctness": {
528
+ "Correctness": 25.2
529
+ },
530
+ "overall": {
531
+ "RACE Score": 26.0
532
+ }
533
+ },
534
+ "WizardCoder-Python-13B-V1.0": {
535
+ "readability": {
536
+ "R*": 36.0,
537
+ "RN_p": 38.2,
538
+ "RN_if": 58.4,
539
+ "RN": 23.1,
540
+ "RL_p": 38.4,
541
+ "RL_if": 83.1,
542
+ "RL": 33.1,
543
+ "RC_p": 43.6,
544
+ "RC_if": 59.8,
545
+ "RC": 27.4,
546
+ "MBPP*": 42.1,
547
+ "Readability": 27.9
548
+ },
549
+ "maintainability": {
550
+ "MI*": 20.0,
551
+ "MI_p": 21.0,
552
+ "MI": 78.8,
553
+ "MC*": 12.8,
554
+ "MC_p": 12.8,
555
+ "MC": 8.5,
556
+ "Maintainability": 43.6
557
+ },
558
+ "efficiency": {
559
+ "E*": 20.8,
560
+ "E_p": 18.8,
561
+ "E_NI_T": 16.2,
562
+ "E_NI_S": 19.8,
563
+ "Efficiency": 18.0
564
+ },
565
+ "correctness": {
566
+ "Correctness": 26.3
567
+ },
568
+ "overall": {
569
+ "RACE Score": 29.0
570
+ }
571
+ },
572
+ "WizardCoder-15B-V1.0": {
573
+ "readability": {
574
+ "R*": 38.4,
575
+ "RN_p": 38.7,
576
+ "RN_if": 59.0,
577
+ "RN": 23.2,
578
+ "RL_p": 41.9,
579
+ "RL_if": 64.8,
580
+ "RL": 27.8,
581
+ "RC_p": 40.0,
582
+ "RC_if": 57.3,
583
+ "RC": 24.4,
584
+ "MBPP*": 46.3,
585
+ "Readability": 25.1
586
+ },
587
+ "maintainability": {
588
+ "MI*": 22.0,
589
+ "MI_p": 21.0,
590
+ "MI": 80.0,
591
+ "MC*": 11.7,
592
+ "MC_p": 11.5,
593
+ "MC": 7.8,
594
+ "Maintainability": 43.9
595
+ },
596
+ "efficiency": {
597
+ "E*": 21.8,
598
+ "E_p": 22.8,
599
+ "E_NI_T": 21.8,
600
+ "E_NI_S": 24.2,
601
+ "Efficiency": 23.0
602
+ },
603
+ "correctness": {
604
+ "Correctness": 28.0
605
+ },
606
+ "overall": {
607
+ "RACE Score": 30.0
608
+ }
609
+ },
610
+ "WizardCoder-33B-V1.1": {
611
+ "readability": {
612
+ "R*": 58.5,
613
+ "RN_p": 58.8,
614
+ "RN_if": 65.4,
615
+ "RN": 39.9,
616
+ "RL_p": 62.2,
617
+ "RL_if": 76.0,
618
+ "RL": 47.6,
619
+ "RC_p": 58.8,
620
+ "RC_if": 61.0,
621
+ "RC": 37.2,
622
+ "MBPP*": 64.6,
623
+ "Readability": 41.6
624
+ },
625
+ "maintainability": {
626
+ "MI*": 34.0,
627
+ "MI_p": 34.0,
628
+ "MI": 71.2,
629
+ "MC*": 26.1,
630
+ "MC_p": 25.0,
631
+ "MC": 9.3,
632
+ "Maintainability": 40.2
633
+ },
634
+ "efficiency": {
635
+ "E*": 38.6,
636
+ "E_p": 35.6,
637
+ "E_NI_T": 33.9,
638
+ "E_NI_S": 34.9,
639
+ "Efficiency": 34.4
640
+ },
641
+ "correctness": {
642
+ "Correctness": 44.4
643
+ },
644
+ "overall": {
645
+ "RACE Score": 40.1
646
+ }
647
+ },
648
+ "CodeQwen1.5-7B-Chat": {
649
+ "readability": {
650
+ "R*": 76.2,
651
+ "RN_p": 76.8,
652
+ "RN_if": 60.8,
653
+ "RN": 47.0,
654
+ "RL_p": 73.4,
655
+ "RL_if": 60.8,
656
+ "RL": 47.0,
657
+ "RC_p": 74.7,
658
+ "RC_if": 71.3,
659
+ "RC": 54.2,
660
+ "MBPP*": 60.3,
661
+ "Readability": 49.4
662
+ },
663
+ "maintainability": {
664
+ "MI*": 22.0,
665
+ "MI_p": 22.0,
666
+ "MI": 82.3,
667
+ "MC*": 33.3,
668
+ "MC_p": 32.6,
669
+ "MC": 13.0,
670
+ "Maintainability": 47.6
671
+ },
672
+ "efficiency": {
673
+ "E*": 39.6,
674
+ "E_p": 38.6,
675
+ "E_NI_T": 30.7,
676
+ "E_NI_S": 37.7,
677
+ "Efficiency": 34.2
678
+ },
679
+ "correctness": {
680
+ "Correctness": 46.3
681
+ },
682
+ "overall": {
683
+ "RACE Score": 44.4
684
+ }
685
+ }
686
+ }
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+
6
+ from css_html import custom_css
7
+ from text_content import ABOUT_TEXT, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL, ACKNOWLEDGEMENT_TEXT, NOTES_TEXT
8
+ from utils import (
9
+ AutoEvalColumn,
10
+ fields,
11
+ )
12
+
13
+ result_path = './RESULTS.json'
14
+ with open(result_path, 'r') as f:
15
+ data = json.load(f)
16
+
17
+ rows = []
18
+ for col, subcols in data.items():
19
+ row = {"model": col}
20
+ for subcol, datas in subcols.items():
21
+ if subcol == 'readability':
22
+ symbol = 'πŸ“–'
23
+ elif subcol == 'maintainability':
24
+ symbol = 'πŸ”¨'
25
+ elif subcol == 'efficiency':
26
+ symbol = 'πŸš€'
27
+ elif subcol == 'correctness':
28
+ symbol = 'βœ…'
29
+ elif subcol == 'overall':
30
+ symbol = 'πŸ’―'
31
+
32
+ for key, value in datas.items():
33
+ row[f'{symbol} {key}'] = value
34
+
35
+ rows.append(row)
36
+
37
+ df = pd.DataFrame(rows)
38
+ df = df.sort_values(by='πŸ’― RACE Score', ascending=False)
39
+
40
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
41
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
42
+ COLS_LITE = [
43
+ c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
44
+ ]
45
+ TYPES_LITE = [
46
+ c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
47
+ ]
48
+
49
+ def select_columns(df, columns):
50
+ always_here_cols = [
51
+ AutoEvalColumn.model.name,
52
+ ]
53
+ # We use COLS to maintain sorting
54
+ filtered_df = df[
55
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns]
56
+ ]
57
+ return filtered_df
58
+
59
+ demo = gr.Blocks(css=custom_css)
60
+ with demo:
61
+ with gr.Row():
62
+ gr.Markdown(
63
+ """<div style="text-align: center;"><h1> 🏎️RACE Leaderboard</h1></div>\
64
+ <br>\
65
+ <p>Based on the 🏎️RACE benchmark, we demonstrated the ability of different LLMs to generate code that is <b><i>correct</i></b> and <b><i>meets the requirements of real-world development scenarios</i></b>.</p>
66
+ <p>Model details about how to evalute the LLM are available in the <a href="https://github.com/test/test">🏎️RACE GitHub repository</a>.</p>
67
+ """,
68
+ elem_classes="markdown-text",
69
+ )
70
+
71
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
72
+ with gr.Column():
73
+ with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
74
+ with gr.TabItem("πŸ” Evaluation Table", id=0):
75
+ with gr.Column():
76
+ with gr.Accordion("⏬ Hidden Columns", open=False):
77
+ shown_columns = gr.CheckboxGroup(
78
+ choices=[
79
+ c
80
+ for c in COLS
81
+ if c
82
+ not in [
83
+ AutoEvalColumn.model.name,
84
+ ]
85
+ ],
86
+ value=[
87
+ c
88
+ for c in COLS_LITE
89
+ if c
90
+ not in [
91
+ AutoEvalColumn.model.name,
92
+ ]
93
+ ],
94
+ label="",
95
+ elem_id="column-select",
96
+ interactive=True,
97
+ )
98
+
99
+ leaderboard_df = gr.components.Dataframe(
100
+ value=df[
101
+ [
102
+ AutoEvalColumn.model.name,
103
+ ]
104
+ + shown_columns.value
105
+ ],
106
+ headers=COLS,
107
+ datatype=TYPES,
108
+ elem_id="leaderboard-table",
109
+ interactive=False,
110
+ )
111
+
112
+ hidden_leaderboard_df = gr.components.Dataframe(
113
+ value=df,
114
+ headers=COLS,
115
+ datatype=["str" for _ in range(len(COLS))],
116
+ visible=False,
117
+ )
118
+
119
+ shown_columns.change(
120
+ select_columns,
121
+ [hidden_leaderboard_df, shown_columns],
122
+ leaderboard_df,
123
+ )
124
+
125
+ gr.Markdown(NOTES_TEXT, elem_classes="markdown-text")
126
+
127
+ with gr.TabItem("πŸ“ About", id=1):
128
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
129
+
130
+ with gr.Row():
131
+ with gr.Accordion("πŸ“™ Citation", open=False):
132
+ citation_button = gr.Textbox(
133
+ value=CITATION_BUTTON_TEXT,
134
+ label=CITATION_BUTTON_LABEL,
135
+ lines=10,
136
+ elem_id="citation-button",
137
+ show_copy_button=True,
138
+ )
139
+
140
+ with gr.Row():
141
+ with gr.Accordion("πŸ™ Acknowledgement", open=False):
142
+ gr.Markdown(ACKNOWLEDGEMENT_TEXT)
143
+
144
+ demo.launch()
css_html.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ #changelog-text {
3
+ font-size: 16px !important;
4
+ }
5
+ #changelog-text h2 {
6
+ font-size: 18px !important;
7
+ }
8
+ .markdown-text {
9
+ font-size: 16px !important;
10
+ }
11
+ #models-to-add-text {
12
+ font-size: 18px !important;
13
+ }
14
+ #citation-button span {
15
+ font-size: 16px !important;
16
+ }
17
+ #citation-button textarea {
18
+ font-size: 16px !important;
19
+ }
20
+ #citation-button > label > button {
21
+ margin: 6px;
22
+ transform: scale(1.3);
23
+ }
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+ #leaderboard-table-lite {
28
+ margin-top: 15px
29
+ }
30
+ #search-bar-table-box > div:first-child {
31
+ background: none;
32
+ border: none;
33
+ }
34
+
35
+ #search-bar {
36
+ padding: 0px;
37
+ }
38
+ /* Hides the final AutoEvalColumn */
39
+ #llm-benchmark-tab-table table td:last-child,
40
+ #llm-benchmark-tab-table table th:last-child {
41
+ display: none;
42
+ }
43
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
44
+ table td:first-child,
45
+ table th:first-child {
46
+ max-width: 400px;
47
+ overflow: auto;
48
+ white-space: nowrap;
49
+ }
50
+ .tab-buttons button {
51
+ font-size: 20px;
52
+ }
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+ #scale-logo .download {
62
+ display: none;
63
+ }
64
+ """
text_content.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ABOUT_TEXT = """# What is RACE benchmark?
2
+ RACE is a multi-dimensional benchmark for code generation that focuses on **R**eadability, m**A**intainability, **C**orrectness, and **E**fficiency.
3
+ Its goal is to evaluate LLM's ability to generate code that is correct and meets the requirements of real-world development scenarios.
4
+ The benchmark is designed with various real-world demands across different **_demand-dependent_** dimensions, making it more applicable to practical scenarios.
5
+
6
+ # What are the specific aspects to be evaluated?
7
+ We have summarized representative influencing factors in real-world scenarios for different dimensions and designed various requirements for each factor.
8
+ These have been incorporated into the task description to prompt the LLM to generate code that is correct and meets the specified requirements.
9
+ The specific factors are as follows:
10
+ - **Readability**: The code should be easy to read and understand.
11
+ - `Comment`
12
+ - `Naming Convention`
13
+ - `Code Length`
14
+ - **Maintainability**: The code should be easy to maintain and extend.
15
+ - `MI Metric`
16
+ - `Modularity`
17
+ - **Efficiency**: The code should be efficient in terms of time and space complexity.
18
+ - `Time Complexity`
19
+ - `Space Complexity`
20
+
21
+ # How to evaluate?
22
+ To facilitate evaluation on the RACE benchmark, we provide the evaluation data and easy-to-use evaluation scripts in our 🏎️RACE GitHub repository.
23
+ Additionally, factors involving execution-based evaluation are conducted in a virtual environment to ensure evaluation security.
24
+
25
+ # Contact
26
+ If you have any questions, feel free to reach out to us at [[email protected]](mailto:[email protected]).
27
+
28
+ # Citation Information
29
+ ```bibtex
30
+
31
+ ```
32
+ """
33
+
34
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
35
+
36
+ CITATION_BUTTON_TEXT = r"""
37
+
38
+ """
39
+
40
+ ACKNOWLEDGEMENT_TEXT = """
41
+ Inspired from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
42
+ """
43
+
44
+
45
+ NOTES_TEXT = """
46
+ **Notes:**
47
+ - `πŸ’― RACE Score` denotes the final evaluation result based on 🏎️RACE benchmark, which is the average of the scores in the four dimensions: `βœ… Correctness`, `πŸ“– Readability`, `πŸ”¨ Maintainability`, and `πŸš€ Efficiency`.
48
+ - All fine-grained evaluation results are provided in `⏬ Hidden Columns`. `πŸ“– R` denotes code **R**eadability, `πŸ”¨ M` denotes code **M**aintainability, and `πŸš€ E` denotes code **E**fficiency. `*` denotes the correctness of the code in the corresponding dimension. More details about the abbreviations are as follows:
49
+ - `πŸ“– R*`: The code accuracy (baesline).
50
+ - `πŸ“– RN`: The proportion of code that is both functionally correct and follows customized instructions related to `Naming Convention`.
51
+ - `πŸ“– RL`: The proportion of code that is both functionally correct and follows customized instructions related to `Code Length`.
52
+ - `πŸ“– RC`: The proportion of code that is both functionally correct and follows customized instructions related to `Comment`.
53
+ - `πŸ”¨ MI*`: The code accuracy related to `Maintainability Index` (baesline).
54
+ - `πŸ”¨ MI`: The proportion of code that is both functionally correct and follows customized instructions related to `MI Metric`.
55
+ - `πŸ”¨ MC*`: The code accuracy related to `Modularity` (baesline).
56
+ - `πŸ”¨ MC`: The proportion of code that is both functionally correct and follows customized instructions related to `Modularity`.
57
+ - `πŸš€ E*`: The code accuracy (baesline).
58
+ - `πŸš€ E_NI_T`: The proportion of code that is both functionally correct and follows customized instructions related to `Time Complexity`.
59
+ - `πŸš€ E_NI_S`: The proportion of code that is both functionally correct and follows customized instructions related to `Space Complexity`.
60
+
61
+ - Regarding the types of evaluation results, `πŸ”¨ MI`, `πŸš€ E_NI_T`, and `πŸš€ E_NI_S` are scalar values ranging from 0 to 100, while the remaining metrics are percentages.
62
+ - For more explanation check the πŸ“ About section.
63
+ """
utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class ColumnContent:
6
+ name: str
7
+ type: str
8
+ displayed_by_default: bool
9
+ hidden: bool = False
10
+
11
+
12
+ def fields(raw_class):
13
+ return [
14
+ v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
15
+ ]
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class AutoEvalColumn: # Auto evals column
20
+ model = ColumnContent("model", "markdown", True)
21
+ score = ColumnContent("πŸ’― RACE Score", "number", True)
22
+ c_0 = ColumnContent("βœ… Correctness", "number", True)
23
+ r_0 = ColumnContent("πŸ“– Readability", "number", True)
24
+ r_1 = ColumnContent("πŸ“– R*", "number", False)
25
+ r_2 = ColumnContent("πŸ“– RN", "number", False)
26
+ r_3 = ColumnContent("πŸ“– RL", "number", False)
27
+ r_4 = ColumnContent("πŸ“– RC", "number", False)
28
+ m_0 = ColumnContent("πŸ”¨ Maintainability", "number", True)
29
+ m_1 = ColumnContent("πŸ”¨ MI*", "number", False)
30
+ m_2 = ColumnContent("πŸ”¨ MI", "number", False)
31
+ m_3 = ColumnContent("πŸ”¨ MC*", "number", False)
32
+ m_4 = ColumnContent("πŸ”¨ MC", "number", False)
33
+ e_0 = ColumnContent("πŸš€ Efficiency", "number", True)
34
+ e_1 = ColumnContent("πŸš€ E*", "number", False)
35
+ e_2 = ColumnContent("πŸš€ E_NI_T", "number", False)
36
+ e_3 = ColumnContent("πŸš€ E_NI_S", "number", False)
37
+