Spaces:
Running
Running
Jason Zheng
commited on
Commit
β’
6906870
1
Parent(s):
76f2cf6
first commit
Browse files- RESULTS.json +686 -0
- app.py +144 -0
- css_html.py +64 -0
- text_content.py +63 -0
- utils.py +37 -0
RESULTS.json
ADDED
@@ -0,0 +1,686 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gpt-4o-2024-05-13": {
|
3 |
+
"readability": {
|
4 |
+
"R*": 80.5,
|
5 |
+
"RN_p": 81.1,
|
6 |
+
"RN_if": 91.8,
|
7 |
+
"RN": 75.3,
|
8 |
+
"RL_p": 78.9,
|
9 |
+
"RL_if": 78.9,
|
10 |
+
"RL": 63.2,
|
11 |
+
"RC_p": 79.8,
|
12 |
+
"RC_if": 78.7,
|
13 |
+
"RC": 64.3,
|
14 |
+
"MBPP*": 64.6,
|
15 |
+
"Readability": 67.6
|
16 |
+
},
|
17 |
+
"maintainability": {
|
18 |
+
"MI*": 38.0,
|
19 |
+
"MI_p": 35.0,
|
20 |
+
"MI": 75.1,
|
21 |
+
"MC*": 57.2,
|
22 |
+
"MC_p": 56.3,
|
23 |
+
"MC": 35.2,
|
24 |
+
"Maintainability": 55.1
|
25 |
+
},
|
26 |
+
"efficiency": {
|
27 |
+
"E*": 59.4,
|
28 |
+
"E_p": 58.4,
|
29 |
+
"E_NI_T": 44.8,
|
30 |
+
"E_NI_S": 42.0,
|
31 |
+
"Efficiency": 43.4
|
32 |
+
},
|
33 |
+
"correctness": {
|
34 |
+
"Correctness": 59.9
|
35 |
+
},
|
36 |
+
"overall": {
|
37 |
+
"RACE Score": 56.5
|
38 |
+
}
|
39 |
+
},
|
40 |
+
"gpt-3.5-turbo-0125": {
|
41 |
+
"readability": {
|
42 |
+
"R*": 62.8,
|
43 |
+
"RN_p": 63.2,
|
44 |
+
"RN_if": 74.4,
|
45 |
+
"RN": 48.3,
|
46 |
+
"RL_p": 60.4,
|
47 |
+
"RL_if": 76.8,
|
48 |
+
"RL": 46.1,
|
49 |
+
"RC_p": 65.8,
|
50 |
+
"RC_if": 60.0,
|
51 |
+
"RC": 41.5,
|
52 |
+
"MBPP*": 62.2,
|
53 |
+
"Readability": 45.3
|
54 |
+
},
|
55 |
+
"maintainability": {
|
56 |
+
"MI*": 28.0,
|
57 |
+
"MI_p": 24.0,
|
58 |
+
"MI": 80.2,
|
59 |
+
"MC*": 31.1,
|
60 |
+
"MC_p": 28.1,
|
61 |
+
"MC": 18.5,
|
62 |
+
"Maintainability": 49.4
|
63 |
+
},
|
64 |
+
"efficiency": {
|
65 |
+
"E*": 39.6,
|
66 |
+
"E_p": 32.7,
|
67 |
+
"E_NI_T": 27.5,
|
68 |
+
"E_NI_S": 36.5,
|
69 |
+
"Efficiency": 32.0
|
70 |
+
},
|
71 |
+
"correctness": {
|
72 |
+
"Correctness": 44.7
|
73 |
+
},
|
74 |
+
"overall": {
|
75 |
+
"RACE Score": 42.8
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"CodeLlama-7b-Instruct-hf": {
|
79 |
+
"readability": {
|
80 |
+
"R*": 32.3,
|
81 |
+
"RN_p": 31.5,
|
82 |
+
"RN_if": 55.5,
|
83 |
+
"RN": 17.0,
|
84 |
+
"RL_p": 31.7,
|
85 |
+
"RL_if": 59.7,
|
86 |
+
"RL": 23.4,
|
87 |
+
"RC_p": 30.2,
|
88 |
+
"RC_if": 67.4,
|
89 |
+
"RC": 18.3,
|
90 |
+
"MBPP*": 43.1,
|
91 |
+
"Readability": 19.6
|
92 |
+
},
|
93 |
+
"maintainability": {
|
94 |
+
"MI*": 16.0,
|
95 |
+
"MI_p": 15.0,
|
96 |
+
"MI": 71.8,
|
97 |
+
"MC*": 12.2,
|
98 |
+
"MC_p": 10.9,
|
99 |
+
"MC": 7.2,
|
100 |
+
"Maintainability": 39.5
|
101 |
+
},
|
102 |
+
"efficiency": {
|
103 |
+
"E*": 15.8,
|
104 |
+
"E_p": 13.9,
|
105 |
+
"E_NI_T": 8.2,
|
106 |
+
"E_NI_S": 8.8,
|
107 |
+
"Efficiency": 8.5
|
108 |
+
},
|
109 |
+
"correctness": {
|
110 |
+
"Correctness": 23.9
|
111 |
+
},
|
112 |
+
"overall": {
|
113 |
+
"RACE Score": 22.9
|
114 |
+
}
|
115 |
+
},
|
116 |
+
"CodeLlama-7b-Python-hf": {
|
117 |
+
"readability": {
|
118 |
+
"R*": 29.3,
|
119 |
+
"RN_p": 29.5,
|
120 |
+
"RN_if": 66.4,
|
121 |
+
"RN": 20.4,
|
122 |
+
"RL_p": 30.1,
|
123 |
+
"RL_if": 76.6,
|
124 |
+
"RL": 25.8,
|
125 |
+
"RC_p": 24.7,
|
126 |
+
"RC_if": 42.1,
|
127 |
+
"RC": 11.6,
|
128 |
+
"MBPP*": 41.3,
|
129 |
+
"Readability": 19.3
|
130 |
+
},
|
131 |
+
"maintainability": {
|
132 |
+
"MI*": 11.0,
|
133 |
+
"MI_p": 10.0,
|
134 |
+
"MI": 79.4,
|
135 |
+
"MC*": 5.6,
|
136 |
+
"MC_p": 6.5,
|
137 |
+
"MC": 3.7,
|
138 |
+
"Maintainability": 41.6
|
139 |
+
},
|
140 |
+
"efficiency": {
|
141 |
+
"E*": 14.9,
|
142 |
+
"E_p": 15.8,
|
143 |
+
"E_NI_T": 14.3,
|
144 |
+
"E_NI_S": 14.4,
|
145 |
+
"Efficiency": 14.4
|
146 |
+
},
|
147 |
+
"correctness": {
|
148 |
+
"Correctness": 20.4
|
149 |
+
},
|
150 |
+
"overall": {
|
151 |
+
"RACE Score": 23.9
|
152 |
+
}
|
153 |
+
},
|
154 |
+
"CodeLlama-13b-Instruct-hf": {
|
155 |
+
"readability": {
|
156 |
+
"R*": 36.0,
|
157 |
+
"RN_p": 37.7,
|
158 |
+
"RN_if": 57.8,
|
159 |
+
"RN": 22.0,
|
160 |
+
"RL_p": 35.0,
|
161 |
+
"RL_if": 59.9,
|
162 |
+
"RL": 23.6,
|
163 |
+
"RC_p": 35.7,
|
164 |
+
"RC_if": 64.3,
|
165 |
+
"RC": 23.2,
|
166 |
+
"MBPP*": 40.7,
|
167 |
+
"Readability": 22.9
|
168 |
+
},
|
169 |
+
"maintainability": {
|
170 |
+
"MI*": 17.0,
|
171 |
+
"MI_p": 19.0,
|
172 |
+
"MI": 82.1,
|
173 |
+
"MC*": 10.6,
|
174 |
+
"MC_p": 13.1,
|
175 |
+
"MC": 7.6,
|
176 |
+
"Maintainability": 44.8
|
177 |
+
},
|
178 |
+
"efficiency": {
|
179 |
+
"E*": 17.8,
|
180 |
+
"E_p": 17.8,
|
181 |
+
"E_NI_T": 10.4,
|
182 |
+
"E_NI_S": 16.1,
|
183 |
+
"Efficiency": 13.2
|
184 |
+
},
|
185 |
+
"correctness": {
|
186 |
+
"Correctness": 24.4
|
187 |
+
},
|
188 |
+
"overall": {
|
189 |
+
"RACE Score": 26.4
|
190 |
+
}
|
191 |
+
},
|
192 |
+
"CodeLlama-13b-Python-hf": {
|
193 |
+
"readability": {
|
194 |
+
"R*": 40.2,
|
195 |
+
"RN_p": 35.0,
|
196 |
+
"RN_if": 61.3,
|
197 |
+
"RN": 22.4,
|
198 |
+
"RL_p": 34.8,
|
199 |
+
"RL_if": 83.5,
|
200 |
+
"RL": 30.9,
|
201 |
+
"RC_p": 30.2,
|
202 |
+
"RC_if": 60.7,
|
203 |
+
"RC": 20.4,
|
204 |
+
"MBPP*": 29.4,
|
205 |
+
"Readability": 24.6
|
206 |
+
},
|
207 |
+
"maintainability": {
|
208 |
+
"MI*": 16.0,
|
209 |
+
"MI_p": 15.0,
|
210 |
+
"MI": 78.6,
|
211 |
+
"MC*": 6.1,
|
212 |
+
"MC_p": 4.8,
|
213 |
+
"MC": 2.4,
|
214 |
+
"Maintainability": 40.5
|
215 |
+
},
|
216 |
+
"efficiency": {
|
217 |
+
"E*": 16.8,
|
218 |
+
"E_p": 17.8,
|
219 |
+
"E_NI_T": 13.8,
|
220 |
+
"E_NI_S": 14.7,
|
221 |
+
"Efficiency": 14.2
|
222 |
+
},
|
223 |
+
"correctness": {
|
224 |
+
"Correctness": 21.7
|
225 |
+
},
|
226 |
+
"overall": {
|
227 |
+
"RACE Score": 25.3
|
228 |
+
}
|
229 |
+
},
|
230 |
+
"CodeLlama-34b-Instruct-hf": {
|
231 |
+
"readability": {
|
232 |
+
"R*": 36.0,
|
233 |
+
"RN_p": 36.5,
|
234 |
+
"RN_if": 54.3,
|
235 |
+
"RN": 21.1,
|
236 |
+
"RL_p": 35.8,
|
237 |
+
"RL_if": 41.7,
|
238 |
+
"RL": 17.5,
|
239 |
+
"RC_p": 36.3,
|
240 |
+
"RC_if": 32.0,
|
241 |
+
"RC": 9.4,
|
242 |
+
"MBPP*": 45.8,
|
243 |
+
"Readability": 16.0
|
244 |
+
},
|
245 |
+
"maintainability": {
|
246 |
+
"MI*": 12.0,
|
247 |
+
"MI_p": 18.0,
|
248 |
+
"MI": 73.2,
|
249 |
+
"MC*": 15.6,
|
250 |
+
"MC_p": 14.2,
|
251 |
+
"MC": 8.5,
|
252 |
+
"Maintainability": 40.9
|
253 |
+
},
|
254 |
+
"efficiency": {
|
255 |
+
"E*": 20.8,
|
256 |
+
"E_p": 15.8,
|
257 |
+
"E_NI_T": 14.4,
|
258 |
+
"E_NI_S": 13.8,
|
259 |
+
"Efficiency": 14.1
|
260 |
+
},
|
261 |
+
"correctness": {
|
262 |
+
"Correctness": 26.0
|
263 |
+
},
|
264 |
+
"overall": {
|
265 |
+
"RACE Score": 24.2
|
266 |
+
}
|
267 |
+
},
|
268 |
+
"CodeLlama-34b-Python-hf": {
|
269 |
+
"readability": {
|
270 |
+
"R*": 31.7,
|
271 |
+
"RN_p": 27.2,
|
272 |
+
"RN_if": 66.9,
|
273 |
+
"RN": 18.6,
|
274 |
+
"RL_p": 32.5,
|
275 |
+
"RL_if": 73.2,
|
276 |
+
"RL": 26.7,
|
277 |
+
"RC_p": 27.8,
|
278 |
+
"RC_if": 39.4,
|
279 |
+
"RC": 6.7,
|
280 |
+
"MBPP*": 36.2,
|
281 |
+
"Readability": 17.3
|
282 |
+
},
|
283 |
+
"maintainability": {
|
284 |
+
"MI*": 3.0,
|
285 |
+
"MI_p": 2.0,
|
286 |
+
"MI": 85.3,
|
287 |
+
"MC*": 7.2,
|
288 |
+
"MC_p": 5.4,
|
289 |
+
"MC": 2.2,
|
290 |
+
"Maintainability": 43.8
|
291 |
+
},
|
292 |
+
"efficiency": {
|
293 |
+
"E*": 17.8,
|
294 |
+
"E_p": 11.9,
|
295 |
+
"E_NI_T": 12.0,
|
296 |
+
"E_NI_S": 14.4,
|
297 |
+
"Efficiency": 13.2
|
298 |
+
},
|
299 |
+
"correctness": {
|
300 |
+
"Correctness": 19.2
|
301 |
+
},
|
302 |
+
"overall": {
|
303 |
+
"RACE Score": 23.4
|
304 |
+
}
|
305 |
+
},
|
306 |
+
"deepseek-coder-6.7b-instruct": {
|
307 |
+
"readability": {
|
308 |
+
"R*": 65.2,
|
309 |
+
"RN_p": 65.5,
|
310 |
+
"RN_if": 67.2,
|
311 |
+
"RN": 44.4,
|
312 |
+
"RL_p": 61.2,
|
313 |
+
"RL_if": 73.6,
|
314 |
+
"RL": 46.6,
|
315 |
+
"RC_p": 61.2,
|
316 |
+
"RC_if": 65.5,
|
317 |
+
"RC": 42.0,
|
318 |
+
"MBPP*": 57.1,
|
319 |
+
"Readability": 44.3
|
320 |
+
},
|
321 |
+
"maintainability": {
|
322 |
+
"MI*": 26.0,
|
323 |
+
"MI_p": 25.0,
|
324 |
+
"MI": 79.3,
|
325 |
+
"MC*": 18.9,
|
326 |
+
"MC_p": 18.7,
|
327 |
+
"MC": 8.2,
|
328 |
+
"Maintainability": 43.8
|
329 |
+
},
|
330 |
+
"efficiency": {
|
331 |
+
"E*": 28.7,
|
332 |
+
"E_p": 30.7,
|
333 |
+
"E_NI_T": 27.1,
|
334 |
+
"E_NI_S": 30.0,
|
335 |
+
"Efficiency": 28.6
|
336 |
+
},
|
337 |
+
"correctness": {
|
338 |
+
"Correctness": 39.2
|
339 |
+
},
|
340 |
+
"overall": {
|
341 |
+
"RACE Score": 39.0
|
342 |
+
}
|
343 |
+
},
|
344 |
+
"deepseek-coder-7b-instruct-v1.5": {
|
345 |
+
"readability": {
|
346 |
+
"R*": 61.0,
|
347 |
+
"RN_p": 61.5,
|
348 |
+
"RN_if": 57.8,
|
349 |
+
"RN": 35.2,
|
350 |
+
"RL_p": 62.6,
|
351 |
+
"RL_if": 70.9,
|
352 |
+
"RL": 46.0,
|
353 |
+
"RC_p": 62.8,
|
354 |
+
"RC_if": 70.2,
|
355 |
+
"RC": 46.0,
|
356 |
+
"MBPP*": 59.3,
|
357 |
+
"Readability": 42.4
|
358 |
+
},
|
359 |
+
"maintainability": {
|
360 |
+
"MI*": 23.0,
|
361 |
+
"MI_p": 24.0,
|
362 |
+
"MI": 79.6,
|
363 |
+
"MC*": 23.3,
|
364 |
+
"MC_p": 20.9,
|
365 |
+
"MC": 8.9,
|
366 |
+
"Maintainability": 44.2
|
367 |
+
},
|
368 |
+
"efficiency": {
|
369 |
+
"E*": 32.7,
|
370 |
+
"E_p": 27.7,
|
371 |
+
"E_NI_T": 25.1,
|
372 |
+
"E_NI_S": 26.8,
|
373 |
+
"Efficiency": 26.0
|
374 |
+
},
|
375 |
+
"correctness": {
|
376 |
+
"Correctness": 39.9
|
377 |
+
},
|
378 |
+
"overall": {
|
379 |
+
"RACE Score": 38.1
|
380 |
+
}
|
381 |
+
},
|
382 |
+
"deepseek-coder-33b-instruct": {
|
383 |
+
"readability": {
|
384 |
+
"R*": 65.9,
|
385 |
+
"RN_p": 64.6,
|
386 |
+
"RN_if": 86.8,
|
387 |
+
"RN": 57.7,
|
388 |
+
"RL_p": 65.0,
|
389 |
+
"RL_if": 82.7,
|
390 |
+
"RL": 53.5,
|
391 |
+
"RC_p": 66.5,
|
392 |
+
"RC_if": 70.8,
|
393 |
+
"RC": 46.4,
|
394 |
+
"MBPP*": 61.9,
|
395 |
+
"Readability": 52.5
|
396 |
+
},
|
397 |
+
"maintainability": {
|
398 |
+
"MI*": 28.0,
|
399 |
+
"MI_p": 30.0,
|
400 |
+
"MI": 75.7,
|
401 |
+
"MC*": 22.2,
|
402 |
+
"MC_p": 27.6,
|
403 |
+
"MC": 11.3,
|
404 |
+
"Maintainability": 43.5
|
405 |
+
},
|
406 |
+
"efficiency": {
|
407 |
+
"E*": 45.5,
|
408 |
+
"E_p": 38.6,
|
409 |
+
"E_NI_T": 35.3,
|
410 |
+
"E_NI_S": 36.1,
|
411 |
+
"Efficiency": 35.7
|
412 |
+
},
|
413 |
+
"correctness": {
|
414 |
+
"Correctness": 44.7
|
415 |
+
},
|
416 |
+
"overall": {
|
417 |
+
"RACE Score": 44.1
|
418 |
+
}
|
419 |
+
},
|
420 |
+
"DeepSeek-Coder-V2-Lite-Instruct": {
|
421 |
+
"readability": {
|
422 |
+
"R*": 72.0,
|
423 |
+
"RN_p": 71.2,
|
424 |
+
"RN_if": 55.3,
|
425 |
+
"RN": 40.2,
|
426 |
+
"RL_p": 66.5,
|
427 |
+
"RL_if": 83.7,
|
428 |
+
"RL": 57.7,
|
429 |
+
"RC_p": 67.1,
|
430 |
+
"RC_if": 63.5,
|
431 |
+
"RC": 42.7,
|
432 |
+
"MBPP*": 62.7,
|
433 |
+
"Readability": 46.9
|
434 |
+
},
|
435 |
+
"maintainability": {
|
436 |
+
"MI*": 26.0,
|
437 |
+
"MI_p": 30.0,
|
438 |
+
"MI": 78.2,
|
439 |
+
"MC*": 44.4,
|
440 |
+
"MC_p": 44.3,
|
441 |
+
"MC": 19.8,
|
442 |
+
"Maintainability": 49.0
|
443 |
+
},
|
444 |
+
"efficiency": {
|
445 |
+
"E*": 49.5,
|
446 |
+
"E_p": 55.4,
|
447 |
+
"E_NI_T": 40.2,
|
448 |
+
"E_NI_S": 47.7,
|
449 |
+
"Efficiency": 44.0
|
450 |
+
},
|
451 |
+
"correctness": {
|
452 |
+
"Correctness": 50.9
|
453 |
+
},
|
454 |
+
"overall": {
|
455 |
+
"RACE Score": 47.7
|
456 |
+
}
|
457 |
+
},
|
458 |
+
"deepseek-coder": {
|
459 |
+
"readability": {
|
460 |
+
"R*": 73.8,
|
461 |
+
"RN_p": 75.3,
|
462 |
+
"RN_if": 91.8,
|
463 |
+
"RN": 70.0,
|
464 |
+
"RL_p": 75.2,
|
465 |
+
"RL_if": 88.4,
|
466 |
+
"RL": 67.1,
|
467 |
+
"RC_p": 76.5,
|
468 |
+
"RC_if": 74.1,
|
469 |
+
"RC": 58.5,
|
470 |
+
"MBPP*": 68.5,
|
471 |
+
"Readability": 65.2
|
472 |
+
},
|
473 |
+
"maintainability": {
|
474 |
+
"MI*": 35.0,
|
475 |
+
"MI_p": 38.0,
|
476 |
+
"MI": 77.3,
|
477 |
+
"MC*": 58.9,
|
478 |
+
"MC_p": 58.9,
|
479 |
+
"MC": 35.0,
|
480 |
+
"Maintainability": 56.1
|
481 |
+
},
|
482 |
+
"efficiency": {
|
483 |
+
"E*": 57.3,
|
484 |
+
"E_p": 53.5,
|
485 |
+
"E_NI_T": 41.1,
|
486 |
+
"E_NI_S": 49.4,
|
487 |
+
"Efficiency": 45.2
|
488 |
+
},
|
489 |
+
"correctness": {
|
490 |
+
"Correctness": 58.7
|
491 |
+
},
|
492 |
+
"overall": {
|
493 |
+
"RACE Score": 56.3
|
494 |
+
}
|
495 |
+
},
|
496 |
+
"WizardCoder-Python-7B-V1.0": {
|
497 |
+
"readability": {
|
498 |
+
"R*": 34.8,
|
499 |
+
"RN_p": 35.8,
|
500 |
+
"RN_if": 58.3,
|
501 |
+
"RN": 22.4,
|
502 |
+
"RL_p": 34.3,
|
503 |
+
"RL_if": 79.7,
|
504 |
+
"RL": 28.0,
|
505 |
+
"RC_p": 35.4,
|
506 |
+
"RC_if": 25.0,
|
507 |
+
"RC": 8.6,
|
508 |
+
"MBPP*": 41.8,
|
509 |
+
"Readability": 19.7
|
510 |
+
},
|
511 |
+
"maintainability": {
|
512 |
+
"MI*": 19.0,
|
513 |
+
"MI_p": 23.0,
|
514 |
+
"MI": 79.3,
|
515 |
+
"MC*": 10.6,
|
516 |
+
"MC_p": 9.8,
|
517 |
+
"MC": 7.2,
|
518 |
+
"Maintainability": 43.2
|
519 |
+
},
|
520 |
+
"efficiency": {
|
521 |
+
"E*": 19.8,
|
522 |
+
"E_p": 19.8,
|
523 |
+
"E_NI_T": 15.3,
|
524 |
+
"E_NI_S": 16.7,
|
525 |
+
"Efficiency": 16.0
|
526 |
+
},
|
527 |
+
"correctness": {
|
528 |
+
"Correctness": 25.2
|
529 |
+
},
|
530 |
+
"overall": {
|
531 |
+
"RACE Score": 26.0
|
532 |
+
}
|
533 |
+
},
|
534 |
+
"WizardCoder-Python-13B-V1.0": {
|
535 |
+
"readability": {
|
536 |
+
"R*": 36.0,
|
537 |
+
"RN_p": 38.2,
|
538 |
+
"RN_if": 58.4,
|
539 |
+
"RN": 23.1,
|
540 |
+
"RL_p": 38.4,
|
541 |
+
"RL_if": 83.1,
|
542 |
+
"RL": 33.1,
|
543 |
+
"RC_p": 43.6,
|
544 |
+
"RC_if": 59.8,
|
545 |
+
"RC": 27.4,
|
546 |
+
"MBPP*": 42.1,
|
547 |
+
"Readability": 27.9
|
548 |
+
},
|
549 |
+
"maintainability": {
|
550 |
+
"MI*": 20.0,
|
551 |
+
"MI_p": 21.0,
|
552 |
+
"MI": 78.8,
|
553 |
+
"MC*": 12.8,
|
554 |
+
"MC_p": 12.8,
|
555 |
+
"MC": 8.5,
|
556 |
+
"Maintainability": 43.6
|
557 |
+
},
|
558 |
+
"efficiency": {
|
559 |
+
"E*": 20.8,
|
560 |
+
"E_p": 18.8,
|
561 |
+
"E_NI_T": 16.2,
|
562 |
+
"E_NI_S": 19.8,
|
563 |
+
"Efficiency": 18.0
|
564 |
+
},
|
565 |
+
"correctness": {
|
566 |
+
"Correctness": 26.3
|
567 |
+
},
|
568 |
+
"overall": {
|
569 |
+
"RACE Score": 29.0
|
570 |
+
}
|
571 |
+
},
|
572 |
+
"WizardCoder-15B-V1.0": {
|
573 |
+
"readability": {
|
574 |
+
"R*": 38.4,
|
575 |
+
"RN_p": 38.7,
|
576 |
+
"RN_if": 59.0,
|
577 |
+
"RN": 23.2,
|
578 |
+
"RL_p": 41.9,
|
579 |
+
"RL_if": 64.8,
|
580 |
+
"RL": 27.8,
|
581 |
+
"RC_p": 40.0,
|
582 |
+
"RC_if": 57.3,
|
583 |
+
"RC": 24.4,
|
584 |
+
"MBPP*": 46.3,
|
585 |
+
"Readability": 25.1
|
586 |
+
},
|
587 |
+
"maintainability": {
|
588 |
+
"MI*": 22.0,
|
589 |
+
"MI_p": 21.0,
|
590 |
+
"MI": 80.0,
|
591 |
+
"MC*": 11.7,
|
592 |
+
"MC_p": 11.5,
|
593 |
+
"MC": 7.8,
|
594 |
+
"Maintainability": 43.9
|
595 |
+
},
|
596 |
+
"efficiency": {
|
597 |
+
"E*": 21.8,
|
598 |
+
"E_p": 22.8,
|
599 |
+
"E_NI_T": 21.8,
|
600 |
+
"E_NI_S": 24.2,
|
601 |
+
"Efficiency": 23.0
|
602 |
+
},
|
603 |
+
"correctness": {
|
604 |
+
"Correctness": 28.0
|
605 |
+
},
|
606 |
+
"overall": {
|
607 |
+
"RACE Score": 30.0
|
608 |
+
}
|
609 |
+
},
|
610 |
+
"WizardCoder-33B-V1.1": {
|
611 |
+
"readability": {
|
612 |
+
"R*": 58.5,
|
613 |
+
"RN_p": 58.8,
|
614 |
+
"RN_if": 65.4,
|
615 |
+
"RN": 39.9,
|
616 |
+
"RL_p": 62.2,
|
617 |
+
"RL_if": 76.0,
|
618 |
+
"RL": 47.6,
|
619 |
+
"RC_p": 58.8,
|
620 |
+
"RC_if": 61.0,
|
621 |
+
"RC": 37.2,
|
622 |
+
"MBPP*": 64.6,
|
623 |
+
"Readability": 41.6
|
624 |
+
},
|
625 |
+
"maintainability": {
|
626 |
+
"MI*": 34.0,
|
627 |
+
"MI_p": 34.0,
|
628 |
+
"MI": 71.2,
|
629 |
+
"MC*": 26.1,
|
630 |
+
"MC_p": 25.0,
|
631 |
+
"MC": 9.3,
|
632 |
+
"Maintainability": 40.2
|
633 |
+
},
|
634 |
+
"efficiency": {
|
635 |
+
"E*": 38.6,
|
636 |
+
"E_p": 35.6,
|
637 |
+
"E_NI_T": 33.9,
|
638 |
+
"E_NI_S": 34.9,
|
639 |
+
"Efficiency": 34.4
|
640 |
+
},
|
641 |
+
"correctness": {
|
642 |
+
"Correctness": 44.4
|
643 |
+
},
|
644 |
+
"overall": {
|
645 |
+
"RACE Score": 40.1
|
646 |
+
}
|
647 |
+
},
|
648 |
+
"CodeQwen1.5-7B-Chat": {
|
649 |
+
"readability": {
|
650 |
+
"R*": 76.2,
|
651 |
+
"RN_p": 76.8,
|
652 |
+
"RN_if": 60.8,
|
653 |
+
"RN": 47.0,
|
654 |
+
"RL_p": 73.4,
|
655 |
+
"RL_if": 60.8,
|
656 |
+
"RL": 47.0,
|
657 |
+
"RC_p": 74.7,
|
658 |
+
"RC_if": 71.3,
|
659 |
+
"RC": 54.2,
|
660 |
+
"MBPP*": 60.3,
|
661 |
+
"Readability": 49.4
|
662 |
+
},
|
663 |
+
"maintainability": {
|
664 |
+
"MI*": 22.0,
|
665 |
+
"MI_p": 22.0,
|
666 |
+
"MI": 82.3,
|
667 |
+
"MC*": 33.3,
|
668 |
+
"MC_p": 32.6,
|
669 |
+
"MC": 13.0,
|
670 |
+
"Maintainability": 47.6
|
671 |
+
},
|
672 |
+
"efficiency": {
|
673 |
+
"E*": 39.6,
|
674 |
+
"E_p": 38.6,
|
675 |
+
"E_NI_T": 30.7,
|
676 |
+
"E_NI_S": 37.7,
|
677 |
+
"Efficiency": 34.2
|
678 |
+
},
|
679 |
+
"correctness": {
|
680 |
+
"Correctness": 46.3
|
681 |
+
},
|
682 |
+
"overall": {
|
683 |
+
"RACE Score": 44.4
|
684 |
+
}
|
685 |
+
}
|
686 |
+
}
|
app.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from css_html import custom_css
|
7 |
+
from text_content import ABOUT_TEXT, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL, ACKNOWLEDGEMENT_TEXT, NOTES_TEXT
|
8 |
+
from utils import (
|
9 |
+
AutoEvalColumn,
|
10 |
+
fields,
|
11 |
+
)
|
12 |
+
|
13 |
+
result_path = './RESULTS.json'
|
14 |
+
with open(result_path, 'r') as f:
|
15 |
+
data = json.load(f)
|
16 |
+
|
17 |
+
rows = []
|
18 |
+
for col, subcols in data.items():
|
19 |
+
row = {"model": col}
|
20 |
+
for subcol, datas in subcols.items():
|
21 |
+
if subcol == 'readability':
|
22 |
+
symbol = 'π'
|
23 |
+
elif subcol == 'maintainability':
|
24 |
+
symbol = 'π¨'
|
25 |
+
elif subcol == 'efficiency':
|
26 |
+
symbol = 'π'
|
27 |
+
elif subcol == 'correctness':
|
28 |
+
symbol = 'β
'
|
29 |
+
elif subcol == 'overall':
|
30 |
+
symbol = 'π―'
|
31 |
+
|
32 |
+
for key, value in datas.items():
|
33 |
+
row[f'{symbol} {key}'] = value
|
34 |
+
|
35 |
+
rows.append(row)
|
36 |
+
|
37 |
+
df = pd.DataFrame(rows)
|
38 |
+
df = df.sort_values(by='π― RACE Score', ascending=False)
|
39 |
+
|
40 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
41 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
42 |
+
COLS_LITE = [
|
43 |
+
c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
|
44 |
+
]
|
45 |
+
TYPES_LITE = [
|
46 |
+
c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
|
47 |
+
]
|
48 |
+
|
49 |
+
def select_columns(df, columns):
|
50 |
+
always_here_cols = [
|
51 |
+
AutoEvalColumn.model.name,
|
52 |
+
]
|
53 |
+
# We use COLS to maintain sorting
|
54 |
+
filtered_df = df[
|
55 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
56 |
+
]
|
57 |
+
return filtered_df
|
58 |
+
|
59 |
+
demo = gr.Blocks(css=custom_css)
|
60 |
+
with demo:
|
61 |
+
with gr.Row():
|
62 |
+
gr.Markdown(
|
63 |
+
"""<div style="text-align: center;"><h1> ποΈRACE Leaderboard</h1></div>\
|
64 |
+
<br>\
|
65 |
+
<p>Based on the ποΈRACE benchmark, we demonstrated the ability of different LLMs to generate code that is <b><i>correct</i></b> and <b><i>meets the requirements of real-world development scenarios</i></b>.</p>
|
66 |
+
<p>Model details about how to evalute the LLM are available in the <a href="https://github.com/test/test">ποΈRACE GitHub repository</a>.</p>
|
67 |
+
""",
|
68 |
+
elem_classes="markdown-text",
|
69 |
+
)
|
70 |
+
|
71 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
72 |
+
with gr.Column():
|
73 |
+
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
|
74 |
+
with gr.TabItem("π Evaluation Table", id=0):
|
75 |
+
with gr.Column():
|
76 |
+
with gr.Accordion("β¬ Hidden Columns", open=False):
|
77 |
+
shown_columns = gr.CheckboxGroup(
|
78 |
+
choices=[
|
79 |
+
c
|
80 |
+
for c in COLS
|
81 |
+
if c
|
82 |
+
not in [
|
83 |
+
AutoEvalColumn.model.name,
|
84 |
+
]
|
85 |
+
],
|
86 |
+
value=[
|
87 |
+
c
|
88 |
+
for c in COLS_LITE
|
89 |
+
if c
|
90 |
+
not in [
|
91 |
+
AutoEvalColumn.model.name,
|
92 |
+
]
|
93 |
+
],
|
94 |
+
label="",
|
95 |
+
elem_id="column-select",
|
96 |
+
interactive=True,
|
97 |
+
)
|
98 |
+
|
99 |
+
leaderboard_df = gr.components.Dataframe(
|
100 |
+
value=df[
|
101 |
+
[
|
102 |
+
AutoEvalColumn.model.name,
|
103 |
+
]
|
104 |
+
+ shown_columns.value
|
105 |
+
],
|
106 |
+
headers=COLS,
|
107 |
+
datatype=TYPES,
|
108 |
+
elem_id="leaderboard-table",
|
109 |
+
interactive=False,
|
110 |
+
)
|
111 |
+
|
112 |
+
hidden_leaderboard_df = gr.components.Dataframe(
|
113 |
+
value=df,
|
114 |
+
headers=COLS,
|
115 |
+
datatype=["str" for _ in range(len(COLS))],
|
116 |
+
visible=False,
|
117 |
+
)
|
118 |
+
|
119 |
+
shown_columns.change(
|
120 |
+
select_columns,
|
121 |
+
[hidden_leaderboard_df, shown_columns],
|
122 |
+
leaderboard_df,
|
123 |
+
)
|
124 |
+
|
125 |
+
gr.Markdown(NOTES_TEXT, elem_classes="markdown-text")
|
126 |
+
|
127 |
+
with gr.TabItem("π About", id=1):
|
128 |
+
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
129 |
+
|
130 |
+
with gr.Row():
|
131 |
+
with gr.Accordion("π Citation", open=False):
|
132 |
+
citation_button = gr.Textbox(
|
133 |
+
value=CITATION_BUTTON_TEXT,
|
134 |
+
label=CITATION_BUTTON_LABEL,
|
135 |
+
lines=10,
|
136 |
+
elem_id="citation-button",
|
137 |
+
show_copy_button=True,
|
138 |
+
)
|
139 |
+
|
140 |
+
with gr.Row():
|
141 |
+
with gr.Accordion("π Acknowledgement", open=False):
|
142 |
+
gr.Markdown(ACKNOWLEDGEMENT_TEXT)
|
143 |
+
|
144 |
+
demo.launch()
|
css_html.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
#changelog-text {
|
3 |
+
font-size: 16px !important;
|
4 |
+
}
|
5 |
+
#changelog-text h2 {
|
6 |
+
font-size: 18px !important;
|
7 |
+
}
|
8 |
+
.markdown-text {
|
9 |
+
font-size: 16px !important;
|
10 |
+
}
|
11 |
+
#models-to-add-text {
|
12 |
+
font-size: 18px !important;
|
13 |
+
}
|
14 |
+
#citation-button span {
|
15 |
+
font-size: 16px !important;
|
16 |
+
}
|
17 |
+
#citation-button textarea {
|
18 |
+
font-size: 16px !important;
|
19 |
+
}
|
20 |
+
#citation-button > label > button {
|
21 |
+
margin: 6px;
|
22 |
+
transform: scale(1.3);
|
23 |
+
}
|
24 |
+
#leaderboard-table {
|
25 |
+
margin-top: 15px
|
26 |
+
}
|
27 |
+
#leaderboard-table-lite {
|
28 |
+
margin-top: 15px
|
29 |
+
}
|
30 |
+
#search-bar-table-box > div:first-child {
|
31 |
+
background: none;
|
32 |
+
border: none;
|
33 |
+
}
|
34 |
+
|
35 |
+
#search-bar {
|
36 |
+
padding: 0px;
|
37 |
+
}
|
38 |
+
/* Hides the final AutoEvalColumn */
|
39 |
+
#llm-benchmark-tab-table table td:last-child,
|
40 |
+
#llm-benchmark-tab-table table th:last-child {
|
41 |
+
display: none;
|
42 |
+
}
|
43 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
44 |
+
table td:first-child,
|
45 |
+
table th:first-child {
|
46 |
+
max-width: 400px;
|
47 |
+
overflow: auto;
|
48 |
+
white-space: nowrap;
|
49 |
+
}
|
50 |
+
.tab-buttons button {
|
51 |
+
font-size: 20px;
|
52 |
+
}
|
53 |
+
#scale-logo {
|
54 |
+
border-style: none !important;
|
55 |
+
box-shadow: none;
|
56 |
+
display: block;
|
57 |
+
margin-left: auto;
|
58 |
+
margin-right: auto;
|
59 |
+
max-width: 600px;
|
60 |
+
}
|
61 |
+
#scale-logo .download {
|
62 |
+
display: none;
|
63 |
+
}
|
64 |
+
"""
|
text_content.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ABOUT_TEXT = """# What is RACE benchmark?
|
2 |
+
RACE is a multi-dimensional benchmark for code generation that focuses on **R**eadability, m**A**intainability, **C**orrectness, and **E**fficiency.
|
3 |
+
Its goal is to evaluate LLM's ability to generate code that is correct and meets the requirements of real-world development scenarios.
|
4 |
+
The benchmark is designed with various real-world demands across different **_demand-dependent_** dimensions, making it more applicable to practical scenarios.
|
5 |
+
|
6 |
+
# What are the specific aspects to be evaluated?
|
7 |
+
We have summarized representative influencing factors in real-world scenarios for different dimensions and designed various requirements for each factor.
|
8 |
+
These have been incorporated into the task description to prompt the LLM to generate code that is correct and meets the specified requirements.
|
9 |
+
The specific factors are as follows:
|
10 |
+
- **Readability**: The code should be easy to read and understand.
|
11 |
+
- `Comment`
|
12 |
+
- `Naming Convention`
|
13 |
+
- `Code Length`
|
14 |
+
- **Maintainability**: The code should be easy to maintain and extend.
|
15 |
+
- `MI Metric`
|
16 |
+
- `Modularity`
|
17 |
+
- **Efficiency**: The code should be efficient in terms of time and space complexity.
|
18 |
+
- `Time Complexity`
|
19 |
+
- `Space Complexity`
|
20 |
+
|
21 |
+
# How to evaluate?
|
22 |
+
To facilitate evaluation on the RACE benchmark, we provide the evaluation data and easy-to-use evaluation scripts in our ποΈRACE GitHub repository.
|
23 |
+
Additionally, factors involving execution-based evaluation are conducted in a virtual environment to ensure evaluation security.
|
24 |
+
|
25 |
+
# Contact
|
26 |
+
If you have any questions, feel free to reach out to us at [[email protected]](mailto:[email protected]).
|
27 |
+
|
28 |
+
# Citation Information
|
29 |
+
```bibtex
|
30 |
+
|
31 |
+
```
|
32 |
+
"""
|
33 |
+
|
34 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
35 |
+
|
36 |
+
CITATION_BUTTON_TEXT = r"""
|
37 |
+
|
38 |
+
"""
|
39 |
+
|
40 |
+
ACKNOWLEDGEMENT_TEXT = """
|
41 |
+
Inspired from the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
|
42 |
+
"""
|
43 |
+
|
44 |
+
|
45 |
+
NOTES_TEXT = """
|
46 |
+
**Notes:**
|
47 |
+
- `π― RACE Score` denotes the final evaluation result based on ποΈRACE benchmark, which is the average of the scores in the four dimensions: `β
Correctness`, `π Readability`, `π¨ Maintainability`, and `π Efficiency`.
|
48 |
+
- All fine-grained evaluation results are provided in `β¬ Hidden Columns`. `π R` denotes code **R**eadability, `π¨ M` denotes code **M**aintainability, and `π E` denotes code **E**fficiency. `*` denotes the correctness of the code in the corresponding dimension. More details about the abbreviations are as follows:
|
49 |
+
- `π R*`: The code accuracy (baesline).
|
50 |
+
- `π RN`: The proportion of code that is both functionally correct and follows customized instructions related to `Naming Convention`.
|
51 |
+
- `π RL`: The proportion of code that is both functionally correct and follows customized instructions related to `Code Length`.
|
52 |
+
- `π RC`: The proportion of code that is both functionally correct and follows customized instructions related to `Comment`.
|
53 |
+
- `π¨ MI*`: The code accuracy related to `Maintainability Index` (baesline).
|
54 |
+
- `π¨ MI`: The proportion of code that is both functionally correct and follows customized instructions related to `MI Metric`.
|
55 |
+
- `π¨ MC*`: The code accuracy related to `Modularity` (baesline).
|
56 |
+
- `π¨ MC`: The proportion of code that is both functionally correct and follows customized instructions related to `Modularity`.
|
57 |
+
- `π E*`: The code accuracy (baesline).
|
58 |
+
- `π E_NI_T`: The proportion of code that is both functionally correct and follows customized instructions related to `Time Complexity`.
|
59 |
+
- `π E_NI_S`: The proportion of code that is both functionally correct and follows customized instructions related to `Space Complexity`.
|
60 |
+
|
61 |
+
- Regarding the types of evaluation results, `π¨ MI`, `π E_NI_T`, and `π E_NI_S` are scalar values ranging from 0 to 100, while the remaining metrics are percentages.
|
62 |
+
- For more explanation check the π About section.
|
63 |
+
"""
|
utils.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class ColumnContent:
|
6 |
+
name: str
|
7 |
+
type: str
|
8 |
+
displayed_by_default: bool
|
9 |
+
hidden: bool = False
|
10 |
+
|
11 |
+
|
12 |
+
def fields(raw_class):
|
13 |
+
return [
|
14 |
+
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
|
15 |
+
]
|
16 |
+
|
17 |
+
|
18 |
+
@dataclass(frozen=True)
|
19 |
+
class AutoEvalColumn: # Auto evals column
|
20 |
+
model = ColumnContent("model", "markdown", True)
|
21 |
+
score = ColumnContent("π― RACE Score", "number", True)
|
22 |
+
c_0 = ColumnContent("β
Correctness", "number", True)
|
23 |
+
r_0 = ColumnContent("π Readability", "number", True)
|
24 |
+
r_1 = ColumnContent("π R*", "number", False)
|
25 |
+
r_2 = ColumnContent("π RN", "number", False)
|
26 |
+
r_3 = ColumnContent("π RL", "number", False)
|
27 |
+
r_4 = ColumnContent("π RC", "number", False)
|
28 |
+
m_0 = ColumnContent("π¨ Maintainability", "number", True)
|
29 |
+
m_1 = ColumnContent("π¨ MI*", "number", False)
|
30 |
+
m_2 = ColumnContent("π¨ MI", "number", False)
|
31 |
+
m_3 = ColumnContent("π¨ MC*", "number", False)
|
32 |
+
m_4 = ColumnContent("π¨ MC", "number", False)
|
33 |
+
e_0 = ColumnContent("π Efficiency", "number", True)
|
34 |
+
e_1 = ColumnContent("π E*", "number", False)
|
35 |
+
e_2 = ColumnContent("π E_NI_T", "number", False)
|
36 |
+
e_3 = ColumnContent("π E_NI_S", "number", False)
|
37 |
+
|