陈俊杰 commited on
Commit
167bc7e
1 Parent(s): 4a235a0
Files changed (1) hide show
  1. app.py +75 -81
app.py CHANGED
@@ -198,98 +198,71 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
198
  <p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
199
  </p>
200
  """, unsafe_allow_html=True)
201
- index = pd.MultiIndex.from_tuples([
202
- ('', 'teamId'),
203
- ('', 'methods'),
204
- ('', 'overall'),
205
- ('Dialogue Generation', 'accuracy'),
206
- ('Dialogue Generation', 'kendall'),
207
- ('Dialogue Generation', 'spearman'),
208
- # ('Text Expansion', 'accuracy'),
209
- # ('Text Expansion', 'kendall\'s tau'),
210
- # ('Text Expansion', 'spearman'),
211
- # ('Summary Generation', 'accuracy'),
212
- # ('Summary Generation', 'kendall\'s tau'),
213
- # ('Summary Generation', 'spearman'),
214
- # ('Non-Factoid QA', 'accuracy'),
215
- # ('Non-Factoid QA', 'kendall\'s tau'),
216
- # ('Non-Factoid QA', 'spearman')
217
- ])
218
-
219
- data = {
220
- ('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
221
- ('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
222
- ('', 'overall'): [0,0,0,0],
223
- ('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
224
- ('Dialogue Generation', 'kendall'): [0.3243, 0.1739, 0.3042, 0.4167],
225
- ('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
226
- # ('Text Expansion', 'accuracy'): [0.5107, 0.5050, 0.5461, 0.5581],
227
- # ('Text Expansion', 'kendall\'s tau'): [0.1281, 0.0635, 0.2716, 0.3864],
228
- # ('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
229
- # ('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
230
- # ('Summary Generation', 'kendall\'s tau'): [0.3957, 0.2688, 0.5092, 0.5001],
231
- # ('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
232
- # ('Non-Factoid QA', 'accuracy'): [0.5935, 0.5817, 0.7000, 0.7203],
233
- # ('Non-Factoid QA', 'kendall\'s tau'): [0.2332, 0.2389, 0.4440, 0.4235],
234
- # ('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
235
  }
236
- # overall = [0, 0, 0, 0]
237
- # for d in data:
238
- # if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
239
- # for i in range(4):
240
- # overall[i] += data[d][i]
241
- # overall = [i / (3*4) for i in overall]
242
- # data[('', 'overall')] = overall
243
- # for d in data:
244
- # if d != ('', 'teamId') and d != ('', 'methods'):
245
- # for col in range(len(data[d])):
246
- # data[d][col] = "{:.4f}".format(data[d][col])
247
- # print(data)
248
- st.dataframe(data, use_container_width=True)
249
- # # teamId 唯一标识码
250
- # DG = {
251
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
252
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
253
- # "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
254
- # "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
255
- # "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
256
- # }
257
 
258
- # df1 = pd.DataFrame(DG)
 
 
 
 
 
 
 
 
259
 
260
- # TE = {
261
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
262
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
263
- # "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
264
- # "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
265
- # "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
266
- # }
267
- # df2 = pd.DataFrame(TE)
268
 
269
- # SG = {
270
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
271
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
272
- # "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
273
- # "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
274
- # "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
275
- # }
276
- # df3 = pd.DataFrame(SG)
277
 
278
- # NFQA = {
279
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
280
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
281
- # "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
282
- # "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
283
- # "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
284
- # }
285
- # df4 = pd.DataFrame(NFQA)
286
 
287
  # df = [df1, df2, df3, df4]
288
  # for d in df:
289
  # for col in d.select_dtypes(include=['float64', 'int64']).columns:
290
  # d[col] = d[col].apply(lambda x: f"{x:.4f}")
291
 
292
- # 创建标签页
293
  # tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
294
 
295
  # with tab1:
@@ -307,6 +280,27 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
307
  # with tab4:
308
  # st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
309
  # st.dataframe(df4, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
311
  # 获取北京时间
312
  time_placeholder = st.empty()
 
198
  <p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
199
  </p>
200
  """, unsafe_allow_html=True)
201
+ df = {
202
+ "TeamId": ["baseline", "baseline", "baseline", "baseline"],
203
+ "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
204
+ "Average (all 4 datatsets)": [],
205
+ "Average (Dialogue Generation)": [],
206
+ "Accuracy (Dialogue Generation)": [],
207
+ "Kendall's Tau (Dialogue Generation)": [],
208
+ "Spearman (Dialogue Generation)": [],
209
+ "Average (Text Expansion)": [],
210
+ "Accuracy (Text Expansion)": [],
211
+ "Kendall's Tau (Text Expansion)": [],
212
+ "Spearman (Text Expansion)": [],
213
+ "Average (Summary Generation)": [],
214
+ "Accuracy (Summary Generation)": [],
215
+ "Kendall's Tau (Summary Generation)": [],
216
+ "Spearman (Summary Generation)": [],
217
+ "Average (Non-Factoid QA)": [],
218
+ "Accuracy (Non-Factoid QA)": [],
219
+ "Kendall's Tau (Non-Factoid QA)": [],
220
+ "Spearman (Non-Factoid QA)": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ # teamId 唯一标识码
224
+ DG = {
225
+ "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
226
+ "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
227
+ "Accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
228
+ "Kendall's Tau": [0.3243, 0.1739, 0.3042, 0.4167],
229
+ "Spearman": [0.3505, 0.1857, 0.3264, 0.4512]
230
+ }
231
+ df1 = pd.DataFrame(DG)
232
 
233
+ TE = {
234
+ "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
235
+ "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
236
+ "Accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
237
+ "Kendall's Tau": [0.1281, 0.0635, 0.2716, 0.3864],
238
+ "Spearman": [0.1352, 0.0667, 0.2867, 0.4157]
239
+ }
240
+ df2 = pd.DataFrame(TE)
241
 
242
+ SG = {
243
+ "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
244
+ "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
245
+ "Accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
246
+ "Kendall's Tau": [0.3957, 0.2688, 0.5092, 0.5001],
247
+ "Spearman": [0.4188, 0.2817, 0.5403, 0.5405],
248
+ }
249
+ df3 = pd.DataFrame(SG)
250
 
251
+ NFQA = {
252
+ "TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
253
+ "Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
254
+ "Accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
255
+ "Kendall's Tau": [0.2332, 0.2389, 0.4440, 0.4235],
256
+ "Spearman": [0.2443, 0.2492, 0.4630, 0.4511]
257
+ }
258
+ df4 = pd.DataFrame(NFQA)
259
 
260
  # df = [df1, df2, df3, df4]
261
  # for d in df:
262
  # for col in d.select_dtypes(include=['float64', 'int64']).columns:
263
  # d[col] = d[col].apply(lambda x: f"{x:.4f}")
264
 
265
+ # # 创建标签页
266
  # tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
267
 
268
  # with tab1:
 
280
  # with tab4:
281
  # st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
282
  # st.dataframe(df4, use_container_width=True)
283
+
284
+ data = [DG, NFQA, SG, TE]
285
+ task = ["Dialogue Generation", "Non-Factoid QA", "Summary Generation", "Text Expansion"]
286
+ metric = ["Accuracy", "Kendall's Tau", "Spearman"]
287
+
288
+ overall_total = [0] * len(df["TeamId"])
289
+ for i, d in enumerate(data): # 每种数据集
290
+ total = [0] * len(df["TeamId"]) # 长度初始化为方法数
291
+ for j in range(len(metric)): # 每种指标
292
+ index = f"{metric[j]} ({task[i]})"
293
+ df[index] = d[metric[j]]
294
+ for k in range(len[df["TeamId"]]):
295
+ total[k] += d[metric[j]][k]
296
+ average_index = f"Average ({task[i]})"
297
+ df[average_index] = [k / len(metric) for k in total]
298
+ for k in range(len[df["TeamId"]]):
299
+ overall_total[k] += df[average_index][k]
300
+
301
+ df["Average (all 4 datatsets)"] = [k / len(task) for k in overall_total]
302
+
303
+ st.dataframe(pd.DataFrame(df), use_container_width=True)
304
  st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
305
  # 获取北京时间
306
  time_placeholder = st.empty()