陈俊杰
commited on
Commit
•
167bc7e
1
Parent(s):
4a235a0
cjj-table
Browse files
app.py
CHANGED
@@ -198,98 +198,71 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
|
|
198 |
<p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
|
199 |
</p>
|
200 |
""", unsafe_allow_html=True)
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
(
|
205 |
-
(
|
206 |
-
(
|
207 |
-
(
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
(
|
221 |
-
('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
222 |
-
('', 'overall'): [0,0,0,0],
|
223 |
-
('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
|
224 |
-
('Dialogue Generation', 'kendall'): [0.3243, 0.1739, 0.3042, 0.4167],
|
225 |
-
('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
|
226 |
-
# ('Text Expansion', 'accuracy'): [0.5107, 0.5050, 0.5461, 0.5581],
|
227 |
-
# ('Text Expansion', 'kendall\'s tau'): [0.1281, 0.0635, 0.2716, 0.3864],
|
228 |
-
# ('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
|
229 |
-
# ('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
|
230 |
-
# ('Summary Generation', 'kendall\'s tau'): [0.3957, 0.2688, 0.5092, 0.5001],
|
231 |
-
# ('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
|
232 |
-
# ('Non-Factoid QA', 'accuracy'): [0.5935, 0.5817, 0.7000, 0.7203],
|
233 |
-
# ('Non-Factoid QA', 'kendall\'s tau'): [0.2332, 0.2389, 0.4440, 0.4235],
|
234 |
-
# ('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
|
235 |
}
|
236 |
-
# overall = [0, 0, 0, 0]
|
237 |
-
# for d in data:
|
238 |
-
# if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
|
239 |
-
# for i in range(4):
|
240 |
-
# overall[i] += data[d][i]
|
241 |
-
# overall = [i / (3*4) for i in overall]
|
242 |
-
# data[('', 'overall')] = overall
|
243 |
-
# for d in data:
|
244 |
-
# if d != ('', 'teamId') and d != ('', 'methods'):
|
245 |
-
# for col in range(len(data[d])):
|
246 |
-
# data[d][col] = "{:.4f}".format(data[d][col])
|
247 |
-
# print(data)
|
248 |
-
st.dataframe(data, use_container_width=True)
|
249 |
-
# # teamId 唯一标识码
|
250 |
-
# DG = {
|
251 |
-
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
252 |
-
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
253 |
-
# "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
|
254 |
-
# "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
|
255 |
-
# "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
|
256 |
-
# }
|
257 |
|
258 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
|
287 |
# df = [df1, df2, df3, df4]
|
288 |
# for d in df:
|
289 |
# for col in d.select_dtypes(include=['float64', 'int64']).columns:
|
290 |
# d[col] = d[col].apply(lambda x: f"{x:.4f}")
|
291 |
|
292 |
-
# 创建标签页
|
293 |
# tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
294 |
|
295 |
# with tab1:
|
@@ -307,6 +280,27 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
|
|
307 |
# with tab4:
|
308 |
# st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
|
309 |
# st.dataframe(df4, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
|
311 |
# 获取北京时间
|
312 |
time_placeholder = st.empty()
|
|
|
198 |
<p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
|
199 |
</p>
|
200 |
""", unsafe_allow_html=True)
|
201 |
+
df = {
|
202 |
+
"TeamId": ["baseline", "baseline", "baseline", "baseline"],
|
203 |
+
"Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
204 |
+
"Average (all 4 datatsets)": [],
|
205 |
+
"Average (Dialogue Generation)": [],
|
206 |
+
"Accuracy (Dialogue Generation)": [],
|
207 |
+
"Kendall's Tau (Dialogue Generation)": [],
|
208 |
+
"Spearman (Dialogue Generation)": [],
|
209 |
+
"Average (Text Expansion)": [],
|
210 |
+
"Accuracy (Text Expansion)": [],
|
211 |
+
"Kendall's Tau (Text Expansion)": [],
|
212 |
+
"Spearman (Text Expansion)": [],
|
213 |
+
"Average (Summary Generation)": [],
|
214 |
+
"Accuracy (Summary Generation)": [],
|
215 |
+
"Kendall's Tau (Summary Generation)": [],
|
216 |
+
"Spearman (Summary Generation)": [],
|
217 |
+
"Average (Non-Factoid QA)": [],
|
218 |
+
"Accuracy (Non-Factoid QA)": [],
|
219 |
+
"Kendall's Tau (Non-Factoid QA)": [],
|
220 |
+
"Spearman (Non-Factoid QA)": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
+
# teamId 唯一标识码
|
224 |
+
DG = {
|
225 |
+
"TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
226 |
+
"Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
227 |
+
"Accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
|
228 |
+
"Kendall's Tau": [0.3243, 0.1739, 0.3042, 0.4167],
|
229 |
+
"Spearman": [0.3505, 0.1857, 0.3264, 0.4512]
|
230 |
+
}
|
231 |
+
df1 = pd.DataFrame(DG)
|
232 |
|
233 |
+
TE = {
|
234 |
+
"TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
235 |
+
"Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
236 |
+
"Accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
|
237 |
+
"Kendall's Tau": [0.1281, 0.0635, 0.2716, 0.3864],
|
238 |
+
"Spearman": [0.1352, 0.0667, 0.2867, 0.4157]
|
239 |
+
}
|
240 |
+
df2 = pd.DataFrame(TE)
|
241 |
|
242 |
+
SG = {
|
243 |
+
"TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
244 |
+
"Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
245 |
+
"Accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
|
246 |
+
"Kendall's Tau": [0.3957, 0.2688, 0.5092, 0.5001],
|
247 |
+
"Spearman": [0.4188, 0.2817, 0.5403, 0.5405],
|
248 |
+
}
|
249 |
+
df3 = pd.DataFrame(SG)
|
250 |
|
251 |
+
NFQA = {
|
252 |
+
"TeamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
253 |
+
"Methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
254 |
+
"Accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
|
255 |
+
"Kendall's Tau": [0.2332, 0.2389, 0.4440, 0.4235],
|
256 |
+
"Spearman": [0.2443, 0.2492, 0.4630, 0.4511]
|
257 |
+
}
|
258 |
+
df4 = pd.DataFrame(NFQA)
|
259 |
|
260 |
# df = [df1, df2, df3, df4]
|
261 |
# for d in df:
|
262 |
# for col in d.select_dtypes(include=['float64', 'int64']).columns:
|
263 |
# d[col] = d[col].apply(lambda x: f"{x:.4f}")
|
264 |
|
265 |
+
# # 创建标签页
|
266 |
# tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
267 |
|
268 |
# with tab1:
|
|
|
280 |
# with tab4:
|
281 |
# st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
|
282 |
# st.dataframe(df4, use_container_width=True)
|
283 |
+
|
284 |
+
data = [DG, NFQA, SG, TE]
|
285 |
+
task = ["Dialogue Generation", "Non-Factoid QA", "Summary Generation", "Text Expansion"]
|
286 |
+
metric = ["Accuracy", "Kendall's Tau", "Spearman"]
|
287 |
+
|
288 |
+
overall_total = [0] * len(df["TeamId"])
|
289 |
+
for i, d in enumerate(data): # 每种数据集
|
290 |
+
total = [0] * len(df["TeamId"]) # 长度初始化为方法数
|
291 |
+
for j in range(len(metric)): # 每种指标
|
292 |
+
index = f"{metric[j]} ({task[i]})"
|
293 |
+
df[index] = d[metric[j]]
|
294 |
+
for k in range(len[df["TeamId"]]):
|
295 |
+
total[k] += d[metric[j]][k]
|
296 |
+
average_index = f"Average ({task[i]})"
|
297 |
+
df[average_index] = [k / len(metric) for k in total]
|
298 |
+
for k in range(len[df["TeamId"]]):
|
299 |
+
overall_total[k] += df[average_index][k]
|
300 |
+
|
301 |
+
df["Average (all 4 datatsets)"] = [k / len(task) for k in overall_total]
|
302 |
+
|
303 |
+
st.dataframe(pd.DataFrame(df), use_container_width=True)
|
304 |
st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
|
305 |
# 获取北京时间
|
306 |
time_placeholder = st.empty()
|