Spaces:
Running
Running
abhinav-joshi
commited on
Commit
•
2b8f89d
1
Parent(s):
eb68762
add baseline results
Browse files- .DS_Store +0 -0
- app.py +313 -28
- dummy.py +15 -0
- submissions/.DS_Store +0 -0
- submissions/baseline/baseline -pre2.csv +12 -0
- submissions/baseline/baseline-pre.csv +7 -0
- submissions/baseline/baseline.csv +11 -7
- submissions/baseline/results-bacup.json +133 -0
- submissions/baseline/results.json +133 -0
- submissions/baseline/submission.json +16 -0
- uploads.py +53 -34
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
app.py
CHANGED
@@ -7,15 +7,14 @@ from uploads import add_new_eval
|
|
7 |
|
8 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
9 |
CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
}
|
18 |
-
}"""
|
19 |
|
20 |
api = HfApi()
|
21 |
TOKEN = os.environ.get("TOKEN", None)
|
@@ -27,7 +26,7 @@ def restart_space():
|
|
27 |
|
28 |
|
29 |
# Function to load data from a given CSV file
|
30 |
-
def baseline_load_data(tasks):
|
31 |
# version = version.replace("%", "p")
|
32 |
file_path = f"submissions/baseline/baseline.csv" # Replace with your file paths
|
33 |
df = pd.read_csv(file_path)
|
@@ -46,6 +45,20 @@ def baseline_load_data(tasks):
|
|
46 |
"SUMM",
|
47 |
"Average",
|
48 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
if tasks is None:
|
50 |
breakpoint()
|
51 |
# based on the tasks, remove the columns that are not needed
|
@@ -65,14 +78,77 @@ def baseline_load_data(tasks):
|
|
65 |
column_names.remove("SUMM")
|
66 |
|
67 |
df = df[column_names]
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
df = df.drop_duplicates(subset=["Method"], keep="first")
|
70 |
|
71 |
return df
|
72 |
|
73 |
|
74 |
-
def load_data(tasks):
|
75 |
-
baseline_df = baseline_load_data(tasks)
|
76 |
|
77 |
return baseline_df
|
78 |
|
@@ -86,8 +162,29 @@ def search_leaderboard(df, query):
|
|
86 |
|
87 |
|
88 |
# Function to change the version of the leaderboard
|
89 |
-
def change_version(
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
return new_df
|
92 |
|
93 |
|
@@ -120,6 +217,57 @@ with demo:
|
|
120 |
label="Select Tasks",
|
121 |
choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
122 |
value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
)
|
124 |
|
125 |
with gr.Row():
|
@@ -128,10 +276,22 @@ with demo:
|
|
128 |
show_label=False,
|
129 |
)
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
leaderboard_table = gr.components.Dataframe(
|
132 |
value=load_data(
|
133 |
# "baseline",
|
134 |
["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
|
|
135 |
),
|
136 |
interactive=True,
|
137 |
visible=True,
|
@@ -151,31 +311,156 @@ with demo:
|
|
151 |
|
152 |
search_bar.change(
|
153 |
search_leaderboard,
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
inputs=[
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
],
|
159 |
outputs=leaderboard_table,
|
160 |
)
|
161 |
-
|
162 |
tasks_checkbox.change(
|
163 |
change_version,
|
164 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
outputs=leaderboard_table,
|
166 |
)
|
167 |
|
168 |
-
with gr.Accordion("Submit
|
169 |
with gr.Row():
|
170 |
with gr.Column():
|
171 |
-
method_name_textbox = gr.Textbox(label="Method
|
172 |
-
url_textbox = gr.Textbox(label="
|
173 |
-
with gr.Column():
|
174 |
organisation = gr.Textbox(label="Organisation")
|
175 |
mail = gr.Textbox(label="Contact email")
|
|
|
176 |
file_output = gr.File()
|
177 |
-
|
178 |
-
submit_button = gr.Button("Submit Eval")
|
179 |
submission_result = gr.Markdown()
|
180 |
submit_button.click(
|
181 |
add_new_eval,
|
@@ -221,5 +506,5 @@ with demo:
|
|
221 |
scheduler = BackgroundScheduler()
|
222 |
scheduler.add_job(restart_space, "interval", seconds=3600)
|
223 |
scheduler.start()
|
224 |
-
|
225 |
-
demo.launch(share=True)
|
|
|
7 |
|
8 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
9 |
CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
|
10 |
+
title = "IL-TUR: Benchmark for Indian Legal Text Understanding and Reasoning",
|
11 |
+
author = "Joshi, Abhinav and Paul, Shounak and Sharma, Akshat and Goyal, Pawan and Ghosh, Saptarshi and Modi, Ashutosh"
|
12 |
+
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
|
13 |
+
month = aug,
|
14 |
+
year = "2024",
|
15 |
+
address = "Bangkok, Thailand",
|
16 |
+
publisher = "Association for Computational Linguistics",
|
17 |
+
}"""
|
|
|
18 |
|
19 |
api = HfApi()
|
20 |
TOKEN = os.environ.get("TOKEN", None)
|
|
|
26 |
|
27 |
|
28 |
# Function to load data from a given CSV file
|
29 |
+
def baseline_load_data(tasks, task_metrics):
|
30 |
# version = version.replace("%", "p")
|
31 |
file_path = f"submissions/baseline/baseline.csv" # Replace with your file paths
|
32 |
df = pd.read_csv(file_path)
|
|
|
45 |
"SUMM",
|
46 |
"Average",
|
47 |
]
|
48 |
+
# Method,Submitted by,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
|
49 |
+
column_names = [
|
50 |
+
"Method",
|
51 |
+
"Submitted By",
|
52 |
+
"L-NER",
|
53 |
+
"RR",
|
54 |
+
"CJPE",
|
55 |
+
"BAIL",
|
56 |
+
"LSI",
|
57 |
+
"PCR",
|
58 |
+
"SUMM",
|
59 |
+
# "Average",
|
60 |
+
]
|
61 |
+
|
62 |
if tasks is None:
|
63 |
breakpoint()
|
64 |
# based on the tasks, remove the columns that are not needed
|
|
|
78 |
column_names.remove("SUMM")
|
79 |
|
80 |
df = df[column_names]
|
81 |
+
|
82 |
+
import json
|
83 |
+
|
84 |
+
# load the results json file
|
85 |
+
with open("submissions/baseline/results.json") as f:
|
86 |
+
results = json.load(f)
|
87 |
+
# add the results to the dataframe
|
88 |
+
# Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
|
89 |
+
# Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
|
90 |
+
# create a new df to display the results
|
91 |
+
results_df = pd.DataFrame(
|
92 |
+
columns=[
|
93 |
+
"Method",
|
94 |
+
"Submitted By",
|
95 |
+
"Github Link",
|
96 |
+
"L-NER",
|
97 |
+
"RR",
|
98 |
+
"CJPE",
|
99 |
+
"BAIL",
|
100 |
+
"LSI",
|
101 |
+
"PCR",
|
102 |
+
"SUMM",
|
103 |
+
"L-MT",
|
104 |
+
# "Average",
|
105 |
+
]
|
106 |
+
)
|
107 |
+
# breakpoint()
|
108 |
+
for entry in results:
|
109 |
+
results_df = results_df.append(
|
110 |
+
{
|
111 |
+
"Method": entry["Method"],
|
112 |
+
"Submitted By": entry["Submitted By"],
|
113 |
+
"Github Link": entry["Github Link"],
|
114 |
+
"L-NER": entry["L-NER"][task_metrics["L-NER"]],
|
115 |
+
"RR": entry["RR"][task_metrics["RR"]],
|
116 |
+
"CJPE": entry["CJPE"][task_metrics["CJPE"]],
|
117 |
+
"BAIL": entry["BAIL"][task_metrics["BAIL"]],
|
118 |
+
"LSI": entry["LSI"][task_metrics["LSI"]],
|
119 |
+
"PCR": entry["PCR"][task_metrics["PCR"]],
|
120 |
+
"SUMM": entry["SUMM"][task_metrics["SUMM"]],
|
121 |
+
"L-MT": entry["L-MT"][task_metrics["L-MT"]],
|
122 |
+
# "Average": ,
|
123 |
+
},
|
124 |
+
ignore_index=True,
|
125 |
+
)
|
126 |
+
|
127 |
+
# breakpoint()
|
128 |
+
# add the average column
|
129 |
+
# results_df["Average"] = results_df.mean(axis=1)
|
130 |
+
|
131 |
+
df = results_df
|
132 |
+
# df = df.sort_values(by="Average", ascending=False)
|
133 |
+
# remove the columns that are not in tasks
|
134 |
+
selected_columns = (
|
135 |
+
[
|
136 |
+
"Method",
|
137 |
+
"Submitted By",
|
138 |
+
]
|
139 |
+
+ tasks
|
140 |
+
+ ["Github Link"]
|
141 |
+
)
|
142 |
+
print(tasks)
|
143 |
+
df = df[selected_columns]
|
144 |
+
|
145 |
df = df.drop_duplicates(subset=["Method"], keep="first")
|
146 |
|
147 |
return df
|
148 |
|
149 |
|
150 |
+
def load_data(tasks, task_metrics):
|
151 |
+
baseline_df = baseline_load_data(tasks, task_metrics)
|
152 |
|
153 |
return baseline_df
|
154 |
|
|
|
162 |
|
163 |
|
164 |
# Function to change the version of the leaderboard
|
165 |
+
def change_version(
|
166 |
+
tasks,
|
167 |
+
l_ner_metric,
|
168 |
+
rr_metric,
|
169 |
+
cjpe_metric,
|
170 |
+
bail_metric,
|
171 |
+
lsi_metric,
|
172 |
+
pcr_metric,
|
173 |
+
summ_metric,
|
174 |
+
lmt_metric,
|
175 |
+
):
|
176 |
+
task_metrics = {
|
177 |
+
"L-NER": l_ner_metric,
|
178 |
+
"RR": rr_metric,
|
179 |
+
"CJPE": cjpe_metric,
|
180 |
+
"BAIL": bail_metric,
|
181 |
+
"LSI": lsi_metric,
|
182 |
+
"PCR": pcr_metric,
|
183 |
+
"SUMM": summ_metric,
|
184 |
+
"L-MT": lmt_metric,
|
185 |
+
}
|
186 |
+
|
187 |
+
new_df = load_data(tasks, task_metrics)
|
188 |
return new_df
|
189 |
|
190 |
|
|
|
217 |
label="Select Tasks",
|
218 |
choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
219 |
value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
220 |
+
interactive=True,
|
221 |
+
)
|
222 |
+
|
223 |
+
with gr.Row():
|
224 |
+
l_ner_metric = gr.Radio(
|
225 |
+
label="L-NER",
|
226 |
+
choices=["strict mF1"],
|
227 |
+
value="strict mF1",
|
228 |
+
interactive=True,
|
229 |
+
)
|
230 |
+
rr_metric = gr.Radio(
|
231 |
+
label="RR",
|
232 |
+
choices=["mF1"],
|
233 |
+
value="mF1",
|
234 |
+
interactive=True,
|
235 |
+
)
|
236 |
+
cjpe_metric = gr.Radio(
|
237 |
+
label="CJPE",
|
238 |
+
choices=["mF1", "ROUGE-L", "BLEU"],
|
239 |
+
value="mF1",
|
240 |
+
interactive=True,
|
241 |
+
)
|
242 |
+
bail_metric = gr.Radio(
|
243 |
+
label="BAIL",
|
244 |
+
choices=["mF1"],
|
245 |
+
value="mF1",
|
246 |
+
interactive=True,
|
247 |
+
)
|
248 |
+
lsi_metric = gr.Radio(
|
249 |
+
label="LSI",
|
250 |
+
choices=["mF1"],
|
251 |
+
value="mF1",
|
252 |
+
interactive=True,
|
253 |
+
)
|
254 |
+
pcr_metric = gr.Radio(
|
255 |
+
label="PCR",
|
256 |
+
choices=["muF1@K"],
|
257 |
+
value="muF1@K",
|
258 |
+
interactive=True,
|
259 |
+
)
|
260 |
+
summ_metric = gr.Radio(
|
261 |
+
label="SUMM",
|
262 |
+
choices=["ROUGE-L", "BERTSCORE"],
|
263 |
+
value="ROUGE-L",
|
264 |
+
interactive=True,
|
265 |
+
)
|
266 |
+
lmt_metric = gr.Radio(
|
267 |
+
label="L-MT",
|
268 |
+
choices=["BLEU", "GLEU", "chrF++"],
|
269 |
+
value="BLEU",
|
270 |
+
interactive=True,
|
271 |
)
|
272 |
|
273 |
with gr.Row():
|
|
|
276 |
show_label=False,
|
277 |
)
|
278 |
|
279 |
+
task_metrics = {
|
280 |
+
"L-NER": l_ner_metric.value,
|
281 |
+
"RR": rr_metric.value,
|
282 |
+
"CJPE": cjpe_metric.value,
|
283 |
+
"BAIL": bail_metric.value,
|
284 |
+
"LSI": lsi_metric.value,
|
285 |
+
"PCR": pcr_metric.value,
|
286 |
+
"SUMM": summ_metric.value,
|
287 |
+
"L-MT": lmt_metric.value,
|
288 |
+
}
|
289 |
+
|
290 |
leaderboard_table = gr.components.Dataframe(
|
291 |
value=load_data(
|
292 |
# "baseline",
|
293 |
["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
294 |
+
task_metrics=task_metrics,
|
295 |
),
|
296 |
interactive=True,
|
297 |
visible=True,
|
|
|
311 |
|
312 |
search_bar.change(
|
313 |
search_leaderboard,
|
314 |
+
inputs=[leaderboard_table, search_bar],
|
315 |
+
outputs=leaderboard_table,
|
316 |
+
)
|
317 |
+
# breakpoint()
|
318 |
+
l_ner_metric.change(
|
319 |
+
change_version,
|
320 |
inputs=[
|
321 |
+
tasks_checkbox,
|
322 |
+
l_ner_metric,
|
323 |
+
rr_metric,
|
324 |
+
cjpe_metric,
|
325 |
+
bail_metric,
|
326 |
+
lsi_metric,
|
327 |
+
pcr_metric,
|
328 |
+
summ_metric,
|
329 |
+
lmt_metric,
|
330 |
+
],
|
331 |
+
outputs=leaderboard_table,
|
332 |
+
)
|
333 |
+
rr_metric.change(
|
334 |
+
change_version,
|
335 |
+
inputs=[
|
336 |
+
tasks_checkbox,
|
337 |
+
l_ner_metric,
|
338 |
+
rr_metric,
|
339 |
+
cjpe_metric,
|
340 |
+
bail_metric,
|
341 |
+
lsi_metric,
|
342 |
+
pcr_metric,
|
343 |
+
summ_metric,
|
344 |
+
lmt_metric,
|
345 |
+
],
|
346 |
+
outputs=leaderboard_table,
|
347 |
+
)
|
348 |
+
cjpe_metric.change(
|
349 |
+
change_version,
|
350 |
+
inputs=[
|
351 |
+
tasks_checkbox,
|
352 |
+
l_ner_metric,
|
353 |
+
rr_metric,
|
354 |
+
cjpe_metric,
|
355 |
+
bail_metric,
|
356 |
+
lsi_metric,
|
357 |
+
pcr_metric,
|
358 |
+
summ_metric,
|
359 |
+
lmt_metric,
|
360 |
+
],
|
361 |
+
outputs=leaderboard_table,
|
362 |
+
)
|
363 |
+
bail_metric.change(
|
364 |
+
change_version,
|
365 |
+
inputs=[
|
366 |
+
tasks_checkbox,
|
367 |
+
l_ner_metric,
|
368 |
+
rr_metric,
|
369 |
+
cjpe_metric,
|
370 |
+
bail_metric,
|
371 |
+
lsi_metric,
|
372 |
+
pcr_metric,
|
373 |
+
summ_metric,
|
374 |
+
lmt_metric,
|
375 |
+
],
|
376 |
+
outputs=leaderboard_table,
|
377 |
+
)
|
378 |
+
lsi_metric.change(
|
379 |
+
change_version,
|
380 |
+
inputs=[
|
381 |
+
tasks_checkbox,
|
382 |
+
l_ner_metric,
|
383 |
+
rr_metric,
|
384 |
+
cjpe_metric,
|
385 |
+
bail_metric,
|
386 |
+
lsi_metric,
|
387 |
+
pcr_metric,
|
388 |
+
summ_metric,
|
389 |
+
lmt_metric,
|
390 |
+
],
|
391 |
+
outputs=leaderboard_table,
|
392 |
+
)
|
393 |
+
pcr_metric.change(
|
394 |
+
change_version,
|
395 |
+
inputs=[
|
396 |
+
tasks_checkbox,
|
397 |
+
l_ner_metric,
|
398 |
+
rr_metric,
|
399 |
+
cjpe_metric,
|
400 |
+
bail_metric,
|
401 |
+
lsi_metric,
|
402 |
+
pcr_metric,
|
403 |
+
summ_metric,
|
404 |
+
lmt_metric,
|
405 |
+
],
|
406 |
+
outputs=leaderboard_table,
|
407 |
+
)
|
408 |
+
summ_metric.change(
|
409 |
+
change_version,
|
410 |
+
inputs=[
|
411 |
+
tasks_checkbox,
|
412 |
+
l_ner_metric,
|
413 |
+
rr_metric,
|
414 |
+
cjpe_metric,
|
415 |
+
bail_metric,
|
416 |
+
lsi_metric,
|
417 |
+
pcr_metric,
|
418 |
+
summ_metric,
|
419 |
+
lmt_metric,
|
420 |
+
],
|
421 |
+
outputs=leaderboard_table,
|
422 |
+
)
|
423 |
+
lmt_metric.change(
|
424 |
+
change_version,
|
425 |
+
inputs=[
|
426 |
+
tasks_checkbox,
|
427 |
+
l_ner_metric,
|
428 |
+
rr_metric,
|
429 |
+
cjpe_metric,
|
430 |
+
bail_metric,
|
431 |
+
lsi_metric,
|
432 |
+
pcr_metric,
|
433 |
+
summ_metric,
|
434 |
+
lmt_metric,
|
435 |
],
|
436 |
outputs=leaderboard_table,
|
437 |
)
|
|
|
438 |
tasks_checkbox.change(
|
439 |
change_version,
|
440 |
+
inputs=[
|
441 |
+
tasks_checkbox,
|
442 |
+
l_ner_metric,
|
443 |
+
rr_metric,
|
444 |
+
cjpe_metric,
|
445 |
+
bail_metric,
|
446 |
+
lsi_metric,
|
447 |
+
pcr_metric,
|
448 |
+
summ_metric,
|
449 |
+
lmt_metric,
|
450 |
+
],
|
451 |
outputs=leaderboard_table,
|
452 |
)
|
453 |
|
454 |
+
with gr.Accordion("Submit the results of your Method"):
|
455 |
with gr.Row():
|
456 |
with gr.Column():
|
457 |
+
method_name_textbox = gr.Textbox(label="Method")
|
458 |
+
url_textbox = gr.Textbox(label="Github Link")
|
|
|
459 |
organisation = gr.Textbox(label="Organisation")
|
460 |
mail = gr.Textbox(label="Contact email")
|
461 |
+
with gr.Column():
|
462 |
file_output = gr.File()
|
463 |
+
submit_button = gr.Button("Submit Eval")
|
|
|
464 |
submission_result = gr.Markdown()
|
465 |
submit_button.click(
|
466 |
add_new_eval,
|
|
|
506 |
scheduler = BackgroundScheduler()
|
507 |
scheduler.add_job(restart_space, "interval", seconds=3600)
|
508 |
scheduler.start()
|
509 |
+
demo.launch(debug=True)
|
510 |
+
# demo.launch(share=True)
|
dummy.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
# load the results json file
|
4 |
+
with open("submissions/baseline/results.json") as f:
|
5 |
+
results = json.load(f)
|
6 |
+
|
7 |
+
|
8 |
+
# update the results
|
9 |
+
with open("submissions/baseline/submission.json") as f:
|
10 |
+
submission = json.load(f)
|
11 |
+
|
12 |
+
|
13 |
+
breakpoint()
|
14 |
+
# update the results
|
15 |
+
results.append(submission[0])
|
submissions/.DS_Store
CHANGED
Binary files a/submissions/.DS_Store and b/submissions/.DS_Store differ
|
|
submissions/baseline/baseline -pre2.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
|
2 |
+
Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
|
3 |
+
SOTA,various,48.58,69.01,81.31|56.00|32.00,81,28.08,39.15,33.00|86.00,28.00|32.00|57.00
|
4 |
+
BERT,various,39.59,58,71.14|-|-,-,18.44,9.24,-|-,-|-|-
|
5 |
+
LegalBERT,various,45.58,54,78.21|-|-,-,21.74,8.67,-|-,-|-|-
|
6 |
+
InLegalBERT,various,48.58,58,81.31|-|-,-,26.23,7.57,-|-,-|-|-
|
7 |
+
GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17|30.00|8.00,51.04,21.55,-,21.00|85.00,23.00|28.00|42.00
|
8 |
+
GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46|29.00|15.00,46.35,22.61,-,20.00|84.00,25.00|28.00|43.00
|
9 |
+
GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74|30.00|11.00,61,21.4,-,22.00|84.00,26.00|29.00|43.00
|
10 |
+
GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29|40.00|14.00,51.46,23.99,-,23.00|85.00,33.00|36.00|50.00
|
11 |
+
GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26|39.00|16.00,56.9,22.26,-,16.00|81.00,35.00|38.00|52.00
|
12 |
+
GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44|43.00|18.00,66.67,20.53,-,17.00|81.00,36.00|39.00|53.00
|
submissions/baseline/baseline-pre.csv
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Unnamed: 0,index,Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,Average
|
2 |
+
,0,baseline,baseline,0,0,0,0,0,0,0,0
|
3 |
+
,0,baseline2,baseline2,0,0,0,0,0,0,0,0
|
4 |
+
,0,baseline,baseline,0,0,0,0,0,0,0,0
|
5 |
+
,0,random,random,0,0,0,0,0,0,0,0
|
6 |
+
,0,random2,random22,0,0,0,0,0,0,0,0
|
7 |
+
,0,random5,random55,0,0,0,0,0,0,0,0
|
submissions/baseline/baseline.csv
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
-
|
2 |
-
,
|
3 |
-
,
|
4 |
-
,
|
5 |
-
,
|
6 |
-
|
7 |
-
,
|
|
|
|
|
|
|
|
|
|
1 |
+
Method,Submitted By,L-NER strict mF1,RR mF1,CJPE mF1,CJPE ROUGE-L,CJPE BLEU,BAIL mF1,LSI mF1,PCR muF1@K,SUMM ROUGE-L,SUMM BERTSCORE,L-MT BLEU,L-MT GLEU,L-MT chrF++
|
2 |
+
SOTA,various,48.58,69.01,81.31,56.00,32.00,81,28.08,39.15,33.00,86.00,28.00,32.00,57.00
|
3 |
+
BERT,various,39.59,58,71.14,-,-,-,18.44,9.24,-,-,-,-,-
|
4 |
+
LegalBERT,various,45.58,54,78.21,-,-,-,21.74,8.67,-,-,-,-,-
|
5 |
+
InLegalBERT,various,48.58,58,81.31,-,-,-,26.23,7.57,-,-,-,-,-
|
6 |
+
GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17,30.00,8.00,51.04,21.55,-,21.00,85.00,23.00,28.00,42.00
|
7 |
+
GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46,29.00,15.00,46.35,22.61,-,20.00,84.00,25.00,28.00,43.00
|
8 |
+
GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74,30.00,11.00,61,21.4,-,22.00,84.00,26.00,29.00,43.00
|
9 |
+
GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29,40.00,14.00,51.46,23.99,-,23.00,85.00,33.00,36.00,50.00
|
10 |
+
GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26,39.00,16.00,56.9,22.26,-,16.00,81.00,35.00,38.00,52.00
|
11 |
+
GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44,43.00,18.00,66.67,20.53,-,17.00,81.00,36.00,39.00,53.00
|
submissions/baseline/results-bacup.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"Method": "SOTA",
|
4 |
+
"Submitted By": "multiple",
|
5 |
+
"Github Link": "exploration-lab.github.io/IL-TUR/",
|
6 |
+
"L-NER": {"strict mF1": "48.58"},
|
7 |
+
"RR": {"mF1": "69.01"},
|
8 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
|
9 |
+
"BAIL": {"mF1": "81"},
|
10 |
+
"LSI": {"mF1": "28.08"},
|
11 |
+
"PCR": {"muF1@K": "39.15"},
|
12 |
+
"SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
|
13 |
+
"L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"Method": "BERT",
|
17 |
+
"Submitted By": "multiple",
|
18 |
+
"Github Link": "",
|
19 |
+
"L-NER": {"strict mF1": "39.59"},
|
20 |
+
"RR": {"mF1": "58"},
|
21 |
+
"CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
|
22 |
+
"BAIL": {"mF1": "-"},
|
23 |
+
"LSI": {"mF1": "-"},
|
24 |
+
"PCR": {"muF1@K": "18.44"},
|
25 |
+
"SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
|
26 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"Method": "LegalBERT",
|
30 |
+
"Submitted By": "multiple",
|
31 |
+
"Github Link": "",
|
32 |
+
"L-NER": {"strict mF1": "45.58"},
|
33 |
+
"RR": {"mF1": "54"},
|
34 |
+
"CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
|
35 |
+
"BAIL": {"mF1": "-"},
|
36 |
+
"LSI": {"mF1": "-"},
|
37 |
+
"PCR": {"muF1@K": "21.74"},
|
38 |
+
"SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
|
39 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"Method": "InLegalBERT",
|
43 |
+
"Submitted By": "multiple",
|
44 |
+
"Github Link": "",
|
45 |
+
"L-NER": {"strict mF1": "48.58"},
|
46 |
+
"RR": {"mF1": "58"},
|
47 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
|
48 |
+
"BAIL": {"mF1": "-"},
|
49 |
+
"LSI": {"mF1": "-"},
|
50 |
+
"PCR": {"muF1@K": "26.23"},
|
51 |
+
"SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
|
52 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"Method": "GPT-3.5 (0-shot)",
|
56 |
+
"Submitted By": "IL-TUR",
|
57 |
+
"Github Link": "",
|
58 |
+
"L-NER": {"strict mF1": "30.59"},
|
59 |
+
"RR": {"mF1": "30.95"},
|
60 |
+
"CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
|
61 |
+
"BAIL": {"mF1": "51.04"},
|
62 |
+
"LSI": {"mF1": "21.55"},
|
63 |
+
"PCR": {"muF1@K": "-"},
|
64 |
+
"SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
|
65 |
+
"L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"Method": "GPT-3.5 (1-shot)",
|
69 |
+
"Submitted By": "IL-TUR",
|
70 |
+
"Github Link": "",
|
71 |
+
"L-NER": {"strict mF1": "23.68"},
|
72 |
+
"RR": {"mF1": "30.05"},
|
73 |
+
"CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
|
74 |
+
"BAIL": {"mF1": "46.35"},
|
75 |
+
"LSI": {"mF1": "22.61"},
|
76 |
+
"PCR": {"muF1@K": "-"},
|
77 |
+
"SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
|
78 |
+
"L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"Method": "GPT-3.5 (2-shot)",
|
82 |
+
"Submitted By": "IL-TUR",
|
83 |
+
"Github Link": "",
|
84 |
+
"L-NER": {"strict mF1": "32.84"},
|
85 |
+
"RR": {"mF1": "30.31"},
|
86 |
+
"CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
|
87 |
+
"BAIL": {"mF1": "61"},
|
88 |
+
"LSI": {"mF1": "21.4"},
|
89 |
+
"PCR": {"muF1@K": "-"},
|
90 |
+
"SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
|
91 |
+
"L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"Method": "GPT-4 (0-shot)",
|
95 |
+
"Submitted By": "IL-TUR",
|
96 |
+
"Github Link": "",
|
97 |
+
"L-NER": {"strict mF1": "13.65"},
|
98 |
+
"RR": {"mF1": "37.37"},
|
99 |
+
"CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
|
100 |
+
"BAIL": {"mF1": "51.46"},
|
101 |
+
"LSI": {"mF1": "23.99"},
|
102 |
+
"PCR": {"muF1@K": "-"},
|
103 |
+
"SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
|
104 |
+
"L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"Method": "GPT-4 (1-shot)",
|
108 |
+
"Submitted By": "IL-TUR",
|
109 |
+
"Github Link": "",
|
110 |
+
"L-NER": {"strict mF1": "10.51"},
|
111 |
+
"RR": {"mF1": "37.43"},
|
112 |
+
"CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
|
113 |
+
"BAIL": {"mF1": "56.9"},
|
114 |
+
"LSI": {"mF1": "22.26"},
|
115 |
+
"PCR": {"muF1@K": "-"},
|
116 |
+
"SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
|
117 |
+
"L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"Method": "GPT-4 (2-shot)",
|
121 |
+
"Submitted By": "IL-TUR",
|
122 |
+
"Github Link": "",
|
123 |
+
"L-NER": {"strict mF1": "24.03"},
|
124 |
+
"RR": {"mF1": "38.18"},
|
125 |
+
"CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
|
126 |
+
"BAIL": {"mF1": "66.67"},
|
127 |
+
"LSI": {"mF1": "20.53"},
|
128 |
+
"PCR": {"muF1@K": "-"},
|
129 |
+
"SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
|
130 |
+
"L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
|
131 |
+
}
|
132 |
+
]
|
133 |
+
|
submissions/baseline/results.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"Method": "SOTA",
|
4 |
+
"Submitted By": "multiple",
|
5 |
+
"Github Link": "exploration-lab.github.io/IL-TUR/",
|
6 |
+
"L-NER": {"strict mF1": "48.58"},
|
7 |
+
"RR": {"mF1": "69.01"},
|
8 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
|
9 |
+
"BAIL": {"mF1": "81"},
|
10 |
+
"LSI": {"mF1": "28.08"},
|
11 |
+
"PCR": {"muF1@K": "39.15"},
|
12 |
+
"SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
|
13 |
+
"L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"Method": "BERT",
|
17 |
+
"Submitted By": "multiple",
|
18 |
+
"Github Link": "",
|
19 |
+
"L-NER": {"strict mF1": "39.59"},
|
20 |
+
"RR": {"mF1": "58"},
|
21 |
+
"CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
|
22 |
+
"BAIL": {"mF1": "-"},
|
23 |
+
"LSI": {"mF1": "-"},
|
24 |
+
"PCR": {"muF1@K": "18.44"},
|
25 |
+
"SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
|
26 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"Method": "LegalBERT",
|
30 |
+
"Submitted By": "multiple",
|
31 |
+
"Github Link": "",
|
32 |
+
"L-NER": {"strict mF1": "45.58"},
|
33 |
+
"RR": {"mF1": "54"},
|
34 |
+
"CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
|
35 |
+
"BAIL": {"mF1": "-"},
|
36 |
+
"LSI": {"mF1": "-"},
|
37 |
+
"PCR": {"muF1@K": "21.74"},
|
38 |
+
"SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
|
39 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"Method": "InLegalBERT",
|
43 |
+
"Submitted By": "multiple",
|
44 |
+
"Github Link": "",
|
45 |
+
"L-NER": {"strict mF1": "48.58"},
|
46 |
+
"RR": {"mF1": "58"},
|
47 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
|
48 |
+
"BAIL": {"mF1": "-"},
|
49 |
+
"LSI": {"mF1": "-"},
|
50 |
+
"PCR": {"muF1@K": "26.23"},
|
51 |
+
"SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
|
52 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"Method": "GPT-3.5 (0-shot)",
|
56 |
+
"Submitted By": "IL-TUR",
|
57 |
+
"Github Link": "",
|
58 |
+
"L-NER": {"strict mF1": "30.59"},
|
59 |
+
"RR": {"mF1": "30.95"},
|
60 |
+
"CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
|
61 |
+
"BAIL": {"mF1": "51.04"},
|
62 |
+
"LSI": {"mF1": "21.55"},
|
63 |
+
"PCR": {"muF1@K": "-"},
|
64 |
+
"SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
|
65 |
+
"L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"Method": "GPT-3.5 (1-shot)",
|
69 |
+
"Submitted By": "IL-TUR",
|
70 |
+
"Github Link": "",
|
71 |
+
"L-NER": {"strict mF1": "23.68"},
|
72 |
+
"RR": {"mF1": "30.05"},
|
73 |
+
"CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
|
74 |
+
"BAIL": {"mF1": "46.35"},
|
75 |
+
"LSI": {"mF1": "22.61"},
|
76 |
+
"PCR": {"muF1@K": "-"},
|
77 |
+
"SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
|
78 |
+
"L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"Method": "GPT-3.5 (2-shot)",
|
82 |
+
"Submitted By": "IL-TUR",
|
83 |
+
"Github Link": "",
|
84 |
+
"L-NER": {"strict mF1": "32.84"},
|
85 |
+
"RR": {"mF1": "30.31"},
|
86 |
+
"CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
|
87 |
+
"BAIL": {"mF1": "61"},
|
88 |
+
"LSI": {"mF1": "21.4"},
|
89 |
+
"PCR": {"muF1@K": "-"},
|
90 |
+
"SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
|
91 |
+
"L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"Method": "GPT-4 (0-shot)",
|
95 |
+
"Submitted By": "IL-TUR",
|
96 |
+
"Github Link": "",
|
97 |
+
"L-NER": {"strict mF1": "13.65"},
|
98 |
+
"RR": {"mF1": "37.37"},
|
99 |
+
"CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
|
100 |
+
"BAIL": {"mF1": "51.46"},
|
101 |
+
"LSI": {"mF1": "23.99"},
|
102 |
+
"PCR": {"muF1@K": "-"},
|
103 |
+
"SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
|
104 |
+
"L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"Method": "GPT-4 (1-shot)",
|
108 |
+
"Submitted By": "IL-TUR",
|
109 |
+
"Github Link": "",
|
110 |
+
"L-NER": {"strict mF1": "10.51"},
|
111 |
+
"RR": {"mF1": "37.43"},
|
112 |
+
"CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
|
113 |
+
"BAIL": {"mF1": "56.9"},
|
114 |
+
"LSI": {"mF1": "22.26"},
|
115 |
+
"PCR": {"muF1@K": "-"},
|
116 |
+
"SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
|
117 |
+
"L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"Method": "GPT-4 (2-shot)",
|
121 |
+
"Submitted By": "IL-TUR",
|
122 |
+
"Github Link": "",
|
123 |
+
"L-NER": {"strict mF1": "24.03"},
|
124 |
+
"RR": {"mF1": "38.18"},
|
125 |
+
"CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
|
126 |
+
"BAIL": {"mF1": "66.67"},
|
127 |
+
"LSI": {"mF1": "20.53"},
|
128 |
+
"PCR": {"muF1@K": "-"},
|
129 |
+
"SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
|
130 |
+
"L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
|
131 |
+
}
|
132 |
+
]
|
133 |
+
|
submissions/baseline/submission.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"Method": "GPT-5 (2-shot)",
|
4 |
+
"Submitted By": "IL-TUR",
|
5 |
+
"Github Link": "dummy submission",
|
6 |
+
"L-NER": {"strict mF1": "24.03"},
|
7 |
+
"RR": {"mF1": "38.18"},
|
8 |
+
"CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
|
9 |
+
"BAIL": {"mF1": "66.67"},
|
10 |
+
"LSI": {"mF1": "20.53"},
|
11 |
+
"PCR": {"muF1@K": "-"},
|
12 |
+
"SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
|
13 |
+
"L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
|
14 |
+
}
|
15 |
+
]
|
16 |
+
|
uploads.py
CHANGED
@@ -2,6 +2,7 @@ from email.utils import parseaddr
|
|
2 |
from huggingface_hub import HfApi
|
3 |
import os
|
4 |
import datetime
|
|
|
5 |
import pandas as pd
|
6 |
|
7 |
LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
|
@@ -59,54 +60,72 @@ def add_new_eval(
|
|
59 |
mail,
|
60 |
)
|
61 |
|
62 |
-
# load the file
|
63 |
-
df = pd.read_csv(path_to_file)
|
64 |
-
submission_df = pd.read_csv(path_to_file)
|
65 |
|
66 |
-
# modify the df to include metadata
|
67 |
-
df["Method"] = method_name
|
68 |
-
df["url"] = url
|
69 |
-
df["organisation"] = organisation
|
70 |
-
df["mail"] = parsed_mail
|
71 |
-
df["timestamp"] = datetime.datetime.now()
|
72 |
|
73 |
-
submission_df = pd.read_csv(path_to_file)
|
74 |
-
submission_df["Method"] = method_name
|
75 |
-
submission_df["Submitted By"] = organisation
|
76 |
-
# upload to spaces using the hf api at
|
77 |
|
78 |
-
path_in_repo = f"submissions/{method_name}"
|
79 |
-
file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
|
80 |
|
81 |
# upload the df to spaces
|
82 |
import io
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
path_in_repo=f"{path_in_repo}/{file_name}",
|
91 |
-
path_or_fileobj=buffer,
|
92 |
-
token=TOKEN,
|
93 |
-
repo_type="dataset",
|
94 |
-
)
|
95 |
-
# read the leaderboard
|
96 |
-
leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
|
97 |
|
98 |
-
#
|
99 |
-
|
100 |
-
leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
|
101 |
|
102 |
-
# save the new leaderboard
|
103 |
-
# leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
|
104 |
leaderboard_buffer = io.BytesIO()
|
105 |
-
|
|
|
|
|
|
|
|
|
106 |
leaderboard_buffer.seek(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
api.upload_file(
|
108 |
repo_id=LEADERBOARD_PATH,
|
109 |
-
path_in_repo=f"submissions/baseline/baseline.csv",
|
|
|
110 |
path_or_fileobj=leaderboard_buffer,
|
111 |
token=TOKEN,
|
112 |
repo_type="space",
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
import os
|
4 |
import datetime
|
5 |
+
import json
|
6 |
import pandas as pd
|
7 |
|
8 |
LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
|
|
|
60 |
mail,
|
61 |
)
|
62 |
|
63 |
+
# # load the file
|
64 |
+
# df = pd.read_csv(path_to_file)
|
65 |
+
# submission_df = pd.read_csv(path_to_file)
|
66 |
|
67 |
+
# # modify the df to include metadata
|
68 |
+
# df["Method"] = method_name
|
69 |
+
# df["url"] = url
|
70 |
+
# df["organisation"] = organisation
|
71 |
+
# df["mail"] = parsed_mail
|
72 |
+
# df["timestamp"] = datetime.datetime.now()
|
73 |
|
74 |
+
# submission_df = pd.read_csv(path_to_file)
|
75 |
+
# submission_df["Method"] = method_name
|
76 |
+
# submission_df["Submitted By"] = organisation
|
77 |
+
# # upload to spaces using the hf api at
|
78 |
|
79 |
+
# path_in_repo = f"submissions/{method_name}"
|
80 |
+
# file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
|
81 |
|
82 |
# upload the df to spaces
|
83 |
import io
|
84 |
|
85 |
+
# read the submission json file
|
86 |
+
with open(path_to_file, "r") as f:
|
87 |
+
submission = json.load(f)
|
88 |
|
89 |
+
with open("submissions/baseline/results.json", "r") as f:
|
90 |
+
results = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
# update the results
|
93 |
+
results.append(submission[0])
|
|
|
94 |
|
|
|
|
|
95 |
leaderboard_buffer = io.BytesIO()
|
96 |
+
# df.to_csv(buffer, index=False) # Write the DataFrame to a buffer in CSV format
|
97 |
+
# buffer.seek(0) # Rewind the buffer to the beginning
|
98 |
+
|
99 |
+
# save the results to buffer
|
100 |
+
leaderboard_buffer.write(json.dumps(results).encode())
|
101 |
leaderboard_buffer.seek(0)
|
102 |
+
|
103 |
+
# api.upload_file(
|
104 |
+
# repo_id=RESULTS_PATH,
|
105 |
+
# path_in_repo=f"{path_in_repo}/{file_name}",
|
106 |
+
# path_or_fileobj=buffer,
|
107 |
+
# token=TOKEN,
|
108 |
+
# repo_type="dataset",
|
109 |
+
# )
|
110 |
+
# # read the leaderboard
|
111 |
+
# leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
|
112 |
+
|
113 |
+
# # append the new submission_df csv to the leaderboard
|
114 |
+
# # leaderboard_df = leaderboard_df._append(submission_df)
|
115 |
+
# # leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
|
116 |
+
|
117 |
+
# # save the new leaderboard
|
118 |
+
# # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
|
119 |
+
# leaderboard_buffer = io.BytesIO()
|
120 |
+
# leaderboard_df.to_csv(leaderboard_buffer, index=False)
|
121 |
+
# leaderboard_buffer.seek(0)
|
122 |
+
# with open("submissions/baseline/results.json", "w") as f:
|
123 |
+
# json.dump(results, f)
|
124 |
+
|
125 |
api.upload_file(
|
126 |
repo_id=LEADERBOARD_PATH,
|
127 |
+
# path_in_repo=f"submissions/baseline/baseline.csv",
|
128 |
+
path_in_repo=f"submissions/results.json",
|
129 |
path_or_fileobj=leaderboard_buffer,
|
130 |
token=TOKEN,
|
131 |
repo_type="space",
|