abhinav-joshi commited on
Commit
e1043c6
1 Parent(s): d1ca5fe

add prediction submission

Browse files
eval_utils.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from collections import defaultdict
4
+
5
+ import evaluate
6
+ import nltk
7
+ import numpy as np
8
+ from nervaluate import Evaluator
9
+ from rouge_score import rouge_scorer
10
+ from sacrebleu.metrics import BLEU, CHRF
11
+ from sklearn.metrics import f1_score
12
+ from tqdm import tqdm
13
+ from transformers import AutoTokenizer
14
+
15
+ from ner_helpers import span2bio
16
+
17
+
18
+ def load_json(file_path):
19
+ with open(file_path, "r") as f:
20
+ return json.load(f)
21
+
22
+
23
+ def get_micro_at_k(gold, pred, k):
24
+ gold_set = set(gold)
25
+ pred_set = set(pred[:k])
26
+ return len(gold_set & pred_set), len(gold_set), len(pred_set)
27
+
28
+
29
+ def evaluate_bail(gold_data, pred_data):
30
+ gold_labels = []
31
+ pred_labels = []
32
+ for id, label in gold_data.items():
33
+ gold_labels.append(label)
34
+ pred_labels.append(pred_data.get(id, 0))
35
+
36
+ f1 = f1_score(gold_labels, pred_labels, average="macro")
37
+ print("Macro-F1 on HLDC-all-districts test set:", f1)
38
+
39
+ return f"{f1:.2f}"
40
+
41
+
42
+ def evaluate_cjpe(gold_data, pred_data):
43
+ # Evaluate prediction
44
+ gold_labels = []
45
+ pred_labels = []
46
+ for id, label in gold_data["prediction"].items():
47
+ gold_labels.append(label)
48
+ pred_labels.append(pred_data["prediction"].get(id, 0))
49
+
50
+ f1 = f1_score(gold_labels, pred_labels, average="macro")
51
+ prediction_result = {"cjpe-eval": f1}
52
+
53
+ # Evaluate explanation
54
+ rouge = evaluate.load("rouge")
55
+ bleu = evaluate.load("bleu")
56
+
57
+ gold_explanations = [exp["expert_1"] for exp in gold_data["explanation"].values()]
58
+ pred_explanations = [exp["expert_1"] for exp in pred_data["explanation"].values()]
59
+
60
+ rouge_scores = rouge.compute(
61
+ predictions=pred_explanations, references=gold_explanations
62
+ )
63
+ bleu_score = bleu.compute(
64
+ predictions=pred_explanations, references=gold_explanations
65
+ )
66
+
67
+ explanation_result = {
68
+ "cjpe-exp-eval": {
69
+ "rouge": [rouge_scores],
70
+ "bleu": [bleu_score],
71
+ }
72
+ }
73
+
74
+ return {**prediction_result, **explanation_result}
75
+
76
+
77
+ def evaluate_lner(gold_data, pred_data, text_data):
78
+ with open("labels.txt") as f:
79
+ labels = f.read().strip().split("\n")
80
+
81
+ results_per_fold = {}
82
+ for fold in range(1, 4):
83
+ gold = gold_data[f"fold_{fold}"]
84
+ pred = pred_data[f"fold_{fold}"]
85
+ text = text_data[f"fold_{fold}"]
86
+
87
+ texts, gold_labels, pred_labels = [], [], []
88
+
89
+ for id, gold_label in tqdm(gold.items()):
90
+ txt = text[id]
91
+ pred_label = pred.get(id, [])
92
+
93
+ txt_seg, gold_bio = span2bio(txt, gold_label)
94
+ _, pred_bio = span2bio(txt, pred_label)
95
+
96
+ texts.append(txt_seg)
97
+ gold_labels.append(gold_bio)
98
+ pred_labels.append(pred_bio)
99
+
100
+ evaluator = Evaluator(gold_labels, pred_labels, tags=labels, loader="list")
101
+ results, results_per_tag, _, _ = evaluator.evaluate()
102
+
103
+ f1_scores = [results_per_tag[l]["strict"]["f1"] for l in results_per_tag]
104
+ avg_f1 = sum(f1_scores) / len(f1_scores)
105
+ print(f"Strict Macro-F1 on Fold {fold}:", avg_f1)
106
+ results_per_fold[f"fold_{fold}"] = avg_f1
107
+
108
+ return {"strict mF1": f"{np.mean(list(results_per_fold.values()))}:.2f"}
109
+
110
+
111
+ def evaluate_rr(gold_data, pred_data):
112
+ all_gold_labels = []
113
+ all_pred_labels = []
114
+
115
+ for id, gold_labels in gold_data.items():
116
+ pred_labels = pred_data.get(id, ["None"] * len(gold_labels))
117
+ all_gold_labels.extend(gold_labels)
118
+ all_pred_labels.extend(pred_labels)
119
+
120
+ mf1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
121
+ print(f"Macro-F1 on combined test set:", mf1)
122
+
123
+ return {"mF1": f"{mf1:.2f}"}
124
+
125
+
126
+ def evaluate_lsi(gold_data, pred_data):
127
+ with open("lsi_label_vocab.json") as f:
128
+ label_vocab = json.load(f)
129
+
130
+ gold_matrix = np.zeros((len(gold_data), len(label_vocab)))
131
+ pred_matrix = np.zeros((len(gold_data), len(label_vocab)))
132
+
133
+ for i, (id, gold_labels) in enumerate(gold_data.items()):
134
+ pred_labels = pred_data.get(id, [])
135
+
136
+ for label in gold_labels:
137
+ if label in label_vocab:
138
+ gold_matrix[i, label_vocab[label]] = 1
139
+
140
+ for label in pred_labels:
141
+ if label in label_vocab:
142
+ pred_matrix[i, label_vocab[label]] = 1
143
+
144
+ f1 = f1_score(gold_matrix, pred_matrix, average="macro")
145
+ print("Macro-F1 on ILSI test set:", f1)
146
+ return f1
147
+
148
+
149
+ def evaluate_pcr(gold_data, pred_data):
150
+ f1_scores = []
151
+ for k in range(1, 21):
152
+ correct, gold_total, pred_total = 0, 0, 0
153
+ for id, gold_candidates in gold_data.items():
154
+ pred_candidates = pred_data.get(id, [])
155
+ gold_candidates = [c for c in gold_candidates if c != id]
156
+ pred_candidates = [c for c in pred_candidates if c != id]
157
+
158
+ c, g, p = get_micro_at_k(gold_candidates, pred_candidates, k)
159
+ correct += c
160
+ gold_total += g
161
+ pred_total += p
162
+
163
+ precision = correct / pred_total if pred_total > 0 else 0
164
+ recall = correct / gold_total if gold_total > 0 else 0
165
+ f1 = (
166
+ 2 * precision * recall / (precision + recall)
167
+ if precision + recall > 0
168
+ else 0
169
+ )
170
+ f1_scores.append(f1)
171
+
172
+ print(f"Micro-F1@{k} on IL-PCR test set:", f1)
173
+
174
+ return np.mean(f1_scores)
175
+
176
+
177
+ def evaluate_summ(gold_data, pred_data):
178
+ gold_summaries = []
179
+ pred_summaries = []
180
+
181
+ for id, gold_summary in gold_data.items():
182
+ if id in pred_data:
183
+ gold_summary = re.sub(r"\s+", " ", gold_summary.replace("\n", " ")).strip()
184
+ pred_summary = re.sub(r"\s+", " ", pred_data[id].replace("\n", " ")).strip()
185
+
186
+ gold_summaries.append(gold_summary)
187
+ pred_summaries.append(pred_summary)
188
+
189
+ rouge = evaluate.load("rouge")
190
+ rouge_scores = rouge.compute(predictions=pred_summaries, references=gold_summaries)
191
+ print("Rouge-L:", rouge_scores)
192
+
193
+ return {"ROUGE-L": rouge_scores, "BERTSCORE": "-"}
194
+
195
+
196
+ def evaluate_lmt(gold_data, pred_data):
197
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
198
+ bleu = BLEU()
199
+ chrfp = CHRF(word_order=2)
200
+ gleu = evaluate.load("google_bleu")
201
+
202
+ G = defaultdict(lambda: defaultdict(list))
203
+ P = defaultdict(lambda: defaultdict(list))
204
+
205
+ for dataset in gold_data:
206
+ for id, gold_text in gold_data[dataset].items():
207
+ lang = id.split("/")[1].strip()
208
+ gold_tokens = " ".join(tokenizer.tokenize(gold_text))
209
+ pred_tokens = " ".join(tokenizer.tokenize(pred_data[dataset][id]))
210
+ G[dataset][lang].append(gold_tokens)
211
+ P[dataset][lang].append(pred_tokens)
212
+
213
+ bleu_scores, chrfpp_scores, gleu_scores = [], [], []
214
+
215
+ for dataset in G:
216
+ print("Dataset", dataset)
217
+ dataset_bleu, dataset_chrfpp, dataset_gleu = [], [], []
218
+
219
+ for lang in G[dataset]:
220
+ gold = G[dataset][lang]
221
+ pred = P[dataset][lang]
222
+
223
+ bleu_score = bleu.corpus_score(pred, [gold]).score
224
+ chrfpp_score = chrfp.corpus_score(pred, [gold]).score
225
+ gleu_score = gleu.compute(predictions=pred, references=gold)["google_bleu"]
226
+
227
+ dataset_bleu.append(bleu_score)
228
+ dataset_chrfpp.append(chrfpp_score)
229
+ dataset_gleu.append(gleu_score)
230
+
231
+ bleu_scores.append(sum(dataset_bleu) / len(dataset_bleu))
232
+ chrfpp_scores.append(sum(dataset_chrfpp) / len(dataset_chrfpp))
233
+ gleu_scores.append(sum(dataset_gleu) / len(dataset_gleu))
234
+
235
+ return {
236
+ "BLEU": sum(bleu_scores) / len(bleu_scores),
237
+ "GLEU": sum(gleu_scores) / len(gleu_scores),
238
+ "chrF++": sum(chrfpp_scores) / len(chrfpp_scores),
239
+ }
240
+
241
+
242
+ def create_output_json(evaluation_results):
243
+ output = {
244
+ "Method": "GPT-5 (2-shot)",
245
+ "Submitted By": "IL-TUR",
246
+ "Github Link": "dummy submission",
247
+ "L-NER": {"strict mF1": evaluation_results["lner"]["strict mF1"]},
248
+ "RR": {"mF1": evaluation_results["rr"]["mF1"]},
249
+ "CJPE": {
250
+ "mF1": evaluation_results["cjpe"]["mF1"],
251
+ "ROUGE-L": evaluation_results["cjpe"]["ROUGE-L"],
252
+ "BLEU": evaluation_results["cjpe"]["BLEU"],
253
+ },
254
+ "BAIL": {"mF1": evaluation_results["bail"]},
255
+ "LSI": {"mF1": evaluation_results["lsi"]},
256
+ "PCR": {"muF1@K": evaluation_results["pcr"]},
257
+ "SUMM": {
258
+ "ROUGE-L": evaluation_results["summ"]["ROUGE-L"],
259
+ "BERTSCORE": "-", # Placeholder BERTSCORE
260
+ },
261
+ "L-MT": {
262
+ "BLEU": evaluation_results["lmt"]["BLEU"],
263
+ "GLEU": evaluation_results["lmt"]["GLEU"],
264
+ "chrF++": evaluation_results["lmt"]["chrF++"],
265
+ },
266
+ }
267
+ return [output] # Wrap in a list to match the desired format
268
+
269
+
270
+ def main():
271
+ # gold_data = load_json("IL_TUR_eval_gold.json")
272
+ # pred_data = load_json("IL_TUR_eval_submission2.json")
273
+ gold_data = load_json("submissions/baseline/IL_TUR_eval_gold_small.json")
274
+ pred_data = load_json("submissions/baseline/IL_TUR_eval_submission_small.json")
275
+ pred_data = gold_data
276
+ evaluation_results = {}
277
+
278
+ for task in pred_data.keys():
279
+ print(f"Task: {task}")
280
+
281
+ if task == "bail":
282
+ evaluation_results[task] = evaluate_bail(gold_data[task], pred_data[task])
283
+ elif task == "cjpe":
284
+ evaluation_results.update(evaluate_cjpe(gold_data[task], pred_data[task]))
285
+ elif task == "lner":
286
+ text_data = load_json("lner-text.json")
287
+ evaluation_results[task] = evaluate_lner(
288
+ gold_data[task], pred_data[task], text_data
289
+ )
290
+ elif task == "rr":
291
+ evaluation_results[task] = evaluate_rr(gold_data[task], pred_data[task])
292
+ elif task == "lsi":
293
+ evaluation_results[task] = evaluate_lsi(gold_data[task], pred_data[task])
294
+ elif task == "pcr":
295
+ evaluation_results[task] = evaluate_pcr(gold_data[task], pred_data[task])
296
+ elif task == "summ":
297
+ evaluation_results[task] = evaluate_summ(gold_data[task], pred_data[task])
298
+ elif task == "lmt":
299
+ evaluation_results[task] = evaluate_lmt(gold_data[task], pred_data[task])
300
+
301
+ # convert the evaluation results to the required format
302
+ for task, result in evaluation_results.items():
303
+ if isinstance(result, dict):
304
+ for subtask, subresult in result.items():
305
+ if isinstance(subresult, dict):
306
+ for subsubtask, subsubresult in subresult.items():
307
+ evaluation_results[task][subtask][
308
+ subsubtask
309
+ ] = f"{subsubresult:.2f}"
310
+ else:
311
+ if isinstance(subresult, str):
312
+ evaluation_results[task][subtask] = subresult
313
+ else:
314
+ evaluation_results[task][subtask] = f"{subresult:.2f}"
315
+ else:
316
+ if isinstance(result, str):
317
+ evaluation_results[task] = result
318
+ else:
319
+ evaluation_results[task] = f"{result:.2f}"
320
+
321
+ blank_scores = {
322
+ "lner": {"strict mF1": "-"},
323
+ "rr": {"mF1": "-"},
324
+ "cjpe": {"mF1": "-", "ROUGE-L": "-", "BLEU": "-"},
325
+ "bail": {"mF1": "-"},
326
+ "lsi": {"mF1": "-"},
327
+ "pcr": {"muF1@K": "-"},
328
+ "summ": {"ROUGE-L": "-", "BERTSCORE": "-"},
329
+ "lmt": {"BLEU": "-", "GLEU": "-", "chrF++": "-"},
330
+ }
331
+
332
+ print("--------------------------Evaluation Summary--------------------------")
333
+ for task, result in evaluation_results.items():
334
+ print(f"{task}: {result}")
335
+ print("---------------------------------------------------------------------")
336
+
337
+ # for tasks that were not present in the submission, add blank scores
338
+ for task in gold_data.keys():
339
+ if task not in pred_data:
340
+ evaluation_results[task] = blank_scores[task]
341
+
342
+ # Generate the output JSON
343
+ output_json = create_output_json(evaluation_results)
344
+ with open("evaluation_results.json", "w") as f:
345
+ json.dump(output_json, f, indent=2)
346
+ print("Evaluation results saved to evaluation_results.json")
347
+
348
+
349
+ def get_evaluation_scores(gold_data, submission_data):
350
+ evaluation_results = {}
351
+
352
+ for task in submission_data.keys():
353
+ print(f"Task: {task}")
354
+
355
+ if task == "bail":
356
+ evaluation_results[task] = evaluate_bail(
357
+ gold_data[task], submission_data[task]
358
+ )
359
+ elif task == "cjpe":
360
+ evaluation_results.update(
361
+ evaluate_cjpe(gold_data[task], submission_data[task])
362
+ )
363
+ elif task == "lner":
364
+ text_data = load_json("lner-text.json")
365
+ evaluation_results[task] = evaluate_lner(
366
+ gold_data[task], submission_data[task], text_data
367
+ )
368
+ elif task == "rr":
369
+ evaluation_results[task] = evaluate_rr(
370
+ gold_data[task], submission_data[task]
371
+ )
372
+ elif task == "lsi":
373
+ evaluation_results[task] = evaluate_lsi(
374
+ gold_data[task], submission_data[task]
375
+ )
376
+ elif task == "pcr":
377
+ evaluation_results[task] = evaluate_pcr(
378
+ gold_data[task], submission_data[task]
379
+ )
380
+ elif task == "summ":
381
+ evaluation_results[task] = evaluate_summ(
382
+ gold_data[task], submission_data[task]
383
+ )
384
+ elif task == "lmt":
385
+ evaluation_results[task] = evaluate_lmt(
386
+ gold_data[task], submission_data[task]
387
+ )
388
+
389
+ # convert the evaluation results to the required format
390
+ for task, result in evaluation_results.items():
391
+ if isinstance(result, dict):
392
+ for subtask, subresult in result.items():
393
+ if isinstance(subresult, dict):
394
+ for subsubtask, subsubresult in subresult.items():
395
+ evaluation_results[task][subtask][
396
+ subsubtask
397
+ ] = f"{subsubresult:.2f}"
398
+ else:
399
+ if isinstance(subresult, str):
400
+ evaluation_results[task][subtask] = subresult
401
+ else:
402
+ evaluation_results[task][subtask] = f"{subresult:.2f}"
403
+ else:
404
+ if isinstance(result, str):
405
+ evaluation_results[task] = result
406
+ else:
407
+ evaluation_results[task] = f"{result:.2f}"
408
+
409
+ blank_scores = {
410
+ "lner": {"strict mF1": "-"},
411
+ "rr": {"mF1": "-"},
412
+ "cjpe": {"mF1": "-", "ROUGE-L": "-", "BLEU": "-"},
413
+ "bail": {"mF1": "-"},
414
+ "lsi": {"mF1": "-"},
415
+ "pcr": {"muF1@K": "-"},
416
+ "summ": {"ROUGE-L": "-", "BERTSCORE": "-"},
417
+ "lmt": {"BLEU": "-", "GLEU": "-", "chrF++": "-"},
418
+ }
419
+
420
+ # for tasks that were not present in the submission, add blank scores
421
+ for task in gold_data.keys():
422
+ if task not in submission_data:
423
+ evaluation_results[task] = blank_scores[task]
424
+
425
+ print("--------------------------Evaluation Summary--------------------------")
426
+ for task, result in evaluation_results.items():
427
+ print(f"{task}: {result}")
428
+ print("---------------------------------------------------------------------")
429
+ output_json = create_output_json(evaluation_results)
430
+
431
+ return output_json
432
+
433
+
434
+ if __name__ == "__main__":
435
+ main()
evaluation_results.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Method": "GPT-5 (2-shot)",
4
+ "Submitted By": "IL-TUR",
5
+ "Github Link": "dummy submission",
6
+ "L-NER": {
7
+ "strict mF1": "-"
8
+ },
9
+ "RR": {
10
+ "mF1": {
11
+ "mF1": "0.10"
12
+ }
13
+ },
14
+ "CJPE": {
15
+ "mF1": "-",
16
+ "ROUGE-L": "-",
17
+ "BLEU": "-"
18
+ },
19
+ "BAIL": {
20
+ "mF1": "0.02"
21
+ },
22
+ "LSI": {
23
+ "mF1": "0.26"
24
+ },
25
+ "PCR": {
26
+ "muF1@K": "0.63"
27
+ },
28
+ "SUMM": {
29
+ "ROUGE-L": "-",
30
+ "BERTSCORE": "-"
31
+ },
32
+ "L-MT": {
33
+ "BLEU": "-",
34
+ "GLEU": "-",
35
+ "chrF++": "-"
36
+ }
37
+ }
38
+ ]
labels.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APP
2
+ RESP
3
+ A.COUNSEL
4
+ R.COUNSEL
5
+ JUDGE
6
+ WIT
7
+ AUTH
8
+ COURT
9
+ STAT
10
+ PREC
11
+ DATE
12
+ CASENO
lner-text.json ADDED
The diff for this file is too large to render. See raw diff
 
lsi_label_vocab.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Section 2": 0,
3
+ "Section 3": 1,
4
+ "Section 4": 2,
5
+ "Section 5": 3,
6
+ "Section 13": 4,
7
+ "Section 34": 5,
8
+ "Section 107": 6,
9
+ "Section 109": 7,
10
+ "Section 114": 8,
11
+ "Section 120": 9,
12
+ "Section 120B": 10,
13
+ "Section 143": 11,
14
+ "Section 147": 12,
15
+ "Section 148": 13,
16
+ "Section 149": 14,
17
+ "Section 155": 15,
18
+ "Section 156": 16,
19
+ "Section 161": 17,
20
+ "Section 164": 18,
21
+ "Section 173": 19,
22
+ "Section 174A": 20,
23
+ "Section 186": 21,
24
+ "Section 188": 22,
25
+ "Section 190": 23,
26
+ "Section 193": 24,
27
+ "Section 200": 25,
28
+ "Section 201": 26,
29
+ "Section 228": 27,
30
+ "Section 229A": 28,
31
+ "Section 279": 29,
32
+ "Section 294": 30,
33
+ "Section 294(b)": 31,
34
+ "Section 299": 32,
35
+ "Section 300": 33,
36
+ "Section 302": 34,
37
+ "Section 304": 35,
38
+ "Section 304A": 36,
39
+ "Section 304B": 37,
40
+ "Section 306": 38,
41
+ "Section 307": 39,
42
+ "Section 308": 40,
43
+ "Section 313": 41,
44
+ "Section 320": 42,
45
+ "Section 323": 43,
46
+ "Section 324": 44,
47
+ "Section 325": 45,
48
+ "Section 326": 46,
49
+ "Section 332": 47,
50
+ "Section 336": 48,
51
+ "Section 337": 49,
52
+ "Section 338": 50,
53
+ "Section 341": 51,
54
+ "Section 342": 52,
55
+ "Section 353": 53,
56
+ "Section 354": 54,
57
+ "Section 363": 55,
58
+ "Section 364": 56,
59
+ "Section 365": 57,
60
+ "Section 366": 58,
61
+ "Section 366A": 59,
62
+ "Section 375": 60,
63
+ "Section 376": 61,
64
+ "Section 376(2)": 62,
65
+ "Section 379": 63,
66
+ "Section 380": 64,
67
+ "Section 384": 65,
68
+ "Section 389": 66,
69
+ "Section 392": 67,
70
+ "Section 394": 68,
71
+ "Section 395": 69,
72
+ "Section 397": 70,
73
+ "Section 406": 71,
74
+ "Section 409": 72,
75
+ "Section 411": 73,
76
+ "Section 415": 74,
77
+ "Section 417": 75,
78
+ "Section 419": 76,
79
+ "Section 420": 77,
80
+ "Section 427": 78,
81
+ "Section 436": 79,
82
+ "Section 437": 80,
83
+ "Section 438": 81,
84
+ "Section 447": 82,
85
+ "Section 448": 83,
86
+ "Section 450": 84,
87
+ "Section 452": 85,
88
+ "Section 457": 86,
89
+ "Section 465": 87,
90
+ "Section 467": 88,
91
+ "Section 468": 89,
92
+ "Section 471": 90,
93
+ "Section 482": 91,
94
+ "Section 494": 92,
95
+ "Section 498": 93,
96
+ "Section 498A": 94,
97
+ "Section 500": 95,
98
+ "Section 504": 96,
99
+ "Section 506": 97,
100
+ "Section 509": 98,
101
+ "Section 511": 99
102
+ }
ner_helpers.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ import re
3
+ import string
4
+
5
+
6
+ class TF_Tokenizer:
7
+ def __init__(self, model_str):
8
+ tok = AutoTokenizer.from_pretrained(model_str)
9
+
10
+ def __call__(self, txt):
11
+ return self.tok.tokenize(txt)
12
+
13
+
14
+ class WS_Tokenizer:
15
+ def __init__(self):
16
+ pass
17
+
18
+ def __call__(self, txt):
19
+ return re.findall(r"[{}]|\w+".format(string.punctuation), txt)
20
+
21
+
22
+ def convert_spans_to_bio(txt, roles, tokenizer_func):
23
+ roles = sorted(roles, key=lambda x: x["start"])
24
+ roles_left = [r["start"] for r in roles]
25
+
26
+ ttxt = tokenizer_func(txt)
27
+
28
+ c = 0
29
+ cr = -1
30
+ prev = "O"
31
+ troles = []
32
+ for tok in ttxt:
33
+ if c >= len(txt):
34
+ break
35
+
36
+ while txt[c] == " ":
37
+ c += 1
38
+
39
+ else:
40
+ if c in roles_left: # Start of a new role
41
+ ind = roles_left.index(c)
42
+ cr = roles[ind]["end"]
43
+ prev = "I-" + roles[ind]["label"]
44
+ troles.append("B-" + roles[ind]["label"])
45
+ else:
46
+ if c < cr: # Assign previous role
47
+ troles.append(prev)
48
+ else: # Assign 'O'
49
+ troles.append("O")
50
+
51
+ c += len(tok)
52
+
53
+ if len(ttxt) != len(troles):
54
+ troles += ["O"] * (len(ttxt) - len(troles))
55
+
56
+ assert len(ttxt) == len(troles)
57
+ return troles
58
+
59
+
60
+ def convert_bio_to_spans(txt, troles, tokenizer_func):
61
+ c = 0
62
+ c2 = 0
63
+ cr = -1
64
+ cs = -1
65
+ prev = "O"
66
+
67
+ roles = []
68
+ ttxt = tokenizer_func(txt)
69
+
70
+ if len(ttxt) != len(troles):
71
+ ttxt = ttxt[: len(troles)]
72
+
73
+ for j, tok in enumerate(ttxt):
74
+ if c >= len(txt):
75
+ break
76
+
77
+ while c < len(txt) and txt[c].isspace():
78
+ c += 1
79
+
80
+ if tok[:2] == "##" or tok == "[UNK]":
81
+ c += len(tok) - 2 if tok[:2] == "##" else 1
82
+ else:
83
+ if troles[j].startswith("B-"):
84
+ if cs >= cr:
85
+ cr = c
86
+ if cs >= 0:
87
+ roles.append({"start": cs, "end": c2, "label": prev})
88
+ cs = c
89
+ prev = troles[j][2:]
90
+ else:
91
+ if troles[j] == "O":
92
+ if cs >= cr:
93
+ cr = c
94
+ if cs >= 0:
95
+ roles.append({"start": cs, "end": c2, "label": prev})
96
+ c += len(tok)
97
+ c2 = c
98
+
99
+ if cs >= cr:
100
+ if cs >= 0:
101
+ roles.append({"start": cs, "end": c2, "label": prev})
102
+
103
+ return roles
104
+
105
+
106
+ def span2bio(txt, labels):
107
+ roles = sorted(labels, key=lambda x: x["label"])
108
+ roles_left = [r["start"] for r in roles]
109
+
110
+ ttxt = re.findall(r"[{}]|\w+".format(string.punctuation), txt)
111
+
112
+ c = 0
113
+ cr = -1
114
+ prev = "O"
115
+ troles = []
116
+ for tok in ttxt:
117
+ if c >= len(txt):
118
+ break
119
+
120
+ while txt[c] == " ":
121
+ c += 1
122
+
123
+ else:
124
+ if c in roles_left: # Start of a new role
125
+ ind = roles_left.index(c)
126
+ cr = roles[ind]["end"]
127
+ prev = "I-" + roles[ind]["label"]
128
+ troles.append("B-" + roles[ind]["label"])
129
+ else:
130
+ if c < cr: # Assign previous role
131
+ troles.append(prev)
132
+ else: # Assign 'O'
133
+ troles.append("O")
134
+
135
+ c += len(tok)
136
+
137
+ if len(ttxt) != len(troles):
138
+ troles += ["O"] * (len(ttxt) - len(troles))
139
+
140
+ assert len(ttxt) == len(troles)
141
+ return ttxt, troles
requirements.txt CHANGED
@@ -5,4 +5,5 @@ gradio
5
  huggingface-hub==0.18.0
6
  numpy==1.24.2
7
  APScheduler==3.10.1
8
- pandas==1.3.4
 
 
5
  huggingface-hub==0.18.0
6
  numpy==1.24.2
7
  APScheduler==3.10.1
8
+ pandas==1.3.4
9
+ nervaluate==0.2.0
submissions/baseline/IL_TUR_eval_gold_small.json ADDED
The diff for this file is too large to render. See raw diff
 
submissions/baseline/IL_TUR_eval_submission_small.json ADDED
The diff for this file is too large to render. See raw diff
 
uploads.py CHANGED
@@ -6,7 +6,11 @@ import json
6
  import pandas as pd
7
  import gradio as gr
8
 
 
 
 
9
  LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
 
10
  # RESULTS_PATH = "Exploration-Lab/IL-TUR-Leaderboard-results"
11
  TOKEN = os.environ.get("TOKEN", None)
12
  YEAR_VERSION = "2024"
@@ -93,9 +97,21 @@ def add_new_eval(
93
  # upload the df to spaces
94
  import io
95
 
96
- # read the submission json file
97
- with open(path_to_file, "r") as f:
98
- submission = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  with open("submissions/baseline/results.json", "r") as f:
101
  results = json.load(f)
 
6
  import pandas as pd
7
  import gradio as gr
8
 
9
+ from eval_utils import get_evaluation_scores
10
+
11
+
12
  LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
13
+ SUBMISSION_FORMAT = "predictions"
14
  # RESULTS_PATH = "Exploration-Lab/IL-TUR-Leaderboard-results"
15
  TOKEN = os.environ.get("TOKEN", None)
16
  YEAR_VERSION = "2024"
 
97
  # upload the df to spaces
98
  import io
99
 
100
+ if SUBMISSION_FORMAT == "predictions":
101
+ # read the submission json file
102
+ with open(path_to_file, "r") as f:
103
+ submission_data = json.load(f)
104
+
105
+ # read the gold json file
106
+ with open("submissions/baseline/IL_TUR_eval_gold_small.json", "r") as f:
107
+ gold_data = json.load(f)
108
+
109
+ submission = get_evaluation_scores(gold_data, submission_data)
110
+
111
+ else:
112
+ # read the submission json file
113
+ with open(path_to_file, "r") as f:
114
+ submission = json.load(f)
115
 
116
  with open("submissions/baseline/results.json", "r") as f:
117
  results = json.load(f)