Spaces:
Sleeping
Sleeping
arosyihuddin
commited on
Commit
•
ecfd12f
1
Parent(s):
87e0b7e
update
Browse files- .gitignore +1 -0
- app.py +4 -7
- src/__pycache__/bert.cpython-310.pyc +0 -0
- src/__pycache__/helper.cpython-310.pyc +0 -0
- src/__pycache__/legalNER.cpython-310.pyc +0 -0
- src/align_word_ids.py +0 -27
- src/{BertModel.py → bert.py} +0 -0
- src/convertTotext.py +0 -22
- src/{clean_text.py → helper.py} +21 -1
- src/legalNER.py +144 -0
- src/pdf_predict.py +0 -48
- src/read_file.py +0 -21
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
venv
|
app.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
-
import sys
|
2 |
-
sys.path.append("/home/pstar7/Documents/gradio/src")
|
3 |
-
|
4 |
from transformers import BertTokenizerFast
|
5 |
from gradio_pdf import PDF
|
6 |
-
from
|
7 |
-
from
|
8 |
import gradio as gr
|
9 |
|
10 |
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
@@ -24,9 +21,9 @@ def predict(doc : str, model : str) -> str:
|
|
24 |
use_model = model_indonlu
|
25 |
use_tokenizer = tokenizer_indonlu
|
26 |
|
27 |
-
|
28 |
|
29 |
-
return
|
30 |
|
31 |
iface = gr.Interface(
|
32 |
fn=predict,
|
|
|
|
|
|
|
|
|
1 |
from transformers import BertTokenizerFast
|
2 |
from gradio_pdf import PDF
|
3 |
+
from src.bert import *
|
4 |
+
from src.legalNER import *
|
5 |
import gradio as gr
|
6 |
|
7 |
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
|
|
21 |
use_model = model_indonlu
|
22 |
use_tokenizer = tokenizer_indonlu
|
23 |
|
24 |
+
ner = LegalNER(use_model, use_tokenizer, doc, ids_to_labels, model)
|
25 |
|
26 |
+
return ner.display()
|
27 |
|
28 |
iface = gr.Interface(
|
29 |
fn=predict,
|
src/__pycache__/bert.cpython-310.pyc
ADDED
Binary file (881 Bytes). View file
|
|
src/__pycache__/helper.cpython-310.pyc
ADDED
Binary file (2.26 kB). View file
|
|
src/__pycache__/legalNER.cpython-310.pyc
ADDED
Binary file (4.17 kB). View file
|
|
src/align_word_ids.py
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
def align_word_ids(texts, tokenizer, label_all_tokens):
|
2 |
-
|
3 |
-
tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
|
4 |
-
|
5 |
-
word_ids = tokenized_inputs.word_ids()
|
6 |
-
|
7 |
-
previous_word_idx = None
|
8 |
-
label_ids = []
|
9 |
-
|
10 |
-
for word_idx in word_ids:
|
11 |
-
|
12 |
-
if word_idx is None:
|
13 |
-
label_ids.append(-100)
|
14 |
-
|
15 |
-
elif word_idx != previous_word_idx:
|
16 |
-
try:
|
17 |
-
label_ids.append(1)
|
18 |
-
except:
|
19 |
-
label_ids.append(-100)
|
20 |
-
else:
|
21 |
-
try:
|
22 |
-
label_ids.append(1 if label_all_tokens else -100)
|
23 |
-
except:
|
24 |
-
label_ids.append(-100)
|
25 |
-
previous_word_idx = word_idx
|
26 |
-
|
27 |
-
return label_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{BertModel.py → bert.py}
RENAMED
File without changes
|
src/convertTotext.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
def convertTotext(data_token, prediction_label):
|
2 |
-
prev_tag = 'O'
|
3 |
-
result = {}
|
4 |
-
temp = ''
|
5 |
-
for i, word in enumerate(data_token):
|
6 |
-
if prediction_label[i] != 'O':
|
7 |
-
if prev_tag == 'O' and temp != '':
|
8 |
-
temp = ''
|
9 |
-
|
10 |
-
if '##' in word:
|
11 |
-
temp += word.replace('##', '')
|
12 |
-
|
13 |
-
else:
|
14 |
-
temp += ' ' + word
|
15 |
-
else:
|
16 |
-
if temp != "":
|
17 |
-
result[prev_tag.replace("I_", "B_")] = temp.strip()
|
18 |
-
temp = ""
|
19 |
-
|
20 |
-
prev_tag = prediction_label[i]
|
21 |
-
|
22 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{clean_text.py → helper.py}
RENAMED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import re
|
2 |
|
3 |
def clean_text(text):
|
@@ -12,4 +14,22 @@ def clean_text(text):
|
|
12 |
text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
|
13 |
text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
|
14 |
text = re.sub(r'[\u2026]+|\.{3,}', '', text)
|
15 |
-
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import requests
|
3 |
import re
|
4 |
|
5 |
def clean_text(text):
|
|
|
14 |
text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
|
15 |
text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
|
16 |
text = re.sub(r'[\u2026]+|\.{3,}', '', text)
|
17 |
+
return text.strip()
|
18 |
+
|
19 |
+
def read_pdf(pdf):
|
20 |
+
try:
|
21 |
+
pdf_text = ''
|
22 |
+
pdf_file = open(pdf, 'rb')
|
23 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
24 |
+
|
25 |
+
for page_num in range(len(pdf_reader.pages)):
|
26 |
+
page = pdf_reader.pages[page_num]
|
27 |
+
text = clean_text(page.extract_text())
|
28 |
+
|
29 |
+
pdf_text += text
|
30 |
+
|
31 |
+
pdf_file.close()
|
32 |
+
return pdf_text.strip()
|
33 |
+
|
34 |
+
except requests.exceptions.RequestException as e:
|
35 |
+
print("Error:", e)
|
src/legalNER.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.helper import *
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
|
5 |
+
class LegalNER():
|
6 |
+
def __init__(self, model, tokenizer, pdf_file, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
|
7 |
+
self.model = model
|
8 |
+
self.tokenizer = tokenizer
|
9 |
+
self.pdf = pdf_file
|
10 |
+
self.check_point = check_point
|
11 |
+
self.label_all_tokens = label_all_tokens
|
12 |
+
self.ids_to_labels = ids_to_labels
|
13 |
+
self.prediction_label = ''
|
14 |
+
self.label_convert = {'B_VERN' : 'Nomor Putusan',
|
15 |
+
'B_DEFN' : 'Nama Terdakwa',
|
16 |
+
'B_CRIA' : 'Tindak Pidana',
|
17 |
+
'B_ARTV' : 'Melanggar KUHP',
|
18 |
+
'B_PENA' : 'Tuntutan Hukum',
|
19 |
+
'B_PUNI' : 'Putusan Hukum',
|
20 |
+
'B_TIMV' : 'Tanggal Putusan',
|
21 |
+
'B_JUDP' : 'Hakim Ketua',
|
22 |
+
'B_JUDG' : 'Hakim Anggota',
|
23 |
+
'B_REGI' : 'Panitera',
|
24 |
+
'B_PROS' : 'Penuntut Umum',
|
25 |
+
'B_ADVO' : 'Pengacara',
|
26 |
+
}
|
27 |
+
|
28 |
+
def align_word_ids(self, texts):
|
29 |
+
|
30 |
+
tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
|
31 |
+
|
32 |
+
word_ids = tokenized_inputs.word_ids()
|
33 |
+
|
34 |
+
previous_word_idx = None
|
35 |
+
label_ids = []
|
36 |
+
|
37 |
+
for word_idx in word_ids:
|
38 |
+
|
39 |
+
if word_idx is None:
|
40 |
+
label_ids.append(-100)
|
41 |
+
|
42 |
+
elif word_idx != previous_word_idx:
|
43 |
+
try:
|
44 |
+
label_ids.append(1)
|
45 |
+
except:
|
46 |
+
label_ids.append(-100)
|
47 |
+
else:
|
48 |
+
try:
|
49 |
+
label_ids.append(1 if self.label_all_tokens else -100)
|
50 |
+
except:
|
51 |
+
label_ids.append(-100)
|
52 |
+
previous_word_idx = word_idx
|
53 |
+
|
54 |
+
return label_ids
|
55 |
+
|
56 |
+
def labelToText(self, data_token):
|
57 |
+
prev_tag = 'O'
|
58 |
+
result = {}
|
59 |
+
temp = ''
|
60 |
+
|
61 |
+
# Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
|
62 |
+
for i, word in enumerate(data_token):
|
63 |
+
if self.prediction_label[i] != 'O':
|
64 |
+
if prev_tag == 'O' and temp != '':
|
65 |
+
temp = ''
|
66 |
+
|
67 |
+
if '##' in word:
|
68 |
+
temp += word.replace('##', '')
|
69 |
+
|
70 |
+
else:
|
71 |
+
temp += ' ' + word
|
72 |
+
else:
|
73 |
+
if temp != "":
|
74 |
+
result[prev_tag.replace("I_", "B_")] = temp.strip()
|
75 |
+
temp = ""
|
76 |
+
|
77 |
+
prev_tag = self.prediction_label[i]
|
78 |
+
|
79 |
+
return result
|
80 |
+
|
81 |
+
def labelConverter(self, entity):
|
82 |
+
# Memilih prediksi entitas yang paling bagus
|
83 |
+
entity_result = {}
|
84 |
+
for i in entity:
|
85 |
+
if len(list(i.keys())) > 1:
|
86 |
+
for y in i.items():
|
87 |
+
if y[0] not in entity_result:
|
88 |
+
entity_result[y[0]] = y[1]
|
89 |
+
else:
|
90 |
+
if len(entity_result[y[0]]) < len(y[1]):
|
91 |
+
entity_result[y[0]] = y[1]
|
92 |
+
else:
|
93 |
+
if tuple(i.items())[0] not in entity_result:
|
94 |
+
entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
|
95 |
+
|
96 |
+
# Mengkonversi hasil dalam bentuk String
|
97 |
+
result = ''
|
98 |
+
for i, (label, data) in enumerate(entity_result.items()):
|
99 |
+
if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
|
100 |
+
result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
|
101 |
+
elif label in ['B_JUDP', 'B_CRIA']:
|
102 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
|
103 |
+
elif label in ['B_ADVO', 'B_REGI']:
|
104 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
|
105 |
+
else:
|
106 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
107 |
+
|
108 |
+
return result
|
109 |
+
|
110 |
+
def display(self, progress=gr.Progress()):
|
111 |
+
file_pdf = read_pdf(self.pdf)
|
112 |
+
sentence_file = file_pdf.split(';')
|
113 |
+
|
114 |
+
use_cuda = torch.cuda.is_available()
|
115 |
+
device = torch.device("cuda" if use_cuda else "cpu")
|
116 |
+
if use_cuda:
|
117 |
+
self.model = self.model.cuda()
|
118 |
+
|
119 |
+
file_check_point = 'model/IndoLEM/model_fold_4.pth' if self.check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
|
120 |
+
|
121 |
+
model_weights = torch.load(file_check_point, map_location=torch.device(device))
|
122 |
+
self.model.load_state_dict(model_weights)
|
123 |
+
|
124 |
+
label_extraction = []
|
125 |
+
for text in progress.tqdm(sentence_file, desc="Ekstraksi Entitas"):
|
126 |
+
toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
|
127 |
+
input_ids = toknize['input_ids'].to(device)
|
128 |
+
mask = toknize['attention_mask'].to(device)
|
129 |
+
|
130 |
+
logits = self.model(input_ids, mask, None)
|
131 |
+
label_ids = torch.Tensor(self.align_word_ids(text)).unsqueeze(0).to(device)
|
132 |
+
logits_clean = logits[0][label_ids != -100]
|
133 |
+
predictions = logits_clean.argmax(dim=1).tolist()
|
134 |
+
prediction_label = [self.ids_to_labels[i] for i in predictions]
|
135 |
+
|
136 |
+
input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
|
137 |
+
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
|
138 |
+
self.prediction_label = prediction_label
|
139 |
+
labelConv = self.labelToText(data_token)
|
140 |
+
|
141 |
+
if labelConv:
|
142 |
+
label_extraction.append(labelConv)
|
143 |
+
|
144 |
+
return self.labelConverter(label_extraction)
|
src/pdf_predict.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
from tqdm import tqdm
|
2 |
-
import torch
|
3 |
-
from read_file import *
|
4 |
-
from align_word_ids import *
|
5 |
-
from convertTotext import *
|
6 |
-
|
7 |
-
def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
|
8 |
-
file_pdf = read_pdf(file_path)
|
9 |
-
sentence_file = file_pdf.split(';')
|
10 |
-
|
11 |
-
use_cuda = torch.cuda.is_available()
|
12 |
-
device = torch.device("cuda" if use_cuda else "cpu")
|
13 |
-
if use_cuda:
|
14 |
-
model = model.cuda()
|
15 |
-
|
16 |
-
file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
|
17 |
-
|
18 |
-
model_weights = torch.load(file_check_point, map_location=torch.device(device))
|
19 |
-
model.load_state_dict(model_weights)
|
20 |
-
|
21 |
-
label_extraction = []
|
22 |
-
for text in tqdm(sentence_file, desc="Prediction Sentence"):
|
23 |
-
toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
|
24 |
-
input_ids = toknize['input_ids'].to(device)
|
25 |
-
mask = toknize['attention_mask'].to(device)
|
26 |
-
|
27 |
-
logits = model(input_ids, mask, None)
|
28 |
-
label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
|
29 |
-
logits_clean = logits[0][label_ids != -100]
|
30 |
-
predictions = logits_clean.argmax(dim=1).tolist()
|
31 |
-
prediction_label = [ids_to_labels[i] for i in predictions]
|
32 |
-
|
33 |
-
input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
|
34 |
-
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
|
35 |
-
nerExtraction = convertTotext(data_token, prediction_label)
|
36 |
-
|
37 |
-
if nerExtraction:
|
38 |
-
label_extraction.append(nerExtraction)
|
39 |
-
# print(f"\nText : {text}")
|
40 |
-
# print(f"Predict Label : {prediction_label}")
|
41 |
-
# print()
|
42 |
-
|
43 |
-
# print(f"Hasil Ekstrak NER:")
|
44 |
-
# print(nerExtraction)
|
45 |
-
# print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
|
46 |
-
# print()
|
47 |
-
|
48 |
-
return label_extraction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/read_file.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
import PyPDF2
|
2 |
-
from clean_text import *
|
3 |
-
import requests
|
4 |
-
|
5 |
-
def read_pdf(file_pdf):
|
6 |
-
try:
|
7 |
-
pdf_text = ''
|
8 |
-
pdf_file = open(file_pdf, 'rb')
|
9 |
-
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
10 |
-
|
11 |
-
for page_num in range(len(pdf_reader.pages)):
|
12 |
-
page = pdf_reader.pages[page_num]
|
13 |
-
text = clean_text(page.extract_text())
|
14 |
-
|
15 |
-
pdf_text += text
|
16 |
-
|
17 |
-
pdf_file.close()
|
18 |
-
return pdf_text.strip()
|
19 |
-
|
20 |
-
except requests.exceptions.RequestException as e:
|
21 |
-
print("Error:", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|