Spaces:
Sleeping
Sleeping
arosyihuddin
commited on
Commit
•
0ca7583
1
Parent(s):
dd27f28
update UI
Browse files- app.py +32 -56
- data/1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf +0 -0
- data/162_Pid.Sus_2023_PN_Bkl.pdf +0 -0
- data/164_Pid.Sus_2023_PN_Bkl.pdf +0 -0
- data/165_Pdt.P_2023_PN_Bkl.pdf +0 -0
- data/329_pid.b_2023_pn_jkt.brt_20240529090837.pdf +0 -0
- data/428_pid.b_2021_pn_jkt.brt_20240529091234.pdf +0 -0
- src/__pycache__/bert.cpython-311.pyc +0 -0
- src/__pycache__/helper.cpython-311.pyc +0 -0
- src/__pycache__/legalNER.cpython-311.pyc +0 -0
- src/bert.py +0 -14
- src/helper.py +39 -50
- src/legalNER.py +122 -32
app.py
CHANGED
@@ -1,51 +1,29 @@
|
|
1 |
-
from transformers import BertTokenizerFast
|
2 |
from gradio_pdf import PDF
|
3 |
-
from src.
|
4 |
-
from src.legalNER import *
|
5 |
import gradio as gr
|
6 |
from pathlib import Path
|
7 |
|
8 |
dir_ = Path(__file__).parent
|
9 |
|
10 |
-
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
11 |
-
indolem = 'indolem/indobert-base-uncased'
|
12 |
-
indonlu = 'indobenchmark/indobert-base-p2'
|
13 |
-
model_indolem = BertModel(indolem, len(ids_to_labels))
|
14 |
-
model_indonlu = BertModel(indonlu, len(ids_to_labels))
|
15 |
-
tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
|
16 |
-
tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
|
17 |
-
|
18 |
-
def text_extraction(text, model, progress=gr.Progress()):
|
19 |
-
if model == 'IndoBERT (IndoLEM)':
|
20 |
-
use_model = model_indolem
|
21 |
-
use_tokenizer = tokenizer_indolem
|
22 |
-
|
23 |
-
else:
|
24 |
-
use_model = model_indonlu
|
25 |
-
use_tokenizer = tokenizer_indonlu
|
26 |
-
|
27 |
-
legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
|
28 |
-
entitas = legalner.predict(text)
|
29 |
-
new_text = legalner.tokenizer_decode
|
30 |
-
|
31 |
-
return {"text": new_text, "entities": entitas}
|
32 |
-
|
33 |
-
def pdf_extraction(doc, model, progress=gr.Progress()):
|
34 |
-
if model == 'IndoBERT (IndoLEM)':
|
35 |
-
use_model = model_indolem
|
36 |
-
use_tokenizer = tokenizer_indolem
|
37 |
-
|
38 |
-
else:
|
39 |
-
use_model = model_indonlu
|
40 |
-
use_tokenizer = tokenizer_indonlu
|
41 |
-
|
42 |
-
legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
|
43 |
-
|
44 |
-
return legalner.predict(doc)
|
45 |
-
|
46 |
-
|
47 |
with gr.Blocks() as ner:
|
48 |
-
gr.Markdown("#Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
gr.Markdown("## Uji Coba Model dengan Potongan Kalimat")
|
50 |
# Input Text
|
51 |
with gr.Row():
|
@@ -56,15 +34,14 @@ with gr.Blocks() as ner:
|
|
56 |
gr.ClearButton(text, value='Reset')
|
57 |
with gr.Column(scale=3):
|
58 |
output_text = gr.HighlightedText(label="Output Text")
|
59 |
-
|
60 |
button_text.click(fn=text_extraction, inputs=[text, model_text], outputs=output_text, api_name="text")
|
61 |
-
|
62 |
gr.Markdown("## Contoh Inputan Potongan Kalimat")
|
63 |
gr.Examples(
|
64 |
-
examples=[
|
65 |
["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM . DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA . MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoLEM)"],
|
66 |
-
["
|
67 |
-
["PUTUSAN Nomor 77/Pid.B/2023/PN Jkt.Pst DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Jakarta Pusat yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa : 1. Nama lengkap : Arif Bin Santung", "IndoBERT (IndoLEM)"],
|
68 |
],
|
69 |
inputs=[text, model_text],
|
70 |
outputs=output_text,
|
@@ -79,20 +56,19 @@ with gr.Blocks() as ner:
|
|
79 |
model_pdf = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model',value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
|
80 |
button_pdf = gr.Button(value="Extract", variant='primary')
|
81 |
gr.ClearButton(doc, value="Reset")
|
82 |
-
|
83 |
with gr.Column(scale=3):
|
84 |
output_pdf = gr.Textbox(label="Output PDF")
|
85 |
-
|
86 |
button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
|
87 |
-
|
88 |
gr.Examples(
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
],
|
96 |
inputs=[doc],
|
97 |
outputs=output_pdf,
|
98 |
fn=pdf_extraction,
|
|
|
|
|
1 |
from gradio_pdf import PDF
|
2 |
+
from src.helper import *
|
|
|
3 |
import gradio as gr
|
4 |
from pathlib import Path
|
5 |
|
6 |
dir_ = Path(__file__).parent
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
with gr.Blocks() as ner:
|
9 |
+
gr.Markdown("# Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
|
10 |
+
# List Label
|
11 |
+
keterangan_label = [
|
12 |
+
["VERN", "Nomor Putusan"],
|
13 |
+
["DEFN", "Nama Terdakwa"],
|
14 |
+
["CRIA", "Tindak Pidana"],
|
15 |
+
["ARTV", "Melanggar KUHP"],
|
16 |
+
["PENA", "Tuntutan Hukum"],
|
17 |
+
["PUNI", "Putusan Hukum"],
|
18 |
+
["TIMV", "Tanggal Putusan"],
|
19 |
+
["JUDP", "Hakim Ketua"],
|
20 |
+
["JUDG", "Hakim Anggota"],
|
21 |
+
["REGI", "Panitera"],
|
22 |
+
["PROS", "Penuntut Umum"],
|
23 |
+
["ADVO", "Pengacara"],
|
24 |
+
]
|
25 |
+
gr.Markdown("## Penjelasan Label")
|
26 |
+
gr.DataFrame(keterangan_label, headers=["Label", "Keterangan"], height=200)
|
27 |
gr.Markdown("## Uji Coba Model dengan Potongan Kalimat")
|
28 |
# Input Text
|
29 |
with gr.Row():
|
|
|
34 |
gr.ClearButton(text, value='Reset')
|
35 |
with gr.Column(scale=3):
|
36 |
output_text = gr.HighlightedText(label="Output Text")
|
37 |
+
|
38 |
button_text.click(fn=text_extraction, inputs=[text, model_text], outputs=output_text, api_name="text")
|
39 |
+
|
40 |
gr.Markdown("## Contoh Inputan Potongan Kalimat")
|
41 |
gr.Examples(
|
42 |
+
examples=[
|
43 |
["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM . DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA . MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoLEM)"],
|
44 |
+
["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM", "IndoBERT (IndoNLU)"]
|
|
|
45 |
],
|
46 |
inputs=[text, model_text],
|
47 |
outputs=output_text,
|
|
|
56 |
model_pdf = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model',value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
|
57 |
button_pdf = gr.Button(value="Extract", variant='primary')
|
58 |
gr.ClearButton(doc, value="Reset")
|
59 |
+
|
60 |
with gr.Column(scale=3):
|
61 |
output_pdf = gr.Textbox(label="Output PDF")
|
62 |
+
|
63 |
button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
|
64 |
+
|
65 |
gr.Examples(
|
66 |
+
["428_pid.b_2021_pn_jkt.brt_20240529091234.pdf",
|
67 |
+
"1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf",
|
68 |
+
"329_pid.b_2023_pn_jkt.brt_20240529090837.pdf",
|
69 |
+
"168_Pid.Sus_2023_PN_Bkl.pdf",
|
70 |
+
"169_Pid.Sus_2023_PN_Bkl.pdf",
|
71 |
+
"167_Pid.Sus_2023_PN_Bkl.pdf"],
|
|
|
72 |
inputs=[doc],
|
73 |
outputs=output_pdf,
|
74 |
fn=pdf_extraction,
|
data/1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf
ADDED
Binary file (102 kB). View file
|
|
data/162_Pid.Sus_2023_PN_Bkl.pdf
DELETED
Binary file (142 kB)
|
|
data/164_Pid.Sus_2023_PN_Bkl.pdf
DELETED
Binary file (144 kB)
|
|
data/165_Pdt.P_2023_PN_Bkl.pdf
DELETED
Binary file (70.4 kB)
|
|
data/329_pid.b_2023_pn_jkt.brt_20240529090837.pdf
ADDED
Binary file (151 kB). View file
|
|
data/428_pid.b_2021_pn_jkt.brt_20240529091234.pdf
ADDED
Binary file (165 kB). View file
|
|
src/__pycache__/bert.cpython-311.pyc
ADDED
Binary file (1.31 kB). View file
|
|
src/__pycache__/helper.cpython-311.pyc
ADDED
Binary file (2.75 kB). View file
|
|
src/__pycache__/legalNER.cpython-311.pyc
ADDED
Binary file (15.4 kB). View file
|
|
src/bert.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
from transformers import BertForTokenClassification
|
2 |
-
import torch
|
3 |
-
|
4 |
-
class BertModel(torch.nn.Module):
|
5 |
-
def __init__(self, pretrained_model, num_labels):
|
6 |
-
|
7 |
-
super(BertModel, self).__init__()
|
8 |
-
self.bert = BertForTokenClassification.from_pretrained(pretrained_model, num_labels=num_labels)
|
9 |
-
|
10 |
-
def forward(self, input_id, mask, label):
|
11 |
-
|
12 |
-
output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
|
13 |
-
|
14 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/helper.py
CHANGED
@@ -1,50 +1,39 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
for i, word in enumerate(input_ids_conv):
|
41 |
-
if word not in ['[CLS]', '[SEP]', '[PAD]']:
|
42 |
-
if temp != '' and '##' not in word:
|
43 |
-
result += ' ' + temp
|
44 |
-
if '##' in word:
|
45 |
-
temp += word.replace('##', '')
|
46 |
-
else:
|
47 |
-
temp = word
|
48 |
-
if i == len(input_ids_conv)-1:
|
49 |
-
result += ' ' + temp
|
50 |
-
return result.strip()
|
|
|
1 |
+
from transformers import BertTokenizerFast, BertForTokenClassification
|
2 |
+
import gradio as gr
|
3 |
+
from src.legalNER import *
|
4 |
+
|
5 |
+
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
6 |
+
indolem = 'indolem/indobert-base-uncased'
|
7 |
+
indonlu = 'indobenchmark/indobert-base-p2'
|
8 |
+
model_indolem = BertForTokenClassification.from_pretrained(indolem, num_labels=len(ids_to_labels))
|
9 |
+
model_indonlu = BertForTokenClassification.from_pretrained(indonlu, num_labels=len(ids_to_labels))
|
10 |
+
tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
|
11 |
+
tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
|
12 |
+
|
13 |
+
def text_extraction(text, model, progress=gr.Progress()):
|
14 |
+
if model == 'IndoBERT (IndoLEM)':
|
15 |
+
use_model = model_indolem
|
16 |
+
use_tokenizer = tokenizer_indolem
|
17 |
+
|
18 |
+
else:
|
19 |
+
use_model = model_indonlu
|
20 |
+
use_tokenizer = tokenizer_indonlu
|
21 |
+
|
22 |
+
legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
|
23 |
+
entitas = legalner.predict(text)
|
24 |
+
new_text = legalner.tokenizer_decode
|
25 |
+
|
26 |
+
return {"text": new_text, "entities": entitas}
|
27 |
+
|
28 |
+
def pdf_extraction(doc, model, progress=gr.Progress()):
|
29 |
+
if model == 'IndoBERT (IndoLEM)':
|
30 |
+
use_model = model_indolem
|
31 |
+
use_tokenizer = tokenizer_indolem
|
32 |
+
|
33 |
+
else:
|
34 |
+
use_model = model_indonlu
|
35 |
+
use_tokenizer = tokenizer_indonlu
|
36 |
+
|
37 |
+
legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
|
38 |
+
|
39 |
+
return legalner.predict(doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/legalNER.py
CHANGED
@@ -1,39 +1,40 @@
|
|
1 |
-
from src.helper import *
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
|
5 |
class LegalNER():
|
6 |
-
def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)'
|
7 |
self.model = model
|
8 |
self.tokenizer = tokenizer
|
9 |
self.check_point = check_point
|
10 |
-
self.label_all_tokens = label_all_tokens
|
11 |
self.prediction_label = ''
|
12 |
self.data_token = ''
|
13 |
self.ids_to_labels = ids_to_labels
|
14 |
self.label_extraction = []
|
15 |
self.tokenizer_decode = ''
|
16 |
-
self.label_convert = {'
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
}
|
29 |
|
30 |
def align_word_ids(self, texts):
|
|
|
31 |
tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
|
|
|
32 |
word_ids = tokenized_inputs.word_ids()
|
|
|
33 |
previous_word_idx = None
|
34 |
label_ids = []
|
35 |
|
36 |
for word_idx in word_ids:
|
|
|
37 |
if word_idx is None:
|
38 |
label_ids.append(-100)
|
39 |
|
@@ -44,7 +45,7 @@ class LegalNER():
|
|
44 |
label_ids.append(-100)
|
45 |
else:
|
46 |
try:
|
47 |
-
label_ids.append(1
|
48 |
except:
|
49 |
label_ids.append(-100)
|
50 |
previous_word_idx = word_idx
|
@@ -58,6 +59,7 @@ class LegalNER():
|
|
58 |
|
59 |
# Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
|
60 |
for i, word in enumerate(self.data_token):
|
|
|
61 |
if self.prediction_label[i] != 'O':
|
62 |
if prev_tag == 'O' and temp != '':
|
63 |
temp = ''
|
@@ -67,39 +69,84 @@ class LegalNER():
|
|
67 |
|
68 |
else:
|
69 |
temp += ' ' + word
|
|
|
70 |
else:
|
|
|
71 |
if temp != "":
|
72 |
-
|
|
|
73 |
temp = ""
|
74 |
|
75 |
prev_tag = self.prediction_label[i]
|
76 |
|
77 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
def dis_pdf_prediction(self):
|
80 |
# Memilih prediksi entitas yang paling bagus
|
81 |
entity_result = {}
|
|
|
|
|
82 |
for i in self.label_extraction:
|
|
|
83 |
if len(list(i.keys())) > 1:
|
|
|
84 |
for y in i.items():
|
|
|
85 |
if y[0] not in entity_result:
|
|
|
86 |
entity_result[y[0]] = y[1]
|
87 |
else:
|
|
|
88 |
if len(entity_result[y[0]]) < len(y[1]):
|
89 |
entity_result[y[0]] = y[1]
|
90 |
else:
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
# Mengkonversi hasil ekstraski entitas dalam bentuk
|
95 |
result = ''
|
96 |
-
for i, (label, data) in enumerate(
|
97 |
-
if label in ['
|
98 |
result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
|
99 |
-
elif label in ['
|
|
|
|
|
|
|
|
|
|
|
100 |
result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
|
101 |
-
elif label in ['
|
102 |
-
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t
|
|
|
|
|
|
|
|
|
|
|
103 |
else:
|
104 |
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
105 |
|
@@ -111,11 +158,11 @@ class LegalNER():
|
|
111 |
count_huruf = 0
|
112 |
temp_word = ''
|
113 |
temp_label = ''
|
114 |
-
temp_label = ''
|
115 |
temp_count_huruf = 0
|
116 |
prev_word = ''
|
117 |
for i, (word, label) in enumerate(zip(self.data_token, self.prediction_label)):
|
118 |
if label != 'O':
|
|
|
119 |
if temp_word != '' and '##' not in word:
|
120 |
temp_result['entity'] = temp_label
|
121 |
temp_result['word'] = temp_word
|
@@ -124,14 +171,17 @@ class LegalNER():
|
|
124 |
result.append(temp_result)
|
125 |
temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
|
126 |
|
|
|
127 |
if '##' in word:
|
128 |
temp_word += word.replace('##', '')
|
129 |
|
|
|
130 |
else:
|
131 |
temp_label = label
|
132 |
temp_word = word
|
133 |
temp_count_huruf = count_huruf
|
134 |
|
|
|
135 |
if i == len(self.data_token)-1:
|
136 |
temp_result['entity'] = temp_label
|
137 |
temp_result['word'] = temp_word
|
@@ -140,6 +190,7 @@ class LegalNER():
|
|
140 |
result.append(temp_result)
|
141 |
temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
|
142 |
|
|
|
143 |
if '##' in word:
|
144 |
count_huruf += len(word)-2
|
145 |
|
@@ -148,13 +199,14 @@ class LegalNER():
|
|
148 |
|
149 |
return result
|
150 |
|
|
|
151 |
def fit_transform(self, texts, progress=gr.Progress()):
|
152 |
use_cuda = torch.cuda.is_available()
|
153 |
device = torch.device("cuda" if use_cuda else "cpu")
|
154 |
if use_cuda:
|
155 |
self.model = self.model.cuda()
|
156 |
|
157 |
-
file_check_point = '
|
158 |
|
159 |
model_weights = torch.load(file_check_point, map_location=torch.device(device))
|
160 |
self.model.load_state_dict(model_weights)
|
@@ -172,20 +224,58 @@ class LegalNER():
|
|
172 |
|
173 |
input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
|
174 |
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
|
175 |
-
self.tokenizer_decode = token_decode(input_ids_conv)
|
176 |
self.data_token = data_token
|
177 |
self.prediction_label = prediction_label
|
178 |
labelConv = self.labelToText()
|
179 |
|
180 |
if labelConv:
|
181 |
-
self.label_extraction.append(labelConv)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
def predict(self, doc):
|
184 |
if '.pdf' not in doc:
|
185 |
self.fit_transform([doc.strip()])
|
186 |
return self.dis_text_prediction()
|
187 |
else:
|
188 |
-
file_pdf = read_pdf(doc)
|
189 |
sentence_file = file_pdf.split(';')
|
190 |
self.fit_transform(sentence_file)
|
191 |
return self.dis_pdf_prediction()
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
|
4 |
class LegalNER():
|
5 |
+
def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
|
6 |
self.model = model
|
7 |
self.tokenizer = tokenizer
|
8 |
self.check_point = check_point
|
|
|
9 |
self.prediction_label = ''
|
10 |
self.data_token = ''
|
11 |
self.ids_to_labels = ids_to_labels
|
12 |
self.label_extraction = []
|
13 |
self.tokenizer_decode = ''
|
14 |
+
self.label_convert = {'VERN' : 'Nomor Putusan',
|
15 |
+
'DEFN' : 'Nama Terdakwa',
|
16 |
+
'CRIA' : 'Tindak Pidana',
|
17 |
+
'ARTV' : 'Melanggar KUHP',
|
18 |
+
'PENA' : 'Tuntutan Hukum',
|
19 |
+
'PUNI' : 'Putusan Hukum',
|
20 |
+
'TIMV' : 'Tanggal Putusan',
|
21 |
+
'JUDP' : 'Hakim Ketua',
|
22 |
+
'JUDG' : 'Hakim Anggota',
|
23 |
+
'REGI' : 'Panitera',
|
24 |
+
'PROS' : 'Penuntut Umum',
|
25 |
+
'ADVO' : 'Pengacara'}
|
|
|
26 |
|
27 |
def align_word_ids(self, texts):
|
28 |
+
|
29 |
tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
|
30 |
+
|
31 |
word_ids = tokenized_inputs.word_ids()
|
32 |
+
|
33 |
previous_word_idx = None
|
34 |
label_ids = []
|
35 |
|
36 |
for word_idx in word_ids:
|
37 |
+
|
38 |
if word_idx is None:
|
39 |
label_ids.append(-100)
|
40 |
|
|
|
45 |
label_ids.append(-100)
|
46 |
else:
|
47 |
try:
|
48 |
+
label_ids.append(1)
|
49 |
except:
|
50 |
label_ids.append(-100)
|
51 |
previous_word_idx = word_idx
|
|
|
59 |
|
60 |
# Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
|
61 |
for i, word in enumerate(self.data_token):
|
62 |
+
# Memproses semua token yang berlabel entitas bukan O
|
63 |
if self.prediction_label[i] != 'O':
|
64 |
if prev_tag == 'O' and temp != '':
|
65 |
temp = ''
|
|
|
69 |
|
70 |
else:
|
71 |
temp += ' ' + word
|
72 |
+
|
73 |
else:
|
74 |
+
# cek jika temp nya ada isinya di tambahkan ke dict result dengan key label sebelumnya
|
75 |
if temp != "":
|
76 |
+
# hanya mengambil label setelah tanda B_ /I_
|
77 |
+
result[prev_tag[2:]] = temp.strip()
|
78 |
temp = ""
|
79 |
|
80 |
prev_tag = self.prediction_label[i]
|
81 |
|
82 |
+
return result # Dictionary {VERN : 120 ...}
|
83 |
+
|
84 |
+
# Menggabungkan setiap token hasil tokenizer dalam bentuk string
|
85 |
+
def token_decode(self, input_ids_conv):
|
86 |
+
result = ''
|
87 |
+
temp = ''
|
88 |
+
for i, word in enumerate(input_ids_conv):
|
89 |
+
# Memfilter Token tambahan
|
90 |
+
if word not in ['[CLS]', '[SEP]', '[PAD]']:
|
91 |
+
# cek bahwa token saat ini termasuk token lanjutan atau tidak
|
92 |
+
if temp != '' and '##' not in word:
|
93 |
+
result += ' ' + temp
|
94 |
+
# token lanjutan di tanda i dengan tanda paggar 2 "##"
|
95 |
+
if '##' in word:
|
96 |
+
temp += word.replace('##', '')
|
97 |
+
# untuk posisi awal token
|
98 |
+
else:
|
99 |
+
temp = word
|
100 |
+
# cek token terakhir sudah masuk atau belum
|
101 |
+
if i == len(input_ids_conv)-1:
|
102 |
+
result += ' ' + temp
|
103 |
+
return result.strip()
|
104 |
|
105 |
def dis_pdf_prediction(self):
|
106 |
# Memilih prediksi entitas yang paling bagus
|
107 |
entity_result = {}
|
108 |
+
|
109 |
+
# Hasil dari extraksi label ini kadang double sehingga perlu di cari mana yang isinya lebih panjang
|
110 |
for i in self.label_extraction:
|
111 |
+
# jika hasil extraksinya lebih dari 1
|
112 |
if len(list(i.keys())) > 1:
|
113 |
+
# looping setiap item
|
114 |
for y in i.items():
|
115 |
+
# cek key nya sudah ada atau belum
|
116 |
if y[0] not in entity_result:
|
117 |
+
# jika belum tambahkan
|
118 |
entity_result[y[0]] = y[1]
|
119 |
else:
|
120 |
+
# membandaingkan mana yang lebih panjang
|
121 |
if len(entity_result[y[0]]) < len(y[1]):
|
122 |
entity_result[y[0]] = y[1]
|
123 |
else:
|
124 |
+
# cek ada atu tidak dalam enity_result kalau tdidak langsung di tambahkan
|
125 |
+
if list(i.items())[0] not in entity_result:
|
126 |
+
entity_result[list(i.items())[0][0]] = list(i.items())[0][1]
|
127 |
+
|
128 |
+
# Mengurutkan hasil entitas yang di dapat berdasarkan label convert
|
129 |
+
sorted_entitu_result = {key: entity_result[key] for key in self.label_convert if key in entity_result}
|
130 |
|
131 |
+
# Mengkonversi hasil ekstraski entitas dalam bentuk String
|
132 |
result = ''
|
133 |
+
for i, (label, data) in enumerate(sorted_entitu_result.items()):
|
134 |
+
if label in ['PENA', 'ARTV']:
|
135 |
result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
|
136 |
+
elif label in ['PROS']:
|
137 |
+
if (i+1) >= 10:
|
138 |
+
result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
|
139 |
+
else:
|
140 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
141 |
+
elif label in ['JUDP', 'CRIA']:
|
142 |
result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
|
143 |
+
elif label in ['ADVO']:
|
144 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t = {data.capitalize()}\n'
|
145 |
+
elif label in ['REGI']:
|
146 |
+
if (i+1) >= 10:
|
147 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
|
148 |
+
else:
|
149 |
+
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t\t = {data.capitalize()}\n'
|
150 |
else:
|
151 |
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
152 |
|
|
|
158 |
count_huruf = 0
|
159 |
temp_word = ''
|
160 |
temp_label = ''
|
|
|
161 |
temp_count_huruf = 0
|
162 |
prev_word = ''
|
163 |
for i, (word, label) in enumerate(zip(self.data_token, self.prediction_label)):
|
164 |
if label != 'O':
|
165 |
+
# menambahkan token ketika token merupakan token tunggal atau tidak di pecah dengan tanda pagar
|
166 |
if temp_word != '' and '##' not in word:
|
167 |
temp_result['entity'] = temp_label
|
168 |
temp_result['word'] = temp_word
|
|
|
171 |
result.append(temp_result)
|
172 |
temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
|
173 |
|
174 |
+
# Jika sebuah kata lanjutan maka di tambahakan langung dengan menghapus tanda pagar
|
175 |
if '##' in word:
|
176 |
temp_word += word.replace('##', '')
|
177 |
|
178 |
+
# Menyimpan token untuk pengecekan iterasi selanjutnya apakah memiliki token lanjutan atau tidak
|
179 |
else:
|
180 |
temp_label = label
|
181 |
temp_word = word
|
182 |
temp_count_huruf = count_huruf
|
183 |
|
184 |
+
# Menambahkan token terakhir yang masih tersimpan dalam temporari variabel
|
185 |
if i == len(self.data_token)-1:
|
186 |
temp_result['entity'] = temp_label
|
187 |
temp_result['word'] = temp_word
|
|
|
190 |
result.append(temp_result)
|
191 |
temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
|
192 |
|
193 |
+
# Perhitungan jumlah huruf untuk pembuatan labelnya
|
194 |
if '##' in word:
|
195 |
count_huruf += len(word)-2
|
196 |
|
|
|
199 |
|
200 |
return result
|
201 |
|
202 |
+
# Fungsi untuk proses Predict dari inputan
|
203 |
def fit_transform(self, texts, progress=gr.Progress()):
|
204 |
use_cuda = torch.cuda.is_available()
|
205 |
device = torch.device("cuda" if use_cuda else "cpu")
|
206 |
if use_cuda:
|
207 |
self.model = self.model.cuda()
|
208 |
|
209 |
+
file_check_point = 'indoBERT-indoLEM-Fold-5.pth' if self.check_point == 'IndoBERT (IndoLEM)' else 'indoBERT-indoNLU-Fold-5.pth'
|
210 |
|
211 |
model_weights = torch.load(file_check_point, map_location=torch.device(device))
|
212 |
self.model.load_state_dict(model_weights)
|
|
|
224 |
|
225 |
input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
|
226 |
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
|
227 |
+
self.tokenizer_decode = self.token_decode(input_ids_conv)
|
228 |
self.data_token = data_token
|
229 |
self.prediction_label = prediction_label
|
230 |
labelConv = self.labelToText()
|
231 |
|
232 |
if labelConv:
|
233 |
+
self.label_extraction.append(labelConv) # Dictionary {VERN : 120 ...}
|
234 |
+
|
235 |
+
def clean_text(self, text):
|
236 |
+
# Watermark dan Header
|
237 |
+
text = text.replace("Mahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nDirektori Putusan Mahkamah Agung Republik Indonesia\nputusan.mahkamahagung.go.id\n", "")
|
238 |
+
# Footer
|
239 |
+
text = text.replace("\nDisclaimer\nKepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\npelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\nDalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\nEmail : [email protected]", "")
|
240 |
+
text = text.replace("Telp : 021-384 3348 (ext.318)", "")
|
241 |
+
# Membetulkan penulisan token
|
242 |
+
text = text.replace('P U T U S A N', 'PUTUSAN').replace('T erdakwa', 'Terdakwa').replace('T empat', 'Tempat').replace('T ahun', 'Tahun')
|
243 |
+
text = text.replace('P E N E T A P A N', 'PENETAPAN').replace('J u m l a h', 'Jumlah').replace('\n', '')
|
244 |
+
# Menghapus Halaman
|
245 |
+
text = re.sub(r'\nHalaman \d+ dari \d+ .*', '', text)
|
246 |
+
text = re.sub(r'Halaman \d+ dari \d+ .*', '', text)
|
247 |
+
text = re.sub(r'\nHal. \d+ dari \d+ .*', '', text)
|
248 |
+
text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
|
249 |
+
# Menghapus kode tidak digunakan
|
250 |
+
text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
|
251 |
+
text = re.sub(r'[\u2026]+|\.{3,}', '', text)
|
252 |
+
return text.strip()
|
253 |
+
|
254 |
+
def read_pdf(self, file_pdf):
|
255 |
+
try:
|
256 |
+
pdf_text = ''
|
257 |
+
pdf_file = open(file_pdf, 'rb')
|
258 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
259 |
+
|
260 |
+
for page_num in range(len(pdf_reader.pages)):
|
261 |
+
page = pdf_reader.pages[page_num]
|
262 |
+
# clean text
|
263 |
+
text = self.clean_text(page.extract_text())
|
264 |
+
|
265 |
+
pdf_text += text
|
266 |
+
|
267 |
+
pdf_file.close()
|
268 |
+
return pdf_text.strip()
|
269 |
+
|
270 |
+
except requests.exceptions.RequestException as e:
|
271 |
+
print("Error:", e)
|
272 |
|
273 |
def predict(self, doc):
|
274 |
if '.pdf' not in doc:
|
275 |
self.fit_transform([doc.strip()])
|
276 |
return self.dis_text_prediction()
|
277 |
else:
|
278 |
+
file_pdf = self.read_pdf(doc)
|
279 |
sentence_file = file_pdf.split(';')
|
280 |
self.fit_transform(sentence_file)
|
281 |
return self.dis_pdf_prediction()
|