Spaces:

arosyihuddin
/

gradio-LegalNER

Sleeping

App Files Files Community

arosyihuddin commited on Jun 27

Commit

0ca7583

•

1 Parent(s): dd27f28

update UI

Browse files

Files changed (13) hide show

app.py +32 -56
data/1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf +0 -0
data/162_Pid.Sus_2023_PN_Bkl.pdf +0 -0
data/164_Pid.Sus_2023_PN_Bkl.pdf +0 -0
data/165_Pdt.P_2023_PN_Bkl.pdf +0 -0
data/329_pid.b_2023_pn_jkt.brt_20240529090837.pdf +0 -0
data/428_pid.b_2021_pn_jkt.brt_20240529091234.pdf +0 -0
src/__pycache__/bert.cpython-311.pyc +0 -0
src/__pycache__/helper.cpython-311.pyc +0 -0
src/__pycache__/legalNER.cpython-311.pyc +0 -0
src/bert.py +0 -14
src/helper.py +39 -50
src/legalNER.py +122 -32

app.py CHANGED Viewed

@@ -1,51 +1,29 @@
-from transformers import BertTokenizerFast
 from gradio_pdf import PDF
-from src.bert import *
-from src.legalNER import *
 import gradio as gr
 from pathlib import Path
 dir_ = Path(__file__).parent
-ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
-indolem = 'indolem/indobert-base-uncased'
-indonlu = 'indobenchmark/indobert-base-p2'
-model_indolem = BertModel(indolem, len(ids_to_labels))
-model_indonlu = BertModel(indonlu, len(ids_to_labels))
-tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
-tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
-def text_extraction(text, model, progress=gr.Progress()):
-  if model == 'IndoBERT (IndoLEM)':
-    use_model = model_indolem
-    use_tokenizer = tokenizer_indolem
-  else:
-    use_model = model_indonlu
-    use_tokenizer = tokenizer_indonlu
-  legalner =  LegalNER(use_model, use_tokenizer, ids_to_labels, model)
-  entitas = legalner.predict(text)
-  new_text = legalner.tokenizer_decode
-  return {"text": new_text, "entities": entitas}
-def pdf_extraction(doc, model, progress=gr.Progress()):
-  if model == 'IndoBERT (IndoLEM)':
-    use_model = model_indolem
-    use_tokenizer = tokenizer_indolem
-  else:
-    use_model = model_indonlu
-    use_tokenizer = tokenizer_indonlu
-  legalner =  LegalNER(use_model, use_tokenizer, ids_to_labels, model)
-  return legalner.predict(doc)
 with gr.Blocks() as ner:
-  gr.Markdown("#Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
   gr.Markdown("## Uji Coba Model dengan Potongan Kalimat")
   # Input Text
   with gr.Row():
@@ -56,15 +34,14 @@ with gr.Blocks() as ner:
       gr.ClearButton(text, value='Reset')
     with gr.Column(scale=3):
       output_text = gr.HighlightedText(label="Output Text")
     button_text.click(fn=text_extraction, inputs=[text, model_text], outputs=output_text, api_name="text")
   gr.Markdown("## Contoh Inputan Potongan Kalimat")
   gr.Examples(
-    examples=[
         ["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM . DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA . MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoLEM)"],
-        ["MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoNLU)"],
-        ["PUTUSAN Nomor 77/Pid.B/2023/PN Jkt.Pst DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Jakarta Pusat yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa : 1.	Nama lengkap	: Arif Bin Santung", "IndoBERT (IndoLEM)"],
     ],
     inputs=[text, model_text],
     outputs=output_text,
@@ -79,20 +56,19 @@ with gr.Blocks() as ner:
       model_pdf = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model',value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
       button_pdf = gr.Button(value="Extract", variant='primary')
       gr.ClearButton(doc, value="Reset")
     with gr.Column(scale=3):
       output_pdf = gr.Textbox(label="Output PDF")
   button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
   gr.Examples(
-    examples=[[str(dir_ /"data/165_Pdt.P_2023_PN_Bkl.pdf")],
-              [str(dir_ /'data/162_Pid.Sus_2023_PN_Bkl.pdf')],
-              [str(dir_ /'data/164_Pid.Sus_2023_PN_Bkl.pdf')],
-              [str(dir_ /'data/167_Pid.Sus_2023_PN_Bkl.pdf')],
-              [str(dir_ /'data/168_Pid.Sus_2023_PN_Bkl.pdf')],
-              [str(dir_ /'data/169_Pid.Sus_2023_PN_Bkl.pdf')],
-    ],
     inputs=[doc],
     outputs=output_pdf,
     fn=pdf_extraction,

 from gradio_pdf import PDF
+from src.helper import *
 import gradio as gr
 from pathlib import Path
 dir_ = Path(__file__).parent
 with gr.Blocks() as ner:
+  gr.Markdown("# Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
+  # List Label
+  keterangan_label = [
+      ["VERN", "Nomor Putusan"],
+      ["DEFN", "Nama Terdakwa"],
+      ["CRIA", "Tindak Pidana"],
+      ["ARTV", "Melanggar KUHP"],
+      ["PENA", "Tuntutan Hukum"],
+      ["PUNI", "Putusan Hukum"],
+      ["TIMV", "Tanggal Putusan"],
+      ["JUDP", "Hakim Ketua"],
+      ["JUDG", "Hakim Anggota"],
+      ["REGI", "Panitera"],
+      ["PROS", "Penuntut Umum"],
+      ["ADVO", "Pengacara"],
+  ]
+  gr.Markdown("## Penjelasan Label")
+  gr.DataFrame(keterangan_label, headers=["Label", "Keterangan"], height=200)
   gr.Markdown("## Uji Coba Model dengan Potongan Kalimat")
   # Input Text
   with gr.Row():
       gr.ClearButton(text, value='Reset')
     with gr.Column(scale=3):
       output_text = gr.HighlightedText(label="Output Text")
     button_text.click(fn=text_extraction, inputs=[text, model_text], outputs=output_text, api_name="text")
   gr.Markdown("## Contoh Inputan Potongan Kalimat")
   gr.Examples(
+    examples=[
         ["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM . DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA . MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoLEM)"],
+        ["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM", "IndoBERT (IndoNLU)"]
     ],
     inputs=[text, model_text],
     outputs=output_text,
       model_pdf = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model',value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
       button_pdf = gr.Button(value="Extract", variant='primary')
       gr.ClearButton(doc, value="Reset")
     with gr.Column(scale=3):
       output_pdf = gr.Textbox(label="Output PDF")
   button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
   gr.Examples(
+    ["428_pid.b_2021_pn_jkt.brt_20240529091234.pdf",
+     "1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf",
+     "329_pid.b_2023_pn_jkt.brt_20240529090837.pdf",
+     "168_Pid.Sus_2023_PN_Bkl.pdf",
+     "169_Pid.Sus_2023_PN_Bkl.pdf",
+     "167_Pid.Sus_2023_PN_Bkl.pdf"],
     inputs=[doc],
     outputs=output_pdf,
     fn=pdf_extraction,

data/1558_pid.b_2020_pn_jkt.brt_20240529091451.pdf ADDED Viewed

Binary file (102 kB). View file

data/162_Pid.Sus_2023_PN_Bkl.pdf DELETED Viewed

Binary file (142 kB)

data/164_Pid.Sus_2023_PN_Bkl.pdf DELETED Viewed

Binary file (144 kB)

data/165_Pdt.P_2023_PN_Bkl.pdf DELETED Viewed

Binary file (70.4 kB)

data/329_pid.b_2023_pn_jkt.brt_20240529090837.pdf ADDED Viewed

Binary file (151 kB). View file

data/428_pid.b_2021_pn_jkt.brt_20240529091234.pdf ADDED Viewed

Binary file (165 kB). View file

src/__pycache__/bert.cpython-311.pyc ADDED Viewed

Binary file (1.31 kB). View file

src/__pycache__/helper.cpython-311.pyc ADDED Viewed

Binary file (2.75 kB). View file

src/__pycache__/legalNER.cpython-311.pyc ADDED Viewed

Binary file (15.4 kB). View file

src/bert.py DELETED Viewed

@@ -1,14 +0,0 @@
-from transformers import BertForTokenClassification
-import torch
-class BertModel(torch.nn.Module):
-    def __init__(self, pretrained_model, num_labels):
-        super(BertModel, self).__init__()
-        self.bert = BertForTokenClassification.from_pretrained(pretrained_model, num_labels=num_labels)
-    def forward(self, input_id, mask, label):
-        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
-        return output

src/helper.py CHANGED Viewed

@@ -1,50 +1,39 @@
-import PyPDF2
-import requests
-import re
-def clean_text(text):
-    text = text.replace("Mahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nDirektori Putusan Mahkamah Agung Republik Indonesia\nputusan.mahkamahagung.go.id\n", "")
-    text = text.replace("\nDisclaimer\nKepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\npelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\nDalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\nEmail : [email protected]", "")
-    text = text.replace("Telp : 021-384 3348 (ext.318)", "")
-    text = text.replace('P U T U S A N', 'PUTUSAN').replace('T erdakwa', 'Terdakwa').replace('T empat', 'Tempat').replace('T ahun', 'Tahun')
-    text = text.replace('P  E  N  E  T  A  P  A  N', 'PENETAPAN').replace('J u m l a h', 'Jumlah').replace('\n', '')
-    text = re.sub(r'\nHalaman \d+ dari \d+ .*', '', text)
-    text = re.sub(r'Halaman \d+ dari \d+ .*', '', text)
-    text = re.sub(r'\nHal. \d+ dari \d+ .*', '', text)
-    text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
-    text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
-    text = re.sub(r'[\u2026]+|\.{3,}', '', text)
-    return text.strip()
-def read_pdf(pdf):
-  try:
-    pdf_text = ''
-    pdf_file = open(pdf, 'rb')
-    pdf_reader = PyPDF2.PdfReader(pdf_file)
-    for page_num in range(len(pdf_reader.pages)):
-        page = pdf_reader.pages[page_num]
-        text = clean_text(page.extract_text())
-        pdf_text += text
-    pdf_file.close()
-    return pdf_text.strip()
-  except requests.exceptions.RequestException as e:
-    print("Error:", e)
-def token_decode(input_ids_conv):
-  result = ''
-  temp = ''
-  for i, word in enumerate(input_ids_conv):
-    if word not in ['[CLS]', '[SEP]', '[PAD]']:
-      if temp != '' and '##' not in word:
-        result += ' ' + temp
-      if '##' in word:
-        temp += word.replace('##', '')
-      else:
-        temp = word
-    if i == len(input_ids_conv)-1:
-      result += ' ' + temp
-  return result.strip()

+from transformers import BertTokenizerFast, BertForTokenClassification
+import gradio as gr
+from src.legalNER import *
+ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
+indolem = 'indolem/indobert-base-uncased'
+indonlu = 'indobenchmark/indobert-base-p2'
+model_indolem = BertForTokenClassification.from_pretrained(indolem, num_labels=len(ids_to_labels))
+model_indonlu = BertForTokenClassification.from_pretrained(indonlu, num_labels=len(ids_to_labels))
+tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
+tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
+def text_extraction(text, model, progress=gr.Progress()):
+  if model == 'IndoBERT (IndoLEM)':
+    use_model = model_indolem
+    use_tokenizer = tokenizer_indolem
+  else:
+    use_model = model_indonlu
+    use_tokenizer = tokenizer_indonlu
+  legalner =  LegalNER(use_model, use_tokenizer, ids_to_labels, model)
+  entitas = legalner.predict(text)
+  new_text = legalner.tokenizer_decode
+  return {"text": new_text, "entities": entitas}
+def pdf_extraction(doc, model, progress=gr.Progress()):
+  if model == 'IndoBERT (IndoLEM)':
+    use_model = model_indolem
+    use_tokenizer = tokenizer_indolem
+  else:
+    use_model = model_indonlu
+    use_tokenizer = tokenizer_indonlu
+  legalner =  LegalNER(use_model, use_tokenizer, ids_to_labels, model)
+  return legalner.predict(doc)

src/legalNER.py CHANGED Viewed

@@ -1,39 +1,40 @@
-from src.helper import *
 import gradio as gr
 import torch
 class LegalNER():
-  def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
     self.model = model
     self.tokenizer = tokenizer
     self.check_point = check_point
-    self.label_all_tokens = label_all_tokens
     self.prediction_label = ''
     self.data_token = ''
     self.ids_to_labels = ids_to_labels
     self.label_extraction = []
     self.tokenizer_decode = ''
-    self.label_convert = {'B_VERN' : 'Nomor Putusan',
-                   'B_DEFN' : 'Nama Terdakwa',
-                   'B_CRIA' : 'Tindak Pidana',
-                   'B_ARTV' : 'Melanggar KUHP',
-                   'B_PENA' : 'Tuntutan Hukum',
-                   'B_PUNI' : 'Putusan Hukum',
-                   'B_TIMV' : 'Tanggal Putusan',
-                   'B_JUDP' : 'Hakim Ketua',
-                   'B_JUDG' : 'Hakim Anggota',
-                   'B_REGI' : 'Panitera',
-                   'B_PROS' : 'Penuntut Umum',
-                   'B_ADVO' : 'Pengacara',
-                   }
   def align_word_ids(self, texts):
     tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
     word_ids = tokenized_inputs.word_ids()
     previous_word_idx = None
     label_ids = []
     for word_idx in word_ids:
         if word_idx is None:
             label_ids.append(-100)
@@ -44,7 +45,7 @@ class LegalNER():
                 label_ids.append(-100)
         else:
             try:
-                label_ids.append(1 if self.label_all_tokens else -100)
             except:
                 label_ids.append(-100)
         previous_word_idx = word_idx
@@ -58,6 +59,7 @@ class LegalNER():
     # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
     for i, word in enumerate(self.data_token):
       if self.prediction_label[i] != 'O':
         if prev_tag == 'O' and temp != '':
           temp = ''
@@ -67,39 +69,84 @@ class LegalNER():
         else:
           temp +=  ' ' + word
       else:
         if temp != "":
-          result[prev_tag.replace("I_", "B_")] = temp.strip()
         temp = ""
       prev_tag = self.prediction_label[i]
-    return result
   def dis_pdf_prediction(self):
     # Memilih prediksi entitas yang paling bagus
     entity_result = {}
     for i in self.label_extraction:
       if len(list(i.keys())) > 1:
         for y in i.items():
           if y[0] not in entity_result:
             entity_result[y[0]] = y[1]
           else:
             if len(entity_result[y[0]]) < len(y[1]):
               entity_result[y[0]] = y[1]
       else:
-        if tuple(i.items())[0] not in entity_result:
-          entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
-    # Mengkonversi hasil ekstraski entitas dalam bentuk List
     result = ''
-    for i, (label, data) in enumerate(entity_result.items()):
-      if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
         result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
-      elif label in ['B_JUDP', 'B_CRIA']:
         result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
-      elif label in ['B_ADVO', 'B_REGI']:
-        result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
       else:
         result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
@@ -111,11 +158,11 @@ class LegalNER():
     count_huruf = 0
     temp_word = ''
     temp_label = ''
-    temp_label = ''
     temp_count_huruf = 0
     prev_word = ''
     for i, (word, label) in enumerate(zip(self.data_token, self.prediction_label)):
       if label != 'O':
         if temp_word != '' and '##' not in word:
           temp_result['entity'] = temp_label
           temp_result['word'] = temp_word
@@ -124,14 +171,17 @@ class LegalNER():
           result.append(temp_result)
           temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
         if '##' in word:
           temp_word += word.replace('##', '')
         else:
           temp_label = label
           temp_word = word
           temp_count_huruf = count_huruf
       if i == len(self.data_token)-1:
         temp_result['entity'] = temp_label
         temp_result['word'] = temp_word
@@ -140,6 +190,7 @@ class LegalNER():
         result.append(temp_result)
         temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
       if '##' in word:
         count_huruf += len(word)-2
@@ -148,13 +199,14 @@ class LegalNER():
     return result
   def fit_transform(self, texts, progress=gr.Progress()):
     use_cuda = torch.cuda.is_available()
     device = torch.device("cuda" if use_cuda else "cpu")
     if use_cuda:
       self.model = self.model.cuda()
-    file_check_point = 'model/IndoLEM/model_fold_4.pth' if self.check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
     model_weights = torch.load(file_check_point, map_location=torch.device(device))
     self.model.load_state_dict(model_weights)
@@ -172,20 +224,58 @@ class LegalNER():
       input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
       data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
-      self.tokenizer_decode = token_decode(input_ids_conv)
       self.data_token = data_token
       self.prediction_label = prediction_label
       labelConv = self.labelToText()
       if labelConv:
-        self.label_extraction.append(labelConv)
   def predict(self, doc):
     if '.pdf' not in doc:
       self.fit_transform([doc.strip()])
       return self.dis_text_prediction()
     else:
-      file_pdf = read_pdf(doc)
       sentence_file = file_pdf.split(';')
       self.fit_transform(sentence_file)
       return self.dis_pdf_prediction()

 import gradio as gr
 import torch
 class LegalNER():
+  def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
     self.model = model
     self.tokenizer = tokenizer
     self.check_point = check_point
     self.prediction_label = ''
     self.data_token = ''
     self.ids_to_labels = ids_to_labels
     self.label_extraction = []
     self.tokenizer_decode = ''
+    self.label_convert = {'VERN' : 'Nomor Putusan',
+                          'DEFN' : 'Nama Terdakwa',
+                          'CRIA' : 'Tindak Pidana',
+                          'ARTV' : 'Melanggar KUHP',
+                          'PENA' : 'Tuntutan Hukum',
+                          'PUNI' : 'Putusan Hukum',
+                          'TIMV' : 'Tanggal Putusan',
+                          'JUDP' : 'Hakim Ketua',
+                          'JUDG' : 'Hakim Anggota',
+                          'REGI' : 'Panitera',
+                          'PROS' : 'Penuntut Umum',
+                          'ADVO' : 'Pengacara'}
   def align_word_ids(self, texts):
     tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
     word_ids = tokenized_inputs.word_ids()
     previous_word_idx = None
     label_ids = []
     for word_idx in word_ids:
         if word_idx is None:
             label_ids.append(-100)
                 label_ids.append(-100)
         else:
             try:
+                label_ids.append(1)
             except:
                 label_ids.append(-100)
         previous_word_idx = word_idx
     # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
     for i, word in enumerate(self.data_token):
+      # Memproses semua token yang berlabel entitas bukan O
       if self.prediction_label[i] != 'O':
         if prev_tag == 'O' and temp != '':
           temp = ''
         else:
           temp +=  ' ' + word
       else:
+        # cek jika temp nya ada isinya di tambahkan ke dict result dengan key label sebelumnya
         if temp != "":
+          # hanya mengambil label setelah tanda B_ /I_
+          result[prev_tag[2:]] = temp.strip()
         temp = ""
       prev_tag = self.prediction_label[i]
+    return result # Dictionary {VERN : 120 ...}
+  # Menggabungkan setiap token hasil tokenizer dalam bentuk string
+  def token_decode(self, input_ids_conv):
+    result = ''
+    temp = ''
+    for i, word in enumerate(input_ids_conv):
+      # Memfilter Token tambahan
+      if word not in ['[CLS]', '[SEP]', '[PAD]']:
+        # cek bahwa token saat ini termasuk token lanjutan atau tidak
+        if temp != '' and '##' not in word:
+          result += ' ' + temp
+        # token lanjutan di tanda i dengan tanda paggar 2 "##"
+        if '##' in word:
+          temp += word.replace('##', '')
+        # untuk posisi awal token
+        else:
+          temp = word
+      # cek token terakhir sudah masuk atau belum
+      if i == len(input_ids_conv)-1:
+        result += ' ' + temp
+    return result.strip()
   def dis_pdf_prediction(self):
     # Memilih prediksi entitas yang paling bagus
     entity_result = {}
+    # Hasil dari extraksi label ini kadang double sehingga perlu di cari mana yang isinya lebih panjang
     for i in self.label_extraction:
+      # jika hasil extraksinya lebih dari 1
       if len(list(i.keys())) > 1:
+        # looping setiap item
         for y in i.items():
+          # cek key nya sudah ada atau belum
           if y[0] not in entity_result:
+            # jika belum tambahkan
             entity_result[y[0]] = y[1]
           else:
+            # membandaingkan mana yang lebih panjang
             if len(entity_result[y[0]]) < len(y[1]):
               entity_result[y[0]] = y[1]
       else:
+        # cek ada atu tidak dalam enity_result kalau tdidak langsung di tambahkan
+        if list(i.items())[0] not in entity_result:
+          entity_result[list(i.items())[0][0]] = list(i.items())[0][1]
+    # Mengurutkan hasil entitas yang di dapat berdasarkan label convert
+    sorted_entitu_result = {key: entity_result[key] for key in self.label_convert if key in entity_result}
+    # Mengkonversi hasil ekstraski entitas dalam bentuk String
     result = ''
+    for i, (label, data) in enumerate(sorted_entitu_result.items()):
+      if label in ['PENA', 'ARTV']:
         result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
+      elif label in ['PROS']:
+        if (i+1) >= 10:
+          result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
+        else:
+          result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
+      elif label in ['JUDP', 'CRIA']:
         result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
+      elif label in ['ADVO']:
+        result += f'{i+1}. {self.label_convert[label]}\t\t\t\t = {data.capitalize()}\n'
+      elif label in ['REGI']:
+        if (i+1) >= 10:
+          result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
+        else:
+          result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t\t = {data.capitalize()}\n'
       else:
         result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
     count_huruf = 0
     temp_word = ''
     temp_label = ''
     temp_count_huruf = 0
     prev_word = ''
     for i, (word, label) in enumerate(zip(self.data_token, self.prediction_label)):
       if label != 'O':
+        # menambahkan token ketika token merupakan token tunggal atau tidak di pecah dengan tanda pagar
         if temp_word != '' and '##' not in word:
           temp_result['entity'] = temp_label
           temp_result['word'] = temp_word
           result.append(temp_result)
           temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
+        # Jika sebuah kata lanjutan maka di tambahakan langung dengan menghapus tanda pagar
         if '##' in word:
           temp_word += word.replace('##', '')
+        # Menyimpan token untuk pengecekan iterasi selanjutnya apakah memiliki token lanjutan atau tidak
         else:
           temp_label = label
           temp_word = word
           temp_count_huruf = count_huruf
+      # Menambahkan token terakhir yang masih tersimpan dalam temporari variabel
       if i == len(self.data_token)-1:
         temp_result['entity'] = temp_label
         temp_result['word'] = temp_word
         result.append(temp_result)
         temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
+      # Perhitungan jumlah huruf untuk pembuatan labelnya
       if '##' in word:
         count_huruf += len(word)-2
     return result
+  # Fungsi untuk proses Predict dari inputan
   def fit_transform(self, texts, progress=gr.Progress()):
     use_cuda = torch.cuda.is_available()
     device = torch.device("cuda" if use_cuda else "cpu")
     if use_cuda:
       self.model = self.model.cuda()
+    file_check_point = 'indoBERT-indoLEM-Fold-5.pth' if self.check_point == 'IndoBERT (IndoLEM)' else 'indoBERT-indoNLU-Fold-5.pth'
     model_weights = torch.load(file_check_point, map_location=torch.device(device))
     self.model.load_state_dict(model_weights)
       input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
       data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
+      self.tokenizer_decode = self.token_decode(input_ids_conv)
       self.data_token = data_token
       self.prediction_label = prediction_label
       labelConv = self.labelToText()
       if labelConv:
+        self.label_extraction.append(labelConv) # Dictionary {VERN : 120 ...}
+  def clean_text(self, text):
+    # Watermark dan Header
+    text = text.replace("Mahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nDirektori Putusan Mahkamah Agung Republik Indonesia\nputusan.mahkamahagung.go.id\n", "")
+    # Footer
+    text = text.replace("\nDisclaimer\nKepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\npelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\nDalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\nEmail : [email protected]", "")
+    text = text.replace("Telp : 021-384 3348 (ext.318)", "")
+    # Membetulkan penulisan token
+    text = text.replace('P U T U S A N', 'PUTUSAN').replace('T erdakwa', 'Terdakwa').replace('T empat', 'Tempat').replace('T ahun', 'Tahun')
+    text = text.replace('P  E  N  E  T  A  P  A  N', 'PENETAPAN').replace('J u m l a h', 'Jumlah').replace('\n', '')
+    # Menghapus Halaman
+    text = re.sub(r'\nHalaman \d+ dari \d+ .*', '', text)
+    text = re.sub(r'Halaman \d+ dari \d+ .*', '', text)
+    text = re.sub(r'\nHal. \d+ dari \d+ .*', '', text)
+    text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
+    # Menghapus kode tidak digunakan
+    text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
+    text = re.sub(r'[\u2026]+|\.{3,}', '', text)
+    return text.strip()
+  def read_pdf(self, file_pdf):
+    try:
+      pdf_text = ''
+      pdf_file = open(file_pdf, 'rb')
+      pdf_reader = PyPDF2.PdfReader(pdf_file)
+      for page_num in range(len(pdf_reader.pages)):
+          page = pdf_reader.pages[page_num]
+          # clean text
+          text = self.clean_text(page.extract_text())
+          pdf_text += text
+      pdf_file.close()
+      return pdf_text.strip()
+    except requests.exceptions.RequestException as e:
+      print("Error:", e)
   def predict(self, doc):
     if '.pdf' not in doc:
       self.fit_transform([doc.strip()])
       return self.dis_text_prediction()
     else:
+      file_pdf = self.read_pdf(doc)
       sentence_file = file_pdf.split(';')
       self.fit_transform(sentence_file)
       return self.dis_pdf_prediction()