Spaces:

arosyihuddin
/

gradio-LegalNER

Sleeping

App Files Files Community

arosyihuddin commited on Mar 17

Commit

ecfd12f

•

1 Parent(s): 87e0b7e

update

Browse files

Files changed (12) hide show

.gitignore +1 -0
app.py +4 -7
src/__pycache__/bert.cpython-310.pyc +0 -0
src/__pycache__/helper.cpython-310.pyc +0 -0
src/__pycache__/legalNER.cpython-310.pyc +0 -0
src/align_word_ids.py +0 -27
src/{BertModel.py → bert.py} +0 -0
src/convertTotext.py +0 -22
src/{clean_text.py → helper.py} +21 -1
src/legalNER.py +144 -0
src/pdf_predict.py +0 -48
src/read_file.py +0 -21

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv

app.py CHANGED Viewed

@@ -1,10 +1,7 @@
-import sys
-sys.path.append("/home/pstar7/Documents/gradio/src")
 from transformers import BertTokenizerFast
 from gradio_pdf import PDF
-from BertModel import *
-from pdf_predict import *
 import gradio as gr
 ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
@@ -24,9 +21,9 @@ def predict(doc : str, model : str) -> str:
     use_model = model_indonlu
     use_tokenizer = tokenizer_indonlu
-  result =  pdf_predict(use_model, use_tokenizer, doc, ids_to_labels, model)
-  return result
 iface = gr.Interface(
     fn=predict,

 from transformers import BertTokenizerFast
 from gradio_pdf import PDF
+from src.bert import *
+from src.legalNER import *
 import gradio as gr
 ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
     use_model = model_indonlu
     use_tokenizer = tokenizer_indonlu
+  ner =  LegalNER(use_model, use_tokenizer, doc, ids_to_labels, model)
+  return ner.display()
 iface = gr.Interface(
     fn=predict,

src/__pycache__/bert.cpython-310.pyc ADDED Viewed

Binary file (881 Bytes). View file

src/__pycache__/helper.cpython-310.pyc ADDED Viewed

Binary file (2.26 kB). View file

src/__pycache__/legalNER.cpython-310.pyc ADDED Viewed

Binary file (4.17 kB). View file

src/align_word_ids.py DELETED Viewed

@@ -1,27 +0,0 @@
-def align_word_ids(texts, tokenizer, label_all_tokens):
-    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
-    word_ids = tokenized_inputs.word_ids()
-    previous_word_idx = None
-    label_ids = []
-    for word_idx in word_ids:
-        if word_idx is None:
-            label_ids.append(-100)
-        elif word_idx != previous_word_idx:
-            try:
-                label_ids.append(1)
-            except:
-                label_ids.append(-100)
-        else:
-            try:
-                label_ids.append(1 if label_all_tokens else -100)
-            except:
-                label_ids.append(-100)
-        previous_word_idx = word_idx
-    return label_ids

src/{BertModel.py → bert.py} RENAMED Viewed

File without changes

src/convertTotext.py DELETED Viewed

@@ -1,22 +0,0 @@
-def convertTotext(data_token, prediction_label):
-  prev_tag = 'O'
-  result = {}
-  temp = ''
-  for i, word in enumerate(data_token):
-    if prediction_label[i] != 'O':
-      if prev_tag == 'O' and temp != '':
-        temp = ''
-      if '##' in word:
-        temp += word.replace('##', '')
-      else:
-        temp +=  ' ' + word
-    else:
-      if temp != "":
-        result[prev_tag.replace("I_", "B_")] = temp.strip()
-      temp = ""
-    prev_tag = prediction_label[i]
-  return result

src/{clean_text.py → helper.py} RENAMED Viewed

@@ -1,3 +1,5 @@
 import re
 def clean_text(text):
@@ -12,4 +14,22 @@ def clean_text(text):
     text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
     text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
     text = re.sub(r'[\u2026]+|\.{3,}', '', text)
-    return text.strip()

+import PyPDF2
+import requests
 import re
 def clean_text(text):
     text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
     text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
     text = re.sub(r'[\u2026]+|\.{3,}', '', text)
+    return text.strip()
+def read_pdf(pdf):
+  try:
+    pdf_text = ''
+    pdf_file = open(pdf, 'rb')
+    pdf_reader = PyPDF2.PdfReader(pdf_file)
+    for page_num in range(len(pdf_reader.pages)):
+        page = pdf_reader.pages[page_num]
+        text = clean_text(page.extract_text())
+        pdf_text += text
+    pdf_file.close()
+    return pdf_text.strip()
+  except requests.exceptions.RequestException as e:
+    print("Error:", e)

src/legalNER.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from src.helper import *
+import gradio as gr
+import torch
+class LegalNER():
+  def __init__(self, model, tokenizer, pdf_file, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
+    self.model = model
+    self.tokenizer = tokenizer
+    self.pdf = pdf_file
+    self.check_point = check_point
+    self.label_all_tokens = label_all_tokens
+    self.ids_to_labels = ids_to_labels
+    self.prediction_label = ''
+    self.label_convert = {'B_VERN' : 'Nomor Putusan',
+                   'B_DEFN' : 'Nama Terdakwa',
+                   'B_CRIA' : 'Tindak Pidana',
+                   'B_ARTV' : 'Melanggar KUHP',
+                   'B_PENA' : 'Tuntutan Hukum',
+                   'B_PUNI' : 'Putusan Hukum',
+                   'B_TIMV' : 'Tanggal Putusan',
+                   'B_JUDP' : 'Hakim Ketua',
+                   'B_JUDG' : 'Hakim Anggota',
+                   'B_REGI' : 'Panitera',
+                   'B_PROS' : 'Penuntut Umum',
+                   'B_ADVO' : 'Pengacara',
+                   }
+  def align_word_ids(self, texts):
+    tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
+    word_ids = tokenized_inputs.word_ids()
+    previous_word_idx = None
+    label_ids = []
+    for word_idx in word_ids:
+        if word_idx is None:
+            label_ids.append(-100)
+        elif word_idx != previous_word_idx:
+            try:
+                label_ids.append(1)
+            except:
+                label_ids.append(-100)
+        else:
+            try:
+                label_ids.append(1 if self.label_all_tokens else -100)
+            except:
+                label_ids.append(-100)
+        previous_word_idx = word_idx
+    return label_ids
+  def labelToText(self, data_token):
+    prev_tag = 'O'
+    result = {}
+    temp = ''
+    # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
+    for i, word in enumerate(data_token):
+      if self.prediction_label[i] != 'O':
+        if prev_tag == 'O' and temp != '':
+          temp = ''
+        if '##' in word:
+          temp += word.replace('##', '')
+        else:
+          temp +=  ' ' + word
+      else:
+        if temp != "":
+          result[prev_tag.replace("I_", "B_")] = temp.strip()
+        temp = ""
+      prev_tag = self.prediction_label[i]
+    return result
+  def labelConverter(self, entity):
+    # Memilih prediksi entitas yang paling bagus
+    entity_result = {}
+    for i in entity:
+      if len(list(i.keys())) > 1:
+        for y in i.items():
+          if y[0] not in entity_result:
+            entity_result[y[0]] = y[1]
+          else:
+            if len(entity_result[y[0]]) < len(y[1]):
+              entity_result[y[0]] = y[1]
+      else:
+        if tuple(i.items())[0] not in entity_result:
+          entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
+    # Mengkonversi hasil dalam bentuk String
+    result = ''
+    for i, (label, data) in enumerate(entity_result.items()):
+      if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
+        result += f'{i+1}. {self.label_convert[label]}\t = {data.capitalize()}\n'
+      elif label in ['B_JUDP', 'B_CRIA']:
+        result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
+      elif label in ['B_ADVO', 'B_REGI']:
+        result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
+      else:
+        result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
+    return result
+  def display(self, progress=gr.Progress()):
+    file_pdf = read_pdf(self.pdf)
+    sentence_file = file_pdf.split(';')
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    if use_cuda:
+      self.model = self.model.cuda()
+    file_check_point = 'model/IndoLEM/model_fold_4.pth' if self.check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
+    model_weights = torch.load(file_check_point, map_location=torch.device(device))
+    self.model.load_state_dict(model_weights)
+    label_extraction = []
+    for text in progress.tqdm(sentence_file, desc="Ekstraksi Entitas"):
+      toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
+      input_ids = toknize['input_ids'].to(device)
+      mask = toknize['attention_mask'].to(device)
+      logits = self.model(input_ids, mask, None)
+      label_ids = torch.Tensor(self.align_word_ids(text)).unsqueeze(0).to(device)
+      logits_clean = logits[0][label_ids != -100]
+      predictions = logits_clean.argmax(dim=1).tolist()
+      prediction_label = [self.ids_to_labels[i] for i in predictions]
+      input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
+      data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
+      self.prediction_label = prediction_label
+      labelConv = self.labelToText(data_token)
+      if labelConv:
+        label_extraction.append(labelConv)
+    return self.labelConverter(label_extraction)

src/pdf_predict.py DELETED Viewed

@@ -1,48 +0,0 @@
-from tqdm import tqdm
-import torch
-from read_file import *
-from align_word_ids import *
-from convertTotext import *
-def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
-  file_pdf = read_pdf(file_path)
-  sentence_file = file_pdf.split(';')
-  use_cuda = torch.cuda.is_available()
-  device = torch.device("cuda" if use_cuda else "cpu")
-  if use_cuda:
-    model = model.cuda()
-  file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
-  model_weights = torch.load(file_check_point, map_location=torch.device(device))
-  model.load_state_dict(model_weights)
-  label_extraction = []
-  for text in tqdm(sentence_file, desc="Prediction Sentence"):
-    toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
-    input_ids = toknize['input_ids'].to(device)
-    mask = toknize['attention_mask'].to(device)
-    logits = model(input_ids, mask, None)
-    label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
-    logits_clean = logits[0][label_ids != -100]
-    predictions = logits_clean.argmax(dim=1).tolist()
-    prediction_label = [ids_to_labels[i] for i in predictions]
-    input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
-    data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
-    nerExtraction = convertTotext(data_token, prediction_label)
-    if nerExtraction:
-      label_extraction.append(nerExtraction)
-      # print(f"\nText : {text}")
-      # print(f"Predict Label : {prediction_label}")
-      # print()
-      # print(f"Hasil Ekstrak NER:")
-      # print(nerExtraction)
-      # print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
-      # print()
-  return label_extraction

src/read_file.py DELETED Viewed

@@ -1,21 +0,0 @@
-import PyPDF2
-from clean_text import *
-import requests
-def read_pdf(file_pdf):
-  try:
-    pdf_text = ''
-    pdf_file = open(file_pdf, 'rb')
-    pdf_reader = PyPDF2.PdfReader(pdf_file)
-    for page_num in range(len(pdf_reader.pages)):
-        page = pdf_reader.pages[page_num]
-        text = clean_text(page.extract_text())
-        pdf_text += text
-    pdf_file.close()
-    return pdf_text.strip()
-  except requests.exceptions.RequestException as e:
-    print("Error:", e)