gradio / go.py
miguelhomepage's picture
Upload folder using huggingface_hub
2027574 verified
#! apt install tesseract-ocr
#! apt install libtesseract-dev
#! pip install pytesseract
#! pip install Pillow
#!sudo apt-get install tesseract-ocr-por
#!pip install openai
#!sudo apt install poppler-utils
import cv2
import pytesseract
import urllib
import numpy as np
import re
import imutils
from PIL import Image
import string
import glob
import os
from openai import OpenAI
import subprocess
import gradio as gr
import shutil
def askLLM(text,fileType):
client = OpenAI(api_key="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")
if(fileType == "permanente"):
prompt = f"""this is a text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text
if(fileType == "morada"):
prompt = f"""this is a text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text
completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}])
message_content = completion.choices[0].message.content
json_str = message_content.strip("```json").strip("```").strip()
return(json_str)
def ocr(image_path,fileType,rotation):
if(fileType == 'permanente'):
image_path = image_path.replace("-1.png", "")
out = ""
files = glob.glob(os.path.join(image_path + '*.png'))
files_sorted = sorted(files)
for file in files_sorted:
image = cv2.imread(file)
out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " "
#out = out + file + " "
answer = askLLM(out,fileType)
return(answer)
with open(image_path, 'rb') as image_file:
image_data = image_file.read()
image_array = np.frombuffer(image_data, np.uint8)
image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.bitwise_not(gray)
rot_data = pytesseract.image_to_osd(image);
#print("[OSD] "+rot_data)
rot = re.search('(?<=Rotate: )\d+', rot_data).group(0)
angle = float(rot)
rotated = imutils.rotate_bound(image, angle)
if(rotation != 0):
rotated = imutils.rotate(rotated, rotation)
#cv2_imshow(rotated)
#print(pytesseract.image_to_osd(rotated));
#print("[TEXT]")
out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6")
processed = process(out,fileType)
return(processed)
def process(out,fileType):
if(fileType == "morada"):
#print(out)
answer = askLLM(out,fileType)
return(answer)
if(fileType == "iban"):
result = ""
#print("-----")
#print(out)
#print("-----")
idx = 0
prefix = "PT50"
out = out.upper()
if(fileType == "iban"):
index = out.find("NIB")
if (index != -1):
prefix = "NIB"
idx = index
index = out.find("PT50")
if (index != -1):
prefix = "PT50"
idx = index
index = out.find("PT5O")
if (index != -1):
prefix = "PT5O"
idx = index
index = out.find("PTS0")
if (index != -1):
prefix = "PTS0"
idx = index
index = out.find("PTSO")
if (index != -1):
prefix = "PTSO"
idx = index
if(idx != 0):
#print("prefix:",prefix)
visible_chars = string.ascii_letters + string.digits + string.punctuation
remaining_string = out[idx + len(prefix):]
result = ''.join([char for char in remaining_string if char in visible_chars][:21])
if(prefix == "NIB"):
result = "NIB" + result
else:
result = "PT50" + result
else:
result = ""
return(result)
def process_file(file_path,option):
msg = file_path[:-4] + "-1.png"
if file_path.lower().endswith('.pdf'):
output_file_base = os.path.splitext(file_path)[0]
try:
subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True)
except subprocess.CalledProcessError as e:
msg = "Error converting PDF to PNG"
if file_path.lower().endswith('.pdf'):
error = 1
else:
msg = file_path
saida = go(msg,option)
return(saida)
def gradio_process_file(file,option):
if file is None:
return "No file uploaded."
upload_dir = "uploads/"
if not os.path.exists(upload_dir):
os.makedirs(upload_dir)
original_file_name = os.path.basename(file.name)
destination_path = os.path.join(upload_dir, original_file_name)
shutil.copy(file.name, destination_path)
return(process_file(destination_path,option))
def go(img,fileType):
r = ocr(img,fileType,0)
if(r == '""'):
r = ocr(img,fileType,90)
if(r == '""'):
r = ocr(img,fileType,180)
if(r == '""'):
r = ocr(img,fileType,270)
return(r)
with gr.Blocks() as app:
gr.Markdown("# Extração de Docs #")
gr.HTML('<button onclick="window.location.reload()">Nova extração</button>')
file_input = gr.File(label="Upload de PDF ou imagem")
select_option = gr.Dropdown(
label="Escolha",
choices=["iban", "morada", "permanente"],
value="iban",
interactive=True
)
output_box = gr.Textbox(label="Output", interactive=False)
process_button = gr.Button("Processar")
process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box])
app.launch(share=True)