Spaces:
Runtime error
Runtime error
#! apt install tesseract-ocr | |
#! apt install libtesseract-dev | |
#! pip install pytesseract | |
#! pip install Pillow | |
#!sudo apt-get install tesseract-ocr-por | |
#!pip install openai | |
#!sudo apt install poppler-utils | |
import cv2 | |
import pytesseract | |
import urllib | |
import numpy as np | |
import re | |
import imutils | |
from PIL import Image | |
import string | |
import glob | |
import os | |
from openai import OpenAI | |
import subprocess | |
import gradio as gr | |
import shutil | |
def askLLM(text,fileType): | |
client = OpenAI(api_key="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") | |
if(fileType == "permanente"): | |
prompt = f"""this is a text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text | |
if(fileType == "morada"): | |
prompt = f"""this is a text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text | |
completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}]) | |
message_content = completion.choices[0].message.content | |
json_str = message_content.strip("```json").strip("```").strip() | |
return(json_str) | |
def ocr(image_path,fileType,rotation): | |
if(fileType == 'permanente'): | |
image_path = image_path.replace("-1.png", "") | |
out = "" | |
files = glob.glob(os.path.join(image_path + '*.png')) | |
files_sorted = sorted(files) | |
for file in files_sorted: | |
image = cv2.imread(file) | |
out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " " | |
#out = out + file + " " | |
answer = askLLM(out,fileType) | |
return(answer) | |
with open(image_path, 'rb') as image_file: | |
image_data = image_file.read() | |
image_array = np.frombuffer(image_data, np.uint8) | |
image = cv2.imdecode(image_array, cv2.IMREAD_COLOR) | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
gray = cv2.bitwise_not(gray) | |
rot_data = pytesseract.image_to_osd(image); | |
#print("[OSD] "+rot_data) | |
rot = re.search('(?<=Rotate: )\d+', rot_data).group(0) | |
angle = float(rot) | |
rotated = imutils.rotate_bound(image, angle) | |
if(rotation != 0): | |
rotated = imutils.rotate(rotated, rotation) | |
#cv2_imshow(rotated) | |
#print(pytesseract.image_to_osd(rotated)); | |
#print("[TEXT]") | |
out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6") | |
processed = process(out,fileType) | |
return(processed) | |
def process(out,fileType): | |
if(fileType == "morada"): | |
#print(out) | |
answer = askLLM(out,fileType) | |
return(answer) | |
if(fileType == "iban"): | |
result = "" | |
#print("-----") | |
#print(out) | |
#print("-----") | |
idx = 0 | |
prefix = "PT50" | |
out = out.upper() | |
if(fileType == "iban"): | |
index = out.find("NIB") | |
if (index != -1): | |
prefix = "NIB" | |
idx = index | |
index = out.find("PT50") | |
if (index != -1): | |
prefix = "PT50" | |
idx = index | |
index = out.find("PT5O") | |
if (index != -1): | |
prefix = "PT5O" | |
idx = index | |
index = out.find("PTS0") | |
if (index != -1): | |
prefix = "PTS0" | |
idx = index | |
index = out.find("PTSO") | |
if (index != -1): | |
prefix = "PTSO" | |
idx = index | |
if(idx != 0): | |
#print("prefix:",prefix) | |
visible_chars = string.ascii_letters + string.digits + string.punctuation | |
remaining_string = out[idx + len(prefix):] | |
result = ''.join([char for char in remaining_string if char in visible_chars][:21]) | |
if(prefix == "NIB"): | |
result = "NIB" + result | |
else: | |
result = "PT50" + result | |
else: | |
result = "" | |
return(result) | |
def process_file(file_path,option): | |
msg = file_path[:-4] + "-1.png" | |
if file_path.lower().endswith('.pdf'): | |
output_file_base = os.path.splitext(file_path)[0] | |
try: | |
subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True) | |
except subprocess.CalledProcessError as e: | |
msg = "Error converting PDF to PNG" | |
if file_path.lower().endswith('.pdf'): | |
error = 1 | |
else: | |
msg = file_path | |
saida = go(msg,option) | |
return(saida) | |
def gradio_process_file(file,option): | |
if file is None: | |
return "No file uploaded." | |
upload_dir = "uploads/" | |
if not os.path.exists(upload_dir): | |
os.makedirs(upload_dir) | |
original_file_name = os.path.basename(file.name) | |
destination_path = os.path.join(upload_dir, original_file_name) | |
shutil.copy(file.name, destination_path) | |
return(process_file(destination_path,option)) | |
def go(img,fileType): | |
r = ocr(img,fileType,0) | |
if(r == '""'): | |
r = ocr(img,fileType,90) | |
if(r == '""'): | |
r = ocr(img,fileType,180) | |
if(r == '""'): | |
r = ocr(img,fileType,270) | |
return(r) | |
with gr.Blocks() as app: | |
gr.Markdown("# Extração de Docs #") | |
gr.HTML('<button onclick="window.location.reload()">Nova extração</button>') | |
file_input = gr.File(label="Upload de PDF ou imagem") | |
select_option = gr.Dropdown( | |
label="Escolha", | |
choices=["iban", "morada", "permanente"], | |
value="iban", | |
interactive=True | |
) | |
output_box = gr.Textbox(label="Output", interactive=False) | |
process_button = gr.Button("Processar") | |
process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box]) | |
app.launch(share=True) |