#! apt install tesseract-ocr #! apt install libtesseract-dev #! pip install pytesseract #! pip install Pillow #!sudo apt-get install tesseract-ocr-por #!pip install openai #!sudo apt install poppler-utils import cv2 import pytesseract import urllib import numpy as np import re import imutils from PIL import Image import string import glob import os from openai import OpenAI import subprocess import gradio as gr import shutil def askLLM(text,fileType): client = OpenAI(api_key="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") if(fileType == "permanente"): prompt = f"""this is a text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text if(fileType == "morada"): prompt = f"""this is a text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}]) message_content = completion.choices[0].message.content json_str = message_content.strip("```json").strip("```").strip() return(json_str) def ocr(image_path,fileType,rotation): if(fileType == 'permanente'): image_path = image_path.replace("-1.png", "") out = "" files = glob.glob(os.path.join(image_path + '*.png')) files_sorted = sorted(files) for file in files_sorted: image = cv2.imread(file) out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " " #out = out + file + " " answer = askLLM(out,fileType) return(answer) with open(image_path, 'rb') as image_file: image_data = image_file.read() image_array = np.frombuffer(image_data, np.uint8) image = cv2.imdecode(image_array, cv2.IMREAD_COLOR) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) gray = cv2.bitwise_not(gray) rot_data = pytesseract.image_to_osd(image); #print("[OSD] "+rot_data) rot = re.search('(?<=Rotate: )\d+', rot_data).group(0) angle = float(rot) rotated = imutils.rotate_bound(image, angle) if(rotation != 0): rotated = imutils.rotate(rotated, rotation) #cv2_imshow(rotated) #print(pytesseract.image_to_osd(rotated)); #print("[TEXT]") out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6") processed = process(out,fileType) return(processed) def process(out,fileType): if(fileType == "morada"): #print(out) answer = askLLM(out,fileType) return(answer) if(fileType == "iban"): result = "" #print("-----") #print(out) #print("-----") idx = 0 prefix = "PT50" out = out.upper() if(fileType == "iban"): index = out.find("NIB") if (index != -1): prefix = "NIB" idx = index index = out.find("PT50") if (index != -1): prefix = "PT50" idx = index index = out.find("PT5O") if (index != -1): prefix = "PT5O" idx = index index = out.find("PTS0") if (index != -1): prefix = "PTS0" idx = index index = out.find("PTSO") if (index != -1): prefix = "PTSO" idx = index if(idx != 0): #print("prefix:",prefix) visible_chars = string.ascii_letters + string.digits + string.punctuation remaining_string = out[idx + len(prefix):] result = ''.join([char for char in remaining_string if char in visible_chars][:21]) if(prefix == "NIB"): result = "NIB" + result else: result = "PT50" + result else: result = "" return(result) def process_file(file_path,option): msg = file_path[:-4] + "-1.png" if file_path.lower().endswith('.pdf'): output_file_base = os.path.splitext(file_path)[0] try: subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True) except subprocess.CalledProcessError as e: msg = "Error converting PDF to PNG" if file_path.lower().endswith('.pdf'): error = 1 else: msg = file_path saida = go(msg,option) return(saida) def gradio_process_file(file,option): if file is None: return "No file uploaded." upload_dir = "uploads/" if not os.path.exists(upload_dir): os.makedirs(upload_dir) original_file_name = os.path.basename(file.name) destination_path = os.path.join(upload_dir, original_file_name) shutil.copy(file.name, destination_path) return(process_file(destination_path,option)) def go(img,fileType): r = ocr(img,fileType,0) if(r == '""'): r = ocr(img,fileType,90) if(r == '""'): r = ocr(img,fileType,180) if(r == '""'): r = ocr(img,fileType,270) return(r) with gr.Blocks() as app: gr.Markdown("# Extração de Docs #") gr.HTML('') file_input = gr.File(label="Upload de PDF ou imagem") select_option = gr.Dropdown( label="Escolha", choices=["iban", "morada", "permanente"], value="iban", interactive=True ) output_box = gr.Textbox(label="Output", interactive=False) process_button = gr.Button("Processar") process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box]) app.launch(share=True)