Spaces:

miguelhomepage
/

gradio

Runtime error

App Files Files Community

gradio / go.py

miguelhomepage

Upload folder using huggingface_hub

2027574 verified about 10 hours ago

raw

history blame contribute delete

5.64 kB

	#! apt install tesseract-ocr
	#! apt install libtesseract-dev
	#! pip install pytesseract
	#! pip install Pillow
	#!sudo apt-get install tesseract-ocr-por
	#!pip install openai
	#!sudo apt install poppler-utils
	import cv2
	import pytesseract
	import urllib
	import numpy as np
	import re
	import imutils
	from PIL import Image
	import string
	import glob
	import os
	from openai import OpenAI
	import subprocess
	import gradio as gr
	import shutil

	def askLLM(text,fileType):
	client = OpenAI(api_key="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")
	if(fileType == "permanente"):
	prompt = f"""this is a text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text
	if(fileType == "morada"):
	prompt = f"""this is a text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text
	completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}])
	message_content = completion.choices[0].message.content
	json_str = message_content.strip("```json").strip("```").strip()
	return(json_str)


	def ocr(image_path,fileType,rotation):
	if(fileType == 'permanente'):
	image_path = image_path.replace("-1.png", "")
	out = ""
	files = glob.glob(os.path.join(image_path + '*.png'))
	files_sorted = sorted(files)
	for file in files_sorted:
	image = cv2.imread(file)
	out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " "
	#out = out + file + " "

	answer = askLLM(out,fileType)
	return(answer)

	with open(image_path, 'rb') as image_file:
	image_data = image_file.read()
	image_array = np.frombuffer(image_data, np.uint8)
	image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	gray = cv2.bitwise_not(gray)
	rot_data = pytesseract.image_to_osd(image);
	#print("[OSD] "+rot_data)
	rot = re.search('(?<=Rotate: )\d+', rot_data).group(0)
	angle = float(rot)
	rotated = imutils.rotate_bound(image, angle)
	if(rotation != 0):
	rotated = imutils.rotate(rotated, rotation)
	#cv2_imshow(rotated)
	#print(pytesseract.image_to_osd(rotated));
	#print("[TEXT]")
	out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6")
	processed = process(out,fileType)
	return(processed)

	def process(out,fileType):
	if(fileType == "morada"):
	#print(out)
	answer = askLLM(out,fileType)
	return(answer)
	if(fileType == "iban"):
	result = ""
	#print("-----")
	#print(out)
	#print("-----")
	idx = 0
	prefix = "PT50"
	out = out.upper()
	if(fileType == "iban"):
	index = out.find("NIB")
	if (index != -1):
	prefix = "NIB"
	idx = index

	index = out.find("PT50")
	if (index != -1):
	prefix = "PT50"
	idx = index

	index = out.find("PT5O")
	if (index != -1):
	prefix = "PT5O"
	idx = index

	index = out.find("PTS0")
	if (index != -1):
	prefix = "PTS0"
	idx = index

	index = out.find("PTSO")
	if (index != -1):
	prefix = "PTSO"
	idx = index

	if(idx != 0):
	#print("prefix:",prefix)
	visible_chars = string.ascii_letters + string.digits + string.punctuation
	remaining_string = out[idx + len(prefix):]
	result = ''.join([char for char in remaining_string if char in visible_chars][:21])
	if(prefix == "NIB"):
	result = "NIB" + result
	else:
	result = "PT50" + result

	else:
	result = ""
	return(result)

	def process_file(file_path,option):
	msg = file_path[:-4] + "-1.png"
	if file_path.lower().endswith('.pdf'):
	output_file_base = os.path.splitext(file_path)[0]
	try:
	subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True)
	except subprocess.CalledProcessError as e:
	msg = "Error converting PDF to PNG"
	if file_path.lower().endswith('.pdf'):
	error = 1
	else:
	msg = file_path
	saida = go(msg,option)
	return(saida)



	def gradio_process_file(file,option):
	if file is None:
	return "No file uploaded."
	upload_dir = "uploads/"
	if not os.path.exists(upload_dir):
	os.makedirs(upload_dir)
	original_file_name = os.path.basename(file.name)
	destination_path = os.path.join(upload_dir, original_file_name)
	shutil.copy(file.name, destination_path)
	return(process_file(destination_path,option))






	def go(img,fileType):
	r = ocr(img,fileType,0)
	if(r == '""'):
	r = ocr(img,fileType,90)
	if(r == '""'):
	r = ocr(img,fileType,180)
	if(r == '""'):
	r = ocr(img,fileType,270)
	return(r)

	with gr.Blocks() as app:
	gr.Markdown("# Extração de Docs #")
	gr.HTML('<button onclick="window.location.reload()">Nova extração</button>')


	file_input = gr.File(label="Upload de PDF ou imagem")
	select_option = gr.Dropdown(
	label="Escolha",
	choices=["iban", "morada", "permanente"],
	value="iban",
	interactive=True
	)
	output_box = gr.Textbox(label="Output", interactive=False)
	process_button = gr.Button("Processar")
	process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box])

	app.launch(share=True)