#! apt install tesseract-ocr
#! apt install libtesseract-dev
#! pip install pytesseract
#! pip install Pillow
#!sudo apt-get install tesseract-ocr-por
#!pip install openai
#!sudo apt install poppler-utils
import cv2
import pytesseract
import urllib
import numpy as np
import re
import imutils
from PIL import Image
import string
import glob
import os
from openai import OpenAI
import subprocess
import gradio as gr
import shutil

def askLLM(text,fileType):
	client = OpenAI(api_key="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")
	if(fileType == "permanente"):
		prompt = f"""this is a  text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text
	if(fileType == "morada"):
		prompt = f"""this is a  text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text
	completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}])
	message_content = completion.choices[0].message.content
	json_str = message_content.strip("```json").strip("```").strip()
	return(json_str)


def ocr(image_path,fileType,rotation):
  if(fileType == 'permanente'):
	  image_path = image_path.replace("-1.png", "")
	  out = ""
	  files = glob.glob(os.path.join(image_path + '*.png'))
	  files_sorted = sorted(files)
	  for file in files_sorted:
		  image = cv2.imread(file)
		  out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " "
		  #out = out + file + " "
		  
	  answer = askLLM(out,fileType)
	  return(answer)
  
  with open(image_path, 'rb') as image_file:
	  image_data = image_file.read() 
  image_array = np.frombuffer(image_data, np.uint8)
  image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  gray = cv2.bitwise_not(gray)
  rot_data = pytesseract.image_to_osd(image);
  #print("[OSD] "+rot_data)
  rot = re.search('(?<=Rotate: )\d+', rot_data).group(0)
  angle = float(rot)
  rotated = imutils.rotate_bound(image, angle)
  if(rotation != 0):
	  rotated = imutils.rotate(rotated, rotation)
  #cv2_imshow(rotated)
  #print(pytesseract.image_to_osd(rotated));
  #print("[TEXT]")
  out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6")
  processed = process(out,fileType)
  return(processed)

def process(out,fileType):
	if(fileType == "morada"):
	  #print(out)
	  answer = askLLM(out,fileType)
	  return(answer)
	if(fileType == "iban"):
	  result = ""
	  #print("-----")
	  #print(out)
	  #print("-----")
	  idx = 0
	  prefix = "PT50"
	  out = out.upper()
	  if(fileType == "iban"):
		  index = out.find("NIB")
		  if (index != -1):
			  prefix = "NIB"
			  idx = index

		  index = out.find("PT50")
		  if (index != -1):
			  prefix = "PT50"
			  idx = index
		  
		  index = out.find("PT5O")
		  if (index != -1):
			  prefix = "PT5O"
			  idx = index

		  index = out.find("PTS0")
		  if (index != -1):
			  prefix = "PTS0"
			  idx = index
		  
		  index = out.find("PTSO")
		  if (index != -1):
			  prefix = "PTSO"
			  idx = index

		  if(idx != 0):
			  #print("prefix:",prefix)
			  visible_chars = string.ascii_letters + string.digits + string.punctuation
			  remaining_string = out[idx + len(prefix):]
			  result = ''.join([char for char in remaining_string if char in visible_chars][:21])
			  if(prefix == "NIB"):
				  result = "NIB" + result
			  else:
				  result = "PT50" + result
				
	  else:
		  result = ""
	return(result)

def process_file(file_path,option):
	msg = file_path[:-4] + "-1.png"
	if file_path.lower().endswith('.pdf'):
		output_file_base = os.path.splitext(file_path)[0]
		try:
			subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True)
		except subprocess.CalledProcessError as e:
			msg = "Error converting PDF to PNG"
	if file_path.lower().endswith('.pdf'):
		error = 1
	else:
		msg = file_path
	saida = go(msg,option)
	return(saida)
	
		
def gradio_process_file(file,option):
	if file is None:
		return "No file uploaded."
	upload_dir = "uploads/"
	if not os.path.exists(upload_dir):
		os.makedirs(upload_dir)
	original_file_name = os.path.basename(file.name)
	destination_path = os.path.join(upload_dir, original_file_name)
	shutil.copy(file.name, destination_path)
	return(process_file(destination_path,option))
	
	
def go(img,fileType):
	r = ocr(img,fileType,0)
	if(r == '""'):
		r = ocr(img,fileType,90)
	if(r == '""'):
		r = ocr(img,fileType,180)
	if(r == '""'):
		r = ocr(img,fileType,270)
	return(r)

with gr.Blocks() as app:
    gr.Markdown("# Extração de Docs #")
    gr.HTML('<button onclick="window.location.reload()">Nova extração</button>')
		
		
    file_input = gr.File(label="Upload de PDF ou imagem")
    select_option = gr.Dropdown(
	  label="Escolha",
	  choices=["iban", "morada", "permanente"],
	  value="iban", 
	  interactive=True
    )
    output_box = gr.Textbox(label="Output", interactive=False)
    process_button = gr.Button("Processar")
    process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box])

app.launch(share=True)