hf-similarity-check / extraction_data.py
Mitul Mohammad Abdullah Al Mukit
update
9312707
raw
history blame
No virus
3.55 kB
################# cnocr ##################
from cnocr import CnOcr
from pdfquery import PDFQuery
import openai
import json
from dotenv import load_dotenv
import os
def validate(text):
invalid_list = [' ',',']
for char in invalid_list:
text = text.replace(char, '')
return text
def check_bank(text):
text = text.replace(' ', '')
bank_list = ['bankofchina','hangseng','hsbc','sc']
for bank in bank_list:
if bank in text:
return bank
else:
return False
def check_bank_name(img_path):
# BOCH - "Consolidated Statement 2023-01-01"
# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07"
# HSBC - "Statement - HSBC One Account 2023-02-10"
# Standard Chartered - "statementOfAccount 2023-02-01"
standard_names = {'boch': "Consolidated Statement",
'hangseng': "Statement of",
'hsbc': "Statement - HSBC One Account",
'sc': "statementOfAccount"}
for bank_name in standard_names:
if bank_name in str(img_path) or standard_names[bank_name] in str(img_path):
return bank_name
def check_mr(text):
openings = ['mr', 'ms', 'miss', 'mrs']
words = text.lower().split()
if words and words[0] in openings:
return ''.join(words[1:])
else:
return text
def get_info_from_bank(img_path):
# Running the model
ocr = CnOcr(rec_model_name='densenet_lite_136-gru')
out = ocr.ocr(img_path)
load_dotenv()
openai.api_key = os.environ.get("data-extraction-api")
invalid_list = [' ',',']
data_set_1 = []
for item in out:
if item['text'] not in invalid_list:
data_set_1.append(item['text'])
completion = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
temperature = 0,
messages = [
{"role": "system", "content": "You are an AI assistant for extracting data with following names(bank, nameStatement, address, totalAsset (only HKD and represent as one number), totalLiability, statementDate) from bank statements. Uppercase and lowercase letters are the same. Store the results in dictionary format"},
{"role": "user", "content": f"Extract data from the following 2 sets of text: {data_set_1}. (1.) Data that locate in the front part of the text: customer full name (it should be a Chinese name in English spelling and two to three words), address in Hong Kong (including flat, floor, court/estate, region in Hong Kong), bank name, bank statement issue date (verly likely to be within 1-2 years), (2.) Data that mainly locate in the other part of the text: total asset (including investments and deposits) and total liability (often contains DR and includes credit card but might be zero) of the current month."},
]
)
# bs_data = completion['choices'][0]['message']
data = completion['choices'][0]['message']['content']
bs_data = json.loads(data)
# for data_item in bs_data:
# if 'name' in data_item:
# bs_data[''] = check_mr
# print(bs_data)
# new_name = check_mr(bs_data["nameStatement"])
bs_data["nameStatement"] = check_mr(bs_data["nameStatement"])
# bs_data["totalAsset"] = bs_data["totalAsset"].replace("HKD","")
# bs_data["totalLiability"] = bs_data["totalLiability"].replace("HKD","")
# bs_data["totalLiability"] = bs_data["totalLiability"].replace("DR","")
# print(bs_data)
return bs_data