File size: 10,258 Bytes
d1a66a2
 
 
7fa1436
b53150e
 
 
 
 
 
 
 
 
 
 
 
56eb0c5
2f625a1
864e803
661b4df
393a668
661b4df
d1a66a2
b53150e
 
 
864e803
daad07f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b53150e
daad07f
 
 
 
 
 
f231a19
864e803
b53150e
 
 
 
661b4df
b53150e
b45ab43
b53150e
 
daad07f
 
 
 
 
 
 
cdec900
57eb2e6
661b4df
b53150e
 
 
1f4fb93
1524208
b53150e
1f4fb93
b53150e
951378e
 
b53150e
 
1f4fb93
99b959a
393a668
951378e
cfa8ffb
 
99b959a
b53150e
 
83d8c1d
b009d8f
 
 
83d8c1d
 
b53150e
 
 
 
 
661b4df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b53150e
56eb0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b53150e
d1a66a2
 
 
 
 
 
 
 
b53150e
d1a66a2
 
 
 
 
 
b53150e
d1a66a2
b53150e
d1a66a2
b53150e
d1a66a2
 
 
b53150e
5769d80
 
 
 
 
 
 
 
b53150e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import PyPDF2
from docx import Document
import io
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from typing_extensions import Concatenate
from typing import List
# from langchain_community.llms import OpenAI
from langchain_community.callbacks import get_openai_callback
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
import os
import logging
import base64
from langchain_openai import OpenAI
import re
import json
from typing import Optional
import tiktoken
import time


#Setting the openai api key
api_key=os.getenv('OPENAI_API_KEY')


# class Candidate(BaseModel):
#     brand: Optional[str] = Field(default=None, description="Please identify and provide the primary brand name listed on the receipt. If multiple brand names are present, determine and specify the most prominent or relevant brand associated with the primary transaction on the receipt. If the brand name is not explicitly mentioned, include any contextual details or indirect indicators that might help in accurately identifying the brand.")
#     total_cost: Optional[str] = Field(default=None, description="Identify and provide the 'Total Order Value' listed on the receipt. Please specify the exact section where this value is noted, typically labeled as 'Total', 'Total Amount','total' , 'total amount' ,'total cost','Total Cost','Grand total','grand total'. Include any other labeling variations that might represent the total order value. If the total order value is not present or cannot be determined, explicitly state 'null' as the response.")
#     location: Optional[str] = Field(default=None, description="Please provide the city and state where the purchase was made, as indicated on the receipt. For travel-related receipts, extract the location from which the booking was initiated, focusing on the booking origin or departure city/state, rather than the destination. Look for specific details such as the departure airport code, departure city, or the booking location mentioned in the itinerary or booking confirmation section. If no such information is available, or if it remains unclear, clearly mark the response as 'null'")
#     no_of_items: Optional[str] = Field(default=None, description="Specify the total number of items listed in the order as reflected in the receipt or document. If the total count of items is not explicitly mentioned or if it cannot be determined from the provided document, please assign and return the value 'null'.")
#     purchase_category: Optional[str] = Field(default=None, description="Identify and specify the purchase category. Choose from the following predefined categories: fashion, home, travel, food, groceries, hotels, spa, insurance, or others. If the purchase category is not explicitly stated on the receipt or document, or if it cannot be accurately determined based on the available information, assign and return the value 'null'.")
#     brand_category: Optional[str] = Field(default=None, description="""Based on the receipt information, use one of the following brand categories strictly:
# 1. "Fashion, Dress, Personal"
# 2. "Coffee - Personal"
# 3. "Food - Personal"
# 4. "Travel, Roam, Explore"
# 5. "Shopping, Hunt, Obtain"
# If you don't find any brand category then return 'null'.
# """)
#     Date: Optional[str] = Field(default=None, description="Specify the date of purchase in the format dd-MM-yyyy. If the date of purchase is not explicitly provided on the receipt or document, or if it cannot be accurately determined, assign the value 'null'. Ensure the date is formatted correctly as day, month, and year in two digits each.")

class Candidate(BaseModel):
    brand:Optional[str]= Field(default=None , description="INSERT BRAND NAME FROM THE RECEIPT OCR TEXT. IF NOT PRESENT RETURN null")
    total_cost :Optional[str]=Field(default=None , description="INSERT TOTAL COST FROM THE RECEIPT OCR TEXT(most of the times total cost is the maximum value in the OCR text). IF NOT PRESENT RETURN null")
    location:Optional[str]=Field(default=None , description="INSERT LOCATION FROM THE RECEIPT OCR TEXT. IF NOT PRESENT RETURN null")
    purchase_category:Optional[str]=Field(default=None , description="INSERT PURCHASE CATEGORY FROM THE RECEIPT OCR TEXT. IF NOT PRESENT RETURN null")
    brand_category:Optional[str]=Field(default=None , description="""INSERT BRAND CATEGORY FROM THE RECEIPT OCR TEXT. CHOOSE CLOSEST BRAND CATEGORY BASED ON THE OCR FROM THIS ARRAY ["Fashion and Apparel","Jewelry and Watches","Beauty and Personal Care","Automobiles","Real Estate","Travel and Leisure","Culinary Services","Home and Lifestyle","Technology and Electronics","Sports and Leisure","Art and Collectibles","Health and Wellness","Stationery and Writing Instruments","Children and Baby","Pet Accessories","Financial Services","Airline Services","Accommodation Services","Beverages Services","Services"] ELSE IF NOT PRESENT RETURN null""")
    Date:Optional[str]=Field(default=None , description="INSERT RECEIPT DATE FROM THE RECEIPT OCR TEXT. IF NOT PRESENT RETURN null. FORMAT: dd-mm-yyyy")
    # Currency:Optional[str]=Field(default=None , description= "INSERT CURRENCY FROM THE RECEIPT OCR TEXT THAT YOU FIND.IF NOT RETURN null.")

    

    
def strcuture_document_data(raw_text:str)->dict:
    
    try:
        model_name = "gpt-3.5-turbo-instruct"
        temperature = 0.0
        model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
        
        # doc_query = (
        #     "Extract and return strictly a JSON object containing only the  following keys strictly : brand , total_cost , location , no_of_items , purchase_category,brand_category , Date ."
        #     "\nReceipt Data:\n" + raw_text + "\nRemember the response should only be in JSON format very Strictly and it should have these keys brand , total_cost(Try to look for the highest value in the receipt nearby to words total cost or semantically similar words) , location , no_of_items , purchase_category,brand_category , Date , very Strictly.\n"
        # )
        
        doc_query= (
            "Extract and return strictly a JSON object containing only the following keys: brand, total_cost, location, purchase_category, brand_category, Date. Ensure that if a value is not present in the OCR text, it is returned as null."    
        )
        
        parser = PydanticOutputParser(pydantic_object=Candidate)
        
        prompt = PromptTemplate(
            template="""Your primary goal is to take my receipt OCR text and then return back a parsable json.
            Below is the receipt OCR:.\n {raw_text} \n These are the format instructions telling you to convert the data into json :\n {format_instructions}\nDo not include descriptions or explanations from the Candidate class in the JSON output. The response must be a valid JSON object.\n Follow the below instrcution very strictly:\n {query} \n""",
            input_variables=["query"],
            partial_variables={"format_instructions": parser.get_format_instructions(),"raw_text":raw_text},
        )
        print("parser.get_format_instructions()")
        print(parser.get_format_instructions())
        input = prompt.format_prompt(query=doc_query)
        with get_openai_callback() as cb:
            result = model.invoke(input.to_string())
            
        time.sleep(0.25)
        print(f"GPT Response {result}")
        # result = extract_json_from_string(result)
        # print(f"Formatted Response : {result}")
        
        class_object= parser.parse(result)
        dict_object=class_object.__dict__
        if all(value is None for value in dict_object.values()):
            print(dict_object)
            print("Got null for dict object")
 
        # print("printing structured json")
        # print(dict_object)
        return dict_object
    except Exception as e:
        print(f"Error occurred: {e}")
        return {}

def ensure_token_limit(text, model='gpt-3.5-turbo-instruct', max_tokens=4096):
    # Initialize the tokenizer for the specific model
    tokenizer = tiktoken.get_encoding(model)
    
    # Tokenize the text
    tokens = tokenizer.encode(text)
    
    # Check the token count
    if len(tokens) > max_tokens:
        # Truncate the text to the maximum token limit
        truncated_tokens = tokens[:max_tokens]
        truncated_text = tokenizer.decode(truncated_tokens)
        return truncated_text
    else:
        return text


def extract_json_from_string(input_string):
    # Define a regular expression pattern to match JSON
    pattern = r'\{.*?\}'

    # Use re.findall() to find all matches of JSON in the input string
    matches = re.findall(pattern, input_string)

    # If there are matches, extract the JSON and parse it
    if matches:
        json_data_list = []
        for match in matches:
            json_data = json.loads(match)
            json_data_list.append(json_data)
        return json_data_list
    else:
        return None

def extract_text_from_pdf(pdf_data):
    with io.BytesIO(pdf_data) as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

def extract_text_from_docx(docx_data):
    doc = Document(io.BytesIO(docx_data))
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def extract_text_from_attachment(filename, data):
    if filename.endswith('.pdf'):
        return extract_text_from_pdf(base64.urlsafe_b64decode(data))
    elif filename.endswith('.docx'):
        return extract_text_from_docx(base64.urlsafe_b64decode(data))
    else:
        # Add handling for other document types if needed
        return "Unsupported document type"

def extract_text_from_attachment_outlook(filename , data):
    if filename.endswith('.pdf'):
        return extract_text_from_pdf(data)
    elif filename.endswith('.docx'):
        return extract_text_from_docx(data)
    else:
        return "Unsupported document type"