File size: 5,598 Bytes
d1a66a2
 
 
b53150e
 
 
 
 
 
 
 
 
 
 
 
 
56eb0c5
2f625a1
d1a66a2
b53150e
 
 
 
56eb0c5
73f1c56
56eb0c5
 
 
73f1c56
b53150e
 
 
 
 
 
 
 
 
 
 
 
 
 
56eb0c5
b53150e
 
 
6582487
b53150e
 
 
 
 
 
56eb0c5
b53150e
 
 
 
 
 
 
 
 
 
 
56eb0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b53150e
d1a66a2
 
 
 
 
 
 
 
b53150e
d1a66a2
 
 
 
 
 
b53150e
d1a66a2
b53150e
d1a66a2
b53150e
d1a66a2
 
 
b53150e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import PyPDF2
from docx import Document
import io
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from typing_extensions import Concatenate
from typing import List
# from langchain_community.llms import OpenAI
from langchain_community.callbacks import get_openai_callback
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
import os
import logging
import base64
from langchain_openai import OpenAI
import re
import json

#Setting the openai api key
api_key=os.getenv('OPENAI_API_KEY')

class Candidate(BaseModel):
    brand: str = Field(description="Please provide the name of the brand from which the receipt is. If there are multiple brand names present, please specify the most prominent or relevant brand associated with the receipt. If the brand name is not explicitly stated, provide any context or details that might help identify it accurately. If the information is not present or unclear, strictly mark the value as null.")
    total_cost: str = Field(description="Fetch Total order value of the receipt.you can specify the total order value section on the receipt, such as ‘Total’ or ‘Total Amount’. Additionally, mention any common variations in labeling that might indicate the total order value, such as ‘Grand Total’,’Total Amount’.If you cannot find total order value then mark value as null.")
    location: str = Field(description="Please provide the city and state where the purchase was made. If the receipt is related to travel, extract the location from where the booking was made, not the destination. Look for details such as the booking origin or departure city/state. If the purchase location is not explicitly stated, provide any relevant clues or context from the receipt to help identify it accurately. If the information is not present or unclear, mark the value as null.For travel receipts, consider extracting details such as the departure airport code, departure city, or booking location mentioned in the itinerary or booking confirmation section. This location typically indicates where the purchase was initiated.")
    no_of_items: str = Field(description="Number of items in the order.If the information is not present mark the value as null.")
    purchase_category: str = Field(description="The purchase category among fashion, home, travel, food, groceries, hotels, spa, insurance and others.If the information is not present mark the value as null.")
    Date: str = Field(description="Date of purchase. Make sure the date format is in dd-MM-yyyy.If the information is not present mark the value as null.")
    

# async def initialize_openai():
#     model_name = "gpt-3.5-turbo-instruct"
#     # model_name = "text-davinci-003"
#     temperature = 0.0
#     model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
    
def strcuture_document_data(raw_text:str)->dict:
    try:
        model_name = "gpt-3.5-turbo-instruct"
        # model_name = "text-davinci-003"
        temperature = 0.0
        model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
        doc_query = "Strictly Return only a json which contains important entities from the given receipt."+"\n"+"Receipt Data : \n" + raw_text
        parser = PydanticOutputParser(pydantic_object=Candidate)
        
        prompt = PromptTemplate(
            template="Answer the user query.\n{format_instructions}\n{query}\n",
            input_variables=["query"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        )
        input = prompt.format_prompt(query=doc_query)
        with get_openai_callback() as cb:
            result = model(input.to_string())
        result = extract_json_from_string(result)
        class_object= parser.parse(result)
        dict_object=class_object.__dict__
        # print("printing structured json")
        # logging.info(dict_object)
        # print(dict_object)
        return dict_object
    except Exception as e:
        print(f"Error occurred: {e}")
        return {}


def extract_json_from_string(input_string):
    # Define a regular expression pattern to match JSON
    pattern = r'\{.*?\}'

    # Use re.findall() to find all matches of JSON in the input string
    matches = re.findall(pattern, input_string)

    # If there are matches, extract the JSON and parse it
    if matches:
        json_data_list = []
        for match in matches:
            json_data = json.loads(match)
            json_data_list.append(json_data)
        return json_data_list
    else:
        return None

def extract_text_from_pdf(pdf_data):
    with io.BytesIO(pdf_data) as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

def extract_text_from_docx(docx_data):
    doc = Document(io.BytesIO(docx_data))
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def extract_text_from_attachment(filename, data):
    if filename.endswith('.pdf'):
        return extract_text_from_pdf(base64.urlsafe_b64decode(data))
    elif filename.endswith('.docx'):
        return extract_text_from_docx(base64.urlsafe_b64decode(data))
    else:
        # Add handling for other document types if needed
        return "Unsupported document type"