Spaces:

Hushh
/

hushh-valet-chat

Sleeping

File size: 6,592 Bytes

import PyPDF2
from docx import Document
import io
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from typing_extensions import Concatenate
from typing import List
# from langchain_community.llms import OpenAI
from langchain_community.callbacks import get_openai_callback
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
import os
import logging
import base64
from langchain_openai import OpenAI
import re
import json

#Setting the openai api key
api_key=os.getenv('OPENAI_API_KEY')

class Candidate(BaseModel):
    brand: str = Field(description="Please identify and provide the primary brand name listed on the receipt. If multiple brand names are present, determine and specify the most prominent or relevant brand associated with the primary transaction on the receipt. If the brand name is not explicitly mentioned, include any contextual details or indirect indicators that might help in accurately identifying the brand. In cases where the brand name is absent or ambiguous, clearly state 'null' as the response.")
    total_cost: str = Field(description="Identify and provide the 'Total Order Value' listed on the receipt. Please specify the exact section where this value is noted, typically labeled as 'Total', 'Total Amount', or similar variations such as 'Grand Total'. Include any other labeling variations that might represent the total order value. If the total order value is not present or cannot be determined, explicitly state 'null' as the response.")
    location: str = Field(description="Please provide the city and state where the purchase was made, as indicated on the receipt. For travel-related receipts, extract the location from which the booking was initiated, focusing on the booking origin or departure city/state, rather than the destination. Look for specific details such as the departure airport code, departure city, or the booking location mentioned in the itinerary or booking confirmation section. These details typically indicate the purchase's origin. If the purchase location is not explicitly stated or if the information is ambiguous, provide any relevant clues or context from the receipt that might assist in accurately identifying the location. If no such information is available, or if it remains unclear, clearly mark the response as 'null'")
    no_of_items: str = Field(description="Specify the total number of items listed in the order as reflected in the receipt or document. If the total count of items is not explicitly mentioned or if it cannot be determined from the provided document, please assign and return the value 'null'.")
    purchase_category: str = Field(description="Identify and specify the purchase category. Choose from the following predefined categories: fashion, home, travel, food, groceries, hotels, spa, insurance, or others. If the purchase category is not explicitly stated on the receipt or document, or if it cannot be accurately determined based on the available information, assign and return the value 'null'.")
    Date: str = Field(description="Specify the date of purchase in the format dd-MM-yyyy. If the date of purchase is not explicitly provided on the receipt or document, or if it cannot be accurately determined, assign the value 'null'. Ensure the date is formatted correctly as day, month, and year in two digits each.")
    

# async def initialize_openai():
#     model_name = "gpt-3.5-turbo-instruct"
#     # model_name = "text-davinci-003"
#     temperature = 0.0
#     model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
    
def strcuture_document_data(raw_text:str)->dict:
    try:
        model_name = "gpt-3.5-turbo-instruct"
        # model_name = "text-davinci-003"
        temperature = 0.0
        model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
        doc_query = (
            "Extract and return a JSON object containing only the  following keys strictly : brand , total_cost , location , no_of_items , purchase_category , Date .  "
            "\nReceipt Data:\n" + raw_text
        )
        print(raw_text)
        parser = PydanticOutputParser(pydantic_object=Candidate)
        
        prompt = PromptTemplate(
            template="Answer the user query.\n{format_instructions}\n{query}\n",
            input_variables=["query"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        )
        input = prompt.format_prompt(query=doc_query)
        with get_openai_callback() as cb:
            result = model(input.to_string())
            
        
        print(f"GPT Response {result}")
        result = extract_json_from_string(result)
        print(f"Formatted Response : {result}")
        
        class_object= parser.parse(result)
        dict_object=class_object.__dict__
        print("printing structured json")
        logging.info(dict_object)
        print(dict_object)
        return dict_object
    except Exception as e:
        print(f"Error occurred: {e}")
        return {}


def extract_json_from_string(input_string):
    # Define a regular expression pattern to match JSON
    pattern = r'\{.*?\}'

    # Use re.findall() to find all matches of JSON in the input string
    matches = re.findall(pattern, input_string)

    # If there are matches, extract the JSON and parse it
    if matches:
        json_data_list = []
        for match in matches:
            json_data = json.loads(match)
            json_data_list.append(json_data)
        return json_data_list
    else:
        return None

def extract_text_from_pdf(pdf_data):
    with io.BytesIO(pdf_data) as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

def extract_text_from_docx(docx_data):
    doc = Document(io.BytesIO(docx_data))
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def extract_text_from_attachment(filename, data):
    if filename.endswith('.pdf'):
        return extract_text_from_pdf(base64.urlsafe_b64decode(data))
    elif filename.endswith('.docx'):
        return extract_text_from_docx(base64.urlsafe_b64decode(data))
    else:
        # Add handling for other document types if needed
        return "Unsupported document type"