Omkar008 commited on
Commit
b53150e
1 Parent(s): 2a1db67

Update services/utils.py

Browse files
Files changed (1) hide show
  1. services/utils.py +73 -6
services/utils.py CHANGED
@@ -1,8 +1,74 @@
1
  import PyPDF2
2
  from docx import Document
3
  import io
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- async def extract_text_from_pdf(pdf_data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  with io.BytesIO(pdf_data) as pdf_file:
7
  pdf_reader = PyPDF2.PdfReader(pdf_file)
8
  text = ""
@@ -11,19 +77,20 @@ async def extract_text_from_pdf(pdf_data):
11
  text += page.extract_text()
12
  return text
13
 
14
- async def extract_text_from_docx(docx_data):
15
  doc = Document(io.BytesIO(docx_data))
16
  text = ""
17
  for para in doc.paragraphs:
18
  text += para.text + "\n"
19
  return text
20
 
21
- async def extract_text_from_attachment(filename, data):
22
  if filename.endswith('.pdf'):
23
- return await extract_text_from_pdf(data)
24
  elif filename.endswith('.docx'):
25
- return await extract_text_from_docx(data)
26
  else:
27
  # Add handling for other document types if needed
28
  return "Unsupported document type"
29
-
 
 
1
  import PyPDF2
2
  from docx import Document
3
  import io
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from typing_extensions import Concatenate
7
+ from typing import List
8
+ # from langchain_community.llms import OpenAI
9
+ from langchain_community.callbacks import get_openai_callback
10
+ from langchain.output_parsers import PydanticOutputParser
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain_core.pydantic_v1 import BaseModel, Field, validator
13
+ import os
14
+ import logging
15
+ import base64
16
+ from langchain_openai import OpenAI
17
 
18
+ #Setting the openai api key
19
+ api_key=os.getenv('OPENAI_API_KEY')
20
+
21
+ class Candidate(BaseModel):
22
+ brand: str = Field(description="Please provide the name of the brand from which the receipt is. If there are multiple brand names present, please specify the most prominent or relevant brand associated with the receipt. If the brand name is not explicitly stated, provide any context or details that might help identify it accurately. If the information is not present or unclear, mark the value as Unknown Brand.")
23
+ total_cost: str = Field(description="For example, you can specify the total order value section on the receipt, such as ‘Total’ or ‘Amount Due’, and describe where it's typically located (e.g., bottom right corner). Additionally, mention any common variations in labeling that might indicate the total order value, such as ‘Grand Total’,’Total Amount’, or ‘Balance Due’.")
24
+ location: str = Field(description="Please provide the city and state where the purchase was made. If the receipt is related to travel, extract the location from where the booking was made, not the destination. Look for details such as the booking origin or departure city/state. If the purchase location is not explicitly stated, provide any relevant clues or context from the receipt to help identify it accurately. If the information is not present or unclear, mark the value as Unknown Location.For travel receipts, consider extracting details such as the departure airport code, departure city, or booking location mentioned in the itinerary or booking confirmation section. This location typically indicates where the purchase was initiated.")
25
+ no_of_items: str = Field(description="Number of items in the order.If the information is not present mark the value as 0.")
26
+ purchase_category: str = Field(description="The purchase category among fashion, home, travel, food, groceries, hotels, spa, insurance and others.If the information is not present mark the value as Unknown Category.")
27
+ Date: str = Field(description="Date of purchase.If the information is not present mark the value as Unknown Date. Make sure the date format is in dd-MM-yyyy.")
28
+
29
+
30
+ # async def initialize_openai():
31
+ # model_name = "gpt-3.5-turbo-instruct"
32
+ # # model_name = "text-davinci-003"
33
+ # temperature = 0.0
34
+ # model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
35
+
36
+ def strcuture_document_data(raw_text:str)->dict:
37
+ try:
38
+ print("printing raw text")
39
+ print(raw_text)
40
+ model_name = "gpt-3.5-turbo-instruct"
41
+ # model_name = "text-davinci-003"
42
+ temperature = 0.0
43
+ model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
44
+ doc_query = "Return only a json which contains important entities from the given receipt.Also json should contain only these keys - brand,total_cost,location,no_of_items,purchase_category,Date. Make sure the date format is dd-MM-yyyy in the json. Response: " + raw_text
45
+ parser = PydanticOutputParser(pydantic_object=Candidate)
46
+
47
+ prompt = PromptTemplate(
48
+ template="Answer the user query.\n{format_instructions}\n{query}\n",
49
+ input_variables=["query"],
50
+ partial_variables={"format_instructions": parser.get_format_instructions()},
51
+ )
52
+ input = prompt.format_prompt(query=doc_query)
53
+ with get_openai_callback() as cb:
54
+ result = model(input.to_string())
55
+ # print(parser.parse(result))
56
+ print("printing result")
57
+ print(result)
58
+ # print(type(result))
59
+ logging.info(result)
60
+ class_object= parser.parse(result)
61
+ dict_object=class_object.__dict__
62
+ # print("printing structured json")
63
+ # logging.info(dict_object)
64
+ # print(dict_object)
65
+ return dict_object
66
+ except Exception as e:
67
+ print(f"Error occurred: {e}")
68
+ return {}
69
+
70
+
71
+ def extract_text_from_pdf(pdf_data):
72
  with io.BytesIO(pdf_data) as pdf_file:
73
  pdf_reader = PyPDF2.PdfReader(pdf_file)
74
  text = ""
 
77
  text += page.extract_text()
78
  return text
79
 
80
+ def extract_text_from_docx(docx_data):
81
  doc = Document(io.BytesIO(docx_data))
82
  text = ""
83
  for para in doc.paragraphs:
84
  text += para.text + "\n"
85
  return text
86
 
87
+ def extract_text_from_attachment(filename, data):
88
  if filename.endswith('.pdf'):
89
+ return extract_text_from_pdf(base64.urlsafe_b64decode(data))
90
  elif filename.endswith('.docx'):
91
+ return extract_text_from_docx(base64.urlsafe_b64decode(data))
92
  else:
93
  # Add handling for other document types if needed
94
  return "Unsupported document type"
95
+
96
+