shivanis14
commited on
Commit
•
6d14138
1
Parent(s):
93f9e41
Create data_extractor
Browse files- data_extractor +231 -0
data_extractor
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import pymongo
|
4 |
+
import json
|
5 |
+
from flask import Flask, request, jsonify
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from PIL import Image
|
8 |
+
import io
|
9 |
+
import re
|
10 |
+
from bson import ObjectId
|
11 |
+
# Load environment variables from .env file
|
12 |
+
#load_dotenv()
|
13 |
+
|
14 |
+
# Initialize Flask app
|
15 |
+
app = Flask(__name__)
|
16 |
+
|
17 |
+
# Set OpenAI API key
|
18 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
19 |
+
|
20 |
+
# MongoDB connection
|
21 |
+
client = pymongo.MongoClient("mongodb+srv://consumewise_db:p123%[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
|
22 |
+
db = client.consumeWise
|
23 |
+
collection = db.products
|
24 |
+
|
25 |
+
# Define the prompt that will be passed to the OpenAI API
|
26 |
+
label_reader_prompt = """
|
27 |
+
You will be provided with a set of images corresponding to a single product. These images are found printed on the packaging of the product.
|
28 |
+
Your goal will be to extract information from these images to populate the schema provided. Here is some information you will routinely encounter. Ensure that you capture complete information, especially for nutritional information and ingredients:
|
29 |
+
- Ingredients: List of ingredients in the item. They may have some percent listed in brackets. They may also have metadata or classification like Preservative (INS 211) where INS 211 forms the metadata. Structure accordingly. If ingredients have subingredients like sugar: added sugar, trans sugar, treat them as different ingredients.
|
30 |
+
- Claims: Like a mango fruit juice says contains fruit.
|
31 |
+
- Nutritional Information: This will have nutrients, serving size, and nutrients listed per serving. Extract the base value for reference.
|
32 |
+
- FSSAI License number: Extract the license number. There might be many, so store relevant ones.
|
33 |
+
- Name: Extract the name of the product.
|
34 |
+
- Brand/Manufactured By: Extract the parent company of this product.
|
35 |
+
- Serving size: This might be explicitly stated or inferred from the nutrients per serving.
|
36 |
+
"""
|
37 |
+
|
38 |
+
# Function to extract information from image URLs
|
39 |
+
def extract_information(image_links):
|
40 |
+
print("in extract_information")
|
41 |
+
image_message = [{"type": "image_url", "image_url": {"url": il}} for il in image_links]
|
42 |
+
|
43 |
+
# Send the request to OpenAI API with the images and prompt
|
44 |
+
response = openai.ChatCompletion.create(
|
45 |
+
model="gpt-4o-2024-08-06",
|
46 |
+
messages=[
|
47 |
+
{
|
48 |
+
"role": "user",
|
49 |
+
"content": [
|
50 |
+
{"type": "text", "text": label_reader_prompt},
|
51 |
+
*image_message,
|
52 |
+
],
|
53 |
+
},
|
54 |
+
],
|
55 |
+
response_format={"type": "json_schema", "json_schema": {
|
56 |
+
"name": "label_reader",
|
57 |
+
"schema": {
|
58 |
+
"type": "object",
|
59 |
+
"properties": {
|
60 |
+
"productName": {"type": "string"},
|
61 |
+
"brandName": {"type": "string"},
|
62 |
+
"ingredients": {
|
63 |
+
"type": "array",
|
64 |
+
"items": {
|
65 |
+
"type": "object",
|
66 |
+
"properties": {
|
67 |
+
"name": {"type": "string"},
|
68 |
+
"percent": {"type": "string"},
|
69 |
+
"metadata": {"type": "string"},
|
70 |
+
},
|
71 |
+
"required": ["name", "percent", "metadata"],
|
72 |
+
"additionalProperties": False
|
73 |
+
}
|
74 |
+
},
|
75 |
+
"servingSize": {
|
76 |
+
"type": "object",
|
77 |
+
"properties": {
|
78 |
+
"quantity": {"type": "number"},
|
79 |
+
"unit": {"type": "string"},
|
80 |
+
},
|
81 |
+
"required": ["quantity", "unit"],
|
82 |
+
"additionalProperties": False
|
83 |
+
},
|
84 |
+
"packagingSize": {
|
85 |
+
"type": "object",
|
86 |
+
"properties": {
|
87 |
+
"quantity": {"type": "number"},
|
88 |
+
"unit": {"type": "string"},
|
89 |
+
},
|
90 |
+
"required": ["quantity", "unit"],
|
91 |
+
"additionalProperties": False
|
92 |
+
},
|
93 |
+
"servingsPerPack": {"type": "number"},
|
94 |
+
"nutritionalInformation": {
|
95 |
+
"type": "array",
|
96 |
+
"items": {
|
97 |
+
"type": "object",
|
98 |
+
"properties": {
|
99 |
+
"name": {"type": "string"},
|
100 |
+
"unit": {"type": "string"},
|
101 |
+
"values": {
|
102 |
+
"type": "array",
|
103 |
+
"items": {
|
104 |
+
"type": "object",
|
105 |
+
"properties": {
|
106 |
+
"base": {"type": "string"},
|
107 |
+
"value": {"type": "number"},
|
108 |
+
},
|
109 |
+
"required": ["base", "value"],
|
110 |
+
"additionalProperties": False
|
111 |
+
}
|
112 |
+
},
|
113 |
+
},
|
114 |
+
"required": ["name", "unit", "values"],
|
115 |
+
"additionalProperties": False
|
116 |
+
},
|
117 |
+
"additionalProperties": True,
|
118 |
+
},
|
119 |
+
"fssaiLicenseNumbers": {"type": "array", "items": {"type": "number"}},
|
120 |
+
"claims": {"type": "array", "items": {"type": "string"}},
|
121 |
+
"shelfLife": {"type": "string"},
|
122 |
+
},
|
123 |
+
"required": [
|
124 |
+
"productName", "brandName", "ingredients", "servingSize",
|
125 |
+
"packagingSize", "servingsPerPack", "nutritionalInformation",
|
126 |
+
"fssaiLicenseNumbers", "claims", "shelfLife"
|
127 |
+
],
|
128 |
+
"additionalProperties": False
|
129 |
+
},
|
130 |
+
"strict": True
|
131 |
+
}}
|
132 |
+
)
|
133 |
+
|
134 |
+
# Extract and return the relevant response
|
135 |
+
obj = response['choices'][0]
|
136 |
+
return obj
|
137 |
+
|
138 |
+
# Route to accept image URLs and return extracted JSON data
|
139 |
+
@app.route("/extract", methods=["POST"])
|
140 |
+
def extract_data():
|
141 |
+
try:
|
142 |
+
# Get image URLs from the request JSON body
|
143 |
+
data = request.json
|
144 |
+
image_links = data.get('image_links')
|
145 |
+
|
146 |
+
if not image_links:
|
147 |
+
return jsonify({"error": "No image URLs provided"}), 400
|
148 |
+
|
149 |
+
# Call the extraction function
|
150 |
+
extracted_data = extract_information(image_links)
|
151 |
+
print("extracted data called")
|
152 |
+
|
153 |
+
if 'message' in extracted_data and not extracted_data['message'].get('refusal'):
|
154 |
+
text_ans = extracted_data['message']['content']
|
155 |
+
ans = json.loads(extracted_data['message']['content'])
|
156 |
+
# Store in MongoDB
|
157 |
+
print("if condition")
|
158 |
+
collection.insert_one(ans)
|
159 |
+
return jsonify(text_ans), 200
|
160 |
+
else:
|
161 |
+
return jsonify({"error": "Failed to extract information"}), 500
|
162 |
+
|
163 |
+
except Exception as error:
|
164 |
+
return jsonify({"error": str(error)}), 500
|
165 |
+
|
166 |
+
|
167 |
+
@app.route("/find_product", methods=["GET"])
|
168 |
+
def find_product():
|
169 |
+
try:
|
170 |
+
|
171 |
+
product_name = request.args.get('name')
|
172 |
+
|
173 |
+
if product_name:
|
174 |
+
|
175 |
+
# Split the input product name into words
|
176 |
+
words = product_name.split()
|
177 |
+
result = [' '.join(words[:i]) for i in range(2, len(words) + 1)]
|
178 |
+
list_names = result + words
|
179 |
+
|
180 |
+
# # Create a regex pattern that matches all the words (case-insensitive)
|
181 |
+
# regex_pattern = ".*".join(words) # This ensures all words appear in sequence
|
182 |
+
# query = {"productName": {"$regex": re.compile(regex_pattern, re.IGNORECASE)}}
|
183 |
+
product_list = []
|
184 |
+
for i in list_names:
|
185 |
+
# Find all products matching the regex pattern
|
186 |
+
query = {"productName": {"$regex": re.compile(i, re.IGNORECASE)}}
|
187 |
+
products = collection.find(query)
|
188 |
+
for product in products:
|
189 |
+
if product['productName'] not in product_list:
|
190 |
+
product_list.append(product['productName'])
|
191 |
+
|
192 |
+
# # Create a list of product names that match the query
|
193 |
+
# product_list = [product['productName'] for product in products]
|
194 |
+
|
195 |
+
if product_list:
|
196 |
+
return jsonify({"products": product_list}), 200
|
197 |
+
else:
|
198 |
+
return jsonify({"message": "No products found"}), 404
|
199 |
+
else:
|
200 |
+
return jsonify({"error": "Please provide a valid product name or id"}), 400
|
201 |
+
except Exception as error:
|
202 |
+
return jsonify({"error": str(error)}), 500
|
203 |
+
|
204 |
+
# Route to get product information by product name or _id
|
205 |
+
@app.route("/product", methods=["GET"])
|
206 |
+
def get_product():
|
207 |
+
try:
|
208 |
+
|
209 |
+
product_name = request.args.get('name')
|
210 |
+
|
211 |
+
if product_name:
|
212 |
+
product = collection.find_one({"productName": product_name})
|
213 |
+
else:
|
214 |
+
return jsonify({"error": "Please provide a valid product name or id"}), 400
|
215 |
+
|
216 |
+
if not product:
|
217 |
+
print("Product not found.")
|
218 |
+
return jsonify({"error": "Product not found"}), 404
|
219 |
+
if product:
|
220 |
+
product['_id'] = str(product['_id'])
|
221 |
+
print(f"Found product: {json.dumps(product, indent=4)}")
|
222 |
+
return jsonify(product), 200
|
223 |
+
|
224 |
+
# Convert ObjectId to string for JSON response
|
225 |
+
|
226 |
+
|
227 |
+
except Exception as error:
|
228 |
+
return jsonify({"error": str(error)}), 500
|
229 |
+
# Main function to run Flask app
|
230 |
+
if __name__ == "__main__":
|
231 |
+
app.run(debug=True)
|