shivanis14 commited on
Commit
6d14138
1 Parent(s): 93f9e41

Create data_extractor

Browse files
Files changed (1) hide show
  1. data_extractor +231 -0
data_extractor ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import pymongo
4
+ import json
5
+ from flask import Flask, request, jsonify
6
+ from dotenv import load_dotenv
7
+ from PIL import Image
8
+ import io
9
+ import re
10
+ from bson import ObjectId
11
+ # Load environment variables from .env file
12
+ #load_dotenv()
13
+
14
+ # Initialize Flask app
15
+ app = Flask(__name__)
16
+
17
+ # Set OpenAI API key
18
+ openai.api_key = os.getenv("OPENAI_API_KEY")
19
+
20
+ # MongoDB connection
21
+ client = pymongo.MongoClient("mongodb+srv://consumewise_db:p123%[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
22
+ db = client.consumeWise
23
+ collection = db.products
24
+
25
+ # Define the prompt that will be passed to the OpenAI API
26
+ label_reader_prompt = """
27
+ You will be provided with a set of images corresponding to a single product. These images are found printed on the packaging of the product.
28
+ Your goal will be to extract information from these images to populate the schema provided. Here is some information you will routinely encounter. Ensure that you capture complete information, especially for nutritional information and ingredients:
29
+ - Ingredients: List of ingredients in the item. They may have some percent listed in brackets. They may also have metadata or classification like Preservative (INS 211) where INS 211 forms the metadata. Structure accordingly. If ingredients have subingredients like sugar: added sugar, trans sugar, treat them as different ingredients.
30
+ - Claims: Like a mango fruit juice says contains fruit.
31
+ - Nutritional Information: This will have nutrients, serving size, and nutrients listed per serving. Extract the base value for reference.
32
+ - FSSAI License number: Extract the license number. There might be many, so store relevant ones.
33
+ - Name: Extract the name of the product.
34
+ - Brand/Manufactured By: Extract the parent company of this product.
35
+ - Serving size: This might be explicitly stated or inferred from the nutrients per serving.
36
+ """
37
+
38
+ # Function to extract information from image URLs
39
+ def extract_information(image_links):
40
+ print("in extract_information")
41
+ image_message = [{"type": "image_url", "image_url": {"url": il}} for il in image_links]
42
+
43
+ # Send the request to OpenAI API with the images and prompt
44
+ response = openai.ChatCompletion.create(
45
+ model="gpt-4o-2024-08-06",
46
+ messages=[
47
+ {
48
+ "role": "user",
49
+ "content": [
50
+ {"type": "text", "text": label_reader_prompt},
51
+ *image_message,
52
+ ],
53
+ },
54
+ ],
55
+ response_format={"type": "json_schema", "json_schema": {
56
+ "name": "label_reader",
57
+ "schema": {
58
+ "type": "object",
59
+ "properties": {
60
+ "productName": {"type": "string"},
61
+ "brandName": {"type": "string"},
62
+ "ingredients": {
63
+ "type": "array",
64
+ "items": {
65
+ "type": "object",
66
+ "properties": {
67
+ "name": {"type": "string"},
68
+ "percent": {"type": "string"},
69
+ "metadata": {"type": "string"},
70
+ },
71
+ "required": ["name", "percent", "metadata"],
72
+ "additionalProperties": False
73
+ }
74
+ },
75
+ "servingSize": {
76
+ "type": "object",
77
+ "properties": {
78
+ "quantity": {"type": "number"},
79
+ "unit": {"type": "string"},
80
+ },
81
+ "required": ["quantity", "unit"],
82
+ "additionalProperties": False
83
+ },
84
+ "packagingSize": {
85
+ "type": "object",
86
+ "properties": {
87
+ "quantity": {"type": "number"},
88
+ "unit": {"type": "string"},
89
+ },
90
+ "required": ["quantity", "unit"],
91
+ "additionalProperties": False
92
+ },
93
+ "servingsPerPack": {"type": "number"},
94
+ "nutritionalInformation": {
95
+ "type": "array",
96
+ "items": {
97
+ "type": "object",
98
+ "properties": {
99
+ "name": {"type": "string"},
100
+ "unit": {"type": "string"},
101
+ "values": {
102
+ "type": "array",
103
+ "items": {
104
+ "type": "object",
105
+ "properties": {
106
+ "base": {"type": "string"},
107
+ "value": {"type": "number"},
108
+ },
109
+ "required": ["base", "value"],
110
+ "additionalProperties": False
111
+ }
112
+ },
113
+ },
114
+ "required": ["name", "unit", "values"],
115
+ "additionalProperties": False
116
+ },
117
+ "additionalProperties": True,
118
+ },
119
+ "fssaiLicenseNumbers": {"type": "array", "items": {"type": "number"}},
120
+ "claims": {"type": "array", "items": {"type": "string"}},
121
+ "shelfLife": {"type": "string"},
122
+ },
123
+ "required": [
124
+ "productName", "brandName", "ingredients", "servingSize",
125
+ "packagingSize", "servingsPerPack", "nutritionalInformation",
126
+ "fssaiLicenseNumbers", "claims", "shelfLife"
127
+ ],
128
+ "additionalProperties": False
129
+ },
130
+ "strict": True
131
+ }}
132
+ )
133
+
134
+ # Extract and return the relevant response
135
+ obj = response['choices'][0]
136
+ return obj
137
+
138
+ # Route to accept image URLs and return extracted JSON data
139
+ @app.route("/extract", methods=["POST"])
140
+ def extract_data():
141
+ try:
142
+ # Get image URLs from the request JSON body
143
+ data = request.json
144
+ image_links = data.get('image_links')
145
+
146
+ if not image_links:
147
+ return jsonify({"error": "No image URLs provided"}), 400
148
+
149
+ # Call the extraction function
150
+ extracted_data = extract_information(image_links)
151
+ print("extracted data called")
152
+
153
+ if 'message' in extracted_data and not extracted_data['message'].get('refusal'):
154
+ text_ans = extracted_data['message']['content']
155
+ ans = json.loads(extracted_data['message']['content'])
156
+ # Store in MongoDB
157
+ print("if condition")
158
+ collection.insert_one(ans)
159
+ return jsonify(text_ans), 200
160
+ else:
161
+ return jsonify({"error": "Failed to extract information"}), 500
162
+
163
+ except Exception as error:
164
+ return jsonify({"error": str(error)}), 500
165
+
166
+
167
+ @app.route("/find_product", methods=["GET"])
168
+ def find_product():
169
+ try:
170
+
171
+ product_name = request.args.get('name')
172
+
173
+ if product_name:
174
+
175
+ # Split the input product name into words
176
+ words = product_name.split()
177
+ result = [' '.join(words[:i]) for i in range(2, len(words) + 1)]
178
+ list_names = result + words
179
+
180
+ # # Create a regex pattern that matches all the words (case-insensitive)
181
+ # regex_pattern = ".*".join(words) # This ensures all words appear in sequence
182
+ # query = {"productName": {"$regex": re.compile(regex_pattern, re.IGNORECASE)}}
183
+ product_list = []
184
+ for i in list_names:
185
+ # Find all products matching the regex pattern
186
+ query = {"productName": {"$regex": re.compile(i, re.IGNORECASE)}}
187
+ products = collection.find(query)
188
+ for product in products:
189
+ if product['productName'] not in product_list:
190
+ product_list.append(product['productName'])
191
+
192
+ # # Create a list of product names that match the query
193
+ # product_list = [product['productName'] for product in products]
194
+
195
+ if product_list:
196
+ return jsonify({"products": product_list}), 200
197
+ else:
198
+ return jsonify({"message": "No products found"}), 404
199
+ else:
200
+ return jsonify({"error": "Please provide a valid product name or id"}), 400
201
+ except Exception as error:
202
+ return jsonify({"error": str(error)}), 500
203
+
204
+ # Route to get product information by product name or _id
205
+ @app.route("/product", methods=["GET"])
206
+ def get_product():
207
+ try:
208
+
209
+ product_name = request.args.get('name')
210
+
211
+ if product_name:
212
+ product = collection.find_one({"productName": product_name})
213
+ else:
214
+ return jsonify({"error": "Please provide a valid product name or id"}), 400
215
+
216
+ if not product:
217
+ print("Product not found.")
218
+ return jsonify({"error": "Product not found"}), 404
219
+ if product:
220
+ product['_id'] = str(product['_id'])
221
+ print(f"Found product: {json.dumps(product, indent=4)}")
222
+ return jsonify(product), 200
223
+
224
+ # Convert ObjectId to string for JSON response
225
+
226
+
227
+ except Exception as error:
228
+ return jsonify({"error": str(error)}), 500
229
+ # Main function to run Flask app
230
+ if __name__ == "__main__":
231
+ app.run(debug=True)