File size: 9,073 Bytes
6d14138 226d230 6d14138 e4e94ef 6d14138 e4e94ef 205afd4 6d14138 8914c30 074e309 6d14138 24757a7 205afd4 6d14138 205afd4 6d14138 fd29cb3 e577340 6d14138 fd29cb3 f5c8650 6d14138 f5c8650 e577340 6d14138 205afd4 6d14138 205afd4 6d14138 205afd4 6d14138 0c1c9f5 6d14138 0c1c9f5 6d14138 205afd4 6d14138 205afd4 53cbac4 6d14138 205afd4 6d14138 205afd4 6d14138 205afd4 6d14138 53cbac4 6d14138 205afd4 53cbac4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import os
import pymongo
import json
from PIL import Image
import io
import re
from bson import ObjectId
from openai import OpenAI
# Set OpenAI Client
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# MongoDB connection
client = pymongo.MongoClient("mongodb+srv://consumewise_db:p123%[email protected]/?retryWrites=true&w=majority&appName=Cluster0")
db = client.consumeWise
collection = db.products
# Define the prompt that will be passed to the OpenAI API
label_reader_prompt = """
You will be provided with a set of images corresponding to a single product. These images are found printed on the packaging of the product.
Your goal will be to extract information from these images to populate the schema provided. Here is some information you will routinely encounter. Ensure that you capture complete information, especially for nutritional information and ingredients:
- Ingredients: List of ingredients in the item. They may have some percent listed in brackets. They may also have metadata or classification like Preservative (INS 211) where INS 211 forms the metadata. Structure accordingly. If ingredients have subingredients like sugar: added sugar, trans sugar, treat them as different ingredients.
- Claims: Like a mango fruit juice says contains fruit.
- Nutritional Information: This will have nutrients, serving size, and nutrients listed per serving. Extract the base value for reference.
- FSSAI License number: Extract the license number. There might be many, so store relevant ones.
- Name: Extract the name of the product.
- Brand/Manufactured By: Extract the parent company of this product.
- Serving size: This might be explicitly stated or inferred from the nutrients per serving.
"""
# Function to extract information from image URLs
def extract_information(image_links):
print("in extract_information")
image_message = [{"type": "image_url", "image_url": {"url": il}} for il in image_links]
# Send the request to OpenAI API with the images and prompt
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": label_reader_prompt},
*image_message,
],
},
],
response_format={"type": "json_schema", "json_schema": {
"name": "label_reader",
"schema": {
"type": "object",
"properties": {
"productName": {"type": "string"},
"brandName": {"type": "string"},
"ingredients": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"percent": {"type": "string"},
"metadata": {"type": "string"},
},
"required": ["name", "percent", "metadata"],
"additionalProperties": False
}
},
"servingSize": {
"type": "object",
"properties": {
"quantity": {"type": "number"},
"unit": {"type": "string"},
},
"required": ["quantity", "unit"],
"additionalProperties": False
},
"packagingSize": {
"type": "object",
"properties": {
"quantity": {"type": "number"},
"unit": {"type": "string"},
},
"required": ["quantity", "unit"],
"additionalProperties": False
},
"servingsPerPack": {"type": "number"},
"nutritionalInformation": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"unit": {"type": "string"},
"values": {
"type": "array",
"items": {
"type": "object",
"properties": {
"base": {"type": "string"},
"value": {"type": "number"},
},
"required": ["base", "value"],
"additionalProperties": False
}
},
},
"required": ["name", "unit", "values"],
"additionalProperties": False
},
"additionalProperties": True,
},
"fssaiLicenseNumbers": {"type": "array", "items": {"type": "number"}},
"claims": {"type": "array", "items": {"type": "string"}},
"shelfLife": {"type": "string"},
},
"required": [
"productName", "brandName", "ingredients", "servingSize",
"packagingSize", "servingsPerPack", "nutritionalInformation",
"fssaiLicenseNumbers", "claims", "shelfLife"
],
"additionalProperties": False
},
"strict": True
}}
)
# Extract and return the relevant response
return response.choices[0].message.content
#Extract text from image
def extract_data(image_links):
try:
if not image_links:
return {"error": "No image URLs provided"}
# Call the extraction function
extracted_data = extract_information(image_links)
print(f"extracted data : {extracted_data} ")
print(f"extracted data : {type(extracted_data)} ")
if extracted_data:
extracted_data_json = json.loads(extracted_data)
# Store in MongoDB
collection.insert_one(extracted_data_json)
return extracted_data
else:
return {"error": "Failed to extract information"}
except Exception as error:
return {"error": str(error)}
def find_product(product_name):
try:
if product_name:
# Split the input product name into words
words = product_name.split()
result = [' '.join(words[:i]) for i in range(2, len(words) + 1)]
list_names = result + words
# # Create a regex pattern that matches all the words (case-insensitive)
# regex_pattern = ".*".join(words) # This ensures all words appear in sequence
# query = {"productName": {"$regex": re.compile(regex_pattern, re.IGNORECASE)}}
product_list = []
for i in list_names:
# Find all products matching the regex pattern
query = {"productName": {"$regex": re.compile(i, re.IGNORECASE)}}
products = collection.find(query)
for product in products:
if product['productName'] not in product_list:
product_list.append(product['productName'])
# # Create a list of product names that match the query
# product_list = [product['productName'] for product in products]
if product_list:
return {"products": product_list, "message": "Products found"}
else:
return {"products": [], "message": "No products found"}
else:
return {"error": "Please provide a valid product name or id"}
except Exception as error:
return {"error": str(error)}
def get_product(product_name):
try:
if product_name:
product = collection.find_one({"productName": product_name})
else:
return {"error": "Please provide a valid product name or id"}
if not product:
print("Product not found.")
return {"error": "Product not found"}
if product:
product['_id'] = str(product['_id']) # Convert ObjectId to string
product_str = json.dumps(product, indent=4) # Convert product to JSON string
print(f"Found product: {product_str}")
return product_str # Return the product as a JSON string
except Exception as error:
return {"error": str(error)}
|