Spaces:
Sleeping
Sleeping
import torch | |
from PIL import Image | |
from transformers import AutoProcessor, AutoModelForPreTraining | |
import gradio as gr | |
import json | |
import traceback | |
import os | |
import re | |
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" | |
token = os.getenv("HUGGINGFACE_TOKEN").strip() | |
processor = AutoProcessor.from_pretrained(model_name, token=token) | |
model = AutoModelForPreTraining.from_pretrained( | |
model_name, | |
quantization_config={"load_in_4bit": True}, | |
token=token | |
) | |
if torch.cuda.is_available(): | |
model = model.to('cuda') | |
def analyze_image(image, prompt): | |
messages = [ | |
{"role": "user", "content": [ | |
{"type": "image"}, | |
{"type": "text", "text": prompt} | |
]} | |
] | |
input_text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor( | |
image, | |
input_text, | |
add_special_tokens=False, | |
return_tensors="pt" | |
).to(model.device) | |
with torch.no_grad(): | |
output = model.generate(**inputs, max_new_tokens=100) | |
full_response = processor.decode(output[0]) | |
try: | |
# Find all JSON-like structures in the response | |
json_matches = list(re.finditer(r'\{.*?\}', full_response, re.DOTALL)) | |
if json_matches: | |
# Take the last match | |
last_json_str = json_matches[-1].group(0) | |
try: | |
processed_json = json.loads(last_json_str) | |
except json.JSONDecodeError as e: | |
processed_json = {"error": f"Invalid JSON in model output: {e}", "full_response": full_response} | |
else: | |
processed_json = {"error": "No JSON found in model output", "full_response": full_response} | |
except Exception as e: | |
processed_json = {"error": str(e), "full_response": full_response} | |
return full_response, processed_json | |
default_prompt = """Analyze this image and determine if it contains a data logger. A data logger is typically a small, black electronic device used to monitor and record data over time, such as voltage, temperature, or current, via external sensors. | |
Carefully examine the image and provide a detailed response. If a data logger is present in the image, respond with: | |
{"present": true, "reason": "Detailed explanation of why you believe it's a data logger, including specific visual cues you've identified"} | |
If no data logger is visible, respond with: | |
{"present": false, "reason": "Detailed explanation of why you believe there's no data logger, describing what you see instead"} | |
Ensure your response is in valid JSON format """ | |
iface = gr.Interface( | |
fn=analyze_image, | |
inputs=[ | |
gr.Image(type="pil", label="Upload Image"), | |
gr.Textbox(label="Prompt", value=default_prompt, lines=10) | |
], | |
outputs=[ | |
gr.Textbox(label="Full Response", lines=10), | |
gr.JSON(label="Processed JSON") | |
], | |
title="Llama 3.2 Vision", | |
cache_examples=False, | |
description=" ", | |
examples=[ | |
["bad.png", default_prompt] | |
] | |
) | |
iface.launch() | |
# import torch | |
# from PIL import Image | |
# from transformers import AutoProcessor, AutoModelForPreTraining | |
# import gradio as gr | |
# import json | |
# import traceback | |
# import os | |
# import re | |
# model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" | |
# token = os.getenv("HUGGINGFACE_TOKEN").strip() | |
# processor = AutoProcessor.from_pretrained(model_name, token=token) | |
# model = AutoModelForPreTraining.from_pretrained( | |
# model_name, | |
# quantization_config={"load_in_4bit": True}, | |
# token=token | |
# ) | |
# if torch.cuda.is_available(): | |
# model = model.to('cuda') | |
# def analyze_image(image, prompt): | |
# messages = [ | |
# {"role": "user", "content": [ | |
# {"type": "image"}, | |
# {"type": "text", "text": prompt} | |
# ]} | |
# ] | |
# input_text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
# inputs = processor( | |
# image, | |
# input_text, | |
# add_special_tokens=False, | |
# return_tensors="pt" | |
# ).to(model.device) | |
# with torch.no_grad(): | |
# output = model.generate(**inputs, max_new_tokens=100) | |
# full_response = processor.decode(output[0]) | |
# print("Full response:", full_response) # Debug print | |
# # return full_response | |
# try: | |
# json_match = re.search(r'\{.*?\}', full_response, re.DOTALL) | |
# if json_match: | |
# json_str = json_match.group(0) | |
# try: | |
# return json.loads(json_str) | |
# except json.JSONDecodeError as e: | |
# print(f"JSON decode error: {e}") | |
# return {"error": "Invalid JSON in model output", "full_response": full_response} | |
# else: | |
# return {"error": "No JSON found in model output", "full_response": full_response} | |
# except Exception as e: | |
# print(f"Error in analyze_image: {e}") | |
# return {"Full Response": str(e), "full_response": full_response} | |
# default_prompt = """Analyze this image and determine if it contains a data logger. | |
# A data logger is typically a small, black electronic device used to monitor and record data | |
# over time, such as voltage, temperature, or current, via external sensors. | |
# If a data logger is present in the image, respond with: | |
# {"present": true, "reason": "Brief explanation of why you believe it's a data logger"} | |
# If no data logger is visible, respond with: | |
# {"present": false, "reason": "Brief explanation of why you believe there's no data logger"} | |
# Ensure your response is in valid JSON format.""" | |
# iface = gr.Interface( | |
# fn=analyze_image, | |
# inputs=[ | |
# gr.Image(type="pil", label="Upload Image"), | |
# gr.Textbox(label="Prompt", value=default_prompt, lines=10) | |
# ], | |
# outputs=gr.JSON(label="Analysis Result"), | |
# title="Data Logger Detection using Llama 3.2 Vision", | |
# description="Upload an image and customize the prompt to check if it contains a data logger.", | |
# examples=[ | |
# ["bad.png", default_prompt] | |
# ] | |
# ) | |
# iface.launch() |