Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
import os | |
import torch | |
from transformers import AutoProcessor, MllamaForConditionalGeneration | |
from PIL import Image | |
# Hugging Face token | |
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN") | |
if not hf_token: | |
raise ValueError("HUGGING_FACE_HUB_TOKEN not found.") | |
# Model | |
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" | |
model = MllamaForConditionalGeneration.from_pretrained( | |
model_name, | |
token=hf_token, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
) | |
processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token) | |
def predict(image, text): | |
messages = [ | |
{"role": "user", "content": [ | |
{"type": "image"}, | |
{"type": "text", "text": text} | |
]} | |
] | |
input_text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor(image, input_text, return_tensors="pt").to(model.device) | |
outputs = model.generate(**inputs, max_new_tokens=250) | |
response = processor.decode(outputs[0], skip_special_tokens=True) | |
# Split the response at the first occurrence of "assistant" and return only the part after it | |
response = response.split("assistant", 1)[1].strip() | |
return f"\n{response}" | |
# Example photos and prompts | |
examples = [ | |
{"image": "Cowboy Hat.jpg", "text": "Describe the photo"}, | |
{"image": "Kynda Coffee.jpg", "text": "Search for the business name on his t-shirt to get a description of where the person is."}, | |
{"image": "Norway.jpg", "text": "Where is this person?"} | |
] | |
# Load example images | |
example_images = [Image.open(example["image"]) for example in examples] | |
# Gradio | |
interface = gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Image(type="pil", label="Image Input"), | |
gr.Textbox(label="Text Input") | |
], | |
outputs=gr.Textbox(label="Output"), | |
title="Llama 3.2 11B Vision Instruct Chat", | |
description="Image + text chat.", | |
examples=[{"image": image, "text": example["text"]} for image, example in zip(example_images, examples)] | |
) | |
interface.launch() |