Spaces:

jkorstad
/

Llama-3.2-11B-Vision-Instruct-Chat

Running on Zero

App Files Files Community

Llama-3.2-11B-Vision-Instruct-Chat / app.py

jkorstad

Update app.py

62ac8f9 verified about 2 months ago

raw

history blame

2.07 kB

	import gradio as gr
	import spaces
	import os
	import torch
	from transformers import AutoProcessor, MllamaForConditionalGeneration
	from PIL import Image

	# Hugging Face token
	hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
	if not hf_token:
	raise ValueError("HUGGING_FACE_HUB_TOKEN not found.")

	# Model
	model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
	model = MllamaForConditionalGeneration.from_pretrained(
	model_name,
	token=hf_token,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token)

	@spaces.GPU
	def predict(image, text):
	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": text}
	]}
	]

	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(image, input_text, return_tensors="pt").to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=250)
	response = processor.decode(outputs[0], skip_special_tokens=True)
	# Split the response at the first occurrence of "assistant" and return only the part after it
	response = response.split("assistant", 1)[1].strip()
	return f"\n{response}"

	# Example photos and prompts
	examples = [
	{"image": "Cowboy Hat.jpg", "text": "Describe the photo"},
	{"image": "Kynda Coffee.jpg", "text": "Search for the business name on his t-shirt to get a description of where the person is."},
	{"image": "Norway.jpg", "text": "Where is this person?"}
	]

	# Load example images
	example_images = [Image.open(example["image"]) for example in examples]

	# Gradio
	interface = gr.Interface(
	fn=predict,
	inputs=[
	gr.Image(type="pil", label="Image Input"),
	gr.Textbox(label="Text Input")
	],
	outputs=gr.Textbox(label="Output"),
	title="Llama 3.2 11B Vision Instruct Chat",
	description="Image + text chat.",
	examples=[{"image": image, "text": example["text"]} for image, example in zip(example_images, examples)]
	)

	interface.launch()