Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import requests | |
import torch | |
from PIL import Image | |
import spaces | |
from transformers import MllamaForConditionalGeneration, AutoProcessor | |
import os | |
from huggingface_hub import login | |
huggingface_token = os.getenv("SECRET_ENV_VARIABLE") | |
login(huggingface_token) | |
# Load the Llama 3.2 Vision Model | |
def load_llama_model(): | |
model_id = "meta-llama/Llama-3.2-11B-Vision" | |
# Load model and processor | |
model = MllamaForConditionalGeneration.from_pretrained( | |
model_id, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
offload_folder="offload", | |
) | |
model.tie_weights() | |
processor = AutoProcessor.from_pretrained(model_id) | |
return model, processor | |
# Function to generate predictions for text and image | |
def process_input(text, image=None): | |
model, processor = load_llama_model() | |
if image: | |
# If an image is uploaded, process it as a PIL Image object | |
vision_input = image.convert("RGB").resize((224, 224)) | |
prompt = f"<|image|><|begin_of_text|>{text}" | |
# Process image and text together | |
inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device) | |
else: | |
# If no image is uploaded, just process the text | |
prompt = f"<|begin_of_text|>{text}" | |
inputs = processor(prompt, return_tensors="pt").to(model.device) | |
# Generate output from the model | |
outputs = model.generate(**inputs, max_new_tokens=100) | |
# Decode the output to return a readable text | |
decoded_output = processor.decode(outputs[0], skip_special_tokens=True) | |
return decoded_output | |
# Gradio Interface Setup | |
# def demo(): | |
# # Define Gradio input and output components | |
# text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5) | |
# # Use type="pil" to work with PIL Image objects | |
# image_input = gr.Image(label="Upload an Image", type="pil") | |
# output = gr.Textbox(label="Model Output", lines=5) | |
# # Define the interface layout | |
# interface = gr.Interface( | |
# fn=process_input, | |
# inputs=[text_input, image_input], | |
# outputs=output, | |
# title="Llama 3.2 Multimodal Text-Image Analyzer", | |
# description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model." | |
# ) | |
# # Launch the demo | |
# interface.launch() | |
def demo(): | |
# Define Gradio input and output components | |
text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5) | |
image_input = gr.Image(label="Upload an Image", type="pil") | |
output = gr.Textbox(label="Model Output", lines=5) | |
# Add two examples for multimodal analysis | |
examples = [ | |
["The llama is ", "./examples/rococo.jpg"], | |
["The cute hampster is wearing ", "./examples/weather_events.png"] | |
] | |
# Define the interface layout | |
interface = gr.Interface( | |
fn=process_input, | |
inputs=[text_input, image_input], | |
outputs=output, | |
examples=examples, | |
title="Llama 3.2 Multimodal Text-Image Analyzer", | |
description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.", | |
) | |
# Launch the demo | |
interface.launch() | |
# Run the demo | |
if __name__ == "__main__": | |
demo() | |