Spaces:
Running
on
Zero
Running
on
Zero
from __future__ import annotations | |
import os | |
import hashlib | |
import torch | |
from threading import Thread | |
from transformers import AutoModel, AutoProcessor, TextIteratorStreamer | |
import gradio as gr | |
# Initialize the model and processor | |
def initialize_model_and_processor(): | |
model = AutoModel.from_pretrained("OEvortex/HelpingAI-Vision", torch_dtype=torch.float16, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu") | |
processor = AutoProcessor.from_pretrained("OEvortex/HelpingAI-Vision", trust_remote_code=True) | |
return model, processor | |
# Function to process images and cache results | |
def cached_vision_process(image, max_crops, num_tokens): | |
image_hash = hashlib.sha256(image.tobytes()).hexdigest() | |
cache_path = f"visual_cache/{image_hash}-{max_crops}-{num_tokens}.pt" | |
if os.path.exists(cache_path): | |
return torch.load(cache_path).to(model.device, dtype=model.dtype) | |
else: | |
processor_outputs = processor.image_processor([image], max_crops) | |
pixel_values = [value.to(model.device, model.dtype) for value in processor_outputs["pixel_values"]] | |
coords = [value.to(model.device, model.dtype) for value in processor_outputs["coords"]] | |
image_outputs = model.vision_model(pixel_values, coords, num_tokens) | |
image_features = model.multi_modal_projector(image_outputs) | |
os.makedirs("visual_cache", exist_ok=True) | |
torch.save(image_features, cache_path) | |
return image_features.to(model.device, model.dtype) | |
# Function to answer questions about images | |
def answer_question(image, question, max_crops, num_tokens, sample, temperature, top_k): | |
if not question.strip() or not image: | |
return "Please provide both an image and a question." | |
prompt = f"""user | |
<image> | |
{question} | |
assistant | |
""" | |
streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True) | |
with torch.inference_mode(): | |
inputs = processor(prompt, [image], model, max_crops=max_crops, num_tokens=num_tokens) | |
generation_kwargs = { | |
"input_ids": inputs["input_ids"], | |
"attention_mask": inputs["attention_mask"], | |
"image_features": inputs["image_features"], | |
"streamer": streamer, | |
"max_length": 1000, | |
"use_cache": True, | |
"eos_token_id": processor.tokenizer.eos_token_id, | |
"pad_token_id": processor.tokenizer.eos_token_id, | |
"temperature": temperature, | |
"do_sample": sample, | |
"top_k": top_k, | |
} | |
thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
thread.start() | |
buffer = "" | |
output_started = False | |
for new_text in streamer: | |
if not output_started: | |
if "assistant" in new_text: | |
output_started = True | |
continue | |
buffer += new_text | |
if len(buffer) > 1: | |
yield buffer | |
return buffer | |
# Initialize the model and processor | |
model, processor = initialize_model_and_processor() | |
# Gradio interface setup | |
with gr.Blocks() as demo: | |
with gr.Group(): | |
with gr.Row(): | |
prompt = gr.Textbox(label="Question", placeholder="e.g. Describe this?", scale=4) | |
submit = gr.Button("Send", scale=1) | |
with gr.Row(): | |
max_crops = gr.Slider(minimum=0, maximum=200, step=5, value=0, label="Max crops") | |
num_tokens = gr.Slider(minimum=728, maximum=2184, step=10, value=728, label="Number of image tokens") | |
with gr.Row(): | |
img = gr.Image(type="pil", label="Upload or Drag an Image") | |
output = gr.TextArea(label="Answer") | |
with gr.Row(): | |
sample = gr.Checkbox(label="Sample", value=False) | |
temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0, label="Temperature") | |
top_k = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Top-K") | |
submit.click(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output) | |
prompt.submit(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output) | |
demo.queue().launch(debug=True) | |