HelpingAI-V / app.py
Abhaykoul's picture
Create app.py
6757f4d verified
raw
history blame
4.1 kB
from __future__ import annotations
import os
import hashlib
import torch
from threading import Thread
from transformers import AutoModel, AutoProcessor, TextIteratorStreamer
import gradio as gr
# Initialize the model and processor
def initialize_model_and_processor():
model = AutoModel.from_pretrained("OEvortex/HelpingAI-Vision", torch_dtype=torch.float16, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
processor = AutoProcessor.from_pretrained("OEvortex/HelpingAI-Vision", trust_remote_code=True)
return model, processor
# Function to process images and cache results
def cached_vision_process(image, max_crops, num_tokens):
image_hash = hashlib.sha256(image.tobytes()).hexdigest()
cache_path = f"visual_cache/{image_hash}-{max_crops}-{num_tokens}.pt"
if os.path.exists(cache_path):
return torch.load(cache_path).to(model.device, dtype=model.dtype)
else:
processor_outputs = processor.image_processor([image], max_crops)
pixel_values = [value.to(model.device, model.dtype) for value in processor_outputs["pixel_values"]]
coords = [value.to(model.device, model.dtype) for value in processor_outputs["coords"]]
image_outputs = model.vision_model(pixel_values, coords, num_tokens)
image_features = model.multi_modal_projector(image_outputs)
os.makedirs("visual_cache", exist_ok=True)
torch.save(image_features, cache_path)
return image_features.to(model.device, model.dtype)
# Function to answer questions about images
def answer_question(image, question, max_crops, num_tokens, sample, temperature, top_k):
if not question.strip() or not image:
return "Please provide both an image and a question."
prompt = f"""user
<image>
{question}
assistant
"""
streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True)
with torch.inference_mode():
inputs = processor(prompt, [image], model, max_crops=max_crops, num_tokens=num_tokens)
generation_kwargs = {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"image_features": inputs["image_features"],
"streamer": streamer,
"max_length": 1000,
"use_cache": True,
"eos_token_id": processor.tokenizer.eos_token_id,
"pad_token_id": processor.tokenizer.eos_token_id,
"temperature": temperature,
"do_sample": sample,
"top_k": top_k,
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
output_started = False
for new_text in streamer:
if not output_started:
if "assistant" in new_text:
output_started = True
continue
buffer += new_text
if len(buffer) > 1:
yield buffer
return buffer
# Initialize the model and processor
model, processor = initialize_model_and_processor()
# Gradio interface setup
with gr.Blocks() as demo:
with gr.Group():
with gr.Row():
prompt = gr.Textbox(label="Question", placeholder="e.g. Describe this?", scale=4)
submit = gr.Button("Send", scale=1)
with gr.Row():
max_crops = gr.Slider(minimum=0, maximum=200, step=5, value=0, label="Max crops")
num_tokens = gr.Slider(minimum=728, maximum=2184, step=10, value=728, label="Number of image tokens")
with gr.Row():
img = gr.Image(type="pil", label="Upload or Drag an Image")
output = gr.TextArea(label="Answer")
with gr.Row():
sample = gr.Checkbox(label="Sample", value=False)
temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0, label="Temperature")
top_k = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Top-K")
submit.click(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output)
prompt.submit(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output)
demo.queue().launch(debug=True)