from __future__ import annotations import os import hashlib import torch from threading import Thread from transformers import AutoModel, AutoProcessor, TextIteratorStreamer import gradio as gr # Initialize the model and processor def initialize_model_and_processor(): model = AutoModel.from_pretrained("OEvortex/HelpingAI-Vision", torch_dtype=torch.float16, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu") processor = AutoProcessor.from_pretrained("OEvortex/HelpingAI-Vision", trust_remote_code=True) return model, processor # Function to process images and cache results def cached_vision_process(image, max_crops, num_tokens): image_hash = hashlib.sha256(image.tobytes()).hexdigest() cache_path = f"visual_cache/{image_hash}-{max_crops}-{num_tokens}.pt" if os.path.exists(cache_path): return torch.load(cache_path).to(model.device, dtype=model.dtype) else: processor_outputs = processor.image_processor([image], max_crops) pixel_values = [value.to(model.device, model.dtype) for value in processor_outputs["pixel_values"]] coords = [value.to(model.device, model.dtype) for value in processor_outputs["coords"]] image_outputs = model.vision_model(pixel_values, coords, num_tokens) image_features = model.multi_modal_projector(image_outputs) os.makedirs("visual_cache", exist_ok=True) torch.save(image_features, cache_path) return image_features.to(model.device, model.dtype) # Function to answer questions about images def answer_question(image, question, max_crops, num_tokens, sample, temperature, top_k): if not question.strip() or not image: return "Please provide both an image and a question." prompt = f"""user {question} assistant """ streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True) with torch.inference_mode(): inputs = processor(prompt, [image], model, max_crops=max_crops, num_tokens=num_tokens) generation_kwargs = { "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "image_features": inputs["image_features"], "streamer": streamer, "max_length": 1000, "use_cache": True, "eos_token_id": processor.tokenizer.eos_token_id, "pad_token_id": processor.tokenizer.eos_token_id, "temperature": temperature, "do_sample": sample, "top_k": top_k, } thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() buffer = "" output_started = False for new_text in streamer: if not output_started: if "assistant" in new_text: output_started = True continue buffer += new_text if len(buffer) > 1: yield buffer return buffer # Initialize the model and processor model, processor = initialize_model_and_processor() # Gradio interface setup with gr.Blocks() as demo: with gr.Group(): with gr.Row(): prompt = gr.Textbox(label="Question", placeholder="e.g. Describe this?", scale=4) submit = gr.Button("Send", scale=1) with gr.Row(): max_crops = gr.Slider(minimum=0, maximum=200, step=5, value=0, label="Max crops") num_tokens = gr.Slider(minimum=728, maximum=2184, step=10, value=728, label="Number of image tokens") with gr.Row(): img = gr.Image(type="pil", label="Upload or Drag an Image") output = gr.TextArea(label="Answer") with gr.Row(): sample = gr.Checkbox(label="Sample", value=False) temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0, label="Temperature") top_k = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Top-K") submit.click(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output) prompt.submit(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output) demo.queue().launch(debug=True)