Spaces:

Abhaykoul
/

HelpingAI-V

Running on Zero

App Files Files Community

HelpingAI-V / app.py

Abhaykoul

Update app.py

fcae467 verified 9 months ago

raw

history blame contribute delete

4.15 kB

	from __future__ import annotations

	import spaces

	import gradio as gr
	from threading import Thread
	from transformers import TextIteratorStreamer
	import hashlib
	import os

	from transformers import AutoModel, AutoProcessor
	import torch

	model = AutoModel.from_pretrained("OEvortex/HelpingAI-Vision", torch_dtype=torch.float16, trust_remote_code=True).to("cuda")

	processor = AutoProcessor.from_pretrained("OEvortex/HelpingAI-Vision", trust_remote_code=True)

	if torch.cuda.is_available():
	DEVICE = "cuda"
	DTYPE = torch.float16
	else:
	DEVICE = "cpu"
	DTYPE = torch.float32

	def cached_vision_process(image, max_crops, num_tokens):
	image_hash = hashlib.sha256(image.tobytes()).hexdigest()
	cache_path = f"visual_cache/{image_hash}-{max_crops}-{num_tokens}.pt"
	if os.path.exists(cache_path):
	return torch.load(cache_path).to(DEVICE, dtype=DTYPE)
	else:
	processor_outputs = processor.image_processor([image], max_crops)
	pixel_values = processor_outputs["pixel_values"]
	pixel_values = [
	value.to(model.device).to(model.dtype) for value in pixel_values
	]
	coords = processor_outputs["coords"]
	coords = [value.to(model.device).to(model.dtype) for value in coords]
	image_outputs = model.vision_model(pixel_values, coords, num_tokens)
	image_features = model.multi_modal_projector(image_outputs)
	os.makedirs("visual_cache", exist_ok=True)
	torch.save(image_features, cache_path)
	return image_features.to(DEVICE, dtype=DTYPE)

	@spaces.GPU(duration=20)
	def answer_question(image, question, max_crops, num_tokens, sample, temperature, top_k):
	if question is None or question.strip() == "":
	yield "Please ask me anything"
	return
	if image is None:
	yield "Please upload a picture"
	return
	prompt = f"""user
	<image>
	{question}
	assistant
	"""
	streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True)
	with torch.inference_mode():
	inputs = processor(prompt, [image], model, max_crops=max_crops, num_tokens=num_tokens)
	generation_kwargs = {
	"input_ids": inputs["input_ids"],
	"attention_mask": inputs["attention_mask"],
	"image_features": inputs["image_features"],
	"streamer": streamer,
	"max_length": 1000,
	"use_cache": True,
	"eos_token_id": processor.tokenizer.eos_token_id,
	"pad_token_id": processor.tokenizer.eos_token_id,
	"temperature": temperature,
	"do_sample": sample,
	"top_k": top_k,
	}
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	output_started = False
	for new_text in streamer:
	if not output_started:
	if "assistant" in new_text:
	output_started = True
	continue
	buffer += new_text
	if len(buffer) > 1:
	yield buffer
	return buffer


	with gr.Blocks() as demo:
	with gr.Group():
	with gr.Row():
	prompt = gr.Textbox(
	label="Question", placeholder="e.g. Discribe this?", scale=4
	)
	submit = gr.Button(
	"Send",
	scale=1,
	)
	with gr.Row():
	max_crops = gr.Slider(minimum=0, maximum=200, step=5, value=0, label="Max crops")
	num_tokens = gr.Slider(minimum=728, maximum=2184, step=10, value=728, label="Number of image tokens")
	with gr.Row():
	img = gr.Image(type="pil", label="Upload or Drag an Image")
	output = gr.TextArea(label="Answer")
	with gr.Row():
	sample = gr.Checkbox(label="Sample", value=False)
	temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0, label="Temperature")
	top_k = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Top-K")

	submit.click(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output)
	prompt.submit(answer_question, [img, prompt, max_crops, num_tokens, sample, temperature, top_k], output)

	demo.queue().launch(debug=True)