Spaces:

Harshithtd
/

llama_VLM

Runtime error

App Files Files Community

llama_VLM / app.py

Harshithtd

Update app.py

acc7d19 verified about 2 months ago

raw

history blame

3.34 kB

	from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
	from PIL import Image
	import torch
	from threading import Thread
	import gradio as gr
	from gradio import FileData
	import time
	import spaces
	# Load model directly
	# Load model directly
	from transformers import AutoProcessor, AutoModelForPreTraining

	processor = AutoProcessor.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit")
	model = AutoModelForPreTraining.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit")

	@spaces.CPU
	def bot_streaming(message, history, max_new_tokens=250):

	txt = message["text"]
	ext_buffer = f"{txt}"

	messages = []
	images = []

	for i, msg in enumerate(history):
	if isinstance(msg[0], tuple):
	messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
	messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
	images.append(Image.open(msg[0][0]).convert("RGB"))
	elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
	pass
	elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
	messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
	messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})

	if len(message["files"]) == 1:
	if isinstance(message["files"][0], str):
	image = Image.open(message["files"][0]).convert("RGB")
	else:
	image = Image.open(message["files"][0]["path"]).convert("RGB")
	images.append(image)
	messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
	else:
	messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})

	texts = processor.apply_chat_template(messages, add_generation_prompt=True)

	if images == []:
	inputs = processor(text=texts, return_tensors="pt") # No .to("cuda"), stays on CPU
	else:
	inputs = processor(text=texts, images=images, return_tensors="pt") # No .to("cuda"), stays on CPU

	streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
	generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
	generated_text = ""

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()
	buffer = ""

	for new_text in streamer:
	buffer += new_text
	generated_text_without_prompt = buffer
	time.sleep(0.01)
	yield buffer

	demo = gr.ChatInterface(
	fn=bot_streaming,
	title="Multimodal Llama",
	examples=[], # No examples provided
	textbox=gr.MultimodalTextbox(),
	additional_inputs=[gr.Slider(
	minimum=10,
	maximum=500,
	value=250,
	step=10,
	label="Maximum number of new tokens to generate",
	)],
	cache_examples=False,
	description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32).",
	stop_btn="Stop Generation",
	fill_height=True,
	multimodal=True
	)

	demo.launch(debug=True)