Spaces:

markqiu
/

text-generation-webui

Sleeping

App Files Files Community

text-generation-webui / extensions /multimodal /script.py

markqiu

Upload folder using huggingface_hub

cd36062 12 months ago

raw

history blame contribute delete

4 kB

	import base64
	import re
	import time
	from functools import partial
	from io import BytesIO

	import gradio as gr
	import torch

	from extensions.multimodal.multimodal_embedder import MultimodalEmbedder
	from modules import shared
	from modules.logging_colors import logger

	params = {
	"add_all_images_to_prompt": False,
	# device to run vision encoder on
	"vision_device": None,
	# bits to load vision encoder in, either 16 or 32
	"vision_bits": 32,
	# device to run multimodal projector on
	"projector_device": None,
	# multimodal projector bits, either 32 or 16
	"projector_bits": 32
	}


	# If 'state' is True, will hijack the next chat generation
	input_hijack = {
	'state': False,
	'value': ["", ""]
	}


	# initialized in ui, so that params are loaded from settings
	multimodal_embedder: MultimodalEmbedder = None


	def chat_input_modifier(text, visible_text, state):
	global input_hijack
	if input_hijack['state']:
	input_hijack['state'] = False
	return input_hijack['value'](text, visible_text)
	else:
	return text, visible_text


	def add_chat_picture(picture, text, visible_text):
	# resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
	# Adjusted to 336 for the values here, due to the increased resolution in llava-v1.5
	max_hw, min_hw = max(picture.size), min(picture.size)
	aspect_ratio = max_hw / min_hw
	shortest_edge = int(max(336 / aspect_ratio, 336))
	longest_edge = int(shortest_edge * aspect_ratio)
	w = shortest_edge if picture.width < picture.height else longest_edge
	h = shortest_edge if picture.width >= picture.height else longest_edge
	picture = picture.resize((w, h))

	buffer = BytesIO()
	picture.save(buffer, format="PNG")
	img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
	image = f'<img src="data:image/jpeg;base64,{img_str}">'

	if '<image>' in text:
	text = text.replace('<image>', image)
	else:
	text = image + '\n' + text

	if visible_text == '' or visible_text is None:
	visible_text = text
	elif '<image>' in visible_text:
	visible_text = visible_text.replace('<image>', image)
	else:
	visible_text = visible_text + '\n' + image

	return text, visible_text


	def custom_tokenized_length(prompt):
	return multimodal_embedder.len_in_tokens(prompt)


	def tokenizer_modifier(state, prompt, input_ids, input_embeds):
	global params
	start_ts = time.time()
	image_match = re.search(r'<img src="data:image/jpeg;base64,[A-Za-z0-9+/=]+">', prompt)

	if image_match is None:
	return prompt, input_ids, input_embeds

	prompt, input_ids, input_embeds, total_embedded = multimodal_embedder.forward(prompt, state, params)
	logger.info(f'Embedded {total_embedded} image(s) in {time.time()-start_ts:.2f}s')
	return (prompt,
	input_ids.unsqueeze(0).to(shared.model.device, dtype=torch.int64),
	input_embeds.unsqueeze(0).to(shared.model.device, dtype=shared.model.dtype))


	def ui():
	global multimodal_embedder
	multimodal_embedder = MultimodalEmbedder(params)
	with gr.Column():
	picture_select = gr.Image(label='Send a picture', type='pil')
	# The models don't seem to deal well with multiple images
	single_image_checkbox = gr.Checkbox(False, label='Embed all images, not only the last one')
	# Prepare the input hijack
	picture_select.upload(
	lambda picture: input_hijack.update({"state": True, "value": partial(add_chat_picture, picture)}),
	[picture_select],
	None
	)
	picture_select.clear(lambda: input_hijack.update({"state": False, "value": ["", ""]}), None, None)
	single_image_checkbox.change(lambda x: params.update({"add_all_images_to_prompt": x}), single_image_checkbox, None)
	shared.gradio['Generate'].click(lambda: None, None, picture_select)
	shared.gradio['textbox'].submit(lambda: None, None, picture_select)