Spaces:

h2oai
/

h2ovl-mississippi

Running on A10G

Shanshan Wang

update

84ce9df 16 days ago

9.67 kB

	import gradio as gr
	from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
	import torch
	import torchvision.transforms as T
	from PIL import Image
	import logging

	logging.basicConfig(level=logging.INFO)
	from torchvision.transforms.functional import InterpolationMode
	import os
	from huggingface_hub import login
	hf_token = os.environ.get('hf_token', None)

	# # Define the path to your model
	# path = "h2oai/h2ovl-mississippi-2b"

	# Define the models and their paths
	model_paths = {
	"H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b-prerel",
	"H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m-prerel",
	# Add more models as needed
	}


	def load_model_and_set_image_function(model_name):
	# Get the model path from the model_paths dictionary
	model_path = model_paths[model_name]

	# Load the model
	model = AutoModel.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	use_auth_token=hf_token
	).eval().cuda()

	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	trust_remote_code=True,
	use_fast=False,
	use_auth_token=hf_token
	)

	return model, tokenizer


	def inference(image,
	user_message,
	temperature,
	top_p,
	max_new_tokens,
	tile_num,
	chatbot,state,
	image_state,
	model_state,
	tokenizer_state):

	# Check if model_state is None
	if model_state is None or tokenizer_state is None:
	chatbot.append(("System", "Please select a model to start the conversation."))
	return chatbot, state, image_state, ""

	model = model_state
	tokenizer = tokenizer_state


	# if image is provided, store it in image_state:
	if chatbot is None:
	chatbot = []

	if image is not None:
	image_state = image
	else:
	# If image_state is None, then no image has been provided yet
	if image_state is None:
	chatbot.append(("System", "Please provide an image to start the conversation."))
	return chatbot, state, image_state, ""

	# Initialize history (state) if it's None
	if state is None:
	state = None # model.chat function handles None as empty history

	# Append user message to chatbot
	chatbot.append((user_message, None))

	# Set generation config
	do_sample = (float(temperature) != 0.0)


	generation_config = dict(
	num_beams=1,
	max_new_tokens=int(max_new_tokens),
	do_sample=do_sample,
	temperature= float(temperature),
	top_p= float(top_p),
	)

	# Call model.chat with history
	response_text, new_state = model.chat(
	tokenizer,
	image_state,
	user_message,
	max_tiles = int(tile_num),
	generation_config=generation_config,
	history=state,
	return_history=True
	)

	# update the satet with new_state
	state = new_state
	# Update chatbot with the model's response
	chatbot[-1] = (user_message, response_text)

	return chatbot, state, image_state, ""

	def regenerate_response(chatbot,
	temperature,
	top_p,
	max_new_tokens,
	tile_num,
	state,
	image_state,
	model_state,
	tokenizer_state):

	# Check if model_state is None
	if model_state is None or tokenizer_state is None:
	chatbot.append(("System", "Please select a model to start the conversation."))
	return chatbot, state, image_state

	model = model_state
	tokenizer = tokenizer_state


	# Check if there is a previous user message
	if chatbot is None or len(chatbot) == 0:
	chatbot = []
	chatbot.append(("System", "Nothing to regenerate. Please start a conversation first."))
	return chatbot, state, image_state

	# Check if there is a previous user message
	if state is None or image_state is None or len(state) == 0:
	chatbot.append(("System", "Nothing to regenerate. Please start a conversation first."))
	return chatbot, state, image_state

	# Get the last user message
	last_user_message, last_response = chatbot[-1]

	state = state[:-1] # Remove last assistant's response from history

	if len(state) == 0:
	state = None
	# Set generation config
	do_sample = (float(temperature) != 0.0)

	generation_config = dict(
	num_beams=1,
	max_new_tokens=int(max_new_tokens),
	do_sample=do_sample,
	temperature= float(temperature),
	top_p= float(top_p),
	)
	# Regenerate the response
	response_text, new_state = model.chat(
	tokenizer,
	image_state,
	last_user_message,
	max_tiles = int(tile_num),
	generation_config=generation_config,
	history=state, # Exclude last assistant's response
	return_history=True
	)

	# Update the state with new_state
	state = new_state

	# Update chatbot with the regenerated response
	chatbot.append((last_user_message, response_text))

	return chatbot, state, image_state


	def clear_all():
	return [], None, None, None # Clear chatbot, state, image_state, image_input


	# Build the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# H2OVL-Mississippi")

	state= gr.State()
	image_state = gr.State()
	model_state = gr.State()
	tokenizer_state = gr.State()
	image_load_function_state = gr.State()

	with gr.Row():
	model_dropdown = gr.Dropdown(
	choices=list(model_paths.keys()),
	label="Select Model"
	)

	# When the model selection changes, load the new model
	model_dropdown.change(
	fn=load_model_and_set_image_function,
	inputs=[model_dropdown],
	outputs=[model_state, tokenizer_state]
	)

	with gr.Row(equal_height=True):
	# First column with image input
	with gr.Column(scale=1):
	image_input = gr.Image(type="filepath", label="Upload an Image")

	# Second column with chatbot and user input
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="Conversation")
	user_input = gr.Textbox(label="What is your question", placeholder="Type your message here")


	with gr.Accordion('Parameters', open=False):
	with gr.Row():
	temperature_input = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.2,
	interactive=True,
	label="Temperature")
	top_p_input = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.9,
	interactive=True,
	label="Top P")
	max_new_tokens_input = gr.Slider(
	minimum=0,
	maximum=4096,
	step=64,
	value=1024,
	interactive=True,
	label="Max New Tokens (default: 1024)")
	tile_num = gr.Slider(
	minimum=2,
	maximum=12,
	step=1,
	value=6,
	interactive=True,
	label="Tile Number (default: 6)"
	)

	with gr.Row():
	submit_button = gr.Button("Submit")
	regenerate_button = gr.Button("Regenerate")
	clear_button = gr.Button("Clear")

	# When the submit button is clicked, call the inference function
	submit_button.click(
	fn=inference,
	inputs=[
	image_input,
	user_input,
	temperature_input,
	top_p_input,
	max_new_tokens_input,
	tile_num,
	chatbot,
	state,
	image_state,
	model_state,
	tokenizer_state
	],
	outputs=[chatbot, state, image_state, user_input]
	)
	# When the regenerate button is clicked, re-run the last inference
	regenerate_button.click(
	fn=regenerate_response,
	inputs=[
	chatbot,
	temperature_input,
	top_p_input,
	max_new_tokens_input,
	tile_num,
	state,
	image_state,
	model_state,
	tokenizer_state,
	],
	outputs=[chatbot, state, image_state]
	)

	clear_button.click(
	fn=clear_all,
	inputs=None,
	outputs=[chatbot, state, image_state, image_input]
	)
	gr.Examples(
	examples=[
	["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
	["assets/receipt.jpg", "Read the text on the image"],
	["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],
	["assets/CBA-1H23-Results-Presentation_wheel.png", "What is the efficiency of H2O.AI in document processing?"],
	],
	inputs = [image_input, user_input],
	label = "examples",
	)

	demo.launch()