Shanshan Wang
clean up image_state
7826ae6
raw
history blame
9.27 kB
import gradio as gr
from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
import torch
import torchvision.transforms as T
from PIL import Image
import logging
logging.basicConfig(level=logging.INFO)
from torchvision.transforms.functional import InterpolationMode
import os
from huggingface_hub import login
hf_token = os.environ.get('hf_token', None)
# Define the models and their paths
model_paths = {
"H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b",
"H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m",
# Add more models as needed
}
def load_model_and_set_image_function(model_name):
# Get the model path from the model_paths dictionary
model_path = model_paths[model_name]
# Load the model
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_auth_token=hf_token
).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True,
use_fast=False,
use_auth_token=hf_token
)
return model, tokenizer
def inference(image_input,
user_message,
temperature,
top_p,
max_new_tokens,
tile_num,
chatbot,
state,
# image_state,
model_state,
tokenizer_state):
# Check if model_state is None
if model_state is None or tokenizer_state is None:
chatbot.append(("System", "Please select a model to start the conversation."))
return chatbot, state, ""
model = model_state
tokenizer = tokenizer_state
# if image is provided, store it in image_state:
if chatbot is None:
chatbot = []
if image_input is None:
chatbot.append(("System", "Please provide an image to start the conversation."))
return chatbot, state, ""
# Initialize history (state) if it's None
if state is None:
state = None # model.chat function handles None as empty history
# Append user message to chatbot
chatbot.append((user_message, None))
# Set generation config
do_sample = (float(temperature) != 0.0)
generation_config = dict(
num_beams=1,
max_new_tokens=int(max_new_tokens),
do_sample=do_sample,
temperature= float(temperature),
top_p= float(top_p),
)
# Call model.chat with history
response_text, new_state = model.chat(
tokenizer,
image_input,
user_message,
max_tiles = int(tile_num),
generation_config=generation_config,
history=state,
return_history=True
)
# update the satet with new_state
state = new_state
# Update chatbot with the model's response
chatbot[-1] = (user_message, response_text)
return chatbot, state, ""
def regenerate_response(chatbot,
temperature,
top_p,
max_new_tokens,
tile_num,
state,
image_input,
model_state,
tokenizer_state):
# Check if model_state is None
if model_state is None or tokenizer_state is None:
chatbot.append(("System", "Please select a model to start the conversation."))
return chatbot, state
model = model_state
tokenizer = tokenizer_state
# Check if there is a previous user message
if chatbot is None or len(chatbot) == 0:
chatbot = []
chatbot.append(("System", "Nothing to regenerate. Please start a conversation first."))
return chatbot, state,
# Check if there is a previous user message
if state is None or len(state) == 0:
chatbot.append(("System", "Nothing to regenerate. Please start a conversation first."))
return chatbot, state
# Get the last user message
last_user_message, _ = chatbot[-1]
state = state[:-1] # Remove last assistant's response from history
if len(state) == 0 or not state:
state = None
# Set generation config
do_sample = (float(temperature) != 0.0)
generation_config = dict(
num_beams=1,
max_new_tokens=int(max_new_tokens),
do_sample=do_sample,
temperature= float(temperature),
top_p= float(top_p),
)
# Regenerate the response
response_text, new_state = model.chat(
tokenizer,
image_input,
last_user_message,
max_tiles = int(tile_num),
generation_config=generation_config,
history=state, # Exclude last assistant's response
return_history=True
)
# Update the state with new_state
state = new_state
# Update chatbot with the regenerated response
chatbot.append((last_user_message, response_text))
return chatbot, state
def clear_all():
return [], None, None, "" # Clear chatbot, state, reset image_input
# Build the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# **H2OVL-Mississippi**")
state= gr.State()
model_state = gr.State()
tokenizer_state = gr.State()
image_load_function_state = gr.State()
with gr.Row():
model_dropdown = gr.Dropdown(
choices=list(model_paths.keys()),
label="Select Model"
)
# When the model selection changes, load the new model
model_dropdown.change(
fn=load_model_and_set_image_function,
inputs=[model_dropdown],
outputs=[model_state, tokenizer_state]
)
with gr.Row(equal_height=True):
# First column with image input
with gr.Column(scale=1):
image_input = gr.Image(type="filepath", label="Upload an Image")
# Second column with chatbot and user input
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Conversation")
user_input = gr.Textbox(label="What is your question", placeholder="Type your message here")
with gr.Accordion('Parameters', open=False):
with gr.Row():
temperature_input = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.1,
value=0.2,
interactive=True,
label="Temperature")
top_p_input = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.1,
value=0.9,
interactive=True,
label="Top P")
max_new_tokens_input = gr.Slider(
minimum=0,
maximum=4096,
step=64,
value=1024,
interactive=True,
label="Max New Tokens (default: 1024)")
tile_num = gr.Slider(
minimum=2,
maximum=12,
step=1,
value=6,
interactive=True,
label="Tile Number (default: 6)"
)
with gr.Row():
submit_button = gr.Button("Submit")
regenerate_button = gr.Button("Regenerate")
clear_button = gr.Button("Clear")
# When the submit button is clicked, call the inference function
submit_button.click(
fn=inference,
inputs=[
image_input,
user_input,
temperature_input,
top_p_input,
max_new_tokens_input,
tile_num,
chatbot,
state,
model_state,
tokenizer_state
],
outputs=[chatbot, state, user_input]
)
# When the regenerate button is clicked, re-run the last inference
regenerate_button.click(
fn=regenerate_response,
inputs=[
chatbot,
temperature_input,
top_p_input,
max_new_tokens_input,
tile_num,
state,
image_input,
model_state,
tokenizer_state,
],
outputs=[chatbot, state]
)
clear_button.click(
fn=clear_all,
inputs=None,
outputs=[chatbot, state, image_input, user_input]
)
gr.Examples(
examples=[
["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
["assets/receipt.jpg", "Read the text on the image"],
["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],
["assets/CBA-1H23-Results-Presentation_wheel.png", "What is the efficiency of H2O.AI in document processing?"],
],
inputs = [image_input, user_input],
label = "examples",
)
demo.launch()