import gradio as gr from transformers import AutoModel, AutoTokenizer import torch import threading import os # caching the mode model_cache = {} tokenizer_cache = {} model_lock = threading.Lock() from huggingface_hub import login hf_token = os.environ.get('hf_token', None) # Define the models and their paths model_paths = { "H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b", "H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m", # Add more models as needed } example_prompts = [ "Read the text and provide word by word ocr for the document. ", "Extract the text from the image.", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount", ] def load_model_and_set_image_function(model_name): # Get the model path from the model_paths dictionary model_path = model_paths[model_name] with model_lock: if model_name in model_cache: # model is already loaded; retrieve it from the cache print(f"Model {model_name} is already loaded. Retrieving from cache.") else: # load the model and tokenizer print(f"Loading model {model_name}...") model = AutoModel.from_pretrained( model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, use_auth_token=hf_token, # device_map="auto" ).eval().cuda() tokenizer = AutoTokenizer.from_pretrained( model_path, trust_remote_code=True, use_fast=False, use_auth_token=hf_token ) # add the model and tokenizer to the cache model_cache[model_name] = model tokenizer_cache[model_name] = tokenizer print(f"Model {model_name} loaded successfully.") return model_name def inference(image_input, user_message, temperature, top_p, max_new_tokens, tile_num, chatbot, state, model_name): # Check if model_state is None if model_name is None: chatbot.append(("System", "Please select a model to start the conversation.")) return chatbot, state, "" with model_lock: if model_name not in model_cache: chatbot.append(("System", "Model not loaded. Please wait for the model to load.")) return chatbot, state, "" model = model_cache[model_name] tokenizer = tokenizer_cache[model_name] # Check for empty or invalid user message if not user_message or user_message.strip() == '' or user_message.lower() == 'system': chatbot.append(("System", "Please enter a valid message to continue the conversation.")) return chatbot, state, "" # if image is provided, store it in image_state: if chatbot is None: chatbot = [] if image_input is None: chatbot.append(("System", "Please provide an image to start the conversation.")) return chatbot, state, "" # Initialize history (state) if it's None if state is None: state = None # model.chat function handles None as empty history # Append user message to chatbot chatbot.append((user_message, None)) # Set generation config do_sample = (float(temperature) != 0.0) generation_config = dict( num_beams=1, max_new_tokens=int(max_new_tokens), do_sample=do_sample, temperature= float(temperature), top_p= float(top_p), ) # Call model.chat with history response_text, new_state = model.chat( tokenizer, image_input, user_message, max_tiles = int(tile_num), generation_config=generation_config, history=state, return_history=True ) # update the satet with new_state state = new_state # Update chatbot with the model's response chatbot[-1] = (user_message, response_text) return chatbot, state, "" def regenerate_response(chatbot, temperature, top_p, max_new_tokens, tile_num, state, image_input, model_name): # Check if model_state is None if model_name is None: chatbot.append(("System", "Please select a model to start the conversation.")) return chatbot, state with model_lock: if model_name not in model_cache: chatbot.append(("System", "Model not loaded. Please wait for the model to load.")) return chatbot, state model = model_cache[model_name] tokenizer = tokenizer_cache[model_name] # Check if there is a previous user message if chatbot is None or len(chatbot) == 0: chatbot = [] chatbot.append(("System", "Nothing to regenerate. Please start a conversation first.")) return chatbot, state, # Get the last user message last_user_message, _ = chatbot[-1] # Check for empty or invalid last user message if not last_user_message or last_user_message.strip() == '' or last_user_message.lower() == 'system': chatbot.append(("System", "Cannot regenerate response for an empty or invalid message.")) return chatbot, state # Remove last assistant's response from state if state is not None and len(state) > 0: state = state[:-1] # Remove last assistant's response from history if len(state) == 0: state = None else: state = None # Set generation config do_sample = (float(temperature) != 0.0) generation_config = dict( num_beams=1, max_new_tokens=int(max_new_tokens), do_sample=do_sample, temperature= float(temperature), top_p= float(top_p), ) # Regenerate the response response_text, new_state = model.chat( tokenizer, image_input, last_user_message, max_tiles = int(tile_num), generation_config=generation_config, history=state, # Exclude last assistant's response return_history=True ) # Update the state with new_state state = new_state # Update chatbot with the regenerated response chatbot[-1] = (last_user_message, response_text) return chatbot, state def clear_all(): return [], None, None, "" # Clear chatbot, state, reset image_input # Build the Gradio interface with gr.Blocks() as demo: gr.Markdown("# **H2OVL-Mississippi**") state= gr.State() model_state = gr.State() # tokenizer_state = gr.State() # image_load_function_state = gr.State() with gr.Row(): model_dropdown = gr.Dropdown( choices=list(model_paths.keys()), label="Select Model", value="H2OVL-Mississippi-2B" ) with gr.Row(equal_height=True): # First column with image input with gr.Column(scale=1): image_input = gr.Image(type="filepath", label="Upload an Image") # Second column with chatbot and user input with gr.Column(scale=2): chatbot = gr.Chatbot(label="Conversation") user_input = gr.Dropdown(label="What is your question", choices = example_prompts, value=None, allow_custom_value=True, interactive=True) def reset_chatbot_state(): # reset chatbot and state return [], None # When the model selection changes, load the new model model_dropdown.change( fn=load_model_and_set_image_function, inputs=[model_dropdown], outputs=[model_state] ) model_dropdown.change( fn=reset_chatbot_state, inputs=None, outputs=[chatbot, state] ) # Reset chatbot and state when image input changes image_input.change( fn=reset_chatbot_state, inputs=None, outputs=[chatbot, state] ) # Load the default model when the app starts demo.load( fn=load_model_and_set_image_function, inputs=[model_dropdown], outputs=[model_state] ) with gr.Accordion('Parameters', open=False): with gr.Row(): temperature_input = gr.Slider( minimum=0.0, maximum=1.0, step=0.1, value=0.2, interactive=True, label="Temperature") top_p_input = gr.Slider( minimum=0.0, maximum=1.0, step=0.1, value=0.9, interactive=True, label="Top P") max_new_tokens_input = gr.Slider( minimum=0, maximum=4096, step=64, value=1024, interactive=True, label="Max New Tokens (default: 1024)") tile_num = gr.Slider( minimum=2, maximum=12, step=1, value=6, interactive=True, label="Tile Number (default: 6)" ) with gr.Row(): submit_button = gr.Button("Submit") regenerate_button = gr.Button("Regenerate") clear_button = gr.Button("Clear") # When the submit button is clicked, call the inference function submit_button.click( fn=inference, inputs=[ image_input, user_input, temperature_input, top_p_input, max_new_tokens_input, tile_num, chatbot, state, model_state ], outputs=[chatbot, state, user_input] ) # When the regenerate button is clicked, re-run the last inference regenerate_button.click( fn=regenerate_response, inputs=[ chatbot, temperature_input, top_p_input, max_new_tokens_input, tile_num, state, image_input, model_state ], outputs=[chatbot, state] ) clear_button.click( fn=clear_all, inputs=None, outputs=[chatbot, state, image_input, user_input] ) def example_clicked(image_value, user_input_value): chatbot_value, state_value = [], None return image_value, user_input_value, chatbot_value, state_value # Reset chatbot and state gr.Examples( examples=[ ["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"], ["assets/handwritten-note-example.jpg", "Read the text on the image"], ["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"], ["assets/CBA-1H23-Results-Presentation_wheel.png", "What is the efficiency of H2O.AI in document processing?"], ], inputs = [image_input, user_input], outputs = [image_input, user_input, chatbot, state], fn=example_clicked, label = "examples", ) demo.queue() demo.launch(max_threads=10)