Spaces:

h2oai
/

h2ovl-mississippi

Running on A10G

App Files Files Community

Shanshan Wang commited on 17 days ago

Commit

6c5150b

•

1 Parent(s): c65d305

added 0.8b model in the model list

Browse files

Files changed (1) hide show

app.py +153 -39

app.py CHANGED Viewed

@@ -11,8 +11,15 @@ import os
 from huggingface_hub import login
 hf_token = os.environ.get('hf_token', None)
-# Define the path to your model
-path = "h2oai/h2ovl-mississippi-2b"
 # image preprocesing
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
@@ -126,7 +133,7 @@ def dynamic_preprocess2(image, min_num=1, max_num=6, image_size=448, use_thumbna
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
     return processed_images
-def load_image1(image_file, input_size=448, min_num=1, max_num=12):
     if isinstance(image_file, str):
         image = Image.open(image_file).convert('RGB')
     else:
@@ -134,7 +141,7 @@ def load_image1(image_file, input_size=448, min_num=1, max_num=12):
     transform = build_transform(input_size=input_size)
     images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
     pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
     return pixel_values, target_aspect_ratio
 def load_image2(image_file, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None):
@@ -146,43 +153,99 @@ def load_image2(image_file, input_size=448, min_num=1, max_num=12, target_aspect
     transform = build_transform(input_size=input_size)
     images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
     pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
     return pixel_values
 def load_image_msac(file_name):
     pixel_values, target_aspect_ratio = load_image1(file_name, min_num=1, max_num=6)
-    pixel_values = pixel_values.to(torch.bfloat16).cuda()
     pixel_values2 = load_image2(file_name, min_num=3, max_num=6, target_aspect_ratio=target_aspect_ratio)
-    pixel_values2 = pixel_values2.to(torch.bfloat16).cuda()
     pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
     return pixel_values
-# Load the model and tokenizer
-model = AutoModel.from_pretrained(
-    path,
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    trust_remote_code=True,
-    use_auth_token=hf_token
-).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(
-    path,
-    trust_remote_code=True,
-    use_fast=False,
-    use_auth_token=hf_token
-)
-tokenizer.pad_token = tokenizer.unk_token
-tokenizer.eos_token = "<|end|>"
-model.generation_config.pad_token_id = tokenizer.pad_token_id
-def inference(image, user_message, temperature, top_p, max_new_tokens, chatbot,state, image_state):
-    # if image is provided, store it in image_state:
-    if chatbot is None:
-        chatbot = []
     if image is not None:
-        image_state = load_image_msac(image)
     else:
         # If image_state is None, then no image has been provided yet
         if image_state is None:
@@ -225,8 +288,24 @@ def inference(image, user_message, temperature, top_p, max_new_tokens, chatbot,s
     return chatbot, state, image_state, ""
-def regenerate_response(chatbot, temperature, top_p, max_new_tokens, state, image_state):
     # Check if there is a previous user message
     if chatbot is None or len(chatbot) == 0:
         chatbot = []
@@ -284,6 +363,22 @@ with gr.Blocks() as demo:
     state= gr.State()
     image_state = gr.State()
     with gr.Row(equal_height=True):
         # First column with image input
@@ -329,13 +424,34 @@ with gr.Blocks() as demo:
     # When the submit button is clicked, call the inference function
     submit_button.click(
         fn=inference,
-        inputs=[image_input, user_input, temperature_input, top_p_input, max_new_tokens_input, chatbot, state, image_state],
         outputs=[chatbot, state, image_state, user_input]
     )
     # When the regenerate button is clicked, re-run the last inference
     regenerate_button.click(
         fn=regenerate_response,
-        inputs=[chatbot, temperature_input, top_p_input,max_new_tokens_input, state, image_state],
         outputs=[chatbot, state, image_state]
     )
@@ -347,13 +463,11 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
             ["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
-            # ["assets/receipt.jpg", "Read the text on the image"],
             ["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],
             ["assets/CBA-1H23-Results-Presentation_wheel.png", "What is the efficiency of H2O.AI in document processing?"],
         ],
         inputs = [image_input, user_input],
-        # outputs = [chatbot, state, image_state, user_input],
-        # fn=inference,
         label = "examples",
     )

 from huggingface_hub import login
 hf_token = os.environ.get('hf_token', None)
+# # Define the path to your model
+# path = "h2oai/h2ovl-mississippi-2b"
+# Define the models and their paths
+model_paths = {
+    "H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b",
+    "H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m",
+    # Add more models as needed
+}
 # image preprocesing
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
     return processed_images
+def load_image1(image_file, input_size=448, min_num=1, max_num=6):
     if isinstance(image_file, str):
         image = Image.open(image_file).convert('RGB')
     else:
     transform = build_transform(input_size=input_size)
     images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
     pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
     return pixel_values, target_aspect_ratio
 def load_image2(image_file, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None):
     transform = build_transform(input_size=input_size)
     images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
     pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
     return pixel_values
 def load_image_msac(file_name):
     pixel_values, target_aspect_ratio = load_image1(file_name, min_num=1, max_num=6)
+    # pixel_values = pixel_values.to(torch.bfloat16).cuda()
     pixel_values2 = load_image2(file_name, min_num=3, max_num=6, target_aspect_ratio=target_aspect_ratio)
+    # pixel_values2 = pixel_values2.to(torch.bfloat16).cuda()
     pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
     return pixel_values
+def load_model_and_set_image_function(model_name):
+    # Get the model path from the model_paths dictionary
+    model_path = model_paths[model_name]
+    # Load the model
+    model = AutoModel.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        use_auth_token=hf_token
+    ).eval().cuda()
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        use_fast=False,
+        use_auth_token=hf_token
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.eos_token = "<|end|>"
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+    # Set the appropriate image loading function
+    if "0.8B" in model_name:
+        image_load_function = lambda x: load_image1(x)[0]
+    elif "2B" in model_name:
+        image_load_function = load_image_msac
+    else:
+        image_load_function = load_image1  # Default function
+    return model, tokenizer, image_load_function
+# # Load the model and tokenizer
+# model = AutoModel.from_pretrained(
+#     path,
+#     torch_dtype=torch.bfloat16,
+#     low_cpu_mem_usage=True,
+#     trust_remote_code=True,
+#     use_auth_token=hf_token
+# ).eval().cuda()
+# tokenizer = AutoTokenizer.from_pretrained(
+#     path,
+#     trust_remote_code=True,
+#     use_fast=False,
+#     use_auth_token=hf_token
+# )
+# tokenizer.pad_token = tokenizer.unk_token
+# tokenizer.eos_token = "<|end|>"
+# model.generation_config.pad_token_id = tokenizer.pad_token_id
+def inference(image,
+              user_message,
+              temperature,
+              top_p,
+              max_new_tokens,
+              chatbot,state,
+              image_state,
+              model_state,
+              tokenizer_state,
+              image_load_function_state):
+    # Check if model_state is None
+    if model_state is None or tokenizer_state is None:
+        chatbot.append(("System", "Please select a model to start the conversation."))
+        return chatbot, state, image_state, ""
+    model = model_state
+    tokenizer = tokenizer_state
+    image_load_function = image_load_function_state
+    # # if image is provided, store it in image_state:
+    # if chatbot is None:
+    #     chatbot = []
     if image is not None:
+        image_state = image_load_function(image)
     else:
         # If image_state is None, then no image has been provided yet
         if image_state is None:
     return chatbot, state, image_state, ""
+def regenerate_response(chatbot,
+                        temperature,
+                        top_p,
+                        max_new_tokens,
+                        state,
+                        image_state,
+                        model_state,
+                        tokenizer_state):
+    # Check if model_state is None
+    if model_state is None or tokenizer_state is None:
+        chatbot.append(("System", "Please select a model to start the conversation."))
+        return chatbot, state, image_state
+    model = model_state
+    tokenizer = tokenizer_state
     # Check if there is a previous user message
     if chatbot is None or len(chatbot) == 0:
         chatbot = []
     state= gr.State()
     image_state = gr.State()
+    model_state = gr.State()
+    tokenizer_state = gr.State()
+    image_load_function_state = gr.State()
+    with gr.Row():
+        model_dropdown = gr.Dropdown(
+            choices=list(model_paths.keys()),
+            label="Select Model"
+        )
+    # When the model selection changes, load the new model
+    model_dropdown.change(
+        fn=load_model_and_set_image_function,
+        inputs=[model_dropdown],
+        outputs=[model_state, tokenizer_state, image_load_function_state]
+    )
     with gr.Row(equal_height=True):
         # First column with image input
     # When the submit button is clicked, call the inference function
     submit_button.click(
         fn=inference,
+        inputs=[
+            image_input,
+            user_input,
+            temperature_input,
+            top_p_input,
+            max_new_tokens_input,
+            chatbot,
+            state,
+            image_state,
+            model_state,
+            tokenizer_state,
+            image_load_function_state
+        ],
         outputs=[chatbot, state, image_state, user_input]
     )
     # When the regenerate button is clicked, re-run the last inference
     regenerate_button.click(
         fn=regenerate_response,
+        inputs=[
+            chatbot,
+            temperature_input,
+            top_p_input,
+            max_new_tokens_input,
+            state,
+            image_state,
+            model_state,
+            tokenizer_state,
+            ],
         outputs=[chatbot, state, image_state]
     )
     gr.Examples(
         examples=[
             ["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
+            ["assets/receipt.jpg", "Read the text on the image"],
             ["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],
             ["assets/CBA-1H23-Results-Presentation_wheel.png", "What is the efficiency of H2O.AI in document processing?"],
         ],
         inputs = [image_input, user_input],
         label = "examples",
     )