Spaces:

h2oai
/

h2ovl-mississippi

Running on A10G

App Files Files Community

Shanshan Wang commited on 16 days ago

Commit

84ce9df

•

1 Parent(s): 6c5150b

update

Browse files

Files changed (1) hide show

app.py +26 -192

app.py CHANGED Viewed

@@ -16,154 +16,11 @@ hf_token = os.environ.get('hf_token', None)
 # Define the models and their paths
 model_paths = {
-    "H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b",
-    "H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m",
     # Add more models as needed
 }
-# image preprocesing
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-def build_transform(input_size):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose([
-        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
-        T.ToTensor(),
-        T.Normalize(mean=MEAN, std=STD)
-    ])
-    return transform
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-    # calculate the existing image aspect ratio
-    target_ratios = set(
-        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-        i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images, target_aspect_ratio
-def dynamic_preprocess2(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, prior_aspect_ratio=None):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-    # calculate the existing image aspect ratio
-    target_ratios = set(
-        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-        i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-    new_target_ratios = []
-    if prior_aspect_ratio is not None:
-        for i in target_ratios:
-            if prior_aspect_ratio[0]%i[0] != 0 and prior_aspect_ratio[1]%i[1] != 0:
-                new_target_ratios.append(i)
-            else:
-                continue
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images
-def load_image1(image_file, input_size=448, min_num=1, max_num=6):
-    if isinstance(image_file, str):
-        image = Image.open(image_file).convert('RGB')
-    else:
-        image = image_file
-    transform = build_transform(input_size=input_size)
-    images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
-    return pixel_values, target_aspect_ratio
-def load_image2(image_file, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None):
-    if isinstance(image_file, str):
-        image = Image.open(image_file).convert('RGB')
-    else:
-        image = image_file
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
-    return pixel_values
-def load_image_msac(file_name):
-    pixel_values, target_aspect_ratio = load_image1(file_name, min_num=1, max_num=6)
-    # pixel_values = pixel_values.to(torch.bfloat16).cuda()
-    pixel_values2 = load_image2(file_name, min_num=3, max_num=6, target_aspect_ratio=target_aspect_ratio)
-    # pixel_values2 = pixel_values2.to(torch.bfloat16).cuda()
-    pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
-    return pixel_values
 def load_model_and_set_image_function(model_name):
     # Get the model path from the model_paths dictionary
@@ -184,51 +41,20 @@ def load_model_and_set_image_function(model_name):
         use_fast=False,
         use_auth_token=hf_token
     )
-    tokenizer.pad_token = tokenizer.unk_token
-    tokenizer.eos_token = "<|end|>"
-    model.generation_config.pad_token_id = tokenizer.pad_token_id
-    # Set the appropriate image loading function
-    if "0.8B" in model_name:
-        image_load_function = lambda x: load_image1(x)[0]
-    elif "2B" in model_name:
-        image_load_function = load_image_msac
-    else:
-        image_load_function = load_image1  # Default function
-    return model, tokenizer, image_load_function
-# # Load the model and tokenizer
-# model = AutoModel.from_pretrained(
-#     path,
-#     torch_dtype=torch.bfloat16,
-#     low_cpu_mem_usage=True,
-#     trust_remote_code=True,
-#     use_auth_token=hf_token
-# ).eval().cuda()
-# tokenizer = AutoTokenizer.from_pretrained(
-#     path,
-#     trust_remote_code=True,
-#     use_fast=False,
-#     use_auth_token=hf_token
-# )
-# tokenizer.pad_token = tokenizer.unk_token
-# tokenizer.eos_token = "<|end|>"
-# model.generation_config.pad_token_id = tokenizer.pad_token_id
 def inference(image,
               user_message,
               temperature,
               top_p,
               max_new_tokens,
               chatbot,state,
               image_state,
               model_state,
-              tokenizer_state,
-              image_load_function_state):
     # Check if model_state is None
     if model_state is None or tokenizer_state is None:
@@ -237,15 +63,14 @@ def inference(image,
     model = model_state
     tokenizer = tokenizer_state
-    image_load_function = image_load_function_state
-    # # if image is provided, store it in image_state:
-    # if chatbot is None:
-    #     chatbot = []
     if image is not None:
-        image_state = image_load_function(image)
     else:
         # If image_state is None, then no image has been provided yet
         if image_state is None:
@@ -276,6 +101,7 @@ def inference(image,
         tokenizer,
         image_state,
         user_message,
         generation_config=generation_config,
         history=state,
         return_history=True
@@ -292,6 +118,7 @@ def regenerate_response(chatbot,
                         temperature,
                         top_p,
                         max_new_tokens,
                         state,
                         image_state,
                         model_state,
@@ -339,6 +166,7 @@ def regenerate_response(chatbot,
         tokenizer,
         image_state,
         last_user_message,
         generation_config=generation_config,
         history=state,  # Exclude last assistant's response
         return_history=True
@@ -377,13 +205,13 @@ with gr.Blocks() as demo:
     model_dropdown.change(
         fn=load_model_and_set_image_function,
         inputs=[model_dropdown],
-        outputs=[model_state, tokenizer_state, image_load_function_state]
     )
     with gr.Row(equal_height=True):
         # First column with image input
         with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Upload an Image")
         # Second column with chatbot and user input
         with gr.Column(scale=2):
@@ -397,7 +225,7 @@ with gr.Blocks() as demo:
                 minimum=0.0,
                 maximum=1.0,
                 step=0.1,
-                value=0.0,
                 interactive=True,
                 label="Temperature")
             top_p_input = gr.Slider(
@@ -413,7 +241,14 @@ with gr.Blocks() as demo:
                 step=64,
                 value=1024,
                 interactive=True,
-                label="Max New Tokens (default: 1024)"
             )
     with gr.Row():
@@ -430,12 +265,12 @@ with gr.Blocks() as demo:
             temperature_input,
             top_p_input,
             max_new_tokens_input,
             chatbot,
             state,
             image_state,
             model_state,
-            tokenizer_state,
-            image_load_function_state
         ],
         outputs=[chatbot, state, image_state, user_input]
     )
@@ -447,6 +282,7 @@ with gr.Blocks() as demo:
             temperature_input,
             top_p_input,
             max_new_tokens_input,
             state,
             image_state,
             model_state,
@@ -471,6 +307,4 @@ with gr.Blocks() as demo:
         label = "examples",
     )
 demo.launch()

 # Define the models and their paths
 model_paths = {
+    "H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b-prerel",
+    "H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m-prerel",
     # Add more models as needed
 }
 def load_model_and_set_image_function(model_name):
     # Get the model path from the model_paths dictionary
         use_fast=False,
         use_auth_token=hf_token
     )
+    return model, tokenizer
 def inference(image,
               user_message,
               temperature,
               top_p,
               max_new_tokens,
+              tile_num,
               chatbot,state,
               image_state,
               model_state,
+              tokenizer_state):
     # Check if model_state is None
     if model_state is None or tokenizer_state is None:
     model = model_state
     tokenizer = tokenizer_state
+    # if image is provided, store it in image_state:
+    if chatbot is None:
+        chatbot = []
     if image is not None:
+        image_state = image
     else:
         # If image_state is None, then no image has been provided yet
         if image_state is None:
         tokenizer,
         image_state,
         user_message,
+        max_tiles = int(tile_num),
         generation_config=generation_config,
         history=state,
         return_history=True
                         temperature,
                         top_p,
                         max_new_tokens,
+                        tile_num,
                         state,
                         image_state,
                         model_state,
         tokenizer,
         image_state,
         last_user_message,
+        max_tiles = int(tile_num),
         generation_config=generation_config,
         history=state,  # Exclude last assistant's response
         return_history=True
     model_dropdown.change(
         fn=load_model_and_set_image_function,
         inputs=[model_dropdown],
+        outputs=[model_state, tokenizer_state]
     )
     with gr.Row(equal_height=True):
         # First column with image input
         with gr.Column(scale=1):
+            image_input = gr.Image(type="filepath", label="Upload an Image")
         # Second column with chatbot and user input
         with gr.Column(scale=2):
                 minimum=0.0,
                 maximum=1.0,
                 step=0.1,
+                value=0.2,
                 interactive=True,
                 label="Temperature")
             top_p_input = gr.Slider(
                 step=64,
                 value=1024,
                 interactive=True,
+                label="Max New Tokens (default: 1024)")
+            tile_num = gr.Slider(
+                minimum=2,
+                maximum=12,
+                step=1,
+                value=6,
+                interactive=True,
+                label="Tile Number (default: 6)"
             )
     with gr.Row():
             temperature_input,
             top_p_input,
             max_new_tokens_input,
+            tile_num,
             chatbot,
             state,
             image_state,
             model_state,
+            tokenizer_state
         ],
         outputs=[chatbot, state, image_state, user_input]
     )
             temperature_input,
             top_p_input,
             max_new_tokens_input,
+            tile_num,
             state,
             image_state,
             model_state,
         label = "examples",
     )
 demo.launch()