Shanshan Wang commited on
Commit
84ce9df
1 Parent(s): 6c5150b
Files changed (1) hide show
  1. app.py +26 -192
app.py CHANGED
@@ -16,154 +16,11 @@ hf_token = os.environ.get('hf_token', None)
16
 
17
  # Define the models and their paths
18
  model_paths = {
19
- "H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b",
20
- "H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m",
21
  # Add more models as needed
22
  }
23
 
24
- # image preprocesing
25
- IMAGENET_MEAN = (0.485, 0.456, 0.406)
26
- IMAGENET_STD = (0.229, 0.224, 0.225)
27
-
28
- def build_transform(input_size):
29
- MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
30
- transform = T.Compose([
31
- T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
32
- T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
33
- T.ToTensor(),
34
- T.Normalize(mean=MEAN, std=STD)
35
- ])
36
- return transform
37
-
38
- def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
39
- best_ratio_diff = float('inf')
40
- best_ratio = (1, 1)
41
- area = width * height
42
- for ratio in target_ratios:
43
- target_aspect_ratio = ratio[0] / ratio[1]
44
- ratio_diff = abs(aspect_ratio - target_aspect_ratio)
45
- if ratio_diff < best_ratio_diff:
46
- best_ratio_diff = ratio_diff
47
- best_ratio = ratio
48
- elif ratio_diff == best_ratio_diff:
49
- if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
50
- best_ratio = ratio
51
- return best_ratio
52
-
53
- def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
54
- orig_width, orig_height = image.size
55
- aspect_ratio = orig_width / orig_height
56
-
57
- # calculate the existing image aspect ratio
58
- target_ratios = set(
59
- (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
60
- i * j <= max_num and i * j >= min_num)
61
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
62
-
63
- # find the closest aspect ratio to the target
64
- target_aspect_ratio = find_closest_aspect_ratio(
65
- aspect_ratio, target_ratios, orig_width, orig_height, image_size)
66
-
67
- # calculate the target width and height
68
- target_width = image_size * target_aspect_ratio[0]
69
- target_height = image_size * target_aspect_ratio[1]
70
- blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
71
-
72
- # resize the image
73
- resized_img = image.resize((target_width, target_height))
74
- processed_images = []
75
- for i in range(blocks):
76
- box = (
77
- (i % (target_width // image_size)) * image_size,
78
- (i // (target_width // image_size)) * image_size,
79
- ((i % (target_width // image_size)) + 1) * image_size,
80
- ((i // (target_width // image_size)) + 1) * image_size
81
- )
82
- # split the image
83
- split_img = resized_img.crop(box)
84
- processed_images.append(split_img)
85
- assert len(processed_images) == blocks
86
- if use_thumbnail and len(processed_images) != 1:
87
- thumbnail_img = image.resize((image_size, image_size))
88
- processed_images.append(thumbnail_img)
89
- return processed_images, target_aspect_ratio
90
-
91
- def dynamic_preprocess2(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, prior_aspect_ratio=None):
92
- orig_width, orig_height = image.size
93
- aspect_ratio = orig_width / orig_height
94
-
95
- # calculate the existing image aspect ratio
96
- target_ratios = set(
97
- (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
98
- i * j <= max_num and i * j >= min_num)
99
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
100
-
101
- new_target_ratios = []
102
- if prior_aspect_ratio is not None:
103
- for i in target_ratios:
104
- if prior_aspect_ratio[0]%i[0] != 0 and prior_aspect_ratio[1]%i[1] != 0:
105
- new_target_ratios.append(i)
106
- else:
107
- continue
108
-
109
- # find the closest aspect ratio to the target
110
- target_aspect_ratio = find_closest_aspect_ratio(
111
- aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
112
-
113
- # calculate the target width and height
114
- target_width = image_size * target_aspect_ratio[0]
115
- target_height = image_size * target_aspect_ratio[1]
116
- blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
117
-
118
- # resize the image
119
- resized_img = image.resize((target_width, target_height))
120
- processed_images = []
121
- for i in range(blocks):
122
- box = (
123
- (i % (target_width // image_size)) * image_size,
124
- (i // (target_width // image_size)) * image_size,
125
- ((i % (target_width // image_size)) + 1) * image_size,
126
- ((i // (target_width // image_size)) + 1) * image_size
127
- )
128
- # split the image
129
- split_img = resized_img.crop(box)
130
- processed_images.append(split_img)
131
- assert len(processed_images) == blocks
132
- if use_thumbnail and len(processed_images) != 1:
133
- thumbnail_img = image.resize((image_size, image_size))
134
- processed_images.append(thumbnail_img)
135
- return processed_images
136
- def load_image1(image_file, input_size=448, min_num=1, max_num=6):
137
- if isinstance(image_file, str):
138
- image = Image.open(image_file).convert('RGB')
139
- else:
140
- image = image_file
141
- transform = build_transform(input_size=input_size)
142
- images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
143
- pixel_values = [transform(image) for image in images]
144
- pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
145
- return pixel_values, target_aspect_ratio
146
-
147
- def load_image2(image_file, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None):
148
-
149
- if isinstance(image_file, str):
150
- image = Image.open(image_file).convert('RGB')
151
- else:
152
- image = image_file
153
- transform = build_transform(input_size=input_size)
154
- images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
155
- pixel_values = [transform(image) for image in images]
156
- pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
157
- return pixel_values
158
-
159
- def load_image_msac(file_name):
160
- pixel_values, target_aspect_ratio = load_image1(file_name, min_num=1, max_num=6)
161
- # pixel_values = pixel_values.to(torch.bfloat16).cuda()
162
- pixel_values2 = load_image2(file_name, min_num=3, max_num=6, target_aspect_ratio=target_aspect_ratio)
163
- # pixel_values2 = pixel_values2.to(torch.bfloat16).cuda()
164
- pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
165
- return pixel_values
166
-
167
 
168
  def load_model_and_set_image_function(model_name):
169
  # Get the model path from the model_paths dictionary
@@ -184,51 +41,20 @@ def load_model_and_set_image_function(model_name):
184
  use_fast=False,
185
  use_auth_token=hf_token
186
  )
187
- tokenizer.pad_token = tokenizer.unk_token
188
- tokenizer.eos_token = "<|end|>"
189
- model.generation_config.pad_token_id = tokenizer.pad_token_id
190
-
191
- # Set the appropriate image loading function
192
- if "0.8B" in model_name:
193
- image_load_function = lambda x: load_image1(x)[0]
194
- elif "2B" in model_name:
195
- image_load_function = load_image_msac
196
- else:
197
- image_load_function = load_image1 # Default function
198
 
199
- return model, tokenizer, image_load_function
200
 
201
 
202
- # # Load the model and tokenizer
203
- # model = AutoModel.from_pretrained(
204
- # path,
205
- # torch_dtype=torch.bfloat16,
206
- # low_cpu_mem_usage=True,
207
- # trust_remote_code=True,
208
- # use_auth_token=hf_token
209
- # ).eval().cuda()
210
-
211
- # tokenizer = AutoTokenizer.from_pretrained(
212
- # path,
213
- # trust_remote_code=True,
214
- # use_fast=False,
215
- # use_auth_token=hf_token
216
- # )
217
- # tokenizer.pad_token = tokenizer.unk_token
218
- # tokenizer.eos_token = "<|end|>"
219
- # model.generation_config.pad_token_id = tokenizer.pad_token_id
220
-
221
-
222
  def inference(image,
223
  user_message,
224
  temperature,
225
  top_p,
226
  max_new_tokens,
 
227
  chatbot,state,
228
  image_state,
229
  model_state,
230
- tokenizer_state,
231
- image_load_function_state):
232
 
233
  # Check if model_state is None
234
  if model_state is None or tokenizer_state is None:
@@ -237,15 +63,14 @@ def inference(image,
237
 
238
  model = model_state
239
  tokenizer = tokenizer_state
240
- image_load_function = image_load_function_state
241
 
242
 
243
- # # if image is provided, store it in image_state:
244
- # if chatbot is None:
245
- # chatbot = []
246
 
247
  if image is not None:
248
- image_state = image_load_function(image)
249
  else:
250
  # If image_state is None, then no image has been provided yet
251
  if image_state is None:
@@ -276,6 +101,7 @@ def inference(image,
276
  tokenizer,
277
  image_state,
278
  user_message,
 
279
  generation_config=generation_config,
280
  history=state,
281
  return_history=True
@@ -292,6 +118,7 @@ def regenerate_response(chatbot,
292
  temperature,
293
  top_p,
294
  max_new_tokens,
 
295
  state,
296
  image_state,
297
  model_state,
@@ -339,6 +166,7 @@ def regenerate_response(chatbot,
339
  tokenizer,
340
  image_state,
341
  last_user_message,
 
342
  generation_config=generation_config,
343
  history=state, # Exclude last assistant's response
344
  return_history=True
@@ -377,13 +205,13 @@ with gr.Blocks() as demo:
377
  model_dropdown.change(
378
  fn=load_model_and_set_image_function,
379
  inputs=[model_dropdown],
380
- outputs=[model_state, tokenizer_state, image_load_function_state]
381
  )
382
 
383
  with gr.Row(equal_height=True):
384
  # First column with image input
385
  with gr.Column(scale=1):
386
- image_input = gr.Image(type="pil", label="Upload an Image")
387
 
388
  # Second column with chatbot and user input
389
  with gr.Column(scale=2):
@@ -397,7 +225,7 @@ with gr.Blocks() as demo:
397
  minimum=0.0,
398
  maximum=1.0,
399
  step=0.1,
400
- value=0.0,
401
  interactive=True,
402
  label="Temperature")
403
  top_p_input = gr.Slider(
@@ -413,7 +241,14 @@ with gr.Blocks() as demo:
413
  step=64,
414
  value=1024,
415
  interactive=True,
416
- label="Max New Tokens (default: 1024)"
 
 
 
 
 
 
 
417
  )
418
 
419
  with gr.Row():
@@ -430,12 +265,12 @@ with gr.Blocks() as demo:
430
  temperature_input,
431
  top_p_input,
432
  max_new_tokens_input,
 
433
  chatbot,
434
  state,
435
  image_state,
436
  model_state,
437
- tokenizer_state,
438
- image_load_function_state
439
  ],
440
  outputs=[chatbot, state, image_state, user_input]
441
  )
@@ -447,6 +282,7 @@ with gr.Blocks() as demo:
447
  temperature_input,
448
  top_p_input,
449
  max_new_tokens_input,
 
450
  state,
451
  image_state,
452
  model_state,
@@ -471,6 +307,4 @@ with gr.Blocks() as demo:
471
  label = "examples",
472
  )
473
 
474
-
475
-
476
  demo.launch()
 
16
 
17
  # Define the models and their paths
18
  model_paths = {
19
+ "H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b-prerel",
20
+ "H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m-prerel",
21
  # Add more models as needed
22
  }
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def load_model_and_set_image_function(model_name):
26
  # Get the model path from the model_paths dictionary
 
41
  use_fast=False,
42
  use_auth_token=hf_token
43
  )
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ return model, tokenizer
46
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def inference(image,
49
  user_message,
50
  temperature,
51
  top_p,
52
  max_new_tokens,
53
+ tile_num,
54
  chatbot,state,
55
  image_state,
56
  model_state,
57
+ tokenizer_state):
 
58
 
59
  # Check if model_state is None
60
  if model_state is None or tokenizer_state is None:
 
63
 
64
  model = model_state
65
  tokenizer = tokenizer_state
 
66
 
67
 
68
+ # if image is provided, store it in image_state:
69
+ if chatbot is None:
70
+ chatbot = []
71
 
72
  if image is not None:
73
+ image_state = image
74
  else:
75
  # If image_state is None, then no image has been provided yet
76
  if image_state is None:
 
101
  tokenizer,
102
  image_state,
103
  user_message,
104
+ max_tiles = int(tile_num),
105
  generation_config=generation_config,
106
  history=state,
107
  return_history=True
 
118
  temperature,
119
  top_p,
120
  max_new_tokens,
121
+ tile_num,
122
  state,
123
  image_state,
124
  model_state,
 
166
  tokenizer,
167
  image_state,
168
  last_user_message,
169
+ max_tiles = int(tile_num),
170
  generation_config=generation_config,
171
  history=state, # Exclude last assistant's response
172
  return_history=True
 
205
  model_dropdown.change(
206
  fn=load_model_and_set_image_function,
207
  inputs=[model_dropdown],
208
+ outputs=[model_state, tokenizer_state]
209
  )
210
 
211
  with gr.Row(equal_height=True):
212
  # First column with image input
213
  with gr.Column(scale=1):
214
+ image_input = gr.Image(type="filepath", label="Upload an Image")
215
 
216
  # Second column with chatbot and user input
217
  with gr.Column(scale=2):
 
225
  minimum=0.0,
226
  maximum=1.0,
227
  step=0.1,
228
+ value=0.2,
229
  interactive=True,
230
  label="Temperature")
231
  top_p_input = gr.Slider(
 
241
  step=64,
242
  value=1024,
243
  interactive=True,
244
+ label="Max New Tokens (default: 1024)")
245
+ tile_num = gr.Slider(
246
+ minimum=2,
247
+ maximum=12,
248
+ step=1,
249
+ value=6,
250
+ interactive=True,
251
+ label="Tile Number (default: 6)"
252
  )
253
 
254
  with gr.Row():
 
265
  temperature_input,
266
  top_p_input,
267
  max_new_tokens_input,
268
+ tile_num,
269
  chatbot,
270
  state,
271
  image_state,
272
  model_state,
273
+ tokenizer_state
 
274
  ],
275
  outputs=[chatbot, state, image_state, user_input]
276
  )
 
282
  temperature_input,
283
  top_p_input,
284
  max_new_tokens_input,
285
+ tile_num,
286
  state,
287
  image_state,
288
  model_state,
 
307
  label = "examples",
308
  )
309
 
 
 
310
  demo.launch()