Spaces:
Running
on
A10G
Running
on
A10G
Shanshan Wang
commited on
Commit
•
84ce9df
1
Parent(s):
6c5150b
update
Browse files
app.py
CHANGED
@@ -16,154 +16,11 @@ hf_token = os.environ.get('hf_token', None)
|
|
16 |
|
17 |
# Define the models and their paths
|
18 |
model_paths = {
|
19 |
-
"H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b",
|
20 |
-
"H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m",
|
21 |
# Add more models as needed
|
22 |
}
|
23 |
|
24 |
-
# image preprocesing
|
25 |
-
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
26 |
-
IMAGENET_STD = (0.229, 0.224, 0.225)
|
27 |
-
|
28 |
-
def build_transform(input_size):
|
29 |
-
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
30 |
-
transform = T.Compose([
|
31 |
-
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
32 |
-
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
33 |
-
T.ToTensor(),
|
34 |
-
T.Normalize(mean=MEAN, std=STD)
|
35 |
-
])
|
36 |
-
return transform
|
37 |
-
|
38 |
-
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
39 |
-
best_ratio_diff = float('inf')
|
40 |
-
best_ratio = (1, 1)
|
41 |
-
area = width * height
|
42 |
-
for ratio in target_ratios:
|
43 |
-
target_aspect_ratio = ratio[0] / ratio[1]
|
44 |
-
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
45 |
-
if ratio_diff < best_ratio_diff:
|
46 |
-
best_ratio_diff = ratio_diff
|
47 |
-
best_ratio = ratio
|
48 |
-
elif ratio_diff == best_ratio_diff:
|
49 |
-
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
50 |
-
best_ratio = ratio
|
51 |
-
return best_ratio
|
52 |
-
|
53 |
-
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
|
54 |
-
orig_width, orig_height = image.size
|
55 |
-
aspect_ratio = orig_width / orig_height
|
56 |
-
|
57 |
-
# calculate the existing image aspect ratio
|
58 |
-
target_ratios = set(
|
59 |
-
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
60 |
-
i * j <= max_num and i * j >= min_num)
|
61 |
-
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
62 |
-
|
63 |
-
# find the closest aspect ratio to the target
|
64 |
-
target_aspect_ratio = find_closest_aspect_ratio(
|
65 |
-
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
66 |
-
|
67 |
-
# calculate the target width and height
|
68 |
-
target_width = image_size * target_aspect_ratio[0]
|
69 |
-
target_height = image_size * target_aspect_ratio[1]
|
70 |
-
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
71 |
-
|
72 |
-
# resize the image
|
73 |
-
resized_img = image.resize((target_width, target_height))
|
74 |
-
processed_images = []
|
75 |
-
for i in range(blocks):
|
76 |
-
box = (
|
77 |
-
(i % (target_width // image_size)) * image_size,
|
78 |
-
(i // (target_width // image_size)) * image_size,
|
79 |
-
((i % (target_width // image_size)) + 1) * image_size,
|
80 |
-
((i // (target_width // image_size)) + 1) * image_size
|
81 |
-
)
|
82 |
-
# split the image
|
83 |
-
split_img = resized_img.crop(box)
|
84 |
-
processed_images.append(split_img)
|
85 |
-
assert len(processed_images) == blocks
|
86 |
-
if use_thumbnail and len(processed_images) != 1:
|
87 |
-
thumbnail_img = image.resize((image_size, image_size))
|
88 |
-
processed_images.append(thumbnail_img)
|
89 |
-
return processed_images, target_aspect_ratio
|
90 |
-
|
91 |
-
def dynamic_preprocess2(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, prior_aspect_ratio=None):
|
92 |
-
orig_width, orig_height = image.size
|
93 |
-
aspect_ratio = orig_width / orig_height
|
94 |
-
|
95 |
-
# calculate the existing image aspect ratio
|
96 |
-
target_ratios = set(
|
97 |
-
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
98 |
-
i * j <= max_num and i * j >= min_num)
|
99 |
-
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
100 |
-
|
101 |
-
new_target_ratios = []
|
102 |
-
if prior_aspect_ratio is not None:
|
103 |
-
for i in target_ratios:
|
104 |
-
if prior_aspect_ratio[0]%i[0] != 0 and prior_aspect_ratio[1]%i[1] != 0:
|
105 |
-
new_target_ratios.append(i)
|
106 |
-
else:
|
107 |
-
continue
|
108 |
-
|
109 |
-
# find the closest aspect ratio to the target
|
110 |
-
target_aspect_ratio = find_closest_aspect_ratio(
|
111 |
-
aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
|
112 |
-
|
113 |
-
# calculate the target width and height
|
114 |
-
target_width = image_size * target_aspect_ratio[0]
|
115 |
-
target_height = image_size * target_aspect_ratio[1]
|
116 |
-
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
117 |
-
|
118 |
-
# resize the image
|
119 |
-
resized_img = image.resize((target_width, target_height))
|
120 |
-
processed_images = []
|
121 |
-
for i in range(blocks):
|
122 |
-
box = (
|
123 |
-
(i % (target_width // image_size)) * image_size,
|
124 |
-
(i // (target_width // image_size)) * image_size,
|
125 |
-
((i % (target_width // image_size)) + 1) * image_size,
|
126 |
-
((i // (target_width // image_size)) + 1) * image_size
|
127 |
-
)
|
128 |
-
# split the image
|
129 |
-
split_img = resized_img.crop(box)
|
130 |
-
processed_images.append(split_img)
|
131 |
-
assert len(processed_images) == blocks
|
132 |
-
if use_thumbnail and len(processed_images) != 1:
|
133 |
-
thumbnail_img = image.resize((image_size, image_size))
|
134 |
-
processed_images.append(thumbnail_img)
|
135 |
-
return processed_images
|
136 |
-
def load_image1(image_file, input_size=448, min_num=1, max_num=6):
|
137 |
-
if isinstance(image_file, str):
|
138 |
-
image = Image.open(image_file).convert('RGB')
|
139 |
-
else:
|
140 |
-
image = image_file
|
141 |
-
transform = build_transform(input_size=input_size)
|
142 |
-
images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
|
143 |
-
pixel_values = [transform(image) for image in images]
|
144 |
-
pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
|
145 |
-
return pixel_values, target_aspect_ratio
|
146 |
-
|
147 |
-
def load_image2(image_file, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None):
|
148 |
-
|
149 |
-
if isinstance(image_file, str):
|
150 |
-
image = Image.open(image_file).convert('RGB')
|
151 |
-
else:
|
152 |
-
image = image_file
|
153 |
-
transform = build_transform(input_size=input_size)
|
154 |
-
images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
|
155 |
-
pixel_values = [transform(image) for image in images]
|
156 |
-
pixel_values = torch.stack(pixel_values).to(torch.bfloat16).cuda()
|
157 |
-
return pixel_values
|
158 |
-
|
159 |
-
def load_image_msac(file_name):
|
160 |
-
pixel_values, target_aspect_ratio = load_image1(file_name, min_num=1, max_num=6)
|
161 |
-
# pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
162 |
-
pixel_values2 = load_image2(file_name, min_num=3, max_num=6, target_aspect_ratio=target_aspect_ratio)
|
163 |
-
# pixel_values2 = pixel_values2.to(torch.bfloat16).cuda()
|
164 |
-
pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
|
165 |
-
return pixel_values
|
166 |
-
|
167 |
|
168 |
def load_model_and_set_image_function(model_name):
|
169 |
# Get the model path from the model_paths dictionary
|
@@ -184,51 +41,20 @@ def load_model_and_set_image_function(model_name):
|
|
184 |
use_fast=False,
|
185 |
use_auth_token=hf_token
|
186 |
)
|
187 |
-
tokenizer.pad_token = tokenizer.unk_token
|
188 |
-
tokenizer.eos_token = "<|end|>"
|
189 |
-
model.generation_config.pad_token_id = tokenizer.pad_token_id
|
190 |
-
|
191 |
-
# Set the appropriate image loading function
|
192 |
-
if "0.8B" in model_name:
|
193 |
-
image_load_function = lambda x: load_image1(x)[0]
|
194 |
-
elif "2B" in model_name:
|
195 |
-
image_load_function = load_image_msac
|
196 |
-
else:
|
197 |
-
image_load_function = load_image1 # Default function
|
198 |
|
199 |
-
return model, tokenizer
|
200 |
|
201 |
|
202 |
-
# # Load the model and tokenizer
|
203 |
-
# model = AutoModel.from_pretrained(
|
204 |
-
# path,
|
205 |
-
# torch_dtype=torch.bfloat16,
|
206 |
-
# low_cpu_mem_usage=True,
|
207 |
-
# trust_remote_code=True,
|
208 |
-
# use_auth_token=hf_token
|
209 |
-
# ).eval().cuda()
|
210 |
-
|
211 |
-
# tokenizer = AutoTokenizer.from_pretrained(
|
212 |
-
# path,
|
213 |
-
# trust_remote_code=True,
|
214 |
-
# use_fast=False,
|
215 |
-
# use_auth_token=hf_token
|
216 |
-
# )
|
217 |
-
# tokenizer.pad_token = tokenizer.unk_token
|
218 |
-
# tokenizer.eos_token = "<|end|>"
|
219 |
-
# model.generation_config.pad_token_id = tokenizer.pad_token_id
|
220 |
-
|
221 |
-
|
222 |
def inference(image,
|
223 |
user_message,
|
224 |
temperature,
|
225 |
top_p,
|
226 |
max_new_tokens,
|
|
|
227 |
chatbot,state,
|
228 |
image_state,
|
229 |
model_state,
|
230 |
-
tokenizer_state
|
231 |
-
image_load_function_state):
|
232 |
|
233 |
# Check if model_state is None
|
234 |
if model_state is None or tokenizer_state is None:
|
@@ -237,15 +63,14 @@ def inference(image,
|
|
237 |
|
238 |
model = model_state
|
239 |
tokenizer = tokenizer_state
|
240 |
-
image_load_function = image_load_function_state
|
241 |
|
242 |
|
243 |
-
#
|
244 |
-
|
245 |
-
|
246 |
|
247 |
if image is not None:
|
248 |
-
image_state =
|
249 |
else:
|
250 |
# If image_state is None, then no image has been provided yet
|
251 |
if image_state is None:
|
@@ -276,6 +101,7 @@ def inference(image,
|
|
276 |
tokenizer,
|
277 |
image_state,
|
278 |
user_message,
|
|
|
279 |
generation_config=generation_config,
|
280 |
history=state,
|
281 |
return_history=True
|
@@ -292,6 +118,7 @@ def regenerate_response(chatbot,
|
|
292 |
temperature,
|
293 |
top_p,
|
294 |
max_new_tokens,
|
|
|
295 |
state,
|
296 |
image_state,
|
297 |
model_state,
|
@@ -339,6 +166,7 @@ def regenerate_response(chatbot,
|
|
339 |
tokenizer,
|
340 |
image_state,
|
341 |
last_user_message,
|
|
|
342 |
generation_config=generation_config,
|
343 |
history=state, # Exclude last assistant's response
|
344 |
return_history=True
|
@@ -377,13 +205,13 @@ with gr.Blocks() as demo:
|
|
377 |
model_dropdown.change(
|
378 |
fn=load_model_and_set_image_function,
|
379 |
inputs=[model_dropdown],
|
380 |
-
outputs=[model_state, tokenizer_state
|
381 |
)
|
382 |
|
383 |
with gr.Row(equal_height=True):
|
384 |
# First column with image input
|
385 |
with gr.Column(scale=1):
|
386 |
-
image_input = gr.Image(type="
|
387 |
|
388 |
# Second column with chatbot and user input
|
389 |
with gr.Column(scale=2):
|
@@ -397,7 +225,7 @@ with gr.Blocks() as demo:
|
|
397 |
minimum=0.0,
|
398 |
maximum=1.0,
|
399 |
step=0.1,
|
400 |
-
value=0.
|
401 |
interactive=True,
|
402 |
label="Temperature")
|
403 |
top_p_input = gr.Slider(
|
@@ -413,7 +241,14 @@ with gr.Blocks() as demo:
|
|
413 |
step=64,
|
414 |
value=1024,
|
415 |
interactive=True,
|
416 |
-
label="Max New Tokens (default: 1024)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
417 |
)
|
418 |
|
419 |
with gr.Row():
|
@@ -430,12 +265,12 @@ with gr.Blocks() as demo:
|
|
430 |
temperature_input,
|
431 |
top_p_input,
|
432 |
max_new_tokens_input,
|
|
|
433 |
chatbot,
|
434 |
state,
|
435 |
image_state,
|
436 |
model_state,
|
437 |
-
tokenizer_state
|
438 |
-
image_load_function_state
|
439 |
],
|
440 |
outputs=[chatbot, state, image_state, user_input]
|
441 |
)
|
@@ -447,6 +282,7 @@ with gr.Blocks() as demo:
|
|
447 |
temperature_input,
|
448 |
top_p_input,
|
449 |
max_new_tokens_input,
|
|
|
450 |
state,
|
451 |
image_state,
|
452 |
model_state,
|
@@ -471,6 +307,4 @@ with gr.Blocks() as demo:
|
|
471 |
label = "examples",
|
472 |
)
|
473 |
|
474 |
-
|
475 |
-
|
476 |
demo.launch()
|
|
|
16 |
|
17 |
# Define the models and their paths
|
18 |
model_paths = {
|
19 |
+
"H2OVL-Mississippi-2B":"h2oai/h2ovl-mississippi-2b-prerel",
|
20 |
+
"H2OVL-Mississippi-0.8B":"h2oai/h2ovl-mississippi-800m-prerel",
|
21 |
# Add more models as needed
|
22 |
}
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def load_model_and_set_image_function(model_name):
|
26 |
# Get the model path from the model_paths dictionary
|
|
|
41 |
use_fast=False,
|
42 |
use_auth_token=hf_token
|
43 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
return model, tokenizer
|
46 |
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def inference(image,
|
49 |
user_message,
|
50 |
temperature,
|
51 |
top_p,
|
52 |
max_new_tokens,
|
53 |
+
tile_num,
|
54 |
chatbot,state,
|
55 |
image_state,
|
56 |
model_state,
|
57 |
+
tokenizer_state):
|
|
|
58 |
|
59 |
# Check if model_state is None
|
60 |
if model_state is None or tokenizer_state is None:
|
|
|
63 |
|
64 |
model = model_state
|
65 |
tokenizer = tokenizer_state
|
|
|
66 |
|
67 |
|
68 |
+
# if image is provided, store it in image_state:
|
69 |
+
if chatbot is None:
|
70 |
+
chatbot = []
|
71 |
|
72 |
if image is not None:
|
73 |
+
image_state = image
|
74 |
else:
|
75 |
# If image_state is None, then no image has been provided yet
|
76 |
if image_state is None:
|
|
|
101 |
tokenizer,
|
102 |
image_state,
|
103 |
user_message,
|
104 |
+
max_tiles = int(tile_num),
|
105 |
generation_config=generation_config,
|
106 |
history=state,
|
107 |
return_history=True
|
|
|
118 |
temperature,
|
119 |
top_p,
|
120 |
max_new_tokens,
|
121 |
+
tile_num,
|
122 |
state,
|
123 |
image_state,
|
124 |
model_state,
|
|
|
166 |
tokenizer,
|
167 |
image_state,
|
168 |
last_user_message,
|
169 |
+
max_tiles = int(tile_num),
|
170 |
generation_config=generation_config,
|
171 |
history=state, # Exclude last assistant's response
|
172 |
return_history=True
|
|
|
205 |
model_dropdown.change(
|
206 |
fn=load_model_and_set_image_function,
|
207 |
inputs=[model_dropdown],
|
208 |
+
outputs=[model_state, tokenizer_state]
|
209 |
)
|
210 |
|
211 |
with gr.Row(equal_height=True):
|
212 |
# First column with image input
|
213 |
with gr.Column(scale=1):
|
214 |
+
image_input = gr.Image(type="filepath", label="Upload an Image")
|
215 |
|
216 |
# Second column with chatbot and user input
|
217 |
with gr.Column(scale=2):
|
|
|
225 |
minimum=0.0,
|
226 |
maximum=1.0,
|
227 |
step=0.1,
|
228 |
+
value=0.2,
|
229 |
interactive=True,
|
230 |
label="Temperature")
|
231 |
top_p_input = gr.Slider(
|
|
|
241 |
step=64,
|
242 |
value=1024,
|
243 |
interactive=True,
|
244 |
+
label="Max New Tokens (default: 1024)")
|
245 |
+
tile_num = gr.Slider(
|
246 |
+
minimum=2,
|
247 |
+
maximum=12,
|
248 |
+
step=1,
|
249 |
+
value=6,
|
250 |
+
interactive=True,
|
251 |
+
label="Tile Number (default: 6)"
|
252 |
)
|
253 |
|
254 |
with gr.Row():
|
|
|
265 |
temperature_input,
|
266 |
top_p_input,
|
267 |
max_new_tokens_input,
|
268 |
+
tile_num,
|
269 |
chatbot,
|
270 |
state,
|
271 |
image_state,
|
272 |
model_state,
|
273 |
+
tokenizer_state
|
|
|
274 |
],
|
275 |
outputs=[chatbot, state, image_state, user_input]
|
276 |
)
|
|
|
282 |
temperature_input,
|
283 |
top_p_input,
|
284 |
max_new_tokens_input,
|
285 |
+
tile_num,
|
286 |
state,
|
287 |
image_state,
|
288 |
model_state,
|
|
|
307 |
label = "examples",
|
308 |
)
|
309 |
|
|
|
|
|
310 |
demo.launch()
|