fix: format .py
Browse files- config.json +1 -1
- generation_config.json +1 -1
- got_vision_b.py +23 -41
- modeling_GOT.py +201 -204
- render_tools.py +6 -13
- requirements.txt +6 -0
- special_tokens_map.json +1 -1
- tokenization_qwen.py +21 -40
config.json
CHANGED
@@ -35,4 +35,4 @@
|
|
35 |
"use_im_start_end": true,
|
36 |
"use_sliding_window": false,
|
37 |
"vocab_size": 151860
|
38 |
-
}
|
|
|
35 |
"use_im_start_end": true,
|
36 |
"use_sliding_window": false,
|
37 |
"vocab_size": 151860
|
38 |
+
}
|
generation_config.json
CHANGED
@@ -3,4 +3,4 @@
|
|
3 |
"eos_token_id": 151643,
|
4 |
"max_new_tokens": 2048,
|
5 |
"transformers_version": "4.37.2"
|
6 |
-
}
|
|
|
3 |
"eos_token_id": 151643,
|
4 |
"max_new_tokens": 2048,
|
5 |
"transformers_version": "4.37.2"
|
6 |
+
}
|
got_vision_b.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn.functional as F
|
3 |
-
from typing import Optional, Tuple, Type
|
4 |
from functools import partial
|
5 |
-
import
|
6 |
-
from typing import Type
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
class MLPBlock(nn.Module):
|
@@ -23,7 +22,6 @@ class MLPBlock(nn.Module):
|
|
23 |
return self.lin2(self.act(self.lin1(x)))
|
24 |
|
25 |
|
26 |
-
|
27 |
class LayerNorm2d(nn.Module):
|
28 |
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
|
29 |
super().__init__()
|
@@ -39,7 +37,6 @@ class LayerNorm2d(nn.Module):
|
|
39 |
return x
|
40 |
|
41 |
|
42 |
-
|
43 |
class ImageEncoderViT(nn.Module):
|
44 |
def __init__(
|
45 |
self,
|
@@ -91,9 +88,7 @@ class ImageEncoderViT(nn.Module):
|
|
91 |
self.pos_embed: Optional[nn.Parameter] = None
|
92 |
if use_abs_pos:
|
93 |
# Initialize absolute positional embedding with pretrain image size.
|
94 |
-
self.pos_embed = nn.Parameter(
|
95 |
-
torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
|
96 |
-
)
|
97 |
|
98 |
self.blocks = nn.ModuleList()
|
99 |
for i in range(depth):
|
@@ -129,7 +124,6 @@ class ImageEncoderViT(nn.Module):
|
|
129 |
LayerNorm2d(out_chans),
|
130 |
)
|
131 |
|
132 |
-
|
133 |
self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
|
134 |
self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
|
135 |
|
@@ -145,7 +139,6 @@ class ImageEncoderViT(nn.Module):
|
|
145 |
x = self.net_2(x)
|
146 |
x = self.net_3(x)
|
147 |
|
148 |
-
|
149 |
return x
|
150 |
|
151 |
|
@@ -247,9 +240,7 @@ class Attention(nn.Module):
|
|
247 |
|
248 |
self.use_rel_pos = use_rel_pos
|
249 |
if self.use_rel_pos:
|
250 |
-
assert
|
251 |
-
input_size is not None
|
252 |
-
), "Input size must be provided if using relative positional encoding."
|
253 |
# initialize relative positional embeddings
|
254 |
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
|
255 |
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
|
@@ -297,9 +288,7 @@ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, T
|
|
297 |
return windows, (Hp, Wp)
|
298 |
|
299 |
|
300 |
-
def window_unpartition(
|
301 |
-
windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
|
302 |
-
) -> torch.Tensor:
|
303 |
"""
|
304 |
Window unpartition into original sequences and removing padding.
|
305 |
Args:
|
@@ -385,9 +374,7 @@ def add_decomposed_rel_pos(
|
|
385 |
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
|
386 |
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
|
387 |
|
388 |
-
attn = (
|
389 |
-
attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
|
390 |
-
).view(B, q_h * q_w, k_h * k_w)
|
391 |
|
392 |
return attn
|
393 |
|
@@ -415,9 +402,7 @@ class PatchEmbed(nn.Module):
|
|
415 |
"""
|
416 |
super().__init__()
|
417 |
|
418 |
-
self.proj = nn.Conv2d(
|
419 |
-
in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
|
420 |
-
)
|
421 |
|
422 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
423 |
x = self.proj(x)
|
@@ -426,7 +411,6 @@ class PatchEmbed(nn.Module):
|
|
426 |
return x
|
427 |
|
428 |
|
429 |
-
|
430 |
def build_GOT_vit_b(checkpoint=None):
|
431 |
return _build_GOT_vision(
|
432 |
encoder_embed_dim=768,
|
@@ -448,21 +432,19 @@ def _build_GOT_vision(
|
|
448 |
image_size = 1024
|
449 |
vit_patch_size = 16
|
450 |
image_embedding_size = image_size // vit_patch_size
|
451 |
-
image_encoder=ImageEncoderViT(
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
|
467 |
return image_encoder
|
468 |
-
|
|
|
|
|
|
|
|
|
1 |
from functools import partial
|
2 |
+
from typing import Optional, Tuple, Type
|
|
|
3 |
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
|
8 |
|
9 |
class MLPBlock(nn.Module):
|
|
|
22 |
return self.lin2(self.act(self.lin1(x)))
|
23 |
|
24 |
|
|
|
25 |
class LayerNorm2d(nn.Module):
|
26 |
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
|
27 |
super().__init__()
|
|
|
37 |
return x
|
38 |
|
39 |
|
|
|
40 |
class ImageEncoderViT(nn.Module):
|
41 |
def __init__(
|
42 |
self,
|
|
|
88 |
self.pos_embed: Optional[nn.Parameter] = None
|
89 |
if use_abs_pos:
|
90 |
# Initialize absolute positional embedding with pretrain image size.
|
91 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
|
|
|
|
|
92 |
|
93 |
self.blocks = nn.ModuleList()
|
94 |
for i in range(depth):
|
|
|
124 |
LayerNorm2d(out_chans),
|
125 |
)
|
126 |
|
|
|
127 |
self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
|
128 |
self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
|
129 |
|
|
|
139 |
x = self.net_2(x)
|
140 |
x = self.net_3(x)
|
141 |
|
|
|
142 |
return x
|
143 |
|
144 |
|
|
|
240 |
|
241 |
self.use_rel_pos = use_rel_pos
|
242 |
if self.use_rel_pos:
|
243 |
+
assert input_size is not None, "Input size must be provided if using relative positional encoding."
|
|
|
|
|
244 |
# initialize relative positional embeddings
|
245 |
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
|
246 |
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
|
|
|
288 |
return windows, (Hp, Wp)
|
289 |
|
290 |
|
291 |
+
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]) -> torch.Tensor:
|
|
|
|
|
292 |
"""
|
293 |
Window unpartition into original sequences and removing padding.
|
294 |
Args:
|
|
|
374 |
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
|
375 |
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
|
376 |
|
377 |
+
attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(B, q_h * q_w, k_h * k_w)
|
|
|
|
|
378 |
|
379 |
return attn
|
380 |
|
|
|
402 |
"""
|
403 |
super().__init__()
|
404 |
|
405 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
|
|
|
|
|
406 |
|
407 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
408 |
x = self.proj(x)
|
|
|
411 |
return x
|
412 |
|
413 |
|
|
|
414 |
def build_GOT_vit_b(checkpoint=None):
|
415 |
return _build_GOT_vision(
|
416 |
encoder_embed_dim=768,
|
|
|
432 |
image_size = 1024
|
433 |
vit_patch_size = 16
|
434 |
image_embedding_size = image_size // vit_patch_size
|
435 |
+
image_encoder = ImageEncoderViT(
|
436 |
+
depth=encoder_depth,
|
437 |
+
embed_dim=encoder_embed_dim,
|
438 |
+
img_size=image_size,
|
439 |
+
mlp_ratio=4,
|
440 |
+
norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
|
441 |
+
num_heads=encoder_num_heads,
|
442 |
+
patch_size=vit_patch_size,
|
443 |
+
qkv_bias=True,
|
444 |
+
use_rel_pos=True,
|
445 |
+
global_attn_indexes=encoder_global_attn_indexes,
|
446 |
+
window_size=14,
|
447 |
+
out_chans=prompt_embed_dim,
|
448 |
+
)
|
|
|
449 |
|
450 |
return image_encoder
|
|
modeling_GOT.py
CHANGED
@@ -1,27 +1,32 @@
|
|
1 |
-
|
2 |
-
from
|
|
|
3 |
from typing import List, Optional, Tuple, Union
|
4 |
-
|
5 |
import requests
|
6 |
-
from PIL import Image
|
7 |
-
from io import BytesIO
|
8 |
import torch
|
9 |
import torch.nn as nn
|
|
|
10 |
from torch.nn import CrossEntropyLoss
|
11 |
-
from .got_vision_b import build_GOT_vit_b
|
12 |
from torchvision import transforms
|
13 |
from torchvision.transforms.functional import InterpolationMode
|
14 |
-
import
|
|
|
|
|
|
|
|
|
|
|
15 |
###
|
16 |
|
17 |
DEFAULT_IMAGE_TOKEN = "<image>"
|
18 |
-
DEFAULT_IMAGE_PATCH_TOKEN =
|
19 |
-
DEFAULT_IM_START_TOKEN =
|
20 |
-
DEFAULT_IM_END_TOKEN =
|
|
|
21 |
|
22 |
-
from enum import auto, Enum
|
23 |
class SeparatorStyle(Enum):
|
24 |
"""Different separator style."""
|
|
|
25 |
SINGLE = auto()
|
26 |
TWO = auto()
|
27 |
MPT = auto()
|
@@ -30,6 +35,7 @@ class SeparatorStyle(Enum):
|
|
30 |
@dataclasses.dataclass
|
31 |
class Conversation:
|
32 |
"""A class that keeps all conversation history."""
|
|
|
33 |
system: str
|
34 |
roles: List[str]
|
35 |
messages: List[List[str]]
|
@@ -43,7 +49,7 @@ class Conversation:
|
|
43 |
|
44 |
def get_prompt(self):
|
45 |
if self.sep_style == SeparatorStyle.SINGLE:
|
46 |
-
ret = self.system + self.sep +
|
47 |
for role, message in self.messages:
|
48 |
if message:
|
49 |
if type(message) is tuple:
|
@@ -65,9 +71,9 @@ class Conversation:
|
|
65 |
return ret
|
66 |
if self.sep_style == SeparatorStyle.MPT:
|
67 |
if self.system:
|
68 |
-
ret = self.system + self.sep
|
69 |
else:
|
70 |
-
ret =
|
71 |
for role, message in self.messages:
|
72 |
if message:
|
73 |
if type(message) is tuple:
|
@@ -79,7 +85,6 @@ class Conversation:
|
|
79 |
else:
|
80 |
raise ValueError(f"Invalid style: {self.sep_style}")
|
81 |
|
82 |
-
|
83 |
def append_message(self, role, message):
|
84 |
self.messages.append([role, message])
|
85 |
|
@@ -91,8 +96,8 @@ class Conversation:
|
|
91 |
offset=self.offset,
|
92 |
sep_style=self.sep_style,
|
93 |
sep=self.sep,
|
94 |
-
sep2=self.sep2
|
95 |
-
|
96 |
|
97 |
|
98 |
class KeywordsStoppingCriteria(StoppingCriteria):
|
@@ -111,12 +116,12 @@ class KeywordsStoppingCriteria(StoppingCriteria):
|
|
111 |
for keyword_id in self.keyword_ids:
|
112 |
if output_ids[0, -1] == keyword_id:
|
113 |
return True
|
114 |
-
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
|
115 |
for keyword in self.keywords:
|
116 |
if keyword in outputs:
|
117 |
return True
|
118 |
return False
|
119 |
-
|
120 |
|
121 |
class GOTImageEvalProcessor:
|
122 |
def __init__(self, image_size=384, mean=None, std=None):
|
@@ -129,18 +134,16 @@ class GOTImageEvalProcessor:
|
|
129 |
|
130 |
self.transform = transforms.Compose(
|
131 |
[
|
132 |
-
transforms.Resize(
|
133 |
-
(image_size, image_size), interpolation=InterpolationMode.BICUBIC
|
134 |
-
),
|
135 |
transforms.ToTensor(),
|
136 |
self.normalize,
|
137 |
]
|
138 |
)
|
|
|
139 |
def __call__(self, item):
|
140 |
return self.transform(item)
|
141 |
|
142 |
|
143 |
-
|
144 |
class GOTConfig(Qwen2Config):
|
145 |
model_type = "GOT"
|
146 |
|
@@ -153,28 +156,24 @@ class GOTQwenModel(Qwen2Model):
|
|
153 |
|
154 |
self.vision_tower_high = build_GOT_vit_b()
|
155 |
|
156 |
-
self.mm_projector_vary =
|
157 |
-
|
158 |
|
159 |
def initialize_vision_modules(
|
160 |
-
self,
|
161 |
vision_tower,
|
162 |
pretrained_stage1_model=None,
|
163 |
freeze_vision_tower=False,
|
164 |
use_im_start_end=False,
|
165 |
vision_select_layer=-1,
|
166 |
dtype=torch.float16,
|
167 |
-
device="cuda"
|
168 |
):
|
169 |
-
|
170 |
-
|
171 |
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
172 |
-
|
173 |
self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
|
174 |
|
175 |
self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
|
176 |
|
177 |
-
|
178 |
image_token_len = 256
|
179 |
|
180 |
self.config.vision_tower = vision_tower
|
@@ -184,13 +183,12 @@ class GOTQwenModel(Qwen2Model):
|
|
184 |
|
185 |
self.config.vision_select_layer = vision_select_layer
|
186 |
self.config.freeze_vision_tower = freeze_vision_tower
|
187 |
-
|
188 |
return dict(
|
189 |
image_processor_high=image_processor_high,
|
190 |
image_token_len=image_token_len,
|
191 |
)
|
192 |
-
|
193 |
-
|
194 |
def forward(
|
195 |
self,
|
196 |
input_ids: torch.LongTensor = None,
|
@@ -204,19 +202,16 @@ class GOTQwenModel(Qwen2Model):
|
|
204 |
images: Optional[torch.FloatTensor] = None,
|
205 |
return_dict: Optional[bool] = None,
|
206 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
207 |
-
|
208 |
# HACK: replace back original embeddings for LLaVA pretraining
|
209 |
-
orig_embeds_params = getattr(self,
|
210 |
if orig_embeds_params is not None:
|
211 |
with torch.no_grad():
|
212 |
-
self.get_input_embeddings().weight[
|
213 |
|
214 |
if inputs_embeds is None:
|
215 |
inputs_embeds = self.embed_tokens(input_ids)
|
216 |
|
217 |
-
|
218 |
-
vision_tower_high = getattr(self, 'vision_tower_high', None)
|
219 |
-
|
220 |
|
221 |
if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
|
222 |
use_im_start_end = getattr(self.config, "use_im_start_end", -1)
|
@@ -232,15 +227,15 @@ class GOTQwenModel(Qwen2Model):
|
|
232 |
im_start_token = 151857
|
233 |
|
234 |
im_end_token = 151858
|
235 |
-
|
236 |
image_features = []
|
237 |
-
|
238 |
for image in images:
|
239 |
P, C, H, W = image.shape
|
240 |
if P == 1:
|
241 |
with torch.set_grad_enabled(False):
|
242 |
cnn_feature = vision_tower_high(image)
|
243 |
-
cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1)
|
244 |
image_feature = self.mm_projector_vary(cnn_feature)
|
245 |
image_features.append(image_feature)
|
246 |
|
@@ -249,7 +244,7 @@ class GOTQwenModel(Qwen2Model):
|
|
249 |
image_patches_features = []
|
250 |
for image_patch in image_patches:
|
251 |
image_p = torch.stack([image_patch])
|
252 |
-
|
253 |
with torch.set_grad_enabled(False):
|
254 |
cnn_feature_p = vision_tower_high(image_p)
|
255 |
cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
|
@@ -258,21 +253,20 @@ class GOTQwenModel(Qwen2Model):
|
|
258 |
image_feature = torch.cat(image_patches_features, dim=1)
|
259 |
image_features.append(image_feature)
|
260 |
|
261 |
-
|
262 |
dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
|
263 |
dummy_image_features = dummy_image_features_2
|
264 |
use_im_start_end = True
|
265 |
new_input_embeds = []
|
266 |
for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
|
267 |
if (cur_input_ids == im_patch_token).sum() == 0:
|
268 |
-
cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
|
269 |
new_input_embeds.append(cur_input_embeds)
|
270 |
continue
|
271 |
|
272 |
if use_im_start_end:
|
273 |
if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
|
274 |
raise ValueError("The number of image start tokens and image end tokens should be the same.")
|
275 |
-
|
276 |
image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
|
277 |
for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
|
278 |
per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
|
@@ -280,17 +274,16 @@ class GOTQwenModel(Qwen2Model):
|
|
280 |
|
281 |
if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
|
282 |
raise ValueError("The image end token should follow the image start token.")
|
283 |
-
|
284 |
cur_input_embeds = torch.cat(
|
285 |
(
|
286 |
-
cur_input_embeds[:image_start_token_pos+1],
|
287 |
-
per_cur_image_features,
|
288 |
-
cur_input_embeds[image_start_token_pos + num_patches + 1:]
|
289 |
-
),
|
290 |
-
dim=0
|
291 |
)
|
292 |
|
293 |
-
|
294 |
new_input_embeds.append(cur_input_embeds)
|
295 |
else:
|
296 |
raise NotImplementedError
|
@@ -298,14 +291,18 @@ class GOTQwenModel(Qwen2Model):
|
|
298 |
inputs_embeds = torch.stack(new_input_embeds, dim=0)
|
299 |
|
300 |
return super(GOTQwenModel, self).forward(
|
301 |
-
input_ids=None,
|
302 |
-
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
305 |
)
|
306 |
|
307 |
|
308 |
-
|
309 |
class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
310 |
config_class = GOTConfig
|
311 |
# supports_gradient_checkpointing = True
|
@@ -336,15 +333,12 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
336 |
output_hidden_states: Optional[bool] = None,
|
337 |
images: Optional[torch.FloatTensor] = None,
|
338 |
return_dict: Optional[bool] = None,
|
339 |
-
|
340 |
) -> Union[Tuple, CausalLMOutputWithPast]:
|
341 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
342 |
-
output_hidden_states =
|
343 |
-
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
344 |
-
)
|
345 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
346 |
|
347 |
-
outputs
|
348 |
input_ids=input_ids,
|
349 |
past_key_values=past_key_values,
|
350 |
attention_mask=attention_mask,
|
@@ -354,8 +348,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
354 |
output_attentions=output_attentions,
|
355 |
output_hidden_states=output_hidden_states,
|
356 |
images=images,
|
357 |
-
return_dict=return_dict
|
358 |
-
|
359 |
)
|
360 |
|
361 |
hidden_states = outputs[0]
|
@@ -389,10 +382,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
389 |
attentions=outputs.attentions,
|
390 |
)
|
391 |
|
392 |
-
|
393 |
-
def prepare_inputs_for_generation(
|
394 |
-
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
|
395 |
-
):
|
396 |
# Omit tokens covered by past_key_values
|
397 |
if past_key_values is not None:
|
398 |
if isinstance(past_key_values, Cache):
|
@@ -416,11 +406,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
416 |
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
|
417 |
|
418 |
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
|
419 |
-
if
|
420 |
-
max_cache_length is not None
|
421 |
-
and attention_mask is not None
|
422 |
-
and cache_length + input_ids.shape[1] > max_cache_length
|
423 |
-
):
|
424 |
attention_mask = attention_mask[:, -max_cache_length:]
|
425 |
|
426 |
position_ids = kwargs.get("position_ids", None)
|
@@ -448,16 +434,9 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
448 |
)
|
449 |
return model_inputs
|
450 |
|
451 |
-
def initialize_vision_tokenizer(
|
452 |
-
self,
|
453 |
-
tokenizer,
|
454 |
-
freeze_lm_model=False,
|
455 |
-
pretrained_stage1_model=None,
|
456 |
-
device="cuda"
|
457 |
-
):
|
458 |
config = self.get_model().config
|
459 |
|
460 |
-
|
461 |
self.resize_token_embeddings(len(tokenizer))
|
462 |
|
463 |
config.im_patch_token = 151859
|
@@ -469,11 +448,11 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
469 |
config.im_start_token, config.im_end_token = 151857, 151858
|
470 |
|
471 |
def load_image(self, image_file):
|
472 |
-
if image_file.startswith(
|
473 |
response = requests.get(image_file)
|
474 |
-
image = Image.open(BytesIO(response.content)).convert(
|
475 |
else:
|
476 |
-
image = Image.open(image_file).convert(
|
477 |
return image
|
478 |
|
479 |
def disable_torch_init(self):
|
@@ -481,15 +460,26 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
481 |
Disable the redundant torch default initialization to accelerate model creation.
|
482 |
"""
|
483 |
import torch
|
|
|
484 |
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
485 |
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
486 |
|
487 |
-
def chat(
|
488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
self.disable_torch_init()
|
490 |
|
491 |
-
|
492 |
-
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
493 |
|
494 |
use_im_start_end = True
|
495 |
|
@@ -501,38 +491,37 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
501 |
image = self.load_image(image_file)
|
502 |
|
503 |
w, h = image.size
|
504 |
-
|
505 |
-
if ocr_type ==
|
506 |
-
qs =
|
507 |
else:
|
508 |
-
qs =
|
509 |
|
510 |
if ocr_box:
|
511 |
bbox = eval(ocr_box)
|
512 |
if len(bbox) == 2:
|
513 |
-
bbox[0] = int(bbox[0]/w*1000)
|
514 |
-
bbox[1] = int(bbox[1]/h*1000)
|
515 |
if len(bbox) == 4:
|
516 |
-
bbox[0] = int(bbox[0]/w*1000)
|
517 |
-
bbox[1] = int(bbox[1]/h*1000)
|
518 |
-
bbox[2] = int(bbox[2]/w*1000)
|
519 |
-
bbox[3] = int(bbox[3]/h*1000)
|
520 |
-
if ocr_type ==
|
521 |
-
qs = str(bbox) +
|
522 |
else:
|
523 |
-
qs = str(bbox) +
|
524 |
|
525 |
if ocr_color:
|
526 |
-
if ocr_type ==
|
527 |
-
qs =
|
528 |
else:
|
529 |
-
qs =
|
530 |
|
531 |
if use_im_start_end:
|
532 |
-
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN +
|
533 |
else:
|
534 |
-
qs = DEFAULT_IMAGE_TOKEN +
|
535 |
-
|
536 |
|
537 |
conv_mpt = Conversation(
|
538 |
system="""<|im_start|>system
|
@@ -571,109 +560,113 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
571 |
input_ids,
|
572 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
573 |
do_sample=False,
|
574 |
-
num_beams
|
575 |
-
no_repeat_ngram_size
|
576 |
streamer=streamer,
|
577 |
max_new_tokens=4096,
|
578 |
-
stopping_criteria=[stopping_criteria]
|
579 |
-
|
580 |
else:
|
581 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
582 |
output_ids = self.generate(
|
583 |
input_ids,
|
584 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
585 |
do_sample=False,
|
586 |
-
num_beams
|
587 |
-
no_repeat_ngram_size
|
588 |
# streamer=streamer,
|
589 |
max_new_tokens=4096,
|
590 |
-
stopping_criteria=[stopping_criteria]
|
591 |
-
|
592 |
-
|
593 |
-
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
|
594 |
-
|
595 |
if outputs.endswith(stop_str):
|
596 |
-
outputs = outputs[
|
597 |
outputs = outputs.strip()
|
598 |
response_str = outputs
|
599 |
|
600 |
if render:
|
601 |
-
print(
|
602 |
-
from .render_tools import
|
603 |
|
604 |
-
if
|
605 |
import verovio
|
|
|
606 |
tk = verovio.toolkit()
|
607 |
tk.loadData(outputs)
|
608 |
-
tk.setOptions({"pageWidth": 2100, "footer":
|
609 |
-
'barLineWidth': 0.5, 'beamMaxSlope': 15,
|
610 |
-
'staffLineWidth': 0.2, 'spacingStaff': 6})
|
611 |
tk.getPageCount()
|
612 |
svg = tk.renderToSVG()
|
613 |
-
svg = svg.replace(
|
614 |
|
615 |
svg_to_html(svg, save_render_file)
|
616 |
|
617 |
-
if ocr_type ==
|
618 |
-
|
619 |
-
|
620 |
-
if '\\begin{tikzpicture}' not in outputs:
|
621 |
html_path_2 = save_render_file
|
622 |
-
right_num = outputs.count(
|
623 |
-
left_num = outputs.count(
|
624 |
|
625 |
if right_num != left_num:
|
626 |
-
outputs =
|
627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
628 |
|
629 |
-
outputs = outputs.replace('"',
|
630 |
|
631 |
-
outputs_list = outputs.split(
|
632 |
-
gt=
|
633 |
for out in outputs_list:
|
634 |
-
gt +=
|
635 |
-
|
636 |
-
gt = gt[:-2]
|
637 |
|
|
|
638 |
|
639 |
lines = content_mmd_to_html
|
640 |
lines = lines.split("const text =")
|
641 |
-
new_web = lines[0] +
|
642 |
|
643 |
else:
|
644 |
html_path_2 = save_render_file
|
645 |
outputs = outputs.translate(translation_table)
|
646 |
-
outputs_list = outputs.split(
|
647 |
-
gt=
|
648 |
for out in outputs_list:
|
649 |
if out:
|
650 |
-
if
|
651 |
-
while out[-1] ==
|
652 |
out = out[:-1]
|
653 |
if out is None:
|
654 |
break
|
655 |
-
|
656 |
if out:
|
657 |
-
if out[-1] !=
|
658 |
-
gt += out[:-1] +
|
659 |
else:
|
660 |
-
gt += out +
|
661 |
else:
|
662 |
-
gt += out +
|
663 |
-
|
664 |
|
665 |
lines = tik_html
|
666 |
lines = lines.split("const text =")
|
667 |
new_web = lines[0] + gt + lines[1]
|
668 |
|
669 |
-
with open(html_path_2,
|
670 |
web_f_new.write(new_web)
|
671 |
return response_str
|
672 |
|
673 |
def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
|
674 |
-
|
675 |
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
676 |
-
best_ratio_diff = float(
|
677 |
best_ratio = (1, 1)
|
678 |
area = width * height
|
679 |
for ratio in target_ratios:
|
@@ -687,20 +680,19 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
687 |
best_ratio = ratio
|
688 |
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
|
689 |
return best_ratio
|
690 |
-
|
691 |
orig_width, orig_height = image.size
|
692 |
aspect_ratio = orig_width / orig_height
|
693 |
|
694 |
# calculate the existing image aspect ratio
|
695 |
target_ratios = set(
|
696 |
-
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
697 |
-
|
698 |
# print(target_ratios)
|
699 |
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
700 |
|
701 |
# find the closest aspect ratio to the target
|
702 |
-
target_aspect_ratio = find_closest_aspect_ratio(
|
703 |
-
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
704 |
|
705 |
# print(target_aspect_ratio)
|
706 |
# calculate the target width and height
|
@@ -716,7 +708,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
716 |
(i % (target_width // image_size)) * image_size,
|
717 |
(i // (target_width // image_size)) * image_size,
|
718 |
((i % (target_width // image_size)) + 1) * image_size,
|
719 |
-
((i // (target_width // image_size)) + 1) * image_size
|
720 |
)
|
721 |
# split the image
|
722 |
split_img = resized_img.crop(box)
|
@@ -727,18 +719,15 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
727 |
processed_images.append(thumbnail_img)
|
728 |
return processed_images
|
729 |
|
730 |
-
|
731 |
-
def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
|
732 |
# Model
|
733 |
self.disable_torch_init()
|
734 |
-
multi_page=False
|
735 |
-
|
736 |
|
737 |
-
image_processor_high =
|
738 |
|
739 |
use_im_start_end = True
|
740 |
|
741 |
-
|
742 |
image_token_len = 256
|
743 |
|
744 |
image_list = []
|
@@ -747,7 +736,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
747 |
# multi_page = True
|
748 |
|
749 |
if multi_page:
|
750 |
-
qs =
|
751 |
# only for png files
|
752 |
# import glob
|
753 |
# from natsort import natsorted
|
@@ -763,10 +752,10 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
763 |
# print("len ll: ", ll)
|
764 |
|
765 |
else:
|
766 |
-
if ocr_type ==
|
767 |
-
qs =
|
768 |
else:
|
769 |
-
qs =
|
770 |
if gradio_input:
|
771 |
img = image_file.copy()
|
772 |
else:
|
@@ -778,17 +767,14 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
778 |
image_tensor_1 = image_processor_high(image)
|
779 |
image_list.append(image_tensor_1)
|
780 |
|
781 |
-
|
782 |
image_list = torch.stack(image_list)
|
783 |
|
784 |
-
print(
|
785 |
-
|
786 |
|
787 |
if use_im_start_end:
|
788 |
-
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN +
|
789 |
else:
|
790 |
-
qs = DEFAULT_IMAGE_TOKEN +
|
791 |
-
|
792 |
|
793 |
conv_mpt = Conversation(
|
794 |
system="""<|im_start|>system
|
@@ -825,57 +811,68 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
825 |
input_ids,
|
826 |
images=[image_list.half().cuda()],
|
827 |
do_sample=False,
|
828 |
-
num_beams
|
829 |
# no_repeat_ngram_size = 20,
|
830 |
streamer=streamer,
|
831 |
max_new_tokens=4096,
|
832 |
-
stopping_criteria=[stopping_criteria]
|
833 |
-
|
834 |
else:
|
835 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
836 |
output_ids = self.generate(
|
837 |
input_ids,
|
838 |
images=[image_list.half().cuda()],
|
839 |
do_sample=False,
|
840 |
-
num_beams
|
841 |
# no_repeat_ngram_size = 20,
|
842 |
# streamer=streamer,
|
843 |
max_new_tokens=4096,
|
844 |
-
stopping_criteria=[stopping_criteria]
|
845 |
-
|
|
|
|
|
846 |
|
847 |
-
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
|
848 |
-
|
849 |
if outputs.endswith(stop_str):
|
850 |
-
outputs = outputs[
|
851 |
-
outputs = outputs.strip()
|
852 |
response_str = outputs
|
853 |
|
854 |
if render:
|
855 |
-
print(
|
856 |
from .render_tools import content_mmd_to_html
|
|
|
857 |
html_path_2 = save_render_file
|
858 |
-
right_num = outputs.count(
|
859 |
-
left_num = outputs.count(
|
860 |
|
861 |
if right_num != left_num:
|
862 |
-
outputs =
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
869 |
for out in outputs_list:
|
870 |
-
gt +=
|
871 |
-
|
872 |
gt = gt[:-2]
|
873 |
|
874 |
lines = content_mmd_to_html
|
875 |
lines = lines.split("const text =")
|
876 |
-
new_web = lines[0] +
|
877 |
-
|
878 |
-
with open(html_path_2,
|
879 |
web_f_new.write(new_web)
|
880 |
|
881 |
-
return response_str
|
|
|
1 |
+
import dataclasses
|
2 |
+
from enum import Enum, auto
|
3 |
+
from io import BytesIO
|
4 |
from typing import List, Optional, Tuple, Union
|
5 |
+
|
6 |
import requests
|
|
|
|
|
7 |
import torch
|
8 |
import torch.nn as nn
|
9 |
+
from PIL import Image
|
10 |
from torch.nn import CrossEntropyLoss
|
|
|
11 |
from torchvision import transforms
|
12 |
from torchvision.transforms.functional import InterpolationMode
|
13 |
+
from transformers import Qwen2Config, Qwen2ForCausalLM, Qwen2Model, StoppingCriteria, TextStreamer
|
14 |
+
from transformers.cache_utils import Cache
|
15 |
+
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
16 |
+
|
17 |
+
from .got_vision_b import build_GOT_vit_b
|
18 |
+
|
19 |
###
|
20 |
|
21 |
DEFAULT_IMAGE_TOKEN = "<image>"
|
22 |
+
DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>"
|
23 |
+
DEFAULT_IM_START_TOKEN = "<img>"
|
24 |
+
DEFAULT_IM_END_TOKEN = "</img>"
|
25 |
+
|
26 |
|
|
|
27 |
class SeparatorStyle(Enum):
|
28 |
"""Different separator style."""
|
29 |
+
|
30 |
SINGLE = auto()
|
31 |
TWO = auto()
|
32 |
MPT = auto()
|
|
|
35 |
@dataclasses.dataclass
|
36 |
class Conversation:
|
37 |
"""A class that keeps all conversation history."""
|
38 |
+
|
39 |
system: str
|
40 |
roles: List[str]
|
41 |
messages: List[List[str]]
|
|
|
49 |
|
50 |
def get_prompt(self):
|
51 |
if self.sep_style == SeparatorStyle.SINGLE:
|
52 |
+
ret = self.system + self.sep + "\n"
|
53 |
for role, message in self.messages:
|
54 |
if message:
|
55 |
if type(message) is tuple:
|
|
|
71 |
return ret
|
72 |
if self.sep_style == SeparatorStyle.MPT:
|
73 |
if self.system:
|
74 |
+
ret = self.system + self.sep
|
75 |
else:
|
76 |
+
ret = ""
|
77 |
for role, message in self.messages:
|
78 |
if message:
|
79 |
if type(message) is tuple:
|
|
|
85 |
else:
|
86 |
raise ValueError(f"Invalid style: {self.sep_style}")
|
87 |
|
|
|
88 |
def append_message(self, role, message):
|
89 |
self.messages.append([role, message])
|
90 |
|
|
|
96 |
offset=self.offset,
|
97 |
sep_style=self.sep_style,
|
98 |
sep=self.sep,
|
99 |
+
sep2=self.sep2,
|
100 |
+
)
|
101 |
|
102 |
|
103 |
class KeywordsStoppingCriteria(StoppingCriteria):
|
|
|
116 |
for keyword_id in self.keyword_ids:
|
117 |
if output_ids[0, -1] == keyword_id:
|
118 |
return True
|
119 |
+
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len :], skip_special_tokens=True)[0]
|
120 |
for keyword in self.keywords:
|
121 |
if keyword in outputs:
|
122 |
return True
|
123 |
return False
|
124 |
+
|
125 |
|
126 |
class GOTImageEvalProcessor:
|
127 |
def __init__(self, image_size=384, mean=None, std=None):
|
|
|
134 |
|
135 |
self.transform = transforms.Compose(
|
136 |
[
|
137 |
+
transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
|
|
|
|
|
138 |
transforms.ToTensor(),
|
139 |
self.normalize,
|
140 |
]
|
141 |
)
|
142 |
+
|
143 |
def __call__(self, item):
|
144 |
return self.transform(item)
|
145 |
|
146 |
|
|
|
147 |
class GOTConfig(Qwen2Config):
|
148 |
model_type = "GOT"
|
149 |
|
|
|
156 |
|
157 |
self.vision_tower_high = build_GOT_vit_b()
|
158 |
|
159 |
+
self.mm_projector_vary = nn.Linear(1024, 1024)
|
|
|
160 |
|
161 |
def initialize_vision_modules(
|
162 |
+
self,
|
163 |
vision_tower,
|
164 |
pretrained_stage1_model=None,
|
165 |
freeze_vision_tower=False,
|
166 |
use_im_start_end=False,
|
167 |
vision_select_layer=-1,
|
168 |
dtype=torch.float16,
|
169 |
+
device="cuda",
|
170 |
):
|
|
|
|
|
171 |
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
172 |
+
|
173 |
self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
|
174 |
|
175 |
self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
|
176 |
|
|
|
177 |
image_token_len = 256
|
178 |
|
179 |
self.config.vision_tower = vision_tower
|
|
|
183 |
|
184 |
self.config.vision_select_layer = vision_select_layer
|
185 |
self.config.freeze_vision_tower = freeze_vision_tower
|
186 |
+
|
187 |
return dict(
|
188 |
image_processor_high=image_processor_high,
|
189 |
image_token_len=image_token_len,
|
190 |
)
|
191 |
+
|
|
|
192 |
def forward(
|
193 |
self,
|
194 |
input_ids: torch.LongTensor = None,
|
|
|
202 |
images: Optional[torch.FloatTensor] = None,
|
203 |
return_dict: Optional[bool] = None,
|
204 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
|
|
205 |
# HACK: replace back original embeddings for LLaVA pretraining
|
206 |
+
orig_embeds_params = getattr(self, "orig_embeds_params", None)
|
207 |
if orig_embeds_params is not None:
|
208 |
with torch.no_grad():
|
209 |
+
self.get_input_embeddings().weight[: -self.num_new_tokens] = orig_embeds_params[: -self.num_new_tokens].data
|
210 |
|
211 |
if inputs_embeds is None:
|
212 |
inputs_embeds = self.embed_tokens(input_ids)
|
213 |
|
214 |
+
vision_tower_high = getattr(self, "vision_tower_high", None)
|
|
|
|
|
215 |
|
216 |
if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
|
217 |
use_im_start_end = getattr(self.config, "use_im_start_end", -1)
|
|
|
227 |
im_start_token = 151857
|
228 |
|
229 |
im_end_token = 151858
|
230 |
+
|
231 |
image_features = []
|
232 |
+
|
233 |
for image in images:
|
234 |
P, C, H, W = image.shape
|
235 |
if P == 1:
|
236 |
with torch.set_grad_enabled(False):
|
237 |
cnn_feature = vision_tower_high(image)
|
238 |
+
cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1) # 256*1024
|
239 |
image_feature = self.mm_projector_vary(cnn_feature)
|
240 |
image_features.append(image_feature)
|
241 |
|
|
|
244 |
image_patches_features = []
|
245 |
for image_patch in image_patches:
|
246 |
image_p = torch.stack([image_patch])
|
247 |
+
|
248 |
with torch.set_grad_enabled(False):
|
249 |
cnn_feature_p = vision_tower_high(image_p)
|
250 |
cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
|
|
|
253 |
image_feature = torch.cat(image_patches_features, dim=1)
|
254 |
image_features.append(image_feature)
|
255 |
|
|
|
256 |
dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
|
257 |
dummy_image_features = dummy_image_features_2
|
258 |
use_im_start_end = True
|
259 |
new_input_embeds = []
|
260 |
for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
|
261 |
if (cur_input_ids == im_patch_token).sum() == 0:
|
262 |
+
cur_input_embeds = cur_input_embeds + (0.0 * dummy_image_features).sum()
|
263 |
new_input_embeds.append(cur_input_embeds)
|
264 |
continue
|
265 |
|
266 |
if use_im_start_end:
|
267 |
if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
|
268 |
raise ValueError("The number of image start tokens and image end tokens should be the same.")
|
269 |
+
|
270 |
image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
|
271 |
for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
|
272 |
per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
|
|
|
274 |
|
275 |
if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
|
276 |
raise ValueError("The image end token should follow the image start token.")
|
277 |
+
|
278 |
cur_input_embeds = torch.cat(
|
279 |
(
|
280 |
+
cur_input_embeds[: image_start_token_pos + 1],
|
281 |
+
per_cur_image_features,
|
282 |
+
cur_input_embeds[image_start_token_pos + num_patches + 1 :],
|
283 |
+
),
|
284 |
+
dim=0,
|
285 |
)
|
286 |
|
|
|
287 |
new_input_embeds.append(cur_input_embeds)
|
288 |
else:
|
289 |
raise NotImplementedError
|
|
|
291 |
inputs_embeds = torch.stack(new_input_embeds, dim=0)
|
292 |
|
293 |
return super(GOTQwenModel, self).forward(
|
294 |
+
input_ids=None,
|
295 |
+
attention_mask=attention_mask,
|
296 |
+
past_key_values=past_key_values,
|
297 |
+
inputs_embeds=inputs_embeds,
|
298 |
+
use_cache=use_cache,
|
299 |
+
position_ids=position_ids,
|
300 |
+
output_attentions=output_attentions,
|
301 |
+
output_hidden_states=output_hidden_states,
|
302 |
+
return_dict=return_dict,
|
303 |
)
|
304 |
|
305 |
|
|
|
306 |
class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
307 |
config_class = GOTConfig
|
308 |
# supports_gradient_checkpointing = True
|
|
|
333 |
output_hidden_states: Optional[bool] = None,
|
334 |
images: Optional[torch.FloatTensor] = None,
|
335 |
return_dict: Optional[bool] = None,
|
|
|
336 |
) -> Union[Tuple, CausalLMOutputWithPast]:
|
337 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
338 |
+
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
|
|
|
339 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
340 |
|
341 |
+
outputs = self.model(
|
342 |
input_ids=input_ids,
|
343 |
past_key_values=past_key_values,
|
344 |
attention_mask=attention_mask,
|
|
|
348 |
output_attentions=output_attentions,
|
349 |
output_hidden_states=output_hidden_states,
|
350 |
images=images,
|
351 |
+
return_dict=return_dict,
|
|
|
352 |
)
|
353 |
|
354 |
hidden_states = outputs[0]
|
|
|
382 |
attentions=outputs.attentions,
|
383 |
)
|
384 |
|
385 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs):
|
|
|
|
|
|
|
386 |
# Omit tokens covered by past_key_values
|
387 |
if past_key_values is not None:
|
388 |
if isinstance(past_key_values, Cache):
|
|
|
406 |
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
|
407 |
|
408 |
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
|
409 |
+
if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
|
|
|
|
|
|
|
|
|
410 |
attention_mask = attention_mask[:, -max_cache_length:]
|
411 |
|
412 |
position_ids = kwargs.get("position_ids", None)
|
|
|
434 |
)
|
435 |
return model_inputs
|
436 |
|
437 |
+
def initialize_vision_tokenizer(self, tokenizer, freeze_lm_model=False, pretrained_stage1_model=None, device="cuda"):
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
config = self.get_model().config
|
439 |
|
|
|
440 |
self.resize_token_embeddings(len(tokenizer))
|
441 |
|
442 |
config.im_patch_token = 151859
|
|
|
448 |
config.im_start_token, config.im_end_token = 151857, 151858
|
449 |
|
450 |
def load_image(self, image_file):
|
451 |
+
if image_file.startswith("http") or image_file.startswith("https"):
|
452 |
response = requests.get(image_file)
|
453 |
+
image = Image.open(BytesIO(response.content)).convert("RGB")
|
454 |
else:
|
455 |
+
image = Image.open(image_file).convert("RGB")
|
456 |
return image
|
457 |
|
458 |
def disable_torch_init(self):
|
|
|
460 |
Disable the redundant torch default initialization to accelerate model creation.
|
461 |
"""
|
462 |
import torch
|
463 |
+
|
464 |
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
465 |
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
466 |
|
467 |
+
def chat(
|
468 |
+
self,
|
469 |
+
tokenizer,
|
470 |
+
image_file,
|
471 |
+
ocr_type,
|
472 |
+
ocr_box="",
|
473 |
+
ocr_color="",
|
474 |
+
render=False,
|
475 |
+
save_render_file=None,
|
476 |
+
print_prompt=False,
|
477 |
+
gradio_input=False,
|
478 |
+
stream_flag=False,
|
479 |
+
):
|
480 |
self.disable_torch_init()
|
481 |
|
482 |
+
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
|
|
483 |
|
484 |
use_im_start_end = True
|
485 |
|
|
|
491 |
image = self.load_image(image_file)
|
492 |
|
493 |
w, h = image.size
|
494 |
+
|
495 |
+
if ocr_type == "format":
|
496 |
+
qs = "OCR with format: "
|
497 |
else:
|
498 |
+
qs = "OCR: "
|
499 |
|
500 |
if ocr_box:
|
501 |
bbox = eval(ocr_box)
|
502 |
if len(bbox) == 2:
|
503 |
+
bbox[0] = int(bbox[0] / w * 1000)
|
504 |
+
bbox[1] = int(bbox[1] / h * 1000)
|
505 |
if len(bbox) == 4:
|
506 |
+
bbox[0] = int(bbox[0] / w * 1000)
|
507 |
+
bbox[1] = int(bbox[1] / h * 1000)
|
508 |
+
bbox[2] = int(bbox[2] / w * 1000)
|
509 |
+
bbox[3] = int(bbox[3] / h * 1000)
|
510 |
+
if ocr_type == "format":
|
511 |
+
qs = str(bbox) + " " + "OCR with format: "
|
512 |
else:
|
513 |
+
qs = str(bbox) + " " + "OCR: "
|
514 |
|
515 |
if ocr_color:
|
516 |
+
if ocr_type == "format":
|
517 |
+
qs = "[" + ocr_color + "]" + " " + "OCR with format: "
|
518 |
else:
|
519 |
+
qs = "[" + ocr_color + "]" + " " + "OCR: "
|
520 |
|
521 |
if use_im_start_end:
|
522 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN + "\n" + qs
|
523 |
else:
|
524 |
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
|
|
525 |
|
526 |
conv_mpt = Conversation(
|
527 |
system="""<|im_start|>system
|
|
|
560 |
input_ids,
|
561 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
562 |
do_sample=False,
|
563 |
+
num_beams=1,
|
564 |
+
no_repeat_ngram_size=20,
|
565 |
streamer=streamer,
|
566 |
max_new_tokens=4096,
|
567 |
+
stopping_criteria=[stopping_criteria],
|
568 |
+
)
|
569 |
else:
|
570 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
571 |
output_ids = self.generate(
|
572 |
input_ids,
|
573 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
574 |
do_sample=False,
|
575 |
+
num_beams=1,
|
576 |
+
no_repeat_ngram_size=20,
|
577 |
# streamer=streamer,
|
578 |
max_new_tokens=4096,
|
579 |
+
stopping_criteria=[stopping_criteria],
|
580 |
+
)
|
581 |
+
|
582 |
+
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
|
583 |
+
|
584 |
if outputs.endswith(stop_str):
|
585 |
+
outputs = outputs[: -len(stop_str)]
|
586 |
outputs = outputs.strip()
|
587 |
response_str = outputs
|
588 |
|
589 |
if render:
|
590 |
+
print("==============rendering===============")
|
591 |
+
from .render_tools import content_mmd_to_html, svg_to_html, tik_html, translation_table
|
592 |
|
593 |
+
if "**kern" in outputs:
|
594 |
import verovio
|
595 |
+
|
596 |
tk = verovio.toolkit()
|
597 |
tk.loadData(outputs)
|
598 |
+
tk.setOptions({"pageWidth": 2100, "footer": "none", "barLineWidth": 0.5, "beamMaxSlope": 15, "staffLineWidth": 0.2, "spacingStaff": 6})
|
|
|
|
|
599 |
tk.getPageCount()
|
600 |
svg = tk.renderToSVG()
|
601 |
+
svg = svg.replace('overflow="inherit"', 'overflow="visible"')
|
602 |
|
603 |
svg_to_html(svg, save_render_file)
|
604 |
|
605 |
+
if ocr_type == "format" and "**kern" not in outputs:
|
606 |
+
if "\\begin{tikzpicture}" not in outputs:
|
|
|
|
|
607 |
html_path_2 = save_render_file
|
608 |
+
right_num = outputs.count("\\right")
|
609 |
+
left_num = outputs.count("\left")
|
610 |
|
611 |
if right_num != left_num:
|
612 |
+
outputs = (
|
613 |
+
outputs.replace("\left(", "(")
|
614 |
+
.replace("\\right)", ")")
|
615 |
+
.replace("\left[", "[")
|
616 |
+
.replace("\\right]", "]")
|
617 |
+
.replace("\left{", "{")
|
618 |
+
.replace("\\right}", "}")
|
619 |
+
.replace("\left|", "|")
|
620 |
+
.replace("\\right|", "|")
|
621 |
+
.replace("\left.", ".")
|
622 |
+
.replace("\\right.", ".")
|
623 |
+
)
|
624 |
|
625 |
+
outputs = outputs.replace('"', "``").replace("$", "")
|
626 |
|
627 |
+
outputs_list = outputs.split("\n")
|
628 |
+
gt = ""
|
629 |
for out in outputs_list:
|
630 |
+
gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
|
|
|
|
|
631 |
|
632 |
+
gt = gt[:-2]
|
633 |
|
634 |
lines = content_mmd_to_html
|
635 |
lines = lines.split("const text =")
|
636 |
+
new_web = lines[0] + "const text =" + gt + lines[1]
|
637 |
|
638 |
else:
|
639 |
html_path_2 = save_render_file
|
640 |
outputs = outputs.translate(translation_table)
|
641 |
+
outputs_list = outputs.split("\n")
|
642 |
+
gt = ""
|
643 |
for out in outputs_list:
|
644 |
if out:
|
645 |
+
if "\\begin{tikzpicture}" not in out and "\\end{tikzpicture}" not in out:
|
646 |
+
while out[-1] == " ":
|
647 |
out = out[:-1]
|
648 |
if out is None:
|
649 |
break
|
650 |
+
|
651 |
if out:
|
652 |
+
if out[-1] != ";":
|
653 |
+
gt += out[:-1] + ";\n"
|
654 |
else:
|
655 |
+
gt += out + "\n"
|
656 |
else:
|
657 |
+
gt += out + "\n"
|
|
|
658 |
|
659 |
lines = tik_html
|
660 |
lines = lines.split("const text =")
|
661 |
new_web = lines[0] + gt + lines[1]
|
662 |
|
663 |
+
with open(html_path_2, "w") as web_f_new:
|
664 |
web_f_new.write(new_web)
|
665 |
return response_str
|
666 |
|
667 |
def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
|
|
|
668 |
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
669 |
+
best_ratio_diff = float("inf")
|
670 |
best_ratio = (1, 1)
|
671 |
area = width * height
|
672 |
for ratio in target_ratios:
|
|
|
680 |
best_ratio = ratio
|
681 |
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
|
682 |
return best_ratio
|
683 |
+
|
684 |
orig_width, orig_height = image.size
|
685 |
aspect_ratio = orig_width / orig_height
|
686 |
|
687 |
# calculate the existing image aspect ratio
|
688 |
target_ratios = set(
|
689 |
+
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num
|
690 |
+
)
|
691 |
# print(target_ratios)
|
692 |
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
693 |
|
694 |
# find the closest aspect ratio to the target
|
695 |
+
target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
|
|
696 |
|
697 |
# print(target_aspect_ratio)
|
698 |
# calculate the target width and height
|
|
|
708 |
(i % (target_width // image_size)) * image_size,
|
709 |
(i // (target_width // image_size)) * image_size,
|
710 |
((i % (target_width // image_size)) + 1) * image_size,
|
711 |
+
((i // (target_width // image_size)) + 1) * image_size,
|
712 |
)
|
713 |
# split the image
|
714 |
split_img = resized_img.crop(box)
|
|
|
719 |
processed_images.append(thumbnail_img)
|
720 |
return processed_images
|
721 |
|
722 |
+
def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag=False):
|
|
|
723 |
# Model
|
724 |
self.disable_torch_init()
|
725 |
+
multi_page = False
|
|
|
726 |
|
727 |
+
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
728 |
|
729 |
use_im_start_end = True
|
730 |
|
|
|
731 |
image_token_len = 256
|
732 |
|
733 |
image_list = []
|
|
|
736 |
# multi_page = True
|
737 |
|
738 |
if multi_page:
|
739 |
+
qs = "OCR with format across multi pages: "
|
740 |
# only for png files
|
741 |
# import glob
|
742 |
# from natsort import natsorted
|
|
|
752 |
# print("len ll: ", ll)
|
753 |
|
754 |
else:
|
755 |
+
if ocr_type == "format":
|
756 |
+
qs = "OCR with format upon the patch reference: "
|
757 |
else:
|
758 |
+
qs = "OCR upon the patch reference: "
|
759 |
if gradio_input:
|
760 |
img = image_file.copy()
|
761 |
else:
|
|
|
767 |
image_tensor_1 = image_processor_high(image)
|
768 |
image_list.append(image_tensor_1)
|
769 |
|
|
|
770 |
image_list = torch.stack(image_list)
|
771 |
|
772 |
+
print("====new images batch size======: \n", image_list.shape)
|
|
|
773 |
|
774 |
if use_im_start_end:
|
775 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len * ll + DEFAULT_IM_END_TOKEN + "\n" + qs
|
776 |
else:
|
777 |
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
|
|
778 |
|
779 |
conv_mpt = Conversation(
|
780 |
system="""<|im_start|>system
|
|
|
811 |
input_ids,
|
812 |
images=[image_list.half().cuda()],
|
813 |
do_sample=False,
|
814 |
+
num_beams=1,
|
815 |
# no_repeat_ngram_size = 20,
|
816 |
streamer=streamer,
|
817 |
max_new_tokens=4096,
|
818 |
+
stopping_criteria=[stopping_criteria],
|
819 |
+
)
|
820 |
else:
|
821 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
822 |
output_ids = self.generate(
|
823 |
input_ids,
|
824 |
images=[image_list.half().cuda()],
|
825 |
do_sample=False,
|
826 |
+
num_beams=1,
|
827 |
# no_repeat_ngram_size = 20,
|
828 |
# streamer=streamer,
|
829 |
max_new_tokens=4096,
|
830 |
+
stopping_criteria=[stopping_criteria],
|
831 |
+
)
|
832 |
+
|
833 |
+
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
|
834 |
|
|
|
|
|
835 |
if outputs.endswith(stop_str):
|
836 |
+
outputs = outputs[: -len(stop_str)]
|
837 |
+
outputs = outputs.strip()
|
838 |
response_str = outputs
|
839 |
|
840 |
if render:
|
841 |
+
print("==============rendering===============")
|
842 |
from .render_tools import content_mmd_to_html
|
843 |
+
|
844 |
html_path_2 = save_render_file
|
845 |
+
right_num = outputs.count("\\right")
|
846 |
+
left_num = outputs.count("\left")
|
847 |
|
848 |
if right_num != left_num:
|
849 |
+
outputs = (
|
850 |
+
outputs.replace("\left(", "(")
|
851 |
+
.replace("\\right)", ")")
|
852 |
+
.replace("\left[", "[")
|
853 |
+
.replace("\\right]", "]")
|
854 |
+
.replace("\left{", "{")
|
855 |
+
.replace("\\right}", "}")
|
856 |
+
.replace("\left|", "|")
|
857 |
+
.replace("\\right|", "|")
|
858 |
+
.replace("\left.", ".")
|
859 |
+
.replace("\\right.", ".")
|
860 |
+
)
|
861 |
+
|
862 |
+
outputs = outputs.replace('"', "``").replace("$", "")
|
863 |
+
|
864 |
+
outputs_list = outputs.split("\n")
|
865 |
+
gt = ""
|
866 |
for out in outputs_list:
|
867 |
+
gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
|
868 |
+
|
869 |
gt = gt[:-2]
|
870 |
|
871 |
lines = content_mmd_to_html
|
872 |
lines = lines.split("const text =")
|
873 |
+
new_web = lines[0] + "const text =" + gt + lines[1]
|
874 |
+
|
875 |
+
with open(html_path_2, "w") as web_f_new:
|
876 |
web_f_new.write(new_web)
|
877 |
|
878 |
+
return response_str
|
render_tools.py
CHANGED
@@ -1,13 +1,9 @@
|
|
|
|
1 |
|
2 |
-
punctuation_dict = {
|
3 |
-
",": ",",
|
4 |
-
"。": ".",
|
5 |
-
|
6 |
-
}
|
7 |
translation_table = str.maketrans(punctuation_dict)
|
8 |
-
|
9 |
-
def svg_to_html(svg_content, output_filename):
|
10 |
|
|
|
|
|
11 |
html_content = f"""
|
12 |
<!DOCTYPE html>
|
13 |
<html lang="en">
|
@@ -24,9 +20,8 @@ def svg_to_html(svg_content, output_filename):
|
|
24 |
</html>
|
25 |
"""
|
26 |
|
27 |
-
with open(output_filename,
|
28 |
file.write(html_content)
|
29 |
-
|
30 |
|
31 |
|
32 |
content_mmd_to_html = """<!DOCTYPE html>
|
@@ -34,7 +29,7 @@ content_mmd_to_html = """<!DOCTYPE html>
|
|
34 |
<meta charset="UTF-8">
|
35 |
<title>Title</title>
|
36 |
<script>
|
37 |
-
const text =
|
38 |
</script>
|
39 |
<style>
|
40 |
#content {
|
@@ -71,7 +66,6 @@ content_mmd_to_html = """<!DOCTYPE html>
|
|
71 |
"""
|
72 |
|
73 |
|
74 |
-
|
75 |
tik_html = """
|
76 |
<!DOCTYPE html>
|
77 |
|
@@ -92,5 +86,4 @@ const text =
|
|
92 |
</html>"""
|
93 |
|
94 |
|
95 |
-
|
96 |
-
# print(tik_html)
|
|
|
1 |
+
punctuation_dict = {",": ",", "。": "."}
|
2 |
|
|
|
|
|
|
|
|
|
|
|
3 |
translation_table = str.maketrans(punctuation_dict)
|
|
|
|
|
4 |
|
5 |
+
|
6 |
+
def svg_to_html(svg_content, output_filename):
|
7 |
html_content = f"""
|
8 |
<!DOCTYPE html>
|
9 |
<html lang="en">
|
|
|
20 |
</html>
|
21 |
"""
|
22 |
|
23 |
+
with open(output_filename, "w") as file:
|
24 |
file.write(html_content)
|
|
|
25 |
|
26 |
|
27 |
content_mmd_to_html = """<!DOCTYPE html>
|
|
|
29 |
<meta charset="UTF-8">
|
30 |
<title>Title</title>
|
31 |
<script>
|
32 |
+
const text =
|
33 |
</script>
|
34 |
<style>
|
35 |
#content {
|
|
|
66 |
"""
|
67 |
|
68 |
|
|
|
69 |
tik_html = """
|
70 |
<!DOCTYPE html>
|
71 |
|
|
|
86 |
</html>"""
|
87 |
|
88 |
|
89 |
+
# print(tik_html)
|
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tiktoken
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
torchvision
|
5 |
+
requests
|
6 |
+
verovio
|
special_tokens_map.json
CHANGED
@@ -6,4 +6,4 @@
|
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
}
|
9 |
-
}
|
|
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
}
|
9 |
+
}
|
tokenization_qwen.py
CHANGED
@@ -12,7 +12,7 @@ import unicodedata
|
|
12 |
from typing import Collection, Dict, List, Set, Tuple, Union
|
13 |
|
14 |
import tiktoken
|
15 |
-
from transformers import
|
16 |
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
@@ -37,10 +37,8 @@ SPECIAL_TOKENS = (
|
|
37 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
38 |
with open(tiktoken_bpe_file, "rb") as f:
|
39 |
contents = f.read()
|
40 |
-
return {
|
41 |
-
|
42 |
-
for token, rank in (line.split() for line in contents.splitlines() if line)
|
43 |
-
}
|
44 |
|
45 |
class QWenTokenizer(PreTrainedTokenizer):
|
46 |
"""QWen tokenizer."""
|
@@ -51,19 +49,19 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
51 |
self,
|
52 |
vocab_file,
|
53 |
errors="replace",
|
54 |
-
image_start_tag=
|
55 |
-
image_end_tag=
|
56 |
-
image_pad_tag=
|
57 |
-
ref_start_tag=
|
58 |
-
ref_end_tag=
|
59 |
-
box_start_tag=
|
60 |
-
box_end_tag=
|
61 |
-
quad_start_tag=
|
62 |
-
quad_end_tag=
|
63 |
**kwargs,
|
64 |
):
|
65 |
super().__init__(**kwargs)
|
66 |
-
|
67 |
self.image_start_tag = image_start_tag
|
68 |
self.image_end_tag = image_end_tag
|
69 |
self.image_pad_tag = image_pad_tag
|
@@ -73,24 +71,13 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
73 |
self.box_end_tag = box_end_tag
|
74 |
self.quad_start_tag = quad_start_tag
|
75 |
self.quad_end_tag = quad_end_tag
|
76 |
-
self.IMAGE_ST = (
|
77 |
-
ref_start_tag, ref_end_tag,
|
78 |
-
box_start_tag, box_end_tag,
|
79 |
-
quad_start_tag, quad_end_tag,
|
80 |
-
image_start_tag, image_end_tag,
|
81 |
-
image_pad_tag
|
82 |
-
)
|
83 |
|
84 |
self.errors = errors # how to handle errors in decoding
|
85 |
|
86 |
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
87 |
-
self.special_tokens = {
|
88 |
-
|
89 |
-
for index, token in enumerate(
|
90 |
-
SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
|
91 |
-
)
|
92 |
-
}
|
93 |
-
|
94 |
self.img_start_id = self.special_tokens[self.image_start_tag]
|
95 |
self.img_end_id = self.special_tokens[self.image_end_tag]
|
96 |
self.img_pad_id = self.special_tokens[self.image_pad_tag]
|
@@ -111,9 +98,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
111 |
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
112 |
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
113 |
|
114 |
-
self.decoder = {
|
115 |
-
v: k for k, v in self.mergeable_ranks.items()
|
116 |
-
} # type: dict[int, bytes|str]
|
117 |
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
118 |
|
119 |
self.tokenizer = enc # type: tiktoken.Encoding
|
@@ -128,9 +113,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
128 |
def get_vocab(self) -> Dict[bytes, int]:
|
129 |
return self.mergeable_ranks
|
130 |
|
131 |
-
def convert_tokens_to_ids(
|
132 |
-
self, tokens: Union[bytes, str, List[Union[bytes, str]]]
|
133 |
-
) -> List[int]:
|
134 |
ids = []
|
135 |
if isinstance(tokens, (str, bytes)):
|
136 |
if tokens in self.special_tokens:
|
@@ -146,11 +129,11 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
146 |
|
147 |
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
148 |
if not special_tokens and new_tokens:
|
149 |
-
raise ValueError(
|
150 |
for token in new_tokens:
|
151 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
152 |
if surface_form not in SPECIAL_TOKENS:
|
153 |
-
raise ValueError(
|
154 |
return 0
|
155 |
|
156 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
@@ -197,9 +180,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
197 |
text = unicodedata.normalize("NFC", text)
|
198 |
|
199 |
# this implementation takes a detour: text -> token id -> token surface forms
|
200 |
-
for t in self.tokenizer.encode(
|
201 |
-
text, allowed_special=allowed_special, disallowed_special=disallowed_special
|
202 |
-
):
|
203 |
tokens.append(self.decoder[t])
|
204 |
return tokens
|
205 |
|
|
|
12 |
from typing import Collection, Dict, List, Set, Tuple, Union
|
13 |
|
14 |
import tiktoken
|
15 |
+
from transformers import AddedToken, PreTrainedTokenizer
|
16 |
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
|
|
37 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
38 |
with open(tiktoken_bpe_file, "rb") as f:
|
39 |
contents = f.read()
|
40 |
+
return {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
|
41 |
+
|
|
|
|
|
42 |
|
43 |
class QWenTokenizer(PreTrainedTokenizer):
|
44 |
"""QWen tokenizer."""
|
|
|
49 |
self,
|
50 |
vocab_file,
|
51 |
errors="replace",
|
52 |
+
image_start_tag="<img>",
|
53 |
+
image_end_tag="</img>",
|
54 |
+
image_pad_tag="<imgpad>",
|
55 |
+
ref_start_tag="<ref>",
|
56 |
+
ref_end_tag="</ref>",
|
57 |
+
box_start_tag="<box>",
|
58 |
+
box_end_tag="</box>",
|
59 |
+
quad_start_tag="<quad>",
|
60 |
+
quad_end_tag="</quad>",
|
61 |
**kwargs,
|
62 |
):
|
63 |
super().__init__(**kwargs)
|
64 |
+
|
65 |
self.image_start_tag = image_start_tag
|
66 |
self.image_end_tag = image_end_tag
|
67 |
self.image_pad_tag = image_pad_tag
|
|
|
71 |
self.box_end_tag = box_end_tag
|
72 |
self.quad_start_tag = quad_start_tag
|
73 |
self.quad_end_tag = quad_end_tag
|
74 |
+
self.IMAGE_ST = (ref_start_tag, ref_end_tag, box_start_tag, box_end_tag, quad_start_tag, quad_end_tag, image_start_tag, image_end_tag, image_pad_tag)
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
self.errors = errors # how to handle errors in decoding
|
77 |
|
78 |
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
79 |
+
self.special_tokens = {token: index for index, token in enumerate(SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks))}
|
80 |
+
|
|
|
|
|
|
|
|
|
|
|
81 |
self.img_start_id = self.special_tokens[self.image_start_tag]
|
82 |
self.img_end_id = self.special_tokens[self.image_end_tag]
|
83 |
self.img_pad_id = self.special_tokens[self.image_pad_tag]
|
|
|
98 |
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
99 |
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
100 |
|
101 |
+
self.decoder = {v: k for k, v in self.mergeable_ranks.items()} # type: dict[int, bytes|str]
|
|
|
|
|
102 |
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
103 |
|
104 |
self.tokenizer = enc # type: tiktoken.Encoding
|
|
|
113 |
def get_vocab(self) -> Dict[bytes, int]:
|
114 |
return self.mergeable_ranks
|
115 |
|
116 |
+
def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]:
|
|
|
|
|
117 |
ids = []
|
118 |
if isinstance(tokens, (str, bytes)):
|
119 |
if tokens in self.special_tokens:
|
|
|
129 |
|
130 |
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
131 |
if not special_tokens and new_tokens:
|
132 |
+
raise ValueError("Adding regular tokens is not supported")
|
133 |
for token in new_tokens:
|
134 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
135 |
if surface_form not in SPECIAL_TOKENS:
|
136 |
+
raise ValueError("Adding unknown special tokens is not supported")
|
137 |
return 0
|
138 |
|
139 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
|
|
180 |
text = unicodedata.normalize("NFC", text)
|
181 |
|
182 |
# this implementation takes a detour: text -> token id -> token surface forms
|
183 |
+
for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
|
|
|
|
|
184 |
tokens.append(self.decoder[t])
|
185 |
return tokens
|
186 |
|