fix: format .py

Browse files

Files changed (8) hide show

config.json +1 -1
generation_config.json +1 -1
got_vision_b.py +23 -41
modeling_GOT.py +201 -204
render_tools.py +6 -13
requirements.txt +6 -0
special_tokens_map.json +1 -1
tokenization_qwen.py +21 -40

config.json CHANGED Viewed

@@ -35,4 +35,4 @@
   "use_im_start_end": true,
   "use_sliding_window": false,
   "vocab_size": 151860
-}

   "use_im_start_end": true,
   "use_sliding_window": false,
   "vocab_size": 151860
+}

generation_config.json CHANGED Viewed

@@ -3,4 +3,4 @@
   "eos_token_id": 151643,
   "max_new_tokens": 2048,
   "transformers_version": "4.37.2"
-}

   "eos_token_id": 151643,
   "max_new_tokens": 2048,
   "transformers_version": "4.37.2"
+}

got_vision_b.py CHANGED Viewed

@@ -1,10 +1,9 @@
-import torch
-import torch.nn.functional as F
-from typing import Optional, Tuple, Type
 from functools import partial
-import torch.nn as nn
-from typing import Type
 class MLPBlock(nn.Module):
@@ -23,7 +22,6 @@ class MLPBlock(nn.Module):
         return self.lin2(self.act(self.lin1(x)))
 class LayerNorm2d(nn.Module):
     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
         super().__init__()
@@ -39,7 +37,6 @@ class LayerNorm2d(nn.Module):
         return x
 class ImageEncoderViT(nn.Module):
     def __init__(
         self,
@@ -91,9 +88,7 @@ class ImageEncoderViT(nn.Module):
         self.pos_embed: Optional[nn.Parameter] = None
         if use_abs_pos:
             # Initialize absolute positional embedding with pretrain image size.
-            self.pos_embed = nn.Parameter(
-                torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
-            )
         self.blocks = nn.ModuleList()
         for i in range(depth):
@@ -129,7 +124,6 @@ class ImageEncoderViT(nn.Module):
             LayerNorm2d(out_chans),
         )
         self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
         self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
@@ -145,7 +139,6 @@ class ImageEncoderViT(nn.Module):
         x = self.net_2(x)
         x = self.net_3(x)
         return x
@@ -247,9 +240,7 @@ class Attention(nn.Module):
         self.use_rel_pos = use_rel_pos
         if self.use_rel_pos:
-            assert (
-                input_size is not None
-            ), "Input size must be provided if using relative positional encoding."
             # initialize relative positional embeddings
             self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
             self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
@@ -297,9 +288,7 @@ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, T
     return windows, (Hp, Wp)
-def window_unpartition(
-    windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
-) -> torch.Tensor:
     """
     Window unpartition into original sequences and removing padding.
     Args:
@@ -385,9 +374,7 @@ def add_decomposed_rel_pos(
     rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
     rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
-    attn = (
-        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
-    ).view(B, q_h * q_w, k_h * k_w)
     return attn
@@ -415,9 +402,7 @@ class PatchEmbed(nn.Module):
         """
         super().__init__()
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
-        )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
@@ -426,7 +411,6 @@ class PatchEmbed(nn.Module):
         return x
 def build_GOT_vit_b(checkpoint=None):
     return _build_GOT_vision(
         encoder_embed_dim=768,
@@ -448,21 +432,19 @@ def _build_GOT_vision(
     image_size = 1024
     vit_patch_size = 16
     image_embedding_size = image_size // vit_patch_size
-    image_encoder=ImageEncoderViT(
-            depth=encoder_depth,
-            embed_dim=encoder_embed_dim,
-            img_size=image_size,
-            mlp_ratio=4,
-            norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
-            num_heads=encoder_num_heads,
-            patch_size=vit_patch_size,
-            qkv_bias=True,
-            use_rel_pos=True,
-            global_attn_indexes=encoder_global_attn_indexes,
-            window_size=14,
-            out_chans=prompt_embed_dim,
-        )
     return image_encoder

 from functools import partial
+from typing import Optional, Tuple, Type
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 class MLPBlock(nn.Module):
         return self.lin2(self.act(self.lin1(x)))
 class LayerNorm2d(nn.Module):
     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
         super().__init__()
         return x
 class ImageEncoderViT(nn.Module):
     def __init__(
         self,
         self.pos_embed: Optional[nn.Parameter] = None
         if use_abs_pos:
             # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
         self.blocks = nn.ModuleList()
         for i in range(depth):
             LayerNorm2d(out_chans),
         )
         self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
         self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
         x = self.net_2(x)
         x = self.net_3(x)
         return x
         self.use_rel_pos = use_rel_pos
         if self.use_rel_pos:
+            assert input_size is not None, "Input size must be provided if using relative positional encoding."
             # initialize relative positional embeddings
             self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
             self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
     return windows, (Hp, Wp)
+def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]) -> torch.Tensor:
     """
     Window unpartition into original sequences and removing padding.
     Args:
     rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
     rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(B, q_h * q_w, k_h * k_w)
     return attn
         """
         super().__init__()
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         return x
 def build_GOT_vit_b(checkpoint=None):
     return _build_GOT_vision(
         encoder_embed_dim=768,
     image_size = 1024
     vit_patch_size = 16
     image_embedding_size = image_size // vit_patch_size
+    image_encoder = ImageEncoderViT(
+        depth=encoder_depth,
+        embed_dim=encoder_embed_dim,
+        img_size=image_size,
+        mlp_ratio=4,
+        norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+        num_heads=encoder_num_heads,
+        patch_size=vit_patch_size,
+        qkv_bias=True,
+        use_rel_pos=True,
+        global_attn_indexes=encoder_global_attn_indexes,
+        window_size=14,
+        out_chans=prompt_embed_dim,
+    )
     return image_encoder

modeling_GOT.py CHANGED Viewed

@@ -1,27 +1,32 @@
-from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM, StoppingCriteria, TextStreamer
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
-from transformers.cache_utils import Cache
 import requests
-from PIL import Image
-from io import BytesIO
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from .got_vision_b import build_GOT_vit_b
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
-import dataclasses
 ###
 DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
-DEFAULT_IM_START_TOKEN = '<img>'
-DEFAULT_IM_END_TOKEN = '</img>'
-from enum import auto, Enum
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
     TWO = auto()
     MPT = auto()
@@ -30,6 +35,7 @@ class SeparatorStyle(Enum):
 @dataclasses.dataclass
 class Conversation:
     """A class that keeps all conversation history."""
     system: str
     roles: List[str]
     messages: List[List[str]]
@@ -43,7 +49,7 @@ class Conversation:
     def get_prompt(self):
         if self.sep_style == SeparatorStyle.SINGLE:
-            ret = self.system + self.sep + '\n'
             for role, message in self.messages:
                 if message:
                     if type(message) is tuple:
@@ -65,9 +71,9 @@ class Conversation:
             return ret
         if self.sep_style == SeparatorStyle.MPT:
             if self.system:
-                ret = self.system + self.sep
             else:
-                ret = ''
             for role, message in self.messages:
                 if message:
                     if type(message) is tuple:
@@ -79,7 +85,6 @@ class Conversation:
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
     def append_message(self, role, message):
         self.messages.append([role, message])
@@ -91,8 +96,8 @@ class Conversation:
             offset=self.offset,
             sep_style=self.sep_style,
             sep=self.sep,
-            sep2=self.sep2)
 class KeywordsStoppingCriteria(StoppingCriteria):
@@ -111,12 +116,12 @@ class KeywordsStoppingCriteria(StoppingCriteria):
             for keyword_id in self.keyword_ids:
                 if output_ids[0, -1] == keyword_id:
                     return True
-            outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
             for keyword in self.keywords:
                 if keyword in outputs:
                     return True
         return False
 class GOTImageEvalProcessor:
     def __init__(self, image_size=384, mean=None, std=None):
@@ -129,18 +134,16 @@ class GOTImageEvalProcessor:
         self.transform = transforms.Compose(
             [
-                transforms.Resize(
-                    (image_size, image_size), interpolation=InterpolationMode.BICUBIC
-                ),
                 transforms.ToTensor(),
                 self.normalize,
             ]
         )
     def __call__(self, item):
         return self.transform(item)
 class GOTConfig(Qwen2Config):
     model_type = "GOT"
@@ -153,28 +156,24 @@ class GOTQwenModel(Qwen2Model):
         self.vision_tower_high = build_GOT_vit_b()
-        self.mm_projector_vary =  nn.Linear(1024, 1024)
     def initialize_vision_modules(
-        self,
         vision_tower,
         pretrained_stage1_model=None,
         freeze_vision_tower=False,
         use_im_start_end=False,
         vision_select_layer=-1,
         dtype=torch.float16,
-        device="cuda"
     ):
         image_processor_high = GOTImageEvalProcessor(image_size=1024)
         self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
         self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
         image_token_len = 256
         self.config.vision_tower = vision_tower
@@ -184,13 +183,12 @@ class GOTQwenModel(Qwen2Model):
         self.config.vision_select_layer = vision_select_layer
         self.config.freeze_vision_tower = freeze_vision_tower
         return dict(
             image_processor_high=image_processor_high,
             image_token_len=image_token_len,
         )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -204,19 +202,16 @@ class GOTQwenModel(Qwen2Model):
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         # HACK: replace back original embeddings for LLaVA pretraining
-        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
         if orig_embeds_params is not None:
             with torch.no_grad():
-                self.get_input_embeddings().weight[:-self.num_new_tokens] = orig_embeds_params[:-self.num_new_tokens].data
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        vision_tower_high = getattr(self, 'vision_tower_high', None)
         if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
             use_im_start_end = getattr(self.config, "use_im_start_end", -1)
@@ -232,15 +227,15 @@ class GOTQwenModel(Qwen2Model):
             im_start_token = 151857
             im_end_token = 151858
             image_features = []
             for image in images:
                 P, C, H, W = image.shape
                 if P == 1:
                     with torch.set_grad_enabled(False):
                         cnn_feature = vision_tower_high(image)
-                        cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1) # 256*1024
                     image_feature = self.mm_projector_vary(cnn_feature)
                     image_features.append(image_feature)
@@ -249,7 +244,7 @@ class GOTQwenModel(Qwen2Model):
                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
@@ -258,21 +253,20 @@ class GOTQwenModel(Qwen2Model):
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
             dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = dummy_image_features_2
             use_im_start_end = True
             new_input_embeds = []
             for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
                 if (cur_input_ids == im_patch_token).sum() == 0:
-                    cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
                     new_input_embeds.append(cur_input_embeds)
                     continue
                 if use_im_start_end:
                     if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
                         raise ValueError("The number of image start tokens and image end tokens should be the same.")
                     image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
                     for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
                         per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
@@ -280,17 +274,16 @@ class GOTQwenModel(Qwen2Model):
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
                             raise ValueError("The image end token should follow the image start token.")
                         cur_input_embeds = torch.cat(
                             (
-                                cur_input_embeds[:image_start_token_pos+1],
-                                per_cur_image_features,
-                                cur_input_embeds[image_start_token_pos + num_patches + 1:]
-                            ),
-                            dim=0
                         )
                     new_input_embeds.append(cur_input_embeds)
                 else:
                     raise NotImplementedError
@@ -298,14 +291,18 @@ class GOTQwenModel(Qwen2Model):
             inputs_embeds = torch.stack(new_input_embeds, dim=0)
         return super(GOTQwenModel, self).forward(
-            input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds, use_cache=use_cache, position_ids = position_ids,
-            output_attentions=output_attentions, output_hidden_states=output_hidden_states,
-            return_dict=return_dict
         )
 class GOTQwenForCausalLM(Qwen2ForCausalLM):
     config_class = GOTConfig
     # supports_gradient_checkpointing = True
@@ -336,15 +333,12 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs  = self.model(
             input_ids=input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
@@ -354,8 +348,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             images=images,
-            return_dict=return_dict
         )
         hidden_states = outputs[0]
@@ -389,10 +382,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             attentions=outputs.attentions,
         )
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
@@ -416,11 +406,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
                 attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
@@ -448,16 +434,9 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         )
         return model_inputs
-    def initialize_vision_tokenizer(
-        self,
-        tokenizer,
-        freeze_lm_model=False,
-        pretrained_stage1_model=None,
-        device="cuda"
-    ):
         config = self.get_model().config
         self.resize_token_embeddings(len(tokenizer))
         config.im_patch_token = 151859
@@ -469,11 +448,11 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             config.im_start_token, config.im_end_token = 151857, 151858
     def load_image(self, image_file):
-        if image_file.startswith('http') or image_file.startswith('https'):
             response = requests.get(image_file)
-            image = Image.open(BytesIO(response.content)).convert('RGB')
         else:
-            image = Image.open(image_file).convert('RGB')
         return image
     def disable_torch_init(self):
@@ -481,15 +460,26 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         Disable the redundant torch default initialization to accelerate model creation.
         """
         import torch
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         self.disable_torch_init()
-        image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
@@ -501,38 +491,37 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image = self.load_image(image_file)
         w, h = image.size
-        if ocr_type == 'format':
-            qs = 'OCR with format: '
         else:
-            qs = 'OCR: '
         if ocr_box:
             bbox = eval(ocr_box)
             if len(bbox) == 2:
-                bbox[0] = int(bbox[0]/w*1000)
-                bbox[1] = int(bbox[1]/h*1000)
             if len(bbox) == 4:
-                bbox[0] = int(bbox[0]/w*1000)
-                bbox[1] = int(bbox[1]/h*1000)
-                bbox[2] = int(bbox[2]/w*1000)
-                bbox[3] = int(bbox[3]/h*1000)
-            if ocr_type == 'format':
-                qs = str(bbox) + ' ' + 'OCR with format: '
             else:
-                qs = str(bbox) + ' ' + 'OCR: '
         if ocr_color:
-            if ocr_type == 'format':
-                qs = '[' + ocr_color + ']' + ' ' + 'OCR with format: '
             else:
-                qs = '[' + ocr_color + ']' + ' ' + 'OCR: '
         if use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
-            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
@@ -571,109 +560,113 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
-                    stopping_criteria=[stopping_criteria]
-                    )
         else:
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
-                    stopping_criteria=[stopping_criteria]
-                    )
-        outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
-            outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
         response_str = outputs
         if render:
-            print('==============rendering===============')
-            from .render_tools import svg_to_html, content_mmd_to_html, tik_html, translation_table
-            if '**kern' in outputs:
                 import verovio
                 tk = verovio.toolkit()
                 tk.loadData(outputs)
-                tk.setOptions({"pageWidth": 2100, "footer": 'none',
-            'barLineWidth': 0.5, 'beamMaxSlope': 15,
-            'staffLineWidth': 0.2, 'spacingStaff': 6})
                 tk.getPageCount()
                 svg = tk.renderToSVG()
-                svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
                 svg_to_html(svg, save_render_file)
-            if ocr_type == 'format' and '**kern' not in outputs:
-                if  '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
-                    right_num = outputs.count('\\right')
-                    left_num = outputs.count('\left')
                     if right_num != left_num:
-                        outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
-                    outputs = outputs.replace('"', '``').replace('$', '')
-                    outputs_list = outputs.split('\n')
-                    gt= ''
                     for out in outputs_list:
-                        gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
-                    gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
-                    new_web = lines[0] + 'const text ='  + gt  + lines[1]
                 else:
                     html_path_2 = save_render_file
                     outputs = outputs.translate(translation_table)
-                    outputs_list = outputs.split('\n')
-                    gt= ''
                     for out in outputs_list:
                         if out:
-                            if '\\begin{tikzpicture}' not in out and '\\end{tikzpicture}' not in out:
-                                while out[-1] == ' ':
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
-                                    if out[-1] != ';':
-                                        gt += out[:-1] + ';\n'
                                     else:
-                                        gt += out + '\n'
                             else:
-                                gt += out + '\n'
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
-                with open(html_path_2, 'w') as web_f_new:
                     web_f_new.write(new_web)
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-            best_ratio_diff = float('inf')
             best_ratio = (1, 1)
             area = width * height
             for ratio in target_ratios:
@@ -687,20 +680,19 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                         best_ratio = ratio
             # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
         # calculate the existing image aspect ratio
         target_ratios = set(
-            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-            i * j <= max_num and i * j >= min_num)
         # print(target_ratios)
         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
         # find the closest aspect ratio to the target
-        target_aspect_ratio = find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
         # print(target_aspect_ratio)
         # calculate the target width and height
@@ -716,7 +708,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 (i % (target_width // image_size)) * image_size,
                 (i // (target_width // image_size)) * image_size,
                 ((i % (target_width // image_size)) + 1) * image_size,
-                ((i // (target_width // image_size)) + 1) * image_size
             )
             # split the image
             split_img = resized_img.crop(box)
@@ -727,18 +719,15 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             processed_images.append(thumbnail_img)
         return processed_images
-    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         # Model
         self.disable_torch_init()
-        multi_page=False
-        image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         image_list = []
@@ -747,7 +736,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         #     multi_page = True
         if multi_page:
-            qs = 'OCR with format across multi pages: '
             # only for png files
             # import glob
             # from natsort import natsorted
@@ -763,10 +752,10 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             # print("len ll: ", ll)
         else:
-            if ocr_type == 'format':
-                qs = 'OCR with format upon the patch reference: '
             else:
-                qs = 'OCR upon the patch reference: '
             if gradio_input:
                 img = image_file.copy()
             else:
@@ -778,17 +767,14 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
-        print('====new images batch size======:  \n',image_list.shape)
         if use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
-            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
@@ -825,57 +811,68 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
                     # no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
-                    stopping_criteria=[stopping_criteria]
-                    )
         else:
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
                     # no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
-                    stopping_criteria=[stopping_criteria]
-                    )
-        outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
-            outputs = outputs[:-len(stop_str)]
-        outputs = outputs.strip()
         response_str = outputs
         if render:
-            print('==============rendering===============')
             from .render_tools import content_mmd_to_html
             html_path_2 = save_render_file
-            right_num = outputs.count('\\right')
-            left_num = outputs.count('\left')
             if right_num != left_num:
-                outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
-            outputs = outputs.replace('"', '``').replace('$', '')
-            outputs_list = outputs.split('\n')
-            gt= ''
             for out in outputs_list:
-                gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
             gt = gt[:-2]
             lines = content_mmd_to_html
             lines = lines.split("const text =")
-            new_web = lines[0] + 'const text ='  + gt  + lines[1]
-            with open(html_path_2, 'w') as web_f_new:
                 web_f_new.write(new_web)
-        return response_str

+import dataclasses
+from enum import Enum, auto
+from io import BytesIO
 from typing import List, Optional, Tuple, Union
 import requests
 import torch
 import torch.nn as nn
+from PIL import Image
 from torch.nn import CrossEntropyLoss
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
+from transformers import Qwen2Config, Qwen2ForCausalLM, Qwen2Model, StoppingCriteria, TextStreamer
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from .got_vision_b import build_GOT_vit_b
 ###
 DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>"
+DEFAULT_IM_START_TOKEN = "<img>"
+DEFAULT_IM_END_TOKEN = "</img>"
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
     TWO = auto()
     MPT = auto()
 @dataclasses.dataclass
 class Conversation:
     """A class that keeps all conversation history."""
     system: str
     roles: List[str]
     messages: List[List[str]]
     def get_prompt(self):
         if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep + "\n"
             for role, message in self.messages:
                 if message:
                     if type(message) is tuple:
             return ret
         if self.sep_style == SeparatorStyle.MPT:
             if self.system:
+                ret = self.system + self.sep
             else:
+                ret = ""
             for role, message in self.messages:
                 if message:
                     if type(message) is tuple:
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
     def append_message(self, role, message):
         self.messages.append([role, message])
             offset=self.offset,
             sep_style=self.sep_style,
             sep=self.sep,
+            sep2=self.sep2,
+        )
 class KeywordsStoppingCriteria(StoppingCriteria):
             for keyword_id in self.keyword_ids:
                 if output_ids[0, -1] == keyword_id:
                     return True
+            outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len :], skip_special_tokens=True)[0]
             for keyword in self.keywords:
                 if keyword in outputs:
                     return True
         return False
 class GOTImageEvalProcessor:
     def __init__(self, image_size=384, mean=None, std=None):
         self.transform = transforms.Compose(
             [
+                transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
                 transforms.ToTensor(),
                 self.normalize,
             ]
         )
     def __call__(self, item):
         return self.transform(item)
 class GOTConfig(Qwen2Config):
     model_type = "GOT"
         self.vision_tower_high = build_GOT_vit_b()
+        self.mm_projector_vary = nn.Linear(1024, 1024)
     def initialize_vision_modules(
+        self,
         vision_tower,
         pretrained_stage1_model=None,
         freeze_vision_tower=False,
         use_im_start_end=False,
         vision_select_layer=-1,
         dtype=torch.float16,
+        device="cuda",
     ):
         image_processor_high = GOTImageEvalProcessor(image_size=1024)
         self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
         self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
         image_token_len = 256
         self.config.vision_tower = vision_tower
         self.config.vision_select_layer = vision_select_layer
         self.config.freeze_vision_tower = freeze_vision_tower
         return dict(
             image_processor_high=image_processor_high,
             image_token_len=image_token_len,
         )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         # HACK: replace back original embeddings for LLaVA pretraining
+        orig_embeds_params = getattr(self, "orig_embeds_params", None)
         if orig_embeds_params is not None:
             with torch.no_grad():
+                self.get_input_embeddings().weight[: -self.num_new_tokens] = orig_embeds_params[: -self.num_new_tokens].data
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        vision_tower_high = getattr(self, "vision_tower_high", None)
         if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
             use_im_start_end = getattr(self.config, "use_im_start_end", -1)
             im_start_token = 151857
             im_end_token = 151858
             image_features = []
             for image in images:
                 P, C, H, W = image.shape
                 if P == 1:
                     with torch.set_grad_enabled(False):
                         cnn_feature = vision_tower_high(image)
+                        cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1)  # 256*1024
                     image_feature = self.mm_projector_vary(cnn_feature)
                     image_features.append(image_feature)
                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
             dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = dummy_image_features_2
             use_im_start_end = True
             new_input_embeds = []
             for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
                 if (cur_input_ids == im_patch_token).sum() == 0:
+                    cur_input_embeds = cur_input_embeds + (0.0 * dummy_image_features).sum()
                     new_input_embeds.append(cur_input_embeds)
                     continue
                 if use_im_start_end:
                     if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
                         raise ValueError("The number of image start tokens and image end tokens should be the same.")
                     image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
                     for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
                         per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
                             raise ValueError("The image end token should follow the image start token.")
                         cur_input_embeds = torch.cat(
                             (
+                                cur_input_embeds[: image_start_token_pos + 1],
+                                per_cur_image_features,
+                                cur_input_embeds[image_start_token_pos + num_patches + 1 :],
+                            ),
+                            dim=0,
                         )
                     new_input_embeds.append(cur_input_embeds)
                 else:
                     raise NotImplementedError
             inputs_embeds = torch.stack(new_input_embeds, dim=0)
         return super(GOTQwenModel, self).forward(
+            input_ids=None,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 class GOTQwenForCausalLM(Qwen2ForCausalLM):
     config_class = GOTConfig
     # supports_gradient_checkpointing = True
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
             input_ids=input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             images=images,
+            return_dict=return_dict,
         )
         hidden_states = outputs[0]
             attentions=outputs.attentions,
         )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs):
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
                 attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
         )
         return model_inputs
+    def initialize_vision_tokenizer(self, tokenizer, freeze_lm_model=False, pretrained_stage1_model=None, device="cuda"):
         config = self.get_model().config
         self.resize_token_embeddings(len(tokenizer))
         config.im_patch_token = 151859
             config.im_start_token, config.im_end_token = 151857, 151858
     def load_image(self, image_file):
+        if image_file.startswith("http") or image_file.startswith("https"):
             response = requests.get(image_file)
+            image = Image.open(BytesIO(response.content)).convert("RGB")
         else:
+            image = Image.open(image_file).convert("RGB")
         return image
     def disable_torch_init(self):
         Disable the redundant torch default initialization to accelerate model creation.
         """
         import torch
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    def chat(
+        self,
+        tokenizer,
+        image_file,
+        ocr_type,
+        ocr_box="",
+        ocr_color="",
+        render=False,
+        save_render_file=None,
+        print_prompt=False,
+        gradio_input=False,
+        stream_flag=False,
+    ):
         self.disable_torch_init()
+        image_processor_high = GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
             image = self.load_image(image_file)
         w, h = image.size
+        if ocr_type == "format":
+            qs = "OCR with format: "
         else:
+            qs = "OCR: "
         if ocr_box:
             bbox = eval(ocr_box)
             if len(bbox) == 2:
+                bbox[0] = int(bbox[0] / w * 1000)
+                bbox[1] = int(bbox[1] / h * 1000)
             if len(bbox) == 4:
+                bbox[0] = int(bbox[0] / w * 1000)
+                bbox[1] = int(bbox[1] / h * 1000)
+                bbox[2] = int(bbox[2] / w * 1000)
+                bbox[3] = int(bbox[3] / h * 1000)
+            if ocr_type == "format":
+                qs = str(bbox) + " " + "OCR with format: "
             else:
+                qs = str(bbox) + " " + "OCR: "
         if ocr_color:
+            if ocr_type == "format":
+                qs = "[" + ocr_color + "]" + " " + "OCR with format: "
             else:
+                qs = "[" + ocr_color + "]" + " " + "OCR: "
         if use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN + "\n" + qs
         else:
+            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
+                    num_beams=1,
+                    no_repeat_ngram_size=20,
                     streamer=streamer,
                     max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria],
+                )
         else:
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
+                    num_beams=1,
+                    no_repeat_ngram_size=20,
                     # streamer=streamer,
                     max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria],
+                )
+        outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
         if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
         outputs = outputs.strip()
         response_str = outputs
         if render:
+            print("==============rendering===============")
+            from .render_tools import content_mmd_to_html, svg_to_html, tik_html, translation_table
+            if "**kern" in outputs:
                 import verovio
                 tk = verovio.toolkit()
                 tk.loadData(outputs)
+                tk.setOptions({"pageWidth": 2100, "footer": "none", "barLineWidth": 0.5, "beamMaxSlope": 15, "staffLineWidth": 0.2, "spacingStaff": 6})
                 tk.getPageCount()
                 svg = tk.renderToSVG()
+                svg = svg.replace('overflow="inherit"', 'overflow="visible"')
                 svg_to_html(svg, save_render_file)
+            if ocr_type == "format" and "**kern" not in outputs:
+                if "\\begin{tikzpicture}" not in outputs:
                     html_path_2 = save_render_file
+                    right_num = outputs.count("\\right")
+                    left_num = outputs.count("\left")
                     if right_num != left_num:
+                        outputs = (
+                            outputs.replace("\left(", "(")
+                            .replace("\\right)", ")")
+                            .replace("\left[", "[")
+                            .replace("\\right]", "]")
+                            .replace("\left{", "{")
+                            .replace("\\right}", "}")
+                            .replace("\left|", "|")
+                            .replace("\\right|", "|")
+                            .replace("\left.", ".")
+                            .replace("\\right.", ".")
+                        )
+                    outputs = outputs.replace('"', "``").replace("$", "")
+                    outputs_list = outputs.split("\n")
+                    gt = ""
                     for out in outputs_list:
+                        gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
+                    gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
+                    new_web = lines[0] + "const text =" + gt + lines[1]
                 else:
                     html_path_2 = save_render_file
                     outputs = outputs.translate(translation_table)
+                    outputs_list = outputs.split("\n")
+                    gt = ""
                     for out in outputs_list:
                         if out:
+                            if "\\begin{tikzpicture}" not in out and "\\end{tikzpicture}" not in out:
+                                while out[-1] == " ":
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
+                                    if out[-1] != ";":
+                                        gt += out[:-1] + ";\n"
                                     else:
+                                        gt += out + "\n"
                             else:
+                                gt += out + "\n"
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
+                with open(html_path_2, "w") as web_f_new:
                     web_f_new.write(new_web)
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+            best_ratio_diff = float("inf")
             best_ratio = (1, 1)
             area = width * height
             for ratio in target_ratios:
                         best_ratio = ratio
             # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
         # calculate the existing image aspect ratio
         target_ratios = set(
+            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num
+        )
         # print(target_ratios)
         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
         # find the closest aspect ratio to the target
+        target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
         # print(target_aspect_ratio)
         # calculate the target width and height
                 (i % (target_width // image_size)) * image_size,
                 (i // (target_width // image_size)) * image_size,
                 ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size,
             )
             # split the image
             split_img = resized_img.crop(box)
             processed_images.append(thumbnail_img)
         return processed_images
+    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag=False):
         # Model
         self.disable_torch_init()
+        multi_page = False
+        image_processor_high = GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         image_list = []
         #     multi_page = True
         if multi_page:
+            qs = "OCR with format across multi pages: "
             # only for png files
             # import glob
             # from natsort import natsorted
             # print("len ll: ", ll)
         else:
+            if ocr_type == "format":
+                qs = "OCR with format upon the patch reference: "
             else:
+                qs = "OCR upon the patch reference: "
             if gradio_input:
                 img = image_file.copy()
             else:
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
+        print("====new images batch size======:  \n", image_list.shape)
         if use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len * ll + DEFAULT_IM_END_TOKEN + "\n" + qs
         else:
+            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
+                    num_beams=1,
                     # no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria],
+                )
         else:
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
+                    num_beams=1,
                     # no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria],
+                )
+        outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
         if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
+        outputs = outputs.strip()
         response_str = outputs
         if render:
+            print("==============rendering===============")
             from .render_tools import content_mmd_to_html
             html_path_2 = save_render_file
+            right_num = outputs.count("\\right")
+            left_num = outputs.count("\left")
             if right_num != left_num:
+                outputs = (
+                    outputs.replace("\left(", "(")
+                    .replace("\\right)", ")")
+                    .replace("\left[", "[")
+                    .replace("\\right]", "]")
+                    .replace("\left{", "{")
+                    .replace("\\right}", "}")
+                    .replace("\left|", "|")
+                    .replace("\\right|", "|")
+                    .replace("\left.", ".")
+                    .replace("\\right.", ".")
+                )
+            outputs = outputs.replace('"', "``").replace("$", "")
+            outputs_list = outputs.split("\n")
+            gt = ""
             for out in outputs_list:
+                gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
             gt = gt[:-2]
             lines = content_mmd_to_html
             lines = lines.split("const text =")
+            new_web = lines[0] + "const text =" + gt + lines[1]
+            with open(html_path_2, "w") as web_f_new:
                 web_f_new.write(new_web)
+        return response_str

render_tools.py CHANGED Viewed

@@ -1,13 +1,9 @@
-punctuation_dict = {
-    "，": ",",
-    "。": ".",
-}
 translation_table = str.maketrans(punctuation_dict)
-def svg_to_html(svg_content, output_filename):
     html_content = f"""
     <!DOCTYPE html>
     <html lang="en">
@@ -24,9 +20,8 @@ def svg_to_html(svg_content, output_filename):
     </html>
     """
-    with open(output_filename, 'w') as file:
         file.write(html_content)
 content_mmd_to_html = """<!DOCTYPE html>
@@ -34,7 +29,7 @@ content_mmd_to_html = """<!DOCTYPE html>
   <meta charset="UTF-8">
   <title>Title</title>
   <script>
-    const text =
   </script>
   <style>
     #content {
@@ -71,7 +66,6 @@ content_mmd_to_html = """<!DOCTYPE html>
 """
 tik_html = """
 <!DOCTYPE html>
@@ -92,5 +86,4 @@ const text =
 </html>"""
-# print(tik_html)

+punctuation_dict = {"，": ",", "。": "."}
 translation_table = str.maketrans(punctuation_dict)
+def svg_to_html(svg_content, output_filename):
     html_content = f"""
     <!DOCTYPE html>
     <html lang="en">
     </html>
     """
+    with open(output_filename, "w") as file:
         file.write(html_content)
 content_mmd_to_html = """<!DOCTYPE html>
   <meta charset="UTF-8">
   <title>Title</title>
   <script>
+    const text =
   </script>
   <style>
     #content {
 """
 tik_html = """
 <!DOCTYPE html>
 </html>"""
+# print(tik_html)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+tiktoken
+transformers
+torch
+torchvision
+requests
+verovio

special_tokens_map.json CHANGED Viewed

@@ -6,4 +6,4 @@
     "rstrip": false,
     "single_word": false
   }
-}

     "rstrip": false,
     "single_word": false
   }
+}

tokenization_qwen.py CHANGED Viewed

@@ -12,7 +12,7 @@ import unicodedata
 from typing import Collection, Dict, List, Set, Tuple, Union
 import tiktoken
-from transformers import PreTrainedTokenizer, AddedToken
 logger = logging.getLogger(__name__)
@@ -37,10 +37,8 @@ SPECIAL_TOKENS = (
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
     with open(tiktoken_bpe_file, "rb") as f:
         contents = f.read()
-    return {
-        base64.b64decode(token): int(rank)
-        for token, rank in (line.split() for line in contents.splitlines() if line)
-    }
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
@@ -51,19 +49,19 @@ class QWenTokenizer(PreTrainedTokenizer):
         self,
         vocab_file,
         errors="replace",
-        image_start_tag='<img>',
-        image_end_tag='</img>',
-        image_pad_tag='<imgpad>',
-        ref_start_tag='<ref>',
-        ref_end_tag='</ref>',
-        box_start_tag='<box>',
-        box_end_tag='</box>',
-        quad_start_tag='<quad>',
-        quad_end_tag='</quad>',
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_start_tag = image_start_tag
         self.image_end_tag = image_end_tag
         self.image_pad_tag = image_pad_tag
@@ -73,24 +71,13 @@ class QWenTokenizer(PreTrainedTokenizer):
         self.box_end_tag = box_end_tag
         self.quad_start_tag = quad_start_tag
         self.quad_end_tag = quad_end_tag
-        self.IMAGE_ST = (
-            ref_start_tag, ref_end_tag,
-            box_start_tag, box_end_tag,
-            quad_start_tag, quad_end_tag,
-            image_start_tag, image_end_tag,
-            image_pad_tag
-        )
         self.errors = errors  # how to handle errors in decoding
         self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
-        self.special_tokens = {
-            token: index
-            for index, token in enumerate(
-                SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
-            )
-        }
         self.img_start_id = self.special_tokens[self.image_start_tag]
         self.img_end_id = self.special_tokens[self.image_end_tag]
         self.img_pad_id = self.special_tokens[self.image_pad_tag]
@@ -111,9 +98,7 @@ class QWenTokenizer(PreTrainedTokenizer):
             len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
         ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
-        self.decoder = {
-            v: k for k, v in self.mergeable_ranks.items()
-        }  # type: dict[int, bytes|str]
         self.decoder.update({v: k for k, v in self.special_tokens.items()})
         self.tokenizer = enc  # type: tiktoken.Encoding
@@ -128,9 +113,7 @@ class QWenTokenizer(PreTrainedTokenizer):
     def get_vocab(self) -> Dict[bytes, int]:
         return self.mergeable_ranks
-    def convert_tokens_to_ids(
-        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
-    ) -> List[int]:
         ids = []
         if isinstance(tokens, (str, bytes)):
             if tokens in self.special_tokens:
@@ -146,11 +129,11 @@ class QWenTokenizer(PreTrainedTokenizer):
     def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
         if not special_tokens and new_tokens:
-            raise ValueError('Adding regular tokens is not supported')
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
             if surface_form not in SPECIAL_TOKENS:
-                raise ValueError('Adding unknown special tokens is not supported')
         return 0
     def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
@@ -197,9 +180,7 @@ class QWenTokenizer(PreTrainedTokenizer):
         text = unicodedata.normalize("NFC", text)
         # this implementation takes a detour: text -> token id -> token surface forms
-        for t in self.tokenizer.encode(
-            text, allowed_special=allowed_special, disallowed_special=disallowed_special
-        ):
             tokens.append(self.decoder[t])
         return tokens

 from typing import Collection, Dict, List, Set, Tuple, Union
 import tiktoken
+from transformers import AddedToken, PreTrainedTokenizer
 logger = logging.getLogger(__name__)
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
     with open(tiktoken_bpe_file, "rb") as f:
         contents = f.read()
+    return {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
         self,
         vocab_file,
         errors="replace",
+        image_start_tag="<img>",
+        image_end_tag="</img>",
+        image_pad_tag="<imgpad>",
+        ref_start_tag="<ref>",
+        ref_end_tag="</ref>",
+        box_start_tag="<box>",
+        box_end_tag="</box>",
+        quad_start_tag="<quad>",
+        quad_end_tag="</quad>",
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_start_tag = image_start_tag
         self.image_end_tag = image_end_tag
         self.image_pad_tag = image_pad_tag
         self.box_end_tag = box_end_tag
         self.quad_start_tag = quad_start_tag
         self.quad_end_tag = quad_end_tag
+        self.IMAGE_ST = (ref_start_tag, ref_end_tag, box_start_tag, box_end_tag, quad_start_tag, quad_end_tag, image_start_tag, image_end_tag, image_pad_tag)
         self.errors = errors  # how to handle errors in decoding
         self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {token: index for index, token in enumerate(SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks))}
         self.img_start_id = self.special_tokens[self.image_start_tag]
         self.img_end_id = self.special_tokens[self.image_end_tag]
         self.img_pad_id = self.special_tokens[self.image_pad_tag]
             len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
         ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {v: k for k, v in self.mergeable_ranks.items()}  # type: dict[int, bytes|str]
         self.decoder.update({v: k for k, v in self.special_tokens.items()})
         self.tokenizer = enc  # type: tiktoken.Encoding
     def get_vocab(self) -> Dict[bytes, int]:
         return self.mergeable_ranks
+    def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]:
         ids = []
         if isinstance(tokens, (str, bytes)):
             if tokens in self.special_tokens:
     def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
         if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
             if surface_form not in SPECIAL_TOKENS:
+                raise ValueError("Adding unknown special tokens is not supported")
         return 0
     def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
         text = unicodedata.normalize("NFC", text)
         # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
             tokens.append(self.decoder[t])
         return tokens