RhapsodyAI/qwen_vl_guidance · Error when using Qwen-gui for inference

Hello,
I tried modifying the Qwen VL from this page to load your qwen_vl_guidance for inference but got the following error.

Traceback (most recent call last):
File "/workspace/dev/qwenvl/qwenvlinfsamp1.py", line 10, in
tokenizer = AutoTokenizer.from_pretrained("RhapsodyAI/qwen_vl_guidance", trust_remote_code=True, use_auth_token=hugging_face_token)
File "/workspace/dev/qwenvl/venvqwenvl/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py", line 847, in from_pretrained
return tokenizer_class.from_pretrained(
File "/workspace/dev/qwenvl/venvqwenvl/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2089, in from_pretrained
return cls._from_pretrained(
File "/workspace/dev/qwenvl/venvqwenvl/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2311, in _from_pretrained
tokenizer = cls(*init_inputs, **init_kwargs)
File "/workspace/models/modules/transformers_modules/RhapsodyAI/qwen_vl_guidance/8e86496b80cbf2f8b90b06cab31ab2c4c3252b75/tokenization_qwen.py", line 120, in init
super().init(**kwargs)
File "/workspace/dev/qwenvl/venvqwenvl/lib/python3.10/site-packages/transformers/tokenization_utils.py", line 367, in init
self._add_tokens(
File "/workspace/models/modules/transformers_modules/RhapsodyAI/qwen_vl_guidance/8e86496b80cbf2f8b90b06cab31ab2c4c3252b75/tokenization_qwen.py", line 227, in _add_tokens
if surface_form not in SPECIAL_TOKENS + self.IMAGE_ST:
AttributeError: 'QWenTokenizer' object has no attribute 'IMAGE_ST'

My code works fine with the original Qwen/Qwen-VL model but breaks when I use RhapsodyAI/qwen_vl_guidance
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig

Can you provide sample inference code on the model page.

import torch

torch.manual_seed(1234)

hugging_face_token = 'hf_XXXXxxxxxxxx'

tokenizer = AutoTokenizer.from_pretrained("RhapsodyAI/qwen_vl_guidance", trust_remote_code=True, use_auth_token=hugging_face_token)

# use cuda device
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True, use_auth_token=hugging_face_token).eval()

# Specify hyperparameters for generation (No need to do this if you are using transformers>=4.32.0)
# model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True, use_auth_token=hugging_face_token)

query = tokenizer.from_list_format([
    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
    {'text': 'Generate the caption in English with grounding:'},
])
inputs = tokenizer(query, return_tensors='pt')
inputs = inputs.to(model.device)
pred = model.generate(**inputs)
response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
print(response)
# <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach
image = tokenizer.draw_bbox_on_latest_picture(response)
if image:
  image.save('2.jpg')
else:
  print("no box")