TypeError: argument 'text': 'AddedToken' object cannot be converted to 'PyString'
TypeError Traceback (most recent call last)
Cell In[4], line 4
1 from transformers import AutoTokenizer, AutoModelForCausalLM
2 import torch
----> 4 tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True)
5 model = AutoModelForCausalLM.from_pretrained("databricks/dbrx-instruct", device_map="cpu", torch_dtype=torch.bfloat16, trust_remote_code=True)
7 input_text = "What does it take to build a great LLM?"
File ~/anaconda3/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:714, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
712 if os.path.isdir(pretrained_model_name_or_path):
713 tokenizer_class.register_for_auto_class()
--> 714 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
715 elif config_tokenizer_class is not None:
716 tokenizer_class = None
File ~/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1854, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
1851 else:
1852 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1854 return cls._from_pretrained(
1855 resolved_vocab_files,
1856 pretrained_model_name_or_path,
1857 init_configuration,
1858 *init_inputs,
1859 token=token,
1860 cache_dir=cache_dir,
1861 local_files_only=local_files_only,
1862 _commit_hash=commit_hash,
1863 _is_local=is_local,
1864 **kwargs,
1865 )
File ~/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2090, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
2087 tokenizer.add_tokens(tokens, special_tokens=is_last_special)
2089 # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
-> 2090 added_tokens = tokenizer.sanitize_special_tokens()
2091 if added_tokens:
2092 logger.warning_advice(
2093 "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
2094 " fine-tuned or trained."
2095 )
File ~/.cache/huggingface/modules/transformers_modules/databricks/dbrx-instruct/464e701f50aef4c1b59c81fb5667819a5d08e108/tiktoken.py:370, in TiktokenTokenizerWrapper.sanitize_special_tokens(self)
368 actual_new_tokens = []
369 for token in self.all_special_tokens_extended:
--> 370 encoded = self.encoding.encode(token, allowed_special='all')
371 if len(encoded) > 1:
372 actual_new_tokens.append(token)
File ~/anaconda3/lib/python3.11/site-packages/tiktoken/core.py:120, in Encoding.encode(self, text, allowed_special, disallowed_special)
117 raise_disallowed_special_token(match.group())
119 try:
--> 120 return self._core_bpe.encode(text, allowed_special)
121 except UnicodeEncodeError:
122 # BPE operates on bytes, but the regex operates on unicode. If we pass a str that is
123 # invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty
(...)
126 # string, but given that this is input we want to support, maybe that's okay.
127 # Also we use errors="replace" to handle weird things like lone surrogates.
128 text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
TypeError: argument 'text': 'AddedToken' object cannot be converted to 'PyString'
I am trying to run:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("databricks/dbrx-instruct", device_map="cpu", torch_dtype=torch.bfloat16, trust_remote_code=True)
Facing the above errors
The code you shared works fine for me. What version of transformers do you have?
Hey @daking I am getting the same error as @Madhugraj .
I am using transformers version 4.33.2 and tiktoken version 0.6.0.
I'm following the code provided in the HF repo and replacing the hf token to a generated one via the instructions.
Please let me know if you'd like any more context or info and I'll be happy to send that your way.
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True, token="hf_YOUR_TOKEN")
model = AutoModelForCausalLM.from_pretrained("databricks/dbrx-instruct", device_map="cpu", torch_dtype=torch.bfloat16, trust_remote_code=True, token="hf_YOUR_TOKEN")
input_text = "What does it take to build a great LLM?"
messages = [{"role": "user", "content": input_text}]
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))
Thats a pretty old version of transformers :) Can you upgrade?
Thanks for the quick reply! That seemed to be the problem. I definitely updated via pip before running, but it looks like there was an issue in the .venv, so recreating the venv and re-installing transformers fixed the issue
Upgrading did the job
pip install --upgrade transformers
The model card example now suggests installing at least 4.39.2 via pip, which should help avoid this.
I am also getting the below error:
TypeError: argument 'text': 'AddedToken' object cannot be converted to 'PyString'
My Version of Transformer and tiktoken:
%pip show transformers
Name: transformers
Version: 4.39.3
%pip show tiktoken
Name: tiktoken
Version: 0.6.0
I am not getting any solution to this, can somebody please help
Hi
@Sudipta179002
, could you running again with the latest revision? We have updated the tokenizer to remove the dependency on tiktoken
and just use GPT2Tokenizer
. It may unblock your issue.
Hi
@abhi-db
Thanks mate for the quick reply, so what all changes I need to do in the below code ?
Code:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True, token="hf_YOUR_TOKEN")
model = AutoModelForCausalLM.from_pretrained("databricks/dbrx-instruct", device_map="cpu", torch_dtype=torch.bfloat16, trust_remote_code=True, token="hf_YOUR_TOKEN")
input_text = "What does it take to build a great LLM?"
messages = [{"role": "user", "content": input_text}]
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))
After adding GPT2Tokenizer I am now getting below error:
ValueError: Non-consecutive added token '<|pad|>' found. Should have index 100278 but has index 100277 in saved vocabulary.