|
"""Tokenization classes for Arctic.""" |
|
|
|
from typing import Any, Dict, Optional |
|
|
|
from transformers.models.llama import LlamaTokenizer |
|
|
|
|
|
class ArcticTokenizer(LlamaTokenizer): |
|
|
|
def __init__( |
|
self, |
|
vocab_file, |
|
unk_token="<unk>", |
|
bos_token="<s>", |
|
eos_token="</s>", |
|
pad_token=None, |
|
sp_model_kwargs: Optional[Dict[str, Any]] = None, |
|
add_bos_token=True, |
|
add_eos_token=False, |
|
clean_up_tokenization_spaces=False, |
|
use_default_system_prompt=False, |
|
spaces_between_special_tokens=False, |
|
legacy=False, |
|
add_prefix_space=True, |
|
**kwargs, |
|
): |
|
|
|
super().__init__( |
|
vocab_file, |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
unk_token=unk_token, |
|
pad_token=pad_token, |
|
sp_model_kwargs=sp_model_kwargs, |
|
add_bos_token=add_bos_token, |
|
add_eos_token=add_eos_token, |
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
|
use_default_system_prompt=use_default_system_prompt, |
|
spaces_between_special_tokens=spaces_between_special_tokens, |
|
legacy=legacy, |
|
add_prefix_space=add_prefix_space, |
|
**kwargs, |
|
) |
|
|
|
@property |
|
def default_chat_template(self): |
|
""" |
|
This template formats inputs in the standard Arctic format. |
|
""" |
|
return ( |
|
"{% for message in messages %}" |
|
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" |
|
"{% endfor %}" |
|
"{% if add_generation_prompt %}" |
|
"{{ '<|im_start|>assistant\n' }}" |
|
"{% endif %}" |
|
) |
|
|