File size: 9,837 Bytes
256a159 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
from typing import Dict, Optional, Union
import torch
from opencompass.utils.prompt import PromptList
from .huggingface import HuggingFace
PromptType = Union[PromptList, str]
class ModelScope(HuggingFace):
"""Model wrapper around ModelScope models.
Args:
path (str): The name or path to ModelScope's model.
ms_cache_dir: Set the cache dir to MS model cache dir. If None, it will
use the env variable MS_MODEL_HUB. Defaults to None.
max_seq_len (int): The maximum length of the input sequence. Defaults
to 2048.
tokenizer_path (str): The path to the tokenizer. Defaults to None.
tokenizer_kwargs (dict): Keyword arguments for the tokenizer.
Defaults to {}.
peft_path (str, optional): The name or path to the ModelScope's PEFT
model. If None, the original model will not be converted to PEFT.
Defaults to None.
tokenizer_only (bool): If True, only the tokenizer will be initialized.
Defaults to False.
model_kwargs (dict): Keyword arguments for the model, used in loader.
Defaults to dict(device_map='auto').
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
extract_pred_after_decode (bool): Whether to extract the prediction
string from the decoded output string, instead of extract the
prediction tokens before decoding. Defaults to False.
batch_padding (bool): If False, inference with be performed in for-loop
without batch padding.
pad_token_id (int): The id of the padding token. Defaults to None. Use
(#vocab + pad_token_id) if get negative value.
mode (str, optional): The method of input truncation when input length
exceeds max_seq_len. 'mid' represents the part of input to
truncate. Defaults to 'none'.
Note:
About ``extract_pred_after_decode``: Commonly, we should extract the
the prediction tokens before decoding. But for some tokenizers using
``sentencepiece``, like LLaMA, this behavior may change the number of
whitespaces, which is harmful for Python programming tasks.
"""
def __init__(self,
path: str,
ms_cache_dir: Optional[str] = None,
max_seq_len: int = 2048,
tokenizer_path: Optional[str] = None,
tokenizer_kwargs: dict = dict(),
peft_path: Optional[str] = None,
tokenizer_only: bool = False,
model_kwargs: dict = dict(device_map='auto'),
meta_template: Optional[Dict] = None,
extract_pred_after_decode: bool = False,
batch_padding: bool = False,
pad_token_id: Optional[int] = None,
mode: str = 'none'):
super().__init__(
path=path,
hf_cache_dir=ms_cache_dir,
max_seq_len=max_seq_len,
tokenizer_path=tokenizer_path,
tokenizer_kwargs=tokenizer_kwargs,
peft_path=peft_path,
tokenizer_only=tokenizer_only,
model_kwargs=model_kwargs,
meta_template=meta_template,
extract_pred_after_decode=extract_pred_after_decode,
batch_padding=batch_padding,
pad_token_id=pad_token_id,
mode=mode,
)
def _load_tokenizer(self, path: str, tokenizer_path: Optional[str],
tokenizer_kwargs: dict):
from modelscope import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path if tokenizer_path else path, **tokenizer_kwargs)
# A patch for some models without pad_token_id
if self.pad_token_id is not None:
if self.pad_token_id < 0:
self.pad_token_id += self.tokenizer.vocab_size
if self.tokenizer.pad_token_id is None:
self.logger.debug(f'Using {self.pad_token_id} as pad_token_id')
elif self.tokenizer.pad_token_id != self.pad_token_id:
self.logger.warning(
'pad_token_id is not consistent with the tokenizer. Using '
f'{self.pad_token_id} as pad_token_id')
self.tokenizer.pad_token_id = self.pad_token_id
elif self.tokenizer.pad_token_id is None:
self.logger.warning('pad_token_id is not set for the tokenizer.')
if self.tokenizer.eos_token is not None:
self.logger.warning(
f'Using eos_token_id {self.tokenizer.eos_token} '
'as pad_token_id.')
self.tokenizer.pad_token = self.tokenizer.eos_token
else:
from modelscope import GenerationConfig
gcfg = GenerationConfig.from_pretrained(path)
if gcfg.pad_token_id is not None:
self.logger.warning(
f'Using pad_token_id {gcfg.pad_token_id} '
'as pad_token_id.')
self.tokenizer.pad_token_id = gcfg.pad_token_id
else:
raise ValueError(
'pad_token_id is not set for this tokenizer. Try to '
'set pad_token_id via passing '
'`pad_token_id={PAD_TOKEN_ID}` in model_cfg.')
# A patch for llama when batch_padding = True
if 'decapoda-research/llama' in path or \
(tokenizer_path and
'decapoda-research/llama' in tokenizer_path):
self.logger.warning('We set new pad_token_id for LLaMA model')
# keep consistent with official LLaMA repo
# https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb # noqa
self.tokenizer.bos_token = '<s>'
self.tokenizer.eos_token = '</s>'
self.tokenizer.pad_token_id = 0
def _set_model_kwargs_torch_dtype(self, model_kwargs):
if 'torch_dtype' not in model_kwargs:
torch_dtype = torch.float16
else:
torch_dtype = {
'torch.float16': torch.float16,
'torch.bfloat16': torch.bfloat16,
'torch.float': torch.float,
'auto': 'auto',
'None': None
}.get(model_kwargs['torch_dtype'])
self.logger.debug(f'MS using torch_dtype: {torch_dtype}')
if torch_dtype is not None:
model_kwargs['torch_dtype'] = torch_dtype
def _load_model(self,
path: str,
model_kwargs: dict,
peft_path: Optional[str] = None):
from modelscope import AutoModel, AutoModelForCausalLM
self._set_model_kwargs_torch_dtype(model_kwargs)
try:
self.model = AutoModelForCausalLM.from_pretrained(
path, **model_kwargs)
except ValueError:
self.model = AutoModel.from_pretrained(path, **model_kwargs)
if peft_path is not None:
from peft import PeftModel
self.model = PeftModel.from_pretrained(self.model,
peft_path,
is_trainable=False)
self.model.eval()
self.model.generation_config.do_sample = False
# A patch for llama when batch_padding = True
if 'decapoda-research/llama' in path:
self.model.config.bos_token_id = 1
self.model.config.eos_token_id = 2
self.model.config.pad_token_id = self.tokenizer.pad_token_id
class ModelScopeCausalLM(ModelScope):
"""Model wrapper around ModelScope CausalLM.
Args:
path (str): The name or path to ModelScope's model.
ms_cache_dir: Set the cache dir to MS model cache dir. If None, it will
use the env variable MS_MODEL_HUB. Defaults to None.
max_seq_len (int): The maximum length of the input sequence. Defaults
to 2048.
tokenizer_path (str): The path to the tokenizer. Defaults to None.
tokenizer_kwargs (dict): Keyword arguments for the tokenizer.
Defaults to {}.
peft_path (str, optional): The name or path to the ModelScope's PEFT
model. If None, the original model will not be converted to PEFT.
Defaults to None.
tokenizer_only (bool): If True, only the tokenizer will be initialized.
Defaults to False.
model_kwargs (dict): Keyword arguments for the model, used in loader.
Defaults to dict(device_map='auto').
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
batch_padding (bool): If False, inference with be performed in for-loop
without batch padding.
"""
def _load_model(self,
path: str,
model_kwargs: dict,
peft_path: Optional[str] = None):
from modelscope import AutoModelForCausalLM
self._set_model_kwargs_torch_dtype(model_kwargs)
self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)
if peft_path is not None:
from peft import PeftModel
self.model = PeftModel.from_pretrained(self.model,
peft_path,
is_trainable=False)
self.model.eval()
self.model.generation_config.do_sample = False
|