Update special tokens (#3)
Browse files- Update special tokens (4deffb0b0a29b37b72f6e9d1f58f209a033ef382)
- add chatml template (bdd4b6c0d4e531762b9b6a0fa6f50c459edd23fa)
- add bos in chat template (0da1df6f7a23940bc6bd400a3e0f6631b316526d)
- update chat template in model (b2357579bad20cdebc86db80aebaea11844084f1)
- modeling_internlm2.py +6 -6
- tokenizer_config.json +77 -2
modeling_internlm2.py
CHANGED
@@ -1138,12 +1138,12 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
1138 |
def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
|
1139 |
prompt = ""
|
1140 |
if meta_instruction:
|
1141 |
-
prompt += f"""<s
|
1142 |
else:
|
1143 |
prompt += "<s>"
|
1144 |
for record in history:
|
1145 |
-
prompt += f"""
|
1146 |
-
prompt += f"""
|
1147 |
return tokenizer([prompt], return_tensors="pt")
|
1148 |
|
1149 |
@torch.no_grad()
|
@@ -1165,7 +1165,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
1165 |
inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
|
1166 |
inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
|
1167 |
# also add end-of-assistant token in eos token id to avoid unnecessary generation
|
1168 |
-
eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["
|
1169 |
outputs = self.generate(
|
1170 |
**inputs,
|
1171 |
streamer=streamer,
|
@@ -1178,7 +1178,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
1178 |
)
|
1179 |
outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
|
1180 |
response = tokenizer.decode(outputs, skip_special_tokens=True)
|
1181 |
-
response = response.split("
|
1182 |
history = history + [(query, response)]
|
1183 |
return response, history
|
1184 |
|
@@ -1231,7 +1231,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
1231 |
return
|
1232 |
|
1233 |
token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
|
1234 |
-
if token.strip() != "
|
1235 |
self.response = self.response + token
|
1236 |
history = self.history + [(self.query, self.response)]
|
1237 |
self.queue.put((self.response, history))
|
|
|
1138 |
def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
|
1139 |
prompt = ""
|
1140 |
if meta_instruction:
|
1141 |
+
prompt += f"""<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"""
|
1142 |
else:
|
1143 |
prompt += "<s>"
|
1144 |
for record in history:
|
1145 |
+
prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
|
1146 |
+
prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
|
1147 |
return tokenizer([prompt], return_tensors="pt")
|
1148 |
|
1149 |
@torch.no_grad()
|
|
|
1165 |
inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
|
1166 |
inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
|
1167 |
# also add end-of-assistant token in eos token id to avoid unnecessary generation
|
1168 |
+
eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
|
1169 |
outputs = self.generate(
|
1170 |
**inputs,
|
1171 |
streamer=streamer,
|
|
|
1178 |
)
|
1179 |
outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
|
1180 |
response = tokenizer.decode(outputs, skip_special_tokens=True)
|
1181 |
+
response = response.split("<|im_end|>")[0]
|
1182 |
history = history + [(query, response)]
|
1183 |
return response, history
|
1184 |
|
|
|
1231 |
return
|
1232 |
|
1233 |
token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
|
1234 |
+
if token.strip() != "<|im_end|>":
|
1235 |
self.response = self.response + token
|
1236 |
history = self.history + [(self.query, self.response)]
|
1237 |
self.queue.put((self.response, history))
|
tokenizer_config.json
CHANGED
@@ -11,5 +11,80 @@
|
|
11 |
"model_max_length": 1000000000000000019884624838656,
|
12 |
"pad_token": "</s>",
|
13 |
"tokenizer_class": "InternLMTokenizer",
|
14 |
-
"unk_token": "<unk>"
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"model_max_length": 1000000000000000019884624838656,
|
12 |
"pad_token": "</s>",
|
13 |
"tokenizer_class": "InternLMTokenizer",
|
14 |
+
"unk_token": "<unk>",
|
15 |
+
"added_tokens_decoder": {
|
16 |
+
"0": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false,
|
22 |
+
"special": true
|
23 |
+
},
|
24 |
+
"1": {
|
25 |
+
"content": "<s>",
|
26 |
+
"lstrip": false,
|
27 |
+
"normalized": false,
|
28 |
+
"rstrip": false,
|
29 |
+
"single_word": false,
|
30 |
+
"special": true
|
31 |
+
},
|
32 |
+
"2": {
|
33 |
+
"content": "</s>",
|
34 |
+
"lstrip": false,
|
35 |
+
"normalized": false,
|
36 |
+
"rstrip": false,
|
37 |
+
"single_word": false,
|
38 |
+
"special": true
|
39 |
+
},
|
40 |
+
"92543": {
|
41 |
+
"content": "<|im_start|>",
|
42 |
+
"lstrip": false,
|
43 |
+
"normalized": false,
|
44 |
+
"rstrip": false,
|
45 |
+
"single_word": false,
|
46 |
+
"special": true
|
47 |
+
},
|
48 |
+
"92542": {
|
49 |
+
"content": "<|im_end|>",
|
50 |
+
"lstrip": false,
|
51 |
+
"normalized": false,
|
52 |
+
"rstrip": false,
|
53 |
+
"single_word": false,
|
54 |
+
"special": true
|
55 |
+
},
|
56 |
+
"92541": {
|
57 |
+
"content": "<|action_start|>",
|
58 |
+
"lstrip": false,
|
59 |
+
"normalized": false,
|
60 |
+
"rstrip": false,
|
61 |
+
"single_word": false,
|
62 |
+
"special": true
|
63 |
+
},
|
64 |
+
"92540": {
|
65 |
+
"content": "<|action_end|>",
|
66 |
+
"lstrip": false,
|
67 |
+
"normalized": false,
|
68 |
+
"rstrip": false,
|
69 |
+
"single_word": false,
|
70 |
+
"special": true
|
71 |
+
},
|
72 |
+
"92539": {
|
73 |
+
"content": "<|interpreter|>",
|
74 |
+
"lstrip": false,
|
75 |
+
"normalized": false,
|
76 |
+
"rstrip": false,
|
77 |
+
"single_word": false,
|
78 |
+
"special": true
|
79 |
+
},
|
80 |
+
"92538": {
|
81 |
+
"content": "<|plugin|>",
|
82 |
+
"lstrip": false,
|
83 |
+
"normalized": false,
|
84 |
+
"rstrip": false,
|
85 |
+
"single_word": false,
|
86 |
+
"special": true
|
87 |
+
}
|
88 |
+
},
|
89 |
+
"chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
90 |
+
}
|