xu song
commited on
Commit
•
df2bf3e
1
Parent(s):
34f1177
update
Browse files- models/cpp_qwen2.py +24 -4
models/cpp_qwen2.py
CHANGED
@@ -65,6 +65,7 @@ llama_print_timings: total time = 56335.37 ms / 72 tokens
|
|
65 |
- https://github.com/awinml/llama-cpp-python-bindings
|
66 |
- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
|
67 |
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
|
|
|
68 |
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
|
69 |
"""
|
70 |
|
@@ -129,6 +130,12 @@ class Qwen2Simulator(Simulator):
|
|
129 |
self.user_start_tokens = self.tokenize("<|im_start|>user\n")
|
130 |
self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
|
131 |
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
# self.llm.set_cache()
|
133 |
|
134 |
def tokenize(self, text):
|
@@ -139,10 +146,10 @@ class Qwen2Simulator(Simulator):
|
|
139 |
|
140 |
def strip_stoptokens(self, tokens):
|
141 |
while tokens and tokens[0] in self.stop_tokens:
|
142 |
-
logger.info(f"head-striping {tokens[0]} {self.
|
143 |
tokens.pop(0)
|
144 |
while tokens and tokens[-1] in self.stop_tokens:
|
145 |
-
logger.info(f"tail-striping {tokens[-1]} {self.
|
146 |
tokens.pop()
|
147 |
return tokens
|
148 |
|
@@ -156,8 +163,10 @@ class Qwen2Simulator(Simulator):
|
|
156 |
"""
|
157 |
if history[-1]['role'] in ["user"]:
|
158 |
start_tokens = self.assistant_start_tokens
|
|
|
159 |
elif history[-1]['role'] in ["assistant", "system"]:
|
160 |
start_tokens = self.user_start_tokens
|
|
|
161 |
|
162 |
input_ids = []
|
163 |
for message in history:
|
@@ -168,11 +177,11 @@ class Qwen2Simulator(Simulator):
|
|
168 |
+ self.tokenize("<|im_end|>\n")
|
169 |
input_ids += start_tokens
|
170 |
if stream:
|
171 |
-
return self._stream_generate(input_ids)
|
172 |
else:
|
173 |
return self._generate(input_ids)
|
174 |
|
175 |
-
def _stream_generate(self, input_ids):
|
176 |
logger.info(f"generation_kwargs {self.generation_kwargs}")
|
177 |
output = self.llm.create_completion(
|
178 |
input_ids,
|
@@ -188,6 +197,17 @@ class Qwen2Simulator(Simulator):
|
|
188 |
else:
|
189 |
logger.info(f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
bot = Qwen2Simulator()
|
193 |
|
|
|
65 |
- https://github.com/awinml/llama-cpp-python-bindings
|
66 |
- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
|
67 |
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
|
68 |
+
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/model.py
|
69 |
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
|
70 |
"""
|
71 |
|
|
|
130 |
self.user_start_tokens = self.tokenize("<|im_start|>user\n")
|
131 |
self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
|
132 |
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
|
133 |
+
|
134 |
+
|
135 |
+
self.cache_size=10
|
136 |
+
|
137 |
+
cache = llama_cpp.LlamaRAMCache(capacity_bytes=self.cache_size)
|
138 |
+
|
139 |
# self.llm.set_cache()
|
140 |
|
141 |
def tokenize(self, text):
|
|
|
146 |
|
147 |
def strip_stoptokens(self, tokens):
|
148 |
while tokens and tokens[0] in self.stop_tokens:
|
149 |
+
logger.info(f"head-striping {tokens[0]} {self.detokenize([tokens[0]])}")
|
150 |
tokens.pop(0)
|
151 |
while tokens and tokens[-1] in self.stop_tokens:
|
152 |
+
logger.info(f"tail-striping {tokens[-1]} {self.detokenize([tokens[-1]])}")
|
153 |
tokens.pop()
|
154 |
return tokens
|
155 |
|
|
|
163 |
"""
|
164 |
if history[-1]['role'] in ["user"]:
|
165 |
start_tokens = self.assistant_start_tokens
|
166 |
+
suffix_tokens = self.user_start_tokens
|
167 |
elif history[-1]['role'] in ["assistant", "system"]:
|
168 |
start_tokens = self.user_start_tokens
|
169 |
+
suffix_tokens = self.assistant_start_tokens
|
170 |
|
171 |
input_ids = []
|
172 |
for message in history:
|
|
|
177 |
+ self.tokenize("<|im_end|>\n")
|
178 |
input_ids += start_tokens
|
179 |
if stream:
|
180 |
+
return self._stream_generate(input_ids, suffix_tokens)
|
181 |
else:
|
182 |
return self._generate(input_ids)
|
183 |
|
184 |
+
def _stream_generate(self, input_ids, suffix_tokens=None):
|
185 |
logger.info(f"generation_kwargs {self.generation_kwargs}")
|
186 |
output = self.llm.create_completion(
|
187 |
input_ids,
|
|
|
197 |
else:
|
198 |
logger.info(f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
|
199 |
|
200 |
+
# warmup for next turn
|
201 |
+
if suffix_tokens:
|
202 |
+
# <|im_end|>\n
|
203 |
+
self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens,
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
|
210 |
+
|
211 |
|
212 |
bot = Qwen2Simulator()
|
213 |
|