xu song commited on
Commit
df2bf3e
1 Parent(s): 34f1177
Files changed (1) hide show
  1. models/cpp_qwen2.py +24 -4
models/cpp_qwen2.py CHANGED
@@ -65,6 +65,7 @@ llama_print_timings: total time = 56335.37 ms / 72 tokens
65
  - https://github.com/awinml/llama-cpp-python-bindings
66
  - https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
67
  - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
 
68
  - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
69
  """
70
 
@@ -129,6 +130,12 @@ class Qwen2Simulator(Simulator):
129
  self.user_start_tokens = self.tokenize("<|im_start|>user\n")
130
  self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
131
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
 
 
 
 
 
 
132
  # self.llm.set_cache()
133
 
134
  def tokenize(self, text):
@@ -139,10 +146,10 @@ class Qwen2Simulator(Simulator):
139
 
140
  def strip_stoptokens(self, tokens):
141
  while tokens and tokens[0] in self.stop_tokens:
142
- logger.info(f"head-striping {tokens[0]} {self.llm.detokenize([tokens[0]])}")
143
  tokens.pop(0)
144
  while tokens and tokens[-1] in self.stop_tokens:
145
- logger.info(f"tail-striping {tokens[-1]} {self.llm.detokenize([tokens[-1]])}")
146
  tokens.pop()
147
  return tokens
148
 
@@ -156,8 +163,10 @@ class Qwen2Simulator(Simulator):
156
  """
157
  if history[-1]['role'] in ["user"]:
158
  start_tokens = self.assistant_start_tokens
 
159
  elif history[-1]['role'] in ["assistant", "system"]:
160
  start_tokens = self.user_start_tokens
 
161
 
162
  input_ids = []
163
  for message in history:
@@ -168,11 +177,11 @@ class Qwen2Simulator(Simulator):
168
  + self.tokenize("<|im_end|>\n")
169
  input_ids += start_tokens
170
  if stream:
171
- return self._stream_generate(input_ids)
172
  else:
173
  return self._generate(input_ids)
174
 
175
- def _stream_generate(self, input_ids):
176
  logger.info(f"generation_kwargs {self.generation_kwargs}")
177
  output = self.llm.create_completion(
178
  input_ids,
@@ -188,6 +197,17 @@ class Qwen2Simulator(Simulator):
188
  else:
189
  logger.info(f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
190
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  bot = Qwen2Simulator()
193
 
 
65
  - https://github.com/awinml/llama-cpp-python-bindings
66
  - https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
67
  - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
68
+ - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/model.py
69
  - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
70
  """
71
 
 
130
  self.user_start_tokens = self.tokenize("<|im_start|>user\n")
131
  self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
132
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
133
+
134
+
135
+ self.cache_size=10
136
+
137
+ cache = llama_cpp.LlamaRAMCache(capacity_bytes=self.cache_size)
138
+
139
  # self.llm.set_cache()
140
 
141
  def tokenize(self, text):
 
146
 
147
  def strip_stoptokens(self, tokens):
148
  while tokens and tokens[0] in self.stop_tokens:
149
+ logger.info(f"head-striping {tokens[0]} {self.detokenize([tokens[0]])}")
150
  tokens.pop(0)
151
  while tokens and tokens[-1] in self.stop_tokens:
152
+ logger.info(f"tail-striping {tokens[-1]} {self.detokenize([tokens[-1]])}")
153
  tokens.pop()
154
  return tokens
155
 
 
163
  """
164
  if history[-1]['role'] in ["user"]:
165
  start_tokens = self.assistant_start_tokens
166
+ suffix_tokens = self.user_start_tokens
167
  elif history[-1]['role'] in ["assistant", "system"]:
168
  start_tokens = self.user_start_tokens
169
+ suffix_tokens = self.assistant_start_tokens
170
 
171
  input_ids = []
172
  for message in history:
 
177
  + self.tokenize("<|im_end|>\n")
178
  input_ids += start_tokens
179
  if stream:
180
+ return self._stream_generate(input_ids, suffix_tokens)
181
  else:
182
  return self._generate(input_ids)
183
 
184
+ def _stream_generate(self, input_ids, suffix_tokens=None):
185
  logger.info(f"generation_kwargs {self.generation_kwargs}")
186
  output = self.llm.create_completion(
187
  input_ids,
 
197
  else:
198
  logger.info(f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
199
 
200
+ # warmup for next turn
201
+ if suffix_tokens:
202
+ # <|im_end|>\n
203
+ self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens,
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
 
212
  bot = Qwen2Simulator()
213