Spaces:

xu-song
/

self-chat

Sleeping

App Files Files Community

xu song commited on Aug 6

Commit

df2bf3e

•

1 Parent(s): 34f1177

update

Browse files

Files changed (1) hide show

models/cpp_qwen2.py +24 -4

models/cpp_qwen2.py CHANGED Viewed

@@ -65,6 +65,7 @@ llama_print_timings:       total time =   56335.37 ms /    72 tokens
 - https://github.com/awinml/llama-cpp-python-bindings
 - https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
 - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
 - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
 """
@@ -129,6 +130,12 @@ class Qwen2Simulator(Simulator):
         self.user_start_tokens = self.tokenize("<|im_start|>user\n")
         self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
         # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
         # self.llm.set_cache()
     def tokenize(self, text):
@@ -139,10 +146,10 @@ class Qwen2Simulator(Simulator):
     def strip_stoptokens(self, tokens):
         while tokens and tokens[0] in self.stop_tokens:
-            logger.info(f"head-striping {tokens[0]} {self.llm.detokenize([tokens[0]])}")
             tokens.pop(0)
         while tokens and tokens[-1] in self.stop_tokens:
-            logger.info(f"tail-striping {tokens[-1]} {self.llm.detokenize([tokens[-1]])}")
             tokens.pop()
         return tokens
@@ -156,8 +163,10 @@ class Qwen2Simulator(Simulator):
         """
         if history[-1]['role'] in ["user"]:
             start_tokens = self.assistant_start_tokens
         elif history[-1]['role'] in ["assistant", "system"]:
             start_tokens = self.user_start_tokens
         input_ids = []
         for message in history:
@@ -168,11 +177,11 @@ class Qwen2Simulator(Simulator):
                          + self.tokenize("<|im_end|>\n")
         input_ids += start_tokens
         if stream:
-            return self._stream_generate(input_ids)
         else:
             return self._generate(input_ids)
-    def _stream_generate(self, input_ids):
         logger.info(f"generation_kwargs {self.generation_kwargs}")
         output = self.llm.create_completion(
             input_ids,
@@ -188,6 +197,17 @@ class Qwen2Simulator(Simulator):
             else:
                 logger.info(f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
 bot = Qwen2Simulator()

 - https://github.com/awinml/llama-cpp-python-bindings
 - https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
 - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
+- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/model.py
 - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
 """
         self.user_start_tokens = self.tokenize("<|im_start|>user\n")
         self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
         # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
+        self.cache_size=10
+        cache = llama_cpp.LlamaRAMCache(capacity_bytes=self.cache_size)
         # self.llm.set_cache()
     def tokenize(self, text):
     def strip_stoptokens(self, tokens):
         while tokens and tokens[0] in self.stop_tokens:
+            logger.info(f"head-striping {tokens[0]} {self.detokenize([tokens[0]])}")
             tokens.pop(0)
         while tokens and tokens[-1] in self.stop_tokens:
+            logger.info(f"tail-striping {tokens[-1]} {self.detokenize([tokens[-1]])}")
             tokens.pop()
         return tokens
         """
         if history[-1]['role'] in ["user"]:
             start_tokens = self.assistant_start_tokens
+            suffix_tokens = self.user_start_tokens
         elif history[-1]['role'] in ["assistant", "system"]:
             start_tokens = self.user_start_tokens
+            suffix_tokens = self.assistant_start_tokens
         input_ids = []
         for message in history:
                          + self.tokenize("<|im_end|>\n")
         input_ids += start_tokens
         if stream:
+            return self._stream_generate(input_ids, suffix_tokens)
         else:
             return self._generate(input_ids)
+    def _stream_generate(self, input_ids, suffix_tokens=None):
         logger.info(f"generation_kwargs {self.generation_kwargs}")
         output = self.llm.create_completion(
             input_ids,
             else:
                 logger.info(f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
+        # warmup for next turn
+        if suffix_tokens:
+            # <|im_end|>\n
+            self.llm.eval([151645, 198] + suffix_tokens)  # 增加 n_tokens，
 bot = Qwen2Simulator()