SeaLLM-Chat

Running on Zero

App Files Files Community

NGUYEN, Xuan Phi commited on Apr 17

Commit

43147aa

•

1 Parent(s): 2997e80

update

Browse files

Files changed (2) hide show

multipurpose_chatbot/demos/multimodal_chat_interface.py +5 -5
multipurpose_chatbot/engines/transformers_engine.py +13 -2

multipurpose_chatbot/demos/multimodal_chat_interface.py CHANGED Viewed

@@ -944,8 +944,8 @@ def vision_chat_response_stream_multiturn_engine(
     if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
         raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
-    print(f'{image_paths=}')
-    print(full_prompt)
     outputs = None
     response = None
     num_tokens = -1
@@ -995,7 +995,7 @@ def doc_chat_response_stream_multiturn_engine(
     if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
         raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
-    print(full_prompt)
     outputs = None
     response = None
     num_tokens = -1
@@ -1050,8 +1050,8 @@ def vision_doc_chat_response_stream_multiturn_engine(
     if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
         raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
-    print(full_prompt)
-    print(f'{image_paths=}')
     outputs = None
     response = None
     num_tokens = -1

     if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
         raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    # print(f'{image_paths=}')
+    # print(full_prompt)
     outputs = None
     response = None
     num_tokens = -1
     if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
         raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    # print(full_prompt)
     outputs = None
     response = None
     num_tokens = -1
     if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
         raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    # print(full_prompt)
+    # print(f'{image_paths=}')
     outputs = None
     response = None
     num_tokens = -1

multipurpose_chatbot/engines/transformers_engine.py CHANGED Viewed

@@ -1,8 +1,13 @@
 try:
     import spaces
 except ModuleNotFoundError:
     print(f'Cannot import hf `spaces` with `import spaces`.')
 import os
 import numpy as np
 import argparse
@@ -541,7 +546,7 @@ class TransformersEngine(BaseEngine):
                     if message_safety is not None:
                         raise gr.Error(message_safety)
-    @spaces.GPU
     def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
         # ! MUST PUT INSIDE torch.no_grad() otherwise it will overflow OOM
@@ -558,6 +563,12 @@ class TransformersEngine(BaseEngine):
             with torch.no_grad():
                 inputs = self.tokenizer(prompt, return_tensors='pt')
                 num_tokens = inputs.input_ids.size(1)
                 inputs = inputs.to(self._model.device)
@@ -574,7 +585,7 @@ class TransformersEngine(BaseEngine):
                 response = None
                 for index, token in enumerate(generator):
                     out_tokens.extend(token.tolist())
-                    response = self.tokenizer.decode(out_tokens)
                     if "<|im_start|>assistant\n" in response:
                         response = response.split("<|im_start|>assistant\n")[-1]
                     num_tokens += 1

 try:
     import spaces
+    def maybe_spaces_gpu(fn):
+        fn = spaces.GPU(fn)
+        return fn
 except ModuleNotFoundError:
     print(f'Cannot import hf `spaces` with `import spaces`.')
+    def maybe_spaces_gpu(fn):
+        return fn
 import os
 import numpy as np
 import argparse
                     if message_safety is not None:
                         raise gr.Error(message_safety)
+    @maybe_spaces_gpu
     def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
         # ! MUST PUT INSIDE torch.no_grad() otherwise it will overflow OOM
             with torch.no_grad():
                 inputs = self.tokenizer(prompt, return_tensors='pt')
+                # whether to print the full prompts
+                retok_full_prompt = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=False)
+                print(f"retok_full_prompt:\n{retok_full_prompt}>>>>")
+                begin_bos = inputs.input_ids[0][0] == self.tokenizer.bos_token_id
+                print(f'begin_bos: {begin_bos}')
                 num_tokens = inputs.input_ids.size(1)
                 inputs = inputs.to(self._model.device)
                 response = None
                 for index, token in enumerate(generator):
                     out_tokens.extend(token.tolist())
+                    response = self.tokenizer.decode(out_tokens, skip_special_tokens=True)
                     if "<|im_start|>assistant\n" in response:
                         response = response.split("<|im_start|>assistant\n")[-1]
                     num_tokens += 1