h2oai
/

h2ogpt-research-oasst1-llama-65b

@@ -71,8 +71,8 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
             # unknown
             model_max_length = None
         if model_max_length is not None:
-            num_prompt_tokens = None
             # can't wait for "hole" if not plain prompt_type, since would lose prefix like <human>:
             # For https://github.com/h2oai/h2ogpt/issues/192
             for trial in range(0, 3):
@@ -108,10 +108,10 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
                         print("Reduced max_new_tokens from %s -> %s" % (
                         generate_kwargs['max_new_tokens'], max_new_tokens))
                     generate_kwargs['max_new_tokens'] = max_new_tokens
-        return prompt_text
     def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
-        prompt_text = H2OTextGenerationPipeline.limit_prompt(prompt_text, self.tokenizer)
         data_point = dict(context='', instruction=prompt_text, input='')
         if self.prompter is not None:
@@ -132,7 +132,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
                 outputs = self.prompter.get_response(outputs, prompt=self.prompt_text,
                                                      sanitize_bot_response=self.sanitize_bot_response)
             elif self.bot and self.human:
-                outputs = rec['generated_text'].split(self.bot)[1].strip().split(self.human)[0].strip()
             else:
                 outputs = rec['generated_text']
             rec['generated_text'] = outputs
@@ -195,83 +195,6 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
             else:
                 raise ValueError("TF not avaialble.")
         return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}
-import torch
-from transformers import StoppingCriteria, StoppingCriteriaList
-class StoppingCriteriaSub(StoppingCriteria):
-    def __init__(self, stops=[], encounters=[], device="cuda", model_max_length=None):
-        super().__init__()
-        assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
-        self.encounters = encounters
-        self.stops = [stop.to(device) for stop in stops]
-        self.num_stops = [0] * len(stops)
-        self.model_max_length = model_max_length
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        for stopi, stop in enumerate(self.stops):
-            if torch.all((stop == input_ids[0][-len(stop):])).item():
-                self.num_stops[stopi] += 1
-                if self.num_stops[stopi] >= self.encounters[stopi % len(self.encounters)]:
-                    # print("Stopped", flush=True)
-                    return True
-        if self.model_max_length is not None and input_ids[0].shape[0] >= self.model_max_length:
-            # critical limit
-            return True
-        # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
-        # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
-        return False
-def get_stopping(prompt_type, prompt_dict, tokenizer, device, human='<human>:', bot="<bot>:", model_max_length=None):
-    # FIXME: prompt_dict unused currently
-    if prompt_type in [PromptType.human_bot.name, PromptType.instruct_vicuna.name, PromptType.instruct_with_end.name]:
-        if prompt_type == PromptType.human_bot.name:
-            # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
-            # stopping only starts once output is beyond prompt
-            # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
-            stop_words = [human, bot, '\n' + human, '\n' + bot]
-            encounters = [1, 2]
-        elif prompt_type == PromptType.instruct_vicuna.name:
-            # even below is not enough, generic strings and many ways to encode
-            stop_words = [
-                '### Human:',
-                """
-### Human:""",
-                """
-### Human:
-""",
-                '### Assistant:',
-                """
-### Assistant:""",
-                """
-### Assistant:
-""",
-            ]
-            encounters = [1, 2]
-        else:
-            # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
-            stop_words = ['### End']
-            encounters = [1]
-        stop_words_ids = [
-            tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
-        # handle single token case
-        stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
-        stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
-        # avoid padding in front of tokens
-        if tokenizer._pad_token:  # use hidden variable to avoid annoying properly logger bug
-            stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
-        # handle fake \n added
-        stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
-        # build stopper
-        stopping_criteria = StoppingCriteriaList(
-            [StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters, device=device,
-                                 model_max_length=model_max_length)])
-    else:
-        stopping_criteria = StoppingCriteriaList()
-    return stopping_criteria
 from enum import Enum
@@ -296,6 +219,12 @@ class PromptType(Enum):
     wizard2 = 16
     wizard3 = 17
     instruct_simple = 18
 class DocumentChoices(Enum):
@@ -318,9 +247,41 @@ class LangChainMode(Enum):
     MY_DATA = "MyData"
     GITHUB_H2OGPT = "github h2oGPT"
     H2O_DAI_DOCS = "DriverlessAI docs"
 import ast
 import time
-from enums import PromptType  # also supports imports from this file from other files
 non_hf_types = ['gpt4all_llama', 'llama', 'gptj']
@@ -344,23 +305,29 @@ prompt_type_to_model_name = {
         'mosaicml/mpt-7b-storywriter',
         'mosaicml/mpt-7b-instruct',  # internal code handles instruct
         'mosaicml/mpt-7b-chat',  # NC, internal code handles instruct
-        'gptj',  # internally handles prompting
-        'llama',  # plain, or need to choose prompt_type for given TheBloke model
-        'gpt4all_llama',  # internally handles prompting
     ],
     'prompt_answer': [
         'h2oai/h2ogpt-gm-oasst1-en-1024-20b',
         'h2oai/h2ogpt-gm-oasst1-en-1024-12b',
         'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b',
-        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt',
-        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2',
-        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-700bt',
-        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b',
         'h2oai/h2ogpt-gm-oasst1-multilang-2048-falcon-7b',
         'h2oai/h2ogpt-gm-oasst1-multilang-2048-falcon-7b-v2',
         'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b',
         'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2',
         'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1',
     ],
     'instruct': [],
     'instruct_with_end': ['databricks/dolly-v2-12b'],
@@ -373,6 +340,7 @@ prompt_type_to_model_name = {
         'h2oai/h2ogpt-oig-oasst1-256-6.9b',  # legacy
         'h2oai/h2ogpt-oig-oasst1-512-6.9b',  # legacy
         'h2oai/h2ogpt-research-oasst1-512-30b',
         'h2oai/h2ogpt-oasst1-falcon-40b',
         'h2oai/h2ogpt-oig-oasst1-falcon-40b',
     ],
@@ -385,7 +353,16 @@ prompt_type_to_model_name = {
     "wizard_lm": ['ehartford/WizardLM-7B-Uncensored', 'ehartford/WizardLM-13B-Uncensored'],
     "wizard_mega": ['openaccess-ai-collective/wizard-mega-13b'],
     "instruct_simple": ['JosephusCheung/Guanaco'],
 }
 inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
 inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
@@ -399,18 +376,29 @@ for p in PromptType:
     prompt_types.extend([p.name, p.value, str(p.value)])
-def get_prompt(prompt_type, prompt_dict, chat, context, reduced, return_dict=False):
     prompt_dict_error = ''
     if prompt_type == PromptType.custom.name and not isinstance(prompt_dict, dict):
         try:
             prompt_dict = ast.literal_eval(prompt_dict)
         except BaseException as e:
             prompt_dict_error = str(e)
-        if prompt_dict_error:
-            return dict(), prompt_dict_error
-    if prompt_type in [PromptType.custom.value, str(PromptType.custom.value),
-                       PromptType.custom.name]:
         promptA = prompt_dict.get('promptA', '')
         promptB = prompt_dict('promptB', '')
         PreInstruct = prompt_dict.get('PreInstruct', '')
@@ -418,21 +406,23 @@ def get_prompt(prompt_type, prompt_dict, chat, context, reduced, return_dict=Fal
         PreResponse = prompt_dict.get('PreResponse', '')
         terminate_response = prompt_dict.get('terminate_response', None)
         chat_sep = prompt_dict.get('chat_sep', '\n')
         humanstr = prompt_dict.get('humanstr', '')
         botstr = prompt_dict.get('botstr', '')
     elif prompt_type in [PromptType.plain.value, str(PromptType.plain.value),
                          PromptType.plain.name]:
-        promptA = promptB = PreInstruct = PreInput = PreResponse = ''
         terminate_response = []
-        chat_sep = ''
-        humanstr = ''
-        botstr = ''
     elif prompt_type == 'simple_instruct':
         promptA = promptB = PreInstruct = PreInput = PreResponse = None
         terminate_response = []
-        chat_sep = '\n'
-        humanstr = ''
-        botstr = ''
     elif prompt_type in [PromptType.instruct.value, str(PromptType.instruct.value),
                          PromptType.instruct.name] + [PromptType.instruct_with_end.value,
                                                       str(PromptType.instruct_with_end.value),
@@ -458,7 +448,7 @@ def get_prompt(prompt_type, prompt_dict, chat, context, reduced, return_dict=Fal
             terminate_response = ['### End']
         else:
             terminate_response = None
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.quality.value, str(PromptType.quality.value),
@@ -480,7 +470,7 @@ def get_prompt(prompt_type, prompt_dict, chat, context, reduced, return_dict=Fal
 ### Response:
 """
         terminate_response = None
-        chat_sep = '\n'
         humanstr = PreInstruct  # first thing human says
         botstr = PreResponse  # first thing bot says
     elif prompt_type in [PromptType.human_bot.value, str(PromptType.human_bot.value),
@@ -502,14 +492,14 @@ Current Time: {}
 """
             preprompt = PRE_PROMPT.format(cur_date, cur_time)
-        start = human
-        promptB = promptA = '%s%s ' % (preprompt, start)
-        PreInstruct = ""
         PreInput = None
-        if reduced:
             # when making context, want it to appear as-if LLM generated, which starts with space after :
             PreResponse = bot + ' '
         else:
@@ -517,10 +507,11 @@ Current Time: {}
             # if add space here, non-unique tokenization will often make LLM produce wrong output
             PreResponse = bot
-        terminate_response = [start, PreResponse]
-        chat_sep = '\n'
         humanstr = human  # tag before human talks
         botstr = bot  # tag before bot talks
     elif prompt_type in [PromptType.dai_faq.value, str(PromptType.dai_faq.value),
                          PromptType.dai_faq.name]:
         promptA = ''
@@ -536,7 +527,7 @@ Current Time: {}
 ### Driverless AI documentation answer:
 """
         terminate_response = ['\n\n']
-        chat_sep = terminate_response
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.summarize.value, str(PromptType.summarize.value),
@@ -545,7 +536,7 @@ Current Time: {}
         PreInstruct = '## Main Text\n\n'
         PreResponse = '\n\n## Summary\n\n'
         terminate_response = None
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_vicuna.value, str(PromptType.instruct_vicuna.value),
@@ -565,7 +556,7 @@ Current Time: {}
 """
         terminate_response = [
             '### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.prompt_answer.value, str(PromptType.prompt_answer.value),
@@ -573,33 +564,50 @@ Current Time: {}
         preprompt = ''
         prompt_tokens = "<|prompt|>"
         answer_tokens = "<|answer|>"
-        start = prompt_tokens
         promptB = promptA = '%s%s' % (preprompt, start)
-        PreInstruct = ""
         PreInput = None
         PreResponse = answer_tokens
         eos = '<|endoftext|>'  # neox eos
-        terminate_response = [start, PreResponse, eos]
-        chat_sep = eos
         humanstr = prompt_tokens
         botstr = answer_tokens
     elif prompt_type in [PromptType.open_assistant.value, str(PromptType.open_assistant.value),
                          PromptType.open_assistant.name]:
         # From added_tokens.json
         preprompt = ''
         prompt_tokens = "<|prompter|>"
         answer_tokens = "<|assistant|>"
-        start = prompt_tokens
         promptB = promptA = '%s%s' % (preprompt, start)
-        PreInstruct = ""
         PreInput = None
         PreResponse = answer_tokens
         pend = "<|prefix_end|>"
         eos = "</s>"
-        terminate_response = [start, PreResponse, pend, eos]
-        chat_sep = eos
         humanstr = prompt_tokens
         botstr = answer_tokens
     elif prompt_type in [PromptType.wizard_lm.value, str(PromptType.wizard_lm.value),
                          PromptType.wizard_lm.name]:
         # https://github.com/ehartford/WizardLM/blob/main/src/train_freeform.py
@@ -611,7 +619,7 @@ Current Time: {}
         PreResponse = "\n\n### Response\n"
         eos = "</s>"
         terminate_response = [PreResponse, eos]
-        chat_sep = eos
         humanstr = promptA
         botstr = PreResponse
     elif prompt_type in [PromptType.wizard_mega.value, str(PromptType.wizard_mega.value),
@@ -627,13 +635,12 @@ Current Time: {}
 ### Assistant:
 """
         terminate_response = [PreResponse]
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_vicuna2.value, str(PromptType.instruct_vicuna2.value),
                          PromptType.instruct_vicuna2.name]:
-        promptA = promptB = "" if not (
-                chat and reduced) else ''
         PreInstruct = """
 HUMAN:
@@ -646,13 +653,12 @@ ASSISTANT:
 """
         terminate_response = [
             'HUMAN:']  # but only allow terminate after prompt is found correctly, else can't terminate
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_vicuna3.value, str(PromptType.instruct_vicuna3.value),
                          PromptType.instruct_vicuna3.name]:
-        promptA = promptB = "" if not (
-                chat and reduced) else ''
         PreInstruct = """
 ### User:
@@ -665,13 +671,14 @@ ASSISTANT:
 """
         terminate_response = [
             '### User:']  # but only allow terminate after prompt is found correctly, else can't terminate
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.wizard2.value, str(PromptType.wizard2.value),
                          PromptType.wizard2.name]:
         # https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML
-        preprompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request."""
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
         PreInstruct = """
@@ -682,27 +689,39 @@ ASSISTANT:
 ### Response:
 """
         terminate_response = [PreResponse]
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.wizard3.value, str(PromptType.wizard3.value),
                          PromptType.wizard3.name]:
         # https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML
-        preprompt = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."""
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
         PreInstruct = """USER: """
         PreInput = None
         PreResponse = """ASSISTANT: """
         terminate_response = [PreResponse]
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_simple.value, str(PromptType.instruct_simple.value),
                          PromptType.instruct_simple.name]:
-        promptA = '' if not (chat and reduced) else ''
-        promptB = '' if not (chat and reduced) else ''
         PreInstruct = """
 ### Instruction:
@@ -716,21 +735,90 @@ ASSISTANT:
 ### Response:
 """
         terminate_response = None
-        chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
-    if return_dict:
-        return dict(promptA=promptA, promptB=promptB, PreInstruct=PreInstruct, PreInput=PreInput,
                     PreResponse=PreResponse, terminate_response=terminate_response, chat_sep=chat_sep,
-                    humanstr=humanstr, botstr=botstr), ''
     else:
-        return promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response, chat_sep, humanstr, botstr
-def generate_prompt(data_point, prompt_type, prompt_dict, chat, reduced):
     context = data_point.get('context')
     if context is None:
         context = ''
@@ -741,9 +829,12 @@ def generate_prompt(data_point, prompt_type, prompt_dict, chat, reduced):
     prompt_dict = data_point.get('prompt_dict', prompt_dict)
     assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
     promptA, promptB, PreInstruct, PreInput, PreResponse, \
-        terminate_response, chat_sep, humanstr, botstr = get_prompt(prompt_type, prompt_dict, chat, context, reduced)
-    prompt = context if not reduced else ''
     if input and promptA:
         prompt += f"""{promptA}"""
@@ -793,7 +884,7 @@ def generate_prompt(data_point, prompt_type, prompt_dict, chat, reduced):
     if output:
         prompt += f"""{output}"""
-    return prompt, pre_response, terminate_response, chat_sep
 def inject_chatsep(prompt_type, prompt, chat_sep=None):
@@ -808,9 +899,6 @@ class Prompter(object):
                  allowed_repeat_line_length=10):
         self.prompt_type = prompt_type
         self.prompt_dict = prompt_dict
-        data_point = dict(instruction='', input='', output='')
-        _, self.pre_response, self.terminate_response, self.chat_sep = \
-            generate_prompt(data_point, self.prompt_type, self.prompt_dict, chat, False)
         self.debug = debug
         self.chat = chat
         self.stream_output = stream_output
@@ -819,15 +907,33 @@ class Prompter(object):
         self.prompt = None
         context = ""  # not for chat context
         reduced = False  # not for chat context
         self.promptA, self.promptB, self.PreInstruct, self.PreInput, self.PreResponse, \
-            self.terminate_response, self.chat_sep, self.humanstr, self.botstr = \
-            get_prompt(self.prompt_type, self.prompt_dict, chat, context, reduced)
-    def generate_prompt(self, data_point):
-        reduced = False
-        prompt, _, _, _ = generate_prompt(data_point, self.prompt_type, self.prompt_dict, self.chat, reduced)
         if self.debug:
             print("prompt: %s" % prompt, flush=True)
         self.prompt = prompt
         return prompt
@@ -846,7 +952,8 @@ class Prompter(object):
             if sanitize_bot_response:
                 from better_profanity import profanity
                 response = profanity.censor(response)
-            response = response.strip("\n")
             return response
         def clean_repeats(response):
@@ -868,12 +975,12 @@ class Prompter(object):
                 # then use most basic parsing like pipeline
                 if self.botstr in output:
                     if self.humanstr:
-                        output = clean_response(output.split(self.botstr)[1].strip().split(self.humanstr)[0].strip())
                     else:
                         # i.e. use after bot but only up to next bot
-                        output = clean_response(output.split(self.botstr)[1].strip().split(self.botstr)[0].strip())
                 else:
-                    # output = clean_response(output.strip())
                     # assume just not printed yet
                     output = ""
             else:
@@ -900,9 +1007,9 @@ class Prompter(object):
                     allow_terminate = True
                     output = output[len(prompt):]
                 # clean after subtract prompt out, so correct removal of pre_response
-                output = clean_response(output).strip()
                 if self.repeat_penalty:
-                    output = clean_repeats(output).strip()
                 if self.terminate_response and allow_terminate:
                     finds = []
                     for term in self.terminate_response:
@@ -910,11 +1017,9 @@ class Prompter(object):
                     finds = [x for x in finds if x >= 0]
                     if len(finds) > 0:
                         termi = finds[0]
-                        output = output[:termi].strip()
                     else:
-                        output = output.strip()
-                else:
-                    output = output.strip()
             if multi_output:
                 # prefix with output counter
                 output = "\n=========== Output %d\n\n" % (1 + oi) + output
@@ -927,3 +1032,80 @@ class Prompter(object):
         if self.debug:
             print("outputclean:\n%s" % '\n\n'.join(outputs), flush=True)
         return output

             # unknown
             model_max_length = None
+        num_prompt_tokens = None
         if model_max_length is not None:
             # can't wait for "hole" if not plain prompt_type, since would lose prefix like <human>:
             # For https://github.com/h2oai/h2ogpt/issues/192
             for trial in range(0, 3):
                         print("Reduced max_new_tokens from %s -> %s" % (
                         generate_kwargs['max_new_tokens'], max_new_tokens))
                     generate_kwargs['max_new_tokens'] = max_new_tokens
+        return prompt_text, num_prompt_tokens
     def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
+        prompt_text, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt_text, self.tokenizer)
         data_point = dict(context='', instruction=prompt_text, input='')
         if self.prompter is not None:
                 outputs = self.prompter.get_response(outputs, prompt=self.prompt_text,
                                                      sanitize_bot_response=self.sanitize_bot_response)
             elif self.bot and self.human:
+                outputs = rec['generated_text'].split(self.bot)[1].split(self.human)[0]
             else:
                 outputs = rec['generated_text']
             rec['generated_text'] = outputs
             else:
                 raise ValueError("TF not avaialble.")
         return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}
 from enum import Enum
     wizard2 = 16
     wizard3 = 17
     instruct_simple = 18
+    wizard_vicuna = 19
+    openai = 20
+    openai_chat = 21
+    gptj = 22
+    prompt_answer_openllama = 23
+    vicuna11 = 24
 class DocumentChoices(Enum):
     MY_DATA = "MyData"
     GITHUB_H2OGPT = "github h2oGPT"
     H2O_DAI_DOCS = "DriverlessAI docs"
+no_server_str = no_lora_str = no_model_str = '[None/Remove]'
+# from site-packages/langchain/llms/openai.py, but needed since ChatOpenAI doesn't have this information
+model_token_mapping = {
+    "gpt-4": 8192,
+    "gpt-4-0314": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-4-32k-0314": 32768,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-16k": 16*1024,
+    "gpt-3.5-turbo-0301": 4096,
+    "text-ada-001": 2049,
+    "ada": 2049,
+    "text-babbage-001": 2040,
+    "babbage": 2049,
+    "text-curie-001": 2049,
+    "curie": 2049,
+    "davinci": 2049,
+    "text-davinci-003": 4097,
+    "text-davinci-002": 4097,
+    "code-davinci-002": 8001,
+    "code-davinci-001": 8001,
+    "code-cushman-002": 2048,
+    "code-cushman-001": 2048,
+}
+source_prefix = "Sources [Score | Link]:"
+source_postfix = "End Sources<p>"
+import os
 import ast
 import time
 non_hf_types = ['gpt4all_llama', 'llama', 'gptj']
         'mosaicml/mpt-7b-storywriter',
         'mosaicml/mpt-7b-instruct',  # internal code handles instruct
         'mosaicml/mpt-7b-chat',  # NC, internal code handles instruct
+        'mosaicml/mpt-30b-instruct',  # internal code handles instruct
     ],
+    'gptj': ['gptj', 'gpt4all_llama'],
     'prompt_answer': [
         'h2oai/h2ogpt-gm-oasst1-en-1024-20b',
         'h2oai/h2ogpt-gm-oasst1-en-1024-12b',
         'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b',
         'h2oai/h2ogpt-gm-oasst1-multilang-2048-falcon-7b',
         'h2oai/h2ogpt-gm-oasst1-multilang-2048-falcon-7b-v2',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3',
         'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b',
         'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2',
         'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v2',
+        'h2oai/h2ogpt-gm-oasst1-en-xgen-7b-8k',
+        'h2oai/h2ogpt-gm-oasst1-multilang-xgen-7b-8k',
+    ],
+    'prompt_answer_openllama': [
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-700bt',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-13b',
     ],
     'instruct': [],
     'instruct_with_end': ['databricks/dolly-v2-12b'],
         'h2oai/h2ogpt-oig-oasst1-256-6.9b',  # legacy
         'h2oai/h2ogpt-oig-oasst1-512-6.9b',  # legacy
         'h2oai/h2ogpt-research-oasst1-512-30b',
+        'h2oai/h2ogpt-research-oasst1-llama-65b',
         'h2oai/h2ogpt-oasst1-falcon-40b',
         'h2oai/h2ogpt-oig-oasst1-falcon-40b',
     ],
     "wizard_lm": ['ehartford/WizardLM-7B-Uncensored', 'ehartford/WizardLM-13B-Uncensored'],
     "wizard_mega": ['openaccess-ai-collective/wizard-mega-13b'],
     "instruct_simple": ['JosephusCheung/Guanaco'],
+    "wizard_vicuna": ['ehartford/Wizard-Vicuna-13B-Uncensored'],
+    "wizard2": ['llama', 'mosaicml/mpt-30b-instruct'],
+    "vicuna11": ['lmsys/vicuna-33b-v1.3'],
+    # could be plain, but default is correct prompt_type for default TheBloke model ggml-wizardLM-7B.q4_2.bin
 }
+if os.getenv('OPENAI_API_KEY'):
+    prompt_type_to_model_name.update({
+        "openai": ["text-davinci-003", "text-curie-001", "text-babbage-001", "text-ada-001"],
+        "openai_chat": ["gpt-3.5-turbo", "gpt-3.5-turbo-16k"],
+    })
 inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
 inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
     prompt_types.extend([p.name, p.value, str(p.value)])
+def get_prompt(prompt_type, prompt_dict, chat, context, reduced, making_context, return_dict=False):
     prompt_dict_error = ''
+    generates_leading_space = False
     if prompt_type == PromptType.custom.name and not isinstance(prompt_dict, dict):
         try:
             prompt_dict = ast.literal_eval(prompt_dict)
         except BaseException as e:
             prompt_dict_error = str(e)
+    if prompt_dict_error:
+        promptA = None
+        promptB = None
+        PreInstruct = None
+        PreInput = ''
+        PreResponse = ''
+        terminate_response = None
+        chat_sep = ''
+        chat_turn_sep = ''
+        humanstr = ''
+        botstr = ''
+        generates_leading_space = False
+    elif prompt_type in [PromptType.custom.value, str(PromptType.custom.value),
+                         PromptType.custom.name]:
         promptA = prompt_dict.get('promptA', '')
         promptB = prompt_dict('promptB', '')
         PreInstruct = prompt_dict.get('PreInstruct', '')
         PreResponse = prompt_dict.get('PreResponse', '')
         terminate_response = prompt_dict.get('terminate_response', None)
         chat_sep = prompt_dict.get('chat_sep', '\n')
+        chat_turn_sep = prompt_dict.get('chat_turn_sep', '\n')
         humanstr = prompt_dict.get('humanstr', '')
         botstr = prompt_dict.get('botstr', '')
     elif prompt_type in [PromptType.plain.value, str(PromptType.plain.value),
                          PromptType.plain.name]:
+        promptA = promptB = PreInstruct = PreInput = PreResponse = None
         terminate_response = []
+        chat_turn_sep = chat_sep = ''
+        # plain should have None for human/bot, so nothing truncated out, not '' that would truncate after first token
+        humanstr = None
+        botstr = None
     elif prompt_type == 'simple_instruct':
         promptA = promptB = PreInstruct = PreInput = PreResponse = None
         terminate_response = []
+        chat_turn_sep = chat_sep = '\n'
+        humanstr = None
+        botstr = None
     elif prompt_type in [PromptType.instruct.value, str(PromptType.instruct.value),
                          PromptType.instruct.name] + [PromptType.instruct_with_end.value,
                                                       str(PromptType.instruct_with_end.value),
             terminate_response = ['### End']
         else:
             terminate_response = None
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.quality.value, str(PromptType.quality.value),
 ### Response:
 """
         terminate_response = None
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct  # first thing human says
         botstr = PreResponse  # first thing bot says
     elif prompt_type in [PromptType.human_bot.value, str(PromptType.human_bot.value),
 """
             preprompt = PRE_PROMPT.format(cur_date, cur_time)
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = human + ' '
         PreInput = None
+        if making_context:
             # when making context, want it to appear as-if LLM generated, which starts with space after :
             PreResponse = bot + ' '
         else:
             # if add space here, non-unique tokenization will often make LLM produce wrong output
             PreResponse = bot
+        terminate_response = ['\n' + human, '\n' + bot, human, bot, PreResponse]
+        chat_turn_sep = chat_sep = '\n'
         humanstr = human  # tag before human talks
         botstr = bot  # tag before bot talks
+        generates_leading_space = True
     elif prompt_type in [PromptType.dai_faq.value, str(PromptType.dai_faq.value),
                          PromptType.dai_faq.name]:
         promptA = ''
 ### Driverless AI documentation answer:
 """
         terminate_response = ['\n\n']
+        chat_turn_sep = chat_sep = terminate_response
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.summarize.value, str(PromptType.summarize.value),
         PreInstruct = '## Main Text\n\n'
         PreResponse = '\n\n## Summary\n\n'
         terminate_response = None
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_vicuna.value, str(PromptType.instruct_vicuna.value),
 """
         terminate_response = [
             '### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.prompt_answer.value, str(PromptType.prompt_answer.value),
         preprompt = ''
         prompt_tokens = "<|prompt|>"
         answer_tokens = "<|answer|>"
+        start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = prompt_tokens
         PreInput = None
         PreResponse = answer_tokens
         eos = '<|endoftext|>'  # neox eos
         humanstr = prompt_tokens
         botstr = answer_tokens
+        terminate_response = [humanstr, PreResponse, eos]
+        chat_sep = ''
+        chat_turn_sep = eos
+    elif prompt_type in [PromptType.prompt_answer_openllama.value, str(PromptType.prompt_answer_openllama.value),
+                         PromptType.prompt_answer_openllama.name]:
+        preprompt = ''
+        prompt_tokens = "<|prompt|>"
+        answer_tokens = "<|answer|>"
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = prompt_tokens
+        PreInput = None
+        PreResponse = answer_tokens
+        eos = '</s>'  # llama eos
+        humanstr = prompt_tokens
+        botstr = answer_tokens
+        terminate_response = [humanstr, PreResponse, eos]
+        chat_sep = ''
+        chat_turn_sep = eos
     elif prompt_type in [PromptType.open_assistant.value, str(PromptType.open_assistant.value),
                          PromptType.open_assistant.name]:
         # From added_tokens.json
         preprompt = ''
         prompt_tokens = "<|prompter|>"
         answer_tokens = "<|assistant|>"
+        start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = prompt_tokens
         PreInput = None
         PreResponse = answer_tokens
         pend = "<|prefix_end|>"
         eos = "</s>"
         humanstr = prompt_tokens
         botstr = answer_tokens
+        terminate_response = [humanstr, PreResponse, pend, eos]
+        chat_turn_sep = chat_sep = eos
     elif prompt_type in [PromptType.wizard_lm.value, str(PromptType.wizard_lm.value),
                          PromptType.wizard_lm.name]:
         # https://github.com/ehartford/WizardLM/blob/main/src/train_freeform.py
         PreResponse = "\n\n### Response\n"
         eos = "</s>"
         terminate_response = [PreResponse, eos]
+        chat_turn_sep = chat_sep = eos
         humanstr = promptA
         botstr = PreResponse
     elif prompt_type in [PromptType.wizard_mega.value, str(PromptType.wizard_mega.value),
 ### Assistant:
 """
         terminate_response = [PreResponse]
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_vicuna2.value, str(PromptType.instruct_vicuna2.value),
                          PromptType.instruct_vicuna2.name]:
+        promptA = promptB = "" if not (chat and reduced) else ''
         PreInstruct = """
 HUMAN:
 """
         terminate_response = [
             'HUMAN:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_vicuna3.value, str(PromptType.instruct_vicuna3.value),
                          PromptType.instruct_vicuna3.name]:
+        promptA = promptB = "" if not (chat and reduced) else ''
         PreInstruct = """
 ### User:
 """
         terminate_response = [
             '### User:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.wizard2.value, str(PromptType.wizard2.value),
                          PromptType.wizard2.name]:
         # https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML
+        preprompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.""" if not (
+                chat and reduced) else ''
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
         PreInstruct = """
 ### Response:
 """
         terminate_response = [PreResponse]
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.wizard3.value, str(PromptType.wizard3.value),
                          PromptType.wizard3.name]:
         # https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML
+        preprompt = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""" if not (
+                chat and reduced) else ''
         start = ''
         promptB = promptA = '%s%s' % (preprompt, start)
         PreInstruct = """USER: """
         PreInput = None
         PreResponse = """ASSISTANT: """
         terminate_response = [PreResponse]
+        chat_turn_sep = chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [PromptType.wizard_vicuna.value, str(PromptType.wizard_vicuna.value),
+                         PromptType.wizard_vicuna.name]:
+        preprompt = ''
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = """USER: """
+        PreInput = None
+        PreResponse = """ASSISTANT: """
+        terminate_response = [PreResponse]
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
     elif prompt_type in [PromptType.instruct_simple.value, str(PromptType.instruct_simple.value),
                          PromptType.instruct_simple.name]:
+        promptB = promptA = '' if not (chat and reduced) else ''
         PreInstruct = """
 ### Instruction:
 ### Response:
 """
         terminate_response = None
+        chat_turn_sep = chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [PromptType.openai.value, str(PromptType.openai.value),
+                         PromptType.openai.name]:
+        preprompt = """The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.""" if not (
+                chat and reduced) else ''
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = "\nHuman: "
+        PreInput = None
+        PreResponse = "\nAI:"
+        terminate_response = [PreResponse] + [" Human:", " AI:"]
+        chat_turn_sep = chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [PromptType.gptj.value, str(PromptType.gptj.value),
+                         PromptType.gptj.name]:
+        preprompt = "### Instruction:\n The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response." if not (
+                chat and reduced) else ''
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = "\n### Prompt: "
+        PreInput = None
+        PreResponse = "\n### Response: "
+        terminate_response = [PreResponse] + ["Prompt:", "Response:"]
+        chat_turn_sep = chat_sep = '\n'
         humanstr = PreInstruct
         botstr = PreResponse
+    elif prompt_type in [PromptType.openai_chat.value, str(PromptType.openai_chat.value),
+                         PromptType.openai_chat.name]:
+        # prompting and termination all handled by endpoint
+        preprompt = """"""
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = ""
+        terminate_response = []
+        chat_turn_sep = chat_sep = '\n'
+        humanstr = None
+        botstr = None
+    elif prompt_type in [PromptType.vicuna11.value, str(PromptType.vicuna11.value),
+                         PromptType.vicuna11.name]:
+        preprompt = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. """ if not (
+                chat and reduced) else ''
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        eos = '</s>'
+        PreInstruct = """USER: """
+        PreInput = None
+        PreResponse = """ASSISTANT:"""
+        terminate_response = [PreResponse]
+        chat_sep = ' '
+        chat_turn_sep = eos
+        humanstr = PreInstruct
+        botstr = PreResponse
+        if making_context:
+            # when making context, want it to appear as-if LLM generated, which starts with space after :
+            PreResponse = PreResponse + ' '
+        else:
+            # normally LLM adds space after this, because was how trained.
+            # if add space here, non-unique tokenization will often make LLM produce wrong output
+            PreResponse = PreResponse
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
+    if isinstance(terminate_response, (tuple, list)):
+        assert '' not in terminate_response, "Bad terminate_response"
+    ret_dict = dict(promptA=promptA, promptB=promptB, PreInstruct=PreInstruct, PreInput=PreInput,
                     PreResponse=PreResponse, terminate_response=terminate_response, chat_sep=chat_sep,
+                    chat_turn_sep=chat_turn_sep,
+                    humanstr=humanstr, botstr=botstr,
+                    generates_leading_space=generates_leading_space)
+    if return_dict:
+        return ret_dict, prompt_dict_error
     else:
+        return tuple(list(ret_dict.values()))
+def generate_prompt(data_point, prompt_type, prompt_dict, chat, reduced, making_context):
     context = data_point.get('context')
     if context is None:
         context = ''
     prompt_dict = data_point.get('prompt_dict', prompt_dict)
     assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
     promptA, promptB, PreInstruct, PreInput, PreResponse, \
+        terminate_response, chat_sep, chat_turn_sep, humanstr, botstr, \
+        generates_leading_space = get_prompt(prompt_type, prompt_dict, chat,
+                                             context, reduced, making_context)
+    # could avoid if reduce=True, but too complex for parent functions to handle
+    prompt = context
     if input and promptA:
         prompt += f"""{promptA}"""
     if output:
         prompt += f"""{output}"""
+    return prompt, pre_response, terminate_response, chat_sep, chat_turn_sep
 def inject_chatsep(prompt_type, prompt, chat_sep=None):
                  allowed_repeat_line_length=10):
         self.prompt_type = prompt_type
         self.prompt_dict = prompt_dict
         self.debug = debug
         self.chat = chat
         self.stream_output = stream_output
         self.prompt = None
         context = ""  # not for chat context
         reduced = False  # not for chat context
+        making_context = False  # not for chat context
         self.promptA, self.promptB, self.PreInstruct, self.PreInput, self.PreResponse, \
+            self.terminate_response, self.chat_sep, self.chat_turn_sep, self.humanstr, self.botstr, \
+            self.generates_leading_space = \
+            get_prompt(self.prompt_type, self.prompt_dict, chat, context, reduced, making_context)
+        self.pre_response = self.PreResponse
+    def generate_prompt(self, data_point, reduced=None):
+        """
+        data_point['context'] is assumed to be like a system prompt or pre-conversation, not inserted after user prompt
+        :param data_point:
+        :param reduced:
+        :return:
+        """
+        reduced = data_point.get('context') not in ['', None] if reduced is None else reduced
+        making_context = False  # whether really making final prompt or just generating context
+        prompt, _, _, _, _ = generate_prompt(data_point, self.prompt_type, self.prompt_dict, self.chat, reduced,
+                                             making_context)
         if self.debug:
             print("prompt: %s" % prompt, flush=True)
+        # if have context, should have always reduced and only preappend promptA/B here
+        if data_point.get('context'):
+            if data_point.get('input') and self.promptA:
+                prompt = self.promptA + prompt
+            elif self.promptB:
+                prompt = self.promptB + prompt
         self.prompt = prompt
         return prompt
             if sanitize_bot_response:
                 from better_profanity import profanity
                 response = profanity.censor(response)
+            if self.generates_leading_space and isinstance(response, str) and len(response) > 0 and response[0] == ' ':
+                response = response[1:]
             return response
         def clean_repeats(response):
                 # then use most basic parsing like pipeline
                 if self.botstr in output:
                     if self.humanstr:
+                        output = clean_response(output.split(self.botstr)[1].split(self.humanstr)[0])
                     else:
                         # i.e. use after bot but only up to next bot
+                        output = clean_response(output.split(self.botstr)[1].split(self.botstr)[0])
                 else:
+                    # output = clean_response(output)
                     # assume just not printed yet
                     output = ""
             else:
                     allow_terminate = True
                     output = output[len(prompt):]
                 # clean after subtract prompt out, so correct removal of pre_response
+                output = clean_response(output)
                 if self.repeat_penalty:
+                    output = clean_repeats(output)
                 if self.terminate_response and allow_terminate:
                     finds = []
                     for term in self.terminate_response:
                     finds = [x for x in finds if x >= 0]
                     if len(finds) > 0:
                         termi = finds[0]
+                        output = output[:termi]
                     else:
+                        output = output
             if multi_output:
                 # prefix with output counter
                 output = "\n=========== Output %d\n\n" % (1 + oi) + output
         if self.debug:
             print("outputclean:\n%s" % '\n\n'.join(outputs), flush=True)
         return output
+import torch
+from transformers import StoppingCriteria, StoppingCriteriaList
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=[], device="cuda", model_max_length=None):
+        super().__init__()
+        assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
+        self.encounters = encounters
+        self.stops = [stop.to(device) for stop in stops]
+        self.num_stops = [0] * len(stops)
+        self.model_max_length = model_max_length
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        for stopi, stop in enumerate(self.stops):
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                self.num_stops[stopi] += 1
+                if self.num_stops[stopi] >= self.encounters[stopi % len(self.encounters)]:
+                    # print("Stopped", flush=True)
+                    return True
+        if self.model_max_length is not None and input_ids[0].shape[0] >= self.model_max_length:
+            # critical limit
+            return True
+        # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
+        # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
+        return False
+def get_stopping(prompt_type, prompt_dict, tokenizer, device, human='<human>:', bot="<bot>:", model_max_length=None):
+    # FIXME: prompt_dict unused currently
+    if prompt_type in [PromptType.human_bot.name, PromptType.instruct_vicuna.name, PromptType.instruct_with_end.name]:
+        if prompt_type == PromptType.human_bot.name:
+            # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
+            # stopping only starts once output is beyond prompt
+            # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
+            stop_words = [human, bot, '\n' + human, '\n' + bot]
+            encounters = [1, 2]
+        elif prompt_type == PromptType.instruct_vicuna.name:
+            # even below is not enough, generic strings and many ways to encode
+            stop_words = [
+                '### Human:',
+                """
+### Human:""",
+                """
+### Human:
+""",
+                '### Assistant:',
+                """
+### Assistant:""",
+                """
+### Assistant:
+""",
+            ]
+            encounters = [1, 2]
+        else:
+            # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
+            stop_words = ['### End']
+            encounters = [1]
+        stop_words_ids = [
+            tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
+        # handle single token case
+        stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
+        stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
+        # avoid padding in front of tokens
+        if tokenizer._pad_token:  # use hidden variable to avoid annoying properly logger bug
+            stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
+        # handle fake \n added
+        stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
+        # build stopper
+        stopping_criteria = StoppingCriteriaList(
+            [StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters, device=device,
+                                 model_max_length=model_max_length)])
+    else:
+        stopping_criteria = StoppingCriteriaList()
+    return stopping_criteria