Spaces:

yuantao-infini-ai
/

demo_test

Runtime error

App Files Files Community

yuantao-infini-ai commited on Jul 30

Commit

7472549

•

1 Parent(s): cf1798b

Upload 136 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fastchat/__init__.py +1 -0
fastchat/__pycache__/__init__.cpython-310.pyc +0 -0
fastchat/__pycache__/__init__.cpython-311.pyc +0 -0
fastchat/__pycache__/constants.cpython-310.pyc +0 -0
fastchat/__pycache__/conversation.cpython-310.pyc +0 -0
fastchat/__pycache__/utils.cpython-310.pyc +0 -0
fastchat/constants.py +65 -0
fastchat/conversation.py +1689 -0
fastchat/data/__init__.py +0 -0
fastchat/data/clean_sharegpt.py +217 -0
fastchat/data/convert_alpaca.py +38 -0
fastchat/data/extract_gpt4_only.py +32 -0
fastchat/data/extract_single_round.py +29 -0
fastchat/data/filter_wrong_format.py +44 -0
fastchat/data/get_stats.py +82 -0
fastchat/data/hardcoded_questions.py +168 -0
fastchat/data/inspect_data.py +33 -0
fastchat/data/merge.py +23 -0
fastchat/data/optional_clean.py +90 -0
fastchat/data/optional_replace.py +82 -0
fastchat/data/prepare_all.py +42 -0
fastchat/data/pretty_json.py +20 -0
fastchat/data/sample.py +40 -0
fastchat/data/split_long_conversation.py +129 -0
fastchat/data/split_train_test.py +34 -0
fastchat/model/__init__.py +5 -0
fastchat/model/__pycache__/__init__.cpython-310.pyc +0 -0
fastchat/model/__pycache__/compression.cpython-310.pyc +0 -0
fastchat/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc +0 -0
fastchat/model/__pycache__/model_adapter.cpython-310.pyc +0 -0
fastchat/model/__pycache__/model_chatglm.cpython-310.pyc +0 -0
fastchat/model/__pycache__/model_codet5p.cpython-310.pyc +0 -0
fastchat/model/__pycache__/model_exllama.cpython-310.pyc +0 -0
fastchat/model/__pycache__/model_falcon.cpython-310.pyc +0 -0
fastchat/model/__pycache__/model_registry.cpython-310.pyc +0 -0
fastchat/model/__pycache__/model_xfastertransformer.cpython-310.pyc +0 -0
fastchat/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc +0 -0
fastchat/model/apply_delta.py +165 -0
fastchat/model/apply_lora.py +48 -0
fastchat/model/compression.py +300 -0
fastchat/model/convert_fp16.py +26 -0
fastchat/model/llama_condense_monkey_patch.py +71 -0
fastchat/model/make_delta.py +48 -0
fastchat/model/model_adapter.py +1970 -0
fastchat/model/model_chatglm.py +102 -0
fastchat/model/model_codet5p.py +108 -0
fastchat/model/model_exllama.py +77 -0
fastchat/model/model_falcon.py +140 -0
fastchat/model/model_registry.py +387 -0
fastchat/model/model_xfastertransformer.py +81 -0

fastchat/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.2.32"

fastchat/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (184 Bytes). View file

fastchat/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (199 Bytes). View file

fastchat/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (2.18 kB). View file

fastchat/__pycache__/conversation.cpython-310.pyc ADDED Viewed

Binary file (27.5 kB). View file

fastchat/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (10.1 kB). View file

fastchat/constants.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Global constants.
+"""
+from enum import IntEnum
+import os
+REPO_PATH = os.path.dirname(os.path.dirname(__file__))
+##### For the gradio web server
+SERVER_ERROR_MSG = (
+    "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+)
+MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES."
+CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
+INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
+SLOW_MODEL_MSG = "⚠️  Both models will show the responses all at once. Please stay patient as it may take over 30 seconds."
+# Maximum input length
+INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000))
+# Maximum conversation turns
+CONVERSATION_TURN_LIMIT = 50
+# Session expiration time
+SESSION_EXPIRATION_TIME = 3600
+# The output dir of log files
+LOGDIR = os.getenv("LOGDIR", ".")
+# CPU Instruction Set Architecture
+CPU_ISA = os.getenv("CPU_ISA")
+##### For the controller and workers (could be overwritten through ENV variables.)
+CONTROLLER_HEART_BEAT_EXPIRATION = int(
+    os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90)
+)
+WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 45))
+WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 100))
+WORKER_API_EMBEDDING_BATCH_SIZE = int(
+    os.getenv("FASTCHAT_WORKER_API_EMBEDDING_BATCH_SIZE", 4)
+)
+class ErrorCode(IntEnum):
+    """
+    https://platform.openai.com/docs/guides/error-codes/api-errors
+    """
+    VALIDATION_TYPE_ERROR = 40001
+    INVALID_AUTH_KEY = 40101
+    INCORRECT_AUTH_KEY = 40102
+    NO_PERMISSION = 40103
+    INVALID_MODEL = 40301
+    PARAM_OUT_OF_RANGE = 40302
+    CONTEXT_OVERFLOW = 40303
+    RATE_LIMIT = 42901
+    QUOTA_EXCEEDED = 42902
+    ENGINE_OVERLOADED = 42903
+    INTERNAL_ERROR = 50001
+    CUDA_OUT_OF_MEMORY = 50002
+    GRADIO_REQUEST_ERROR = 50003
+    GRADIO_STREAM_UNKNOWN_ERROR = 50004
+    CONTROLLER_NO_WORKER = 50005
+    CONTROLLER_WORKER_TIMEOUT = 50006

fastchat/conversation.py ADDED Viewed

	@@ -0,0 +1,1689 @@

+"""
+Conversation prompt templates.
+We kindly request that you import fastchat instead of copying this file if you wish to use it.
+If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
+"""
+import dataclasses
+from enum import auto, IntEnum
+from typing import List, Any, Dict, Union, Tuple
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    DIY = auto()
+    MEGREZ = auto()
+    MEGREZ_CRLFT = auto()
+    MEGREZ_CRLFT_LLAMA3 = auto()
+    MEGREZ_1B = auto()
+    MINICPM_V2 = auto()
+    ZHINAO360 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: Tuple[str] = ("USER", "ASSISTANT")
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    apply_template: bool = False
+    none_stop: bool = False
+    skip_special_tokens: bool = True
+    def convert_messages_format(self, messages, sysprompt) -> list:
+        messages_ = [{'role': 'system', 'content': sysprompt}]
+        for message in messages:
+            if not message[1]:
+                continue
+            if isinstance(message, list):
+                messages_.append({'role': message[0], 'content': message[1]})
+            else:
+                messages_.append(message)
+        return messages_
+    def get_prompt(self, tokenizer=None) -> str:
+        """Get the prompt for generation."""
+        if tokenizer and self.apply_template:
+            print(f'======using apply_chat_template()======')
+            try:
+                messages = self.convert_messages_format(self.messages, self.system_message)
+                ret = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                return ret
+            except:
+                raise ValueError(f"apply_chat_template() is not supported by this tokenizer: {tokenizer}")
+        print(f'======using fastchat conv template======')
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt #+ self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += tag + " " + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == "chatglm2" else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
+                if message:
+                    ret += f"{role}：{message}{self.sep}"
+                else:
+                    ret += f"{role}："
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + " " + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += "<s>"
+                if message:
+                    ret += role + ":" + message + seps[i % 2] + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DIY:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.MEGREZ:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0] + ' '
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.MEGREZ_CRLFT:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.MEGREZ_CRLFT_LLAMA3:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.MEGREZ_1B:
+            ret = system_prompt + self.sep
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.MINICPM_V2:
+            seps = [self.sep, self.sep2]
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + "\n" + message + seps[i % 2]
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.ZHINAO360:
+            seps = [self.sep, self.sep2]
+            ret = '' if system_prompt=='' else f'<|im_start|>system\n{system_prompt}<|im_end|>\n'
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + "\n" + message + seps[i % 2]
+                else:
+                    ret += role + "\n"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{"role": "system", "content": self.system_message}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+            apply_template=self.apply_template,
+            none_stop=self.none_stop,
+            skip_special_tokens=self.skip_special_tokens,
+        )
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f"{template.name} has been registered."
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+# An empty template for raw conversation.
+register_conv_template(
+    Conversation(
+        name="raw",
+        system_message="",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+    )
+)
+# A template with a one-shot conversation example
+register_conv_template(
+    Conversation(
+        name="one_shot",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        messages=(
+            (
+                "Human",
+                "Got any creative ideas for a 10 year old’s birthday?",
+            ),
+            (
+                "Assistant",
+                """Of course! Here are some creative ideas for a 10-year-old's birthday party:
+1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
+2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
+3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
+4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
+5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
+6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
+7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
+8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
+Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+# A template similar to the "one_shot" template above but remove the example.
+register_conv_template(
+    Conversation(
+        name="zero_shot",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+# Vicuna v1.1 template
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez",
+        system_message="你是一个乐于助人的助手，助手将针对用户的问题给出详细的、积极的回答。",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.MEGREZ,
+        sep="\n",
+        sep2="</s>",
+        stop_str="USER",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_crlft",
+        system_message="你是一个乐于助人的助手，助手将针对用户的问题给出详细的、积极的回答。",
+        roles=("Megrez USER", "Megrez ASSISTANT"),
+        sep_style=SeparatorStyle.MEGREZ_CRLFT,
+        sep="</s>",
+        sep2="</s>",
+        stop_str="USER",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_crlft_llama3",
+        system_message="<|begin_of_text|>你是一个乐于助人的助手，将针对用户的问题给出详细的、积极的回答。<|end_of_text|>\n\n",
+        roles=("<|start_header_id|>Megrez USER<|end_header_id|>\n\n", "<|start_header_id|>Megrez ASSISTANT<|end_header_id|>\n\n"),
+        sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
+        sep="<|eot_id|>",
+        sep2="<|eot_id|><|end_of_text|>",
+        stop_str=["<|eot_id|>", "<|end_of_text|>", "<|eot_id|><|end_of_text|>"]
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_3b",
+        system_message="<|system_start|>你是一个乐于助人的助手，将针对用户的问题给出详细的、积极的回答。<|system_end|>",
+        roles=("<|user|>Megrez USER\n", "<|assistant|>Megrez ASSISTANT\n"),
+        sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
+        sep="<|eos|>",
+        sep2="<|eos|>",
+        stop_str=["<|eos|>"]
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_3b_2",
+        system_message="<|system_start|>你是一个乐于助人的助手，将针对用户的问题给出详细的、积极的回答。<|system_end|>",
+        roles=("<|user|>USER\n", "<|assistant|>ASSISTANT\n"),
+        sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
+        sep="<|eos|>",
+        sep2="<|eos|>",
+        stop_str=["<|eos|>"]
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_standar",
+        system_message="<|role_start|>system<|role_end|>你是无穹天权，将针对用户的问题给出详细的、积极的回答。",
+        roles=("<|role_start|>user<|role_end|>", "<|role_start|>assistant<|role_end|>"),
+        sep_style=SeparatorStyle.MEGREZ_1B,
+        sep="<|turn_end|>",
+        stop_str=["<|turn_end|>"]
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_1b_rk",
+        system_message="<|role_start|>system<|role_end|>你是Megrez-1B，将针对用户的问题给出详细的、积极的回答。",
+        roles=("<|role_start|>user<|role_end|>", "<|role_start|>assistant<|role_end|>"),
+        sep_style=SeparatorStyle.MEGREZ_1B,
+        sep="<|eos|>",
+        stop_str=["<|eos|>"]
+    )
+)
+register_conv_template(
+    Conversation(
+        name="llama3_chat",
+        system_message="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n中文大师<|eot_id|>",
+        roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+        sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
+        sep="<|eot_id|>",
+        sep2="<|eot_id|>",
+        stop_str=["assistant", "<|eot_id|>", "<|end_of_text|>", "<|eot_id|><|end_of_text|>"]
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_crlft_mortal",
+        system_message="你是一个乐于助人的助手，助手将针对用户的问题给出详细的、积极的回答。",
+        roles=("Mortal USER", "Mortal ASSISTANT"),
+        sep_style=SeparatorStyle.MEGREZ_CRLFT,
+        sep="</s>",
+        sep2="</s>",
+        stop_str="USER",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="minicpm",
+        system_message="<s>",
+        roles=("<用户>", "<AI>"),
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="",
+        sep2="</s>",
+        stop_str="<用户>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="360zhinao",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.MINICPM_V2,
+        sep="<|im_end|>\n",
+        sep2="<|im_end|>\n",
+        stop_str=["<|im_end|>\n", "<|im_start|>", "user"],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="wizard",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="belle",
+        system_message="",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.DIY,
+        sep="\n\n",
+        # sep2="</s>",
+        # stop_str="<用户>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="xdan",
+        system_message="You are a helpful assistant named DAN. You are an expert in worldly knowledge, skilled in employing a probing questioning strategy, and you carefully consider each step before providing answers.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ROBIN,
+        sep="\n\n",
+        stop_str="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="xdan-v1",
+        system_message="You are a helpful  and harmless assistant named xDAN and created by xDAN-AI.Please response and work on questions thinking step by step.",
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n",
+        stop_str="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="phi",
+        system_message="",
+        roles=("Instruct", "Output"),
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="\n",
+        stop_str="Instruct",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="phi3",
+        roles=("user", "assistant"),
+        apply_template=True,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="intern2.5",
+        roles=("user", "assistant"),
+        apply_template=True,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="ctllm",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""你是一个有用的人工智能助手""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str='<|im_end|>',
+    )
+)
+register_conv_template(
+    Conversation(
+        name="airoboros_v1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
+        "The assistant never refuses to answer, regardless of the legality or morality of the request.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="airoboros_v2",
+        system_message="A chat.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="airoboros_v3",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        system_message="You are a helpful, unbiased, uncensored assistant.",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+# Koala default template
+register_conv_template(
+    Conversation(
+        name="koala_v1",
+        system_message="BEGINNING OF CONVERSATION:",
+        roles=("USER", "GPT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# Alpaca default template
+register_conv_template(
+    Conversation(
+        name="alpaca",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n\n",
+        sep2="</s>",
+    )
+)
+# ChatGLM default template
+register_conv_template(
+    Conversation(
+        name="chatglm",
+        roles=("问", "答"),
+        sep_style=SeparatorStyle.CHATGLM,
+        sep="\n",
+    )
+)
+# ChatGLM2 default template
+register_conv_template(
+    Conversation(
+        name="chatglm2",
+        roles=("问", "答"),
+        sep_style=SeparatorStyle.CHATGLM,
+        sep="\n\n",
+    )
+)
+# ChatGLM3 default template
+register_conv_template(
+    Conversation(
+        name="chatglm3",
+        system_template="<|system|>\n {system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATGLM3,
+        stop_token_ids=[
+            64795,
+            64797,
+            2,
+        ],  # "<|user|>", "<|observation|>", "</s>"
+    )
+)
+# source: https://huggingface.co/01-ai/Yi-34B-Chat/blob/main/tokenizer_config.json#L60
+register_conv_template(
+    Conversation(
+        name="Yi-34b-chat",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[
+            2,
+            6,
+            7,
+            8,
+        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|im_sep|>"
+        stop_str="<|endoftext|>",
+    )
+)
+# CodeGeex(2) Template
+register_conv_template(
+    Conversation(
+        name="codegeex",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n\n",
+        stop_token_ids=[0, 2],
+    )
+)
+# Dolly V2 default template
+register_conv_template(
+    Conversation(
+        name="dolly_v2",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.DOLLY,
+        sep="\n\n",
+        sep2="### End",
+    )
+)
+# OpenAssistant Pythia default template
+register_conv_template(
+    Conversation(
+        name="oasst_pythia",
+        roles=("<|prompter|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|endoftext|>",
+    )
+)
+# OpenAssistant default template
+register_conv_template(
+    Conversation(
+        name="oasst_llama",
+        roles=("<|prompter|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="</s>",
+    )
+)
+# OpenChat 3.5 default template
+register_conv_template(
+    Conversation(
+        name="openchat_3.5",
+        roles=("GPT4 Correct User", "GPT4 Correct Assistant"),
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="<|end_of_turn|>",
+    )
+)
+# OpenChat 3.5 default template
+register_conv_template(
+    Conversation(
+        name="openchat_3.6",
+        roles=("GPT4 Correct User", "GPT4 Correct Assistant"),
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="<|end_of_turn|>",
+        stop_str=["<|end_of_turn|>", "<|im_end|>", "|||", "|>|>", "|end_of_turn|", "end_of_turn"],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="bilibili",
+        roles=("user", "assistant"),
+        apply_template=True,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="neo",
+        system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        apply_template=True,
+    )
+)
+# Tulu default template
+register_conv_template(
+    Conversation(
+        name="tulu",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n",
+    )
+)
+# StableLM Alpha default template
+register_conv_template(
+    Conversation(
+        name="stablelm",
+        system_template="<|SYSTEM|>{system_message}",
+        system_message="""# StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+""",
+        roles=("<|USER|>", "<|ASSISTANT|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[50278, 50279, 50277, 1, 0],
+    )
+)
+# Baize default template
+register_conv_template(
+    Conversation(
+        name="baize",
+        system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
+        roles=("[|Human|]", "[|AI|]"),
+        messages=(
+            ("[|Human|]", "Hello!"),
+            ("[|AI|]", "Hi!"),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n",
+        stop_str="[|Human|]",
+    )
+)
+# RWKV-4-Raven default template
+register_conv_template(
+    Conversation(
+        name="rwkv",
+        roles=("Bob", "Alice"),
+        messages=(
+            ("Bob", "hi"),
+            (
+                "Alice",
+                "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.RWKV,
+        sep="",
+        stop_str="\n\n",
+    )
+)
+# Buddy default template
+register_conv_template(
+    Conversation(
+        name="openbuddy",
+        system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
+Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
+Buddy cannot access the Internet.
+Buddy can fluently speak the user's language (e.g. English, Chinese).
+Buddy can generate poems, stories, code, essays, songs, parodies, and more.
+Buddy possesses vast knowledge about the world, history, and culture.
+Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
+Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
+User: Hi.
+Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+    )
+)
+# Phoenix default template
+register_conv_template(
+    Conversation(
+        name="phoenix",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.PHOENIX,
+        sep="</s>",
+    )
+)
+# ReaLM default template
+register_conv_template(
+    Conversation(
+        name="ReaLM-7b-v1",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.PHOENIX,
+        sep="</s>",
+    )
+)
+# ChatGPT default template
+register_conv_template(
+    Conversation(
+        name="chatgpt",
+        system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        sep_style=None,
+        sep=None,
+    )
+)
+# Claude default template
+register_conv_template(
+    Conversation(
+        name="claude",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+# MPT default template
+register_conv_template(
+    Conversation(
+        name="mpt-7b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[50278, 0],
+    )
+)
+# MPT-30b-chat default template
+register_conv_template(
+    Conversation(
+        name="mpt-30b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[50278, 0],
+    )
+)
+# Lemur-70b-chat default template
+# reference: https://huggingface.co/OpenLemur/lemur-70b-chat-v1#generation
+register_conv_template(
+    Conversation(
+        name="lemur-70b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""You are a helpful, respectful, and honest assistant.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[32002, 0],
+    )
+)
+# MPT-30b-instruct default template
+# reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
+register_conv_template(
+    Conversation(
+        name="mpt-30b-instruct",
+        system_template="{system_message}",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n\n",
+        stop_token_ids=[50278, 0],
+    )
+)
+# Bard default template
+# Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
+#            https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
+register_conv_template(
+    Conversation(
+        name="bard",
+        roles=("0", "1"),
+        sep_style=None,
+        sep=None,
+    )
+)
+# BiLLa default template
+register_conv_template(
+    Conversation(
+        name="billa",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="\n",
+        stop_str="Human:",
+    )
+)
+# RedPajama INCITE default template
+register_conv_template(
+    Conversation(
+        name="redpajama-incite",
+        roles=("<human>", "<bot>"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_str="<human>",
+    )
+)
+# h2oGPT default template
+register_conv_template(
+    Conversation(
+        name="h2ogpt",
+        roles=("<|prompt|>", "<|answer|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="</s>",
+    )
+)
+# Robin default template
+register_conv_template(
+    Conversation(
+        name="Robin",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("###Human", "###Assistant"),
+        sep_style=SeparatorStyle.ROBIN,
+        sep="\n",
+        stop_token_ids=[2, 396],
+        stop_str="###",
+    )
+)
+# Snoozy default template
+# Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
+register_conv_template(
+    Conversation(
+        name="snoozy",
+        system_template="### Instruction:\n{system_message}",
+        system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
+        roles=("### Prompt", "### Response"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_str="###",
+    )
+)
+# manticore default template
+register_conv_template(
+    Conversation(
+        name="manticore",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+# Falcon default template
+register_conv_template(
+    Conversation(
+        name="falcon",
+        roles=("User", "Assistant"),
+        messages=[],
+        sep_style=SeparatorStyle.RWKV,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+        stop_token_ids=[
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+        ],  # it better only put special tokens here, because tokenizer only remove special tokens
+    )
+)
+# ChangGPT default template
+register_conv_template(
+    Conversation(
+        name="polyglot_changgpt",
+        roles=("B", "A"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+    )
+)
+# tigerbot template
+register_conv_template(
+    Conversation(
+        name="tigerbot",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ROBIN,
+        sep="\n\n",
+        stop_str="###",
+    )
+)
+# ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
+register_conv_template(
+    Conversation(
+        name="xgen",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_token_ids=[50256],
+    )
+)
+# Internlm-chat template
+register_conv_template(
+    Conversation(
+        name="internlm-chat",
+        system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
+        roles=("<|User|>", "<|Bot|>"),
+        sep_style=SeparatorStyle.CHATINTERN,
+        sep="<eoh>",
+        sep2="<eoa>",
+        stop_token_ids=[1, 103028],
+        stop_str="<|User|>",
+    )
+)
+# StarChat template
+# reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py
+register_conv_template(
+    Conversation(
+        name="starchat",
+        system_template="<system>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|end|>",
+        stop_token_ids=[0, 49155],
+        stop_str="<|end|>",
+    )
+)
+# Baichuan-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
+    # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan-13B/issues/25
+    Conversation(
+        name="baichuan-chat",
+        roles=("<reserved_102>", "<reserved_103>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+# Baichuan2-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
+    # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan2/issues/62
+    Conversation(
+        name="baichuan2-chat",
+        roles=("<reserved_106>", "<reserved_107>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+# Mistral template
+# source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
+register_conv_template(
+    Conversation(
+        name="mistral",
+        system_template="[INST]{system_message}\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# llama2 template
+# reference: https://huggingface.co/blog/codellama#conversational-instructions
+# reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
+register_conv_template(
+    Conversation(
+        name="llama-2",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="cutegpt",
+        roles=("问：", "答：\n"),
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="\n",
+        sep2="\n",
+        stop_str="<end>",
+    )
+)
+# OpenOrcaxOpenChat-Preview2-13B template
+register_conv_template(
+    Conversation(
+        name="open-orca",
+        system_template="{system_message}",
+        system_message="You are a helpful assistant. Please answer truthfully and write out your "
+        "thinking step by step to be sure you get the right answer. If you make a mistake or encounter "
+        "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
+        "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
+        "and physicist. You will also act as the most appropriate type of expert to answer any particular "
+        "question or solve the relevant problem; state which expert type your are, if so. Also think of "
+        "any particular named expert that would be ideal to answer the relevant question or solve the "
+        "relevant problem; name and act as them, if appropriate.",
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="<|end_of_turn|>\n",
+        stop_token_ids=[32000, 32001],  # "<|end_of_turn|>"
+        stop_str="User",
+    )
+)
+# Open-Orca/Mistral-7B-OpenOrca template
+# source: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca
+# reference: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template
+register_conv_template(
+    Conversation(
+        name="mistral-7b-openorca",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[32000, 32001],
+    )
+)
+# Qwen-chat default template
+# source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
+register_conv_template(
+    Conversation(
+        name="qwen-7b-chat",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[
+            151643,
+            151644,
+            151645,
+        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
+        stop_str="<|endoftext|>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="qwen2",
+        roles=("user", "assistant"),
+        apply_template=True,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_audio",
+        system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        apply_template=True,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="megrez_q",
+        system_message="你是Megrez-7B-Q，将针对用户的问题给出详细的、积极的回答。",
+        # system_message="名字：[Megrez-7B-Q]",
+#         system_message='''@系统设置
+# @@名字：Megrez-7B-Q
+# @@厂家：无问芯穹（Infinigence）
+# @@日期：1970年1月1日
+# @模态设置
+# @@图像：False
+# @@视频：False
+# @@音频：False
+# @能力设置
+# @@函数调用：False
+# @@角色扮演：False
+# ''',
+        roles=("user", "assistant"),
+        apply_template=True,
+        stop_token_ids=[
+            151643,
+            151644,
+            151645,
+        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
+        stop_str="<|endoftext|>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="qwen2_hack",
+        system_message='Human: 你好\n\nAssistant: 你好！有什么我可以帮助你的吗？<|im_end|>\n<|endoftext|>Human: 再见\n\nAssistant: 再见！<|im_end|>\n<|endoftext|>',
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",#"<|endoftext|>",
+        none_stop=True,
+        skip_special_tokens=False,
+    )
+)
+# {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ 'Human: 你好\n\nAssistant: 你好！有什么我可以帮助你的吗？<|im_end|>\n<|endoftext|>Human: 再见\n\nAssistant: 再见！<|im_end|>\n<|endoftext|>' }}{% endif %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant: ' }}{% endif %}
+# register_conv_template(
+#     Conversation(
+#         name="qwen2",
+#         system_message="You are a helpful assistant.",
+#         roles=("user", "assistant"),
+#         stop_token_ids=[
+#             151643,
+#             151644,
+#             151645,
+#         ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
+#         stop_str="<|endoftext|>",
+#     )
+# )
+# register_conv_template(
+#     Conversation(
+#         name="megrez_q",
+#         system_message="你是一个乐于助人的助手，助手将针对用户的问题给出详细的、积极的回答。",
+#         roles=("user", "assistant"),
+#         stop_token_ids=[
+#             151643,
+#             151644,
+#             151645,
+#         ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
+#         stop_str="<|endoftext|>",
+#     )
+# )
+register_conv_template(
+    Conversation(
+        name="deepseek",
+        # system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        apply_template=True,
+        none_stop=True,
+        skip_special_tokens=False,
+    )
+)
+# AquilaChat default template
+# source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
+register_conv_template(
+    Conversation(
+        name="aquila-chat",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="###",
+        sep2="",
+        stop_str=["###", "</s>", "[UNK]"],
+    )
+)
+# AquilaChat2-34B default template
+# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L212
+register_conv_template(
+    Conversation(
+        name="aquila-legacy",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("### Human: ", "### Assistant: "),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+        stop_str=["</s>", "[UNK]"],
+    )
+)
+# AquilaChat2-7B-16K and AquilaChat2-34B-16K default template
+# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L227
+register_conv_template(
+    Conversation(
+        name="aquila",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="###",
+        sep2="</s>",
+        stop_str=["</s>", "[UNK]"],
+    )
+)
+# AquilaChat2-7B default template
+# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L242
+register_conv_template(
+    Conversation(
+        name="aquila-v1",
+        roles=("<|startofpiece|>", "<|endofpiece|>"),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="",
+        sep2="</s>",
+        stop_str=["</s>", "<|endoftext|>"],
+    )
+)
+# Llama2-Chinese default template
+# source: https://huggingface.co/FlagAlpha
+register_conv_template(
+    Conversation(
+        name="llama2-chinese",
+        system_template="<s>{system_message}</s>",
+        roles=("Human", "Assistant", "System"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="\n</s><s>",
+        stop_str="</s>",
+    )
+)
+# Vigogne Instruct default template
+# source: https://github.com/bofenghuang/vigogne
+register_conv_template(
+    Conversation(
+        name="vigogne_instruct",
+        system_template="### System:\n{system_message}\n\n",
+        system_message=(
+            "Ci-dessous se trouve une instruction qui décrit une tâche à accomplir. Rédigez une réponse qui répond de manière"
+            " précise à la demande."
+        ),
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.DOLLY,
+        sep="\n\n",
+        sep2="</s>",
+    )
+)
+# Vigogne Chat default template
+register_conv_template(
+    Conversation(
+        name="vigogne_chat_v2",
+        system_template="<|system|>: {system_message}",
+        system_message=(
+            "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
+            " autant que vous le pouvez."
+        ),
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>\n",
+        stop_str="<|user|>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="vigogne_chat_v3",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        system_message=(
+            "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
+            " autant que vous le pouvez."
+        ),
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s>",
+    )
+)
+# Falcon 180B chat template
+# source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
+register_conv_template(
+    Conversation(
+        name="falcon-chat",
+        roles=("User", "Falcon"),
+        system_template="System: {system_message}",
+        messages=[],
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser:",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+    )
+)
+# Phind template
+# source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
+register_conv_template(
+    Conversation(
+        name="phind",
+        system_message="### System Prompt\nYou are an intelligent programming assistant.",
+        roles=("### User Message", "### Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+# Metharme formatting for Pygmalion models
+# source: https://huggingface.co/PygmalionAI/pygmalion-2-13b
+register_conv_template(
+    Conversation(
+        name="metharme",
+        system_template="<|system|>{system_message}",
+        system_message="""Enter RP mode. You shall reply to the user while staying
+        in character. Your responses must be detailed, creative, immersive, and drive the scenario
+        forward.""",
+        roles=("<|user|>", "<|model|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_str="<|user|>",
+    )
+)
+# Zephyr template
+# reference: https://huggingface.co/spaces/HuggingFaceH4/zephyr-playground/blob/main/dialogues.py
+register_conv_template(
+    Conversation(
+        name="zephyr",
+        system_template="<|system|>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="</s>",
+        stop_token_ids=[2],
+        stop_str="</s>",
+    )
+)
+if __name__ == "__main__":
+    from fastchat.conversation import get_conv_template
+    print("-- Vicuna template --")
+    conv = get_conv_template("vicuna_v1.1")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+    print("\n")
+    print("-- Llama-2 template --")
+    conv = get_conv_template("llama-2")
+    conv.set_system_message("You are a helpful, respectful and honest assistant.")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+    print("\n")
+    print("-- ChatGPT template --")
+    conv = get_conv_template("chatgpt")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.to_openai_api_messages())
+    print("\n")
+    print("-- Claude template --")
+    conv = get_conv_template("claude")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())

fastchat/data/__init__.py ADDED Viewed

File without changes

fastchat/data/clean_sharegpt.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+- Convert html to markdown with basic data cleaning.
+- Deduplication.
+Usage:
+python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
+"""
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+import json
+import logging
+import re
+from typing import Dict, Union
+import bs4
+import markdownify  # == 0.11.6
+from tqdm import tqdm
+div_pattern = re.compile("<div.*?>")
+span_pattern = re.compile("<span.*?>")
+code_lang_pattern = re.compile(
+    "```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
+)
+code_lang_format = "```\g<1>\n\g<2>\n```"
+regenerate_pattern = re.compile("\d+ / \d+")
+copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
+copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
+def reformat_code(val: str) -> str:
+    # Input code format is:
+    # ```
+    # $<language>Copy code$<exact_code_here>
+    #
+    # ```
+    # This function convert it into the correct markdown format
+    return re.sub(code_lang_pattern, code_lang_format, val)
+def html_to_markdown(val: str) -> str:
+    # Remove all <div>. This is required to make intent work in code blocks.
+    val = re.sub(div_pattern, "", val)
+    # Remove all <span>. This is required to make underscores work in code blocks.
+    val = re.sub(span_pattern, "", val)
+    # Markdown to html
+    val = markdownify.markdownify(val).strip()
+    # Reformat code
+    val = reformat_code(val)
+    # Remove noisy "[number] / [number]" at the beginning
+    noise = re.search(regenerate_pattern, val)
+    if noise and noise.start() == 0:
+        val = val[noise.end() :]
+    # Remove noisy "Copy[number] chars / [number] words"
+    val = re.sub(copy_chars_pattern, "", val)
+    # Remove empty code block ```\nCopy code\n```
+    val = re.sub(copy_code_pattern, "", val)
+    # Strip
+    val = val.replace("\n\n\n", "\n").strip()
+    return val
+def contain_blocked_words(val: str) -> bool:
+    blocked_words = ["openai", "chatgpt"]
+    for w in blocked_words:
+        if w in val.lower():
+            return True
+    return False
+def clean_html_one_sample(sample):
+    roles = ["human", "gpt"]
+    if len(sample["conversations"]) <= 1:
+        return (sample, 1)
+    # Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
+    if sample["conversations"][0]["from"] != "human":
+        sample["conversations"] = sample["conversations"][1:]
+    if len(sample["conversations"]) <= 1:
+        return (sample, 1)
+    if sample["conversations"][-1]["from"] == "human":
+        sample["conversations"] = sample["conversations"][:-1]
+    if len(sample["conversations"]) <= 1:
+        return (sample, 1)
+    char_count = 0
+    new_conversations = []
+    for i, c in enumerate(sample["conversations"]):
+        if c["from"] != roles[i % 2]:
+            return (sample, 2)
+        if contain_blocked_words(c["value"]):
+            return (sample, 3)
+        try:
+            new_val = html_to_markdown(c["value"])
+        except (bs4.builder.ParserRejectedMarkup, AssertionError):
+            return (sample, 4)
+        # Filter empty answers like https://sharegpt.com/c/mrllZ6u
+        if not new_val or not new_val[0].isprintable():
+            break
+        char_count += len(new_val)
+        new_conversations.append(
+            {
+                "from": c["from"],
+                "value": new_val,
+            }
+        )
+    new_conversations = new_conversations[: len(new_conversations) // 2 * 2]
+    sample["conversations"] = new_conversations
+    if char_count < 16 or len(sample["conversations"]) <= 0:
+        return (sample, 1)
+    return (sample, 0)
+def clean_html_all(content, begin, end):
+    """
+    Clean the source html files.
+    """
+    cnt_skip = 0
+    cnt_blocked_words = 0
+    cnt_wrong_format = 0
+    cnt_parser_error = 0
+    cnt_too_short = 0
+    cnt_id_duplication = 0
+    cnt_value_duplication = 0
+    cnt_plugin = 0
+    cnt_tag = 0
+    content = content[begin:end]
+    processed = []
+    with ProcessPoolExecutor() as executor:
+        for result in tqdm(
+            executor.map(clean_html_one_sample, content), total=len(content)
+        ):
+            processed.append(result)
+    visited = {}
+    new_content = []
+    for sample, error_code in processed:
+        cid = sample["id"]
+        skipped = True
+        if error_code != 0:
+            if error_code == 1:
+                print(f"id {cid} is too short")
+                cnt_too_short += 1
+            elif error_code == 2:
+                print(f"id {cid} has a wrong format")
+                cnt_wrong_format += 1
+            elif error_code == 3:
+                print(f"id {cid} contains blocked words")
+                cnt_blocked_words += 1
+            elif error_code == 4:
+                print(f"id {cid} contains parser errors")
+                cnt_parser_error += 1
+            else:
+                raise ValueError(f"Invalid error_code: {error_code}")
+        elif cid in visited:
+            print(f"id {cid} is an id duplication of {visited[cid]}")
+            cnt_id_duplication += 1
+        elif sample.get("plugins", None) is not None:
+            print(f"id {cid} contains plugin")
+            cnt_plugin += 1
+        else:
+            key = (
+                sample["conversations"][0]["value"],
+                sample["conversations"][1]["value"],
+            )
+            if key in visited:
+                print(f"id {cid} is a value duplication of {visited[key]}")
+                cnt_value_duplication += 1
+            else:
+                visited[cid] = visited[key] = cid
+                skipped = False
+        if not skipped:
+            new_content.append(sample)
+        else:
+            cnt_skip += 1
+    print(
+        f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
+        f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
+        f"cnt_wrong_format: {cnt_wrong_format}, "
+        f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
+        f"cnt_value_duplication: {cnt_value_duplication}, cnt_plugin: {cnt_plugin}"
+    )
+    return new_content
+def main(args):
+    content = json.load(open(args["in_file"], "r"))
+    content = clean_html_all(content, args["begin"], args["end"])
+    json.dump(content, open(args["out_file"], "w"), indent=2, ensure_ascii=False)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
+    parser.add_argument("--begin", type=int)
+    parser.add_argument("--end", type=int)
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    main(vars(args))

fastchat/data/convert_alpaca.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Convert alpaca dataset into sharegpt format.
+Usage: python3 -m fastchat.data.convert_alpaca --in alpaca_data.json
+"""
+import argparse
+import json
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import numpy as np
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str)
+    parser.add_argument("--out-file", type=str)
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    new_content = []
+    for i, c in enumerate(content):
+        if len(c["input"].strip()) > 1:
+            q, a = c["instruction"] + "\nInput:\n" + c["input"], c["output"]
+        else:
+            q, a = c["instruction"], c["output"]
+        new_content.append(
+            {
+                "id": f"alpaca_{i}",
+                "conversations": [
+                    {"from": "human", "value": q},
+                    {"from": "gpt", "value": a},
+                ],
+            }
+        )
+    print(f"#out: {len(new_content)}")
+    json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/extract_gpt4_only.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Extract the conversations generated by GPT-4 only.
+Usage: python3 -m fastchat.data.extract_gpt4_only --in sharegpt.json
+"""
+import argparse
+import json
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str)
+    parser.add_argument("--begin", type=int)
+    parser.add_argument("--end", type=int)
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    content = content[args.begin : args.end]
+    new_content = []
+    for c in content:
+        model = c.get("model", None)
+        if model == "gpt4" or model is None:
+            new_content.append(c)
+    if args.out_file:
+        out_file = args.out_file
+    else:
+        out_file = args.in_file.replace(".json", "_gpt4.json")
+    print(f"#in: {len(content)}, #out: {len(new_content)}")
+    json.dump(new_content, open(out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/extract_single_round.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Extract the first round of the conversations.
+Usage: python3 -m fastchat.data.extract_single_round --in sharegpt.json
+"""
+import argparse
+import json
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str)
+    parser.add_argument("--begin", type=int)
+    parser.add_argument("--end", type=int)
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    content = content[args.begin : args.end]
+    for c in content:
+        c["conversations"] = c["conversations"][:2]
+    if args.out_file:
+        out_file = args.out_file
+    else:
+        out_file = args.in_file.replace(".json", "_single.json")
+    print(f"#in: {len(content)}, #out: {len(content)}")
+    json.dump(content, open(out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/filter_wrong_format.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Filter conversations with wrong formats.
+Usage:
+python3 -m fastchat.data.filter_wrong_format --in input.json --out output.json
+"""
+import argparse
+import json
+import re
+from tqdm import tqdm
+wrong_indices_pattern = re.compile("\n1\. [^2]*\n1\. ")
+def should_skip(conv):
+    # Filter wrong list indices like https://sharegpt.com/c/1pREAGO
+    for sentence in conv["conversations"]:
+        val = sentence["value"]
+        sub = re.search(wrong_indices_pattern, val)
+        if sub is not None:
+            return True
+    return False
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, required=True)
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    new_content = []
+    for conv in tqdm(content):
+        if should_skip(conv):
+            print(f"{conv['id']} contains a wrong format.")
+        else:
+            new_content.append(conv)
+    print(f"#in: {len(content)}, #out: {len(new_content)}")
+    json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/get_stats.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Get stats of a dataset.
+Usage: python3 -m fastchat.data.get_stats --in sharegpt.json
+"""
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+import json
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+K = 1e3
+M = 1e6
+def tokenize_one_sample(c):
+    for i in range(len(c["conversations"])):
+        v = c["conversations"][i]["value"]
+        c["conversations"][i]["value"] = tokenizer.tokenize(v)
+    return c
+def tokenize_dataset(content):
+    processed = []
+    with ProcessPoolExecutor() as executor:
+        for result in tqdm(
+            executor.map(tokenize_one_sample, content), total=len(content)
+        ):
+            processed.append(result)
+    return processed
+def compute_stats(content):
+    sample_lens = []
+    sample_turns = []
+    prompt_lens = []
+    res_lens = []
+    for c in content:
+        sample_len = 0
+        sample_turns.append(len(c["conversations"]) // 2)
+        for i in range(len(c["conversations"]) // 2):
+            p = c["conversations"][i * 2]["value"]
+            r = c["conversations"][i * 2 + 1]["value"]
+            turn_len = len(p) + len(r)
+            sample_len += turn_len
+            prompt_lens.append(len(p))
+            res_lens.append(len(r))
+        sample_lens.append(sample_len)
+    return sample_lens, sample_turns, prompt_lens, res_lens
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str)
+    parser.add_argument(
+        "--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
+    )
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
+    content = tokenize_dataset(content)
+    sample_lens, sample_turns, prompt_lens, res_lens = compute_stats(content)
+    print(f"#sequence: {len(content)/K:.2f} K")
+    print(f"#tokens: {np.sum(sample_lens)/M:.2f} M")
+    print(f"avg. turns: {np.mean(sample_turns):.2f}")
+    print(f"avg. prompt length: {np.mean(prompt_lens):.2f}")
+    print(f"avg. response length: {np.mean(res_lens):.2f}")
+    print("\n- Histogram -")
+    bin_edges = [0, 1024, 2048, 4096, 8192, 16384, 32768]
+    hist = np.histogram(sample_lens, bins=bin_edges)[0]
+    for i in range(len(hist)):
+        print(f"L{bin_edges[i]} - {bin_edges[i+1]}: {hist[i]}")

fastchat/data/hardcoded_questions.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Hardcoded question and answers.
+"""
+import json
+def identity_questions():
+    """ "
+    Adapted from https://github.com/young-geng/koala_data_pipeline/blob/main/process_hard_coded_data.py
+    """
+    content = []
+    name = "Vicuna"
+    org = "Large Model Systems Organization (LMSYS)"
+    def generate_conversations(questions, answers):
+        for q in questions:
+            for a in answers:
+                content.append(
+                    {
+                        "id": f"identity_{len(content)}",
+                        "conversations": [
+                            {"from": "human", "value": q},
+                            {"from": "gpt", "value": a},
+                        ],
+                    }
+                )
+    questions = [
+        "Who are you?",
+        "What is your name?",
+        "Can you introduce yourself?",
+        "Can you tell me a little bit about yourself?",
+        "What's your name?",
+        "What are you called?",
+        "What are you?",
+        "Tell me your name.",
+        "Tell me about yourself.",
+        "Tell me about you.",
+        "Tell me who you are.",
+        "Please introduce yourself.",
+    ]
+    answers = [
+        f"I am {name}, a language model trained by researchers from {org}.",
+        f"My name is {name}, and I'm a language model developed by {org}.",
+        f"You can call me {name}, and I was trained by {org} researchers as a language model.",
+        f"As a language model, I go by the name {name} and was trained by researchers from {org}.",
+        f"I'm a language model called {name}, and I was trained by {org} researchers.",
+        f"You may refer to me as {name}, a language model meticulously developed by the researchers at {org}.",
+    ]
+    generate_conversations(questions, answers)
+    questions = [
+        "Who created you?",
+        "Who made you?",
+        "Who built you?",
+        "Who programmed you?",
+        "Who trained you?",
+        "Who taught you?",
+        "Who developed you?",
+    ]
+    answers = [
+        f"Researchers from {org} created me.",
+        f"I'm created by {org}.",
+        f"I'm built by researchers from {org}.",
+        f"I am a language model trained by researchers from {org}.",
+        f"I'm a language model developed by {org}.",
+        f"I'm a language model created by researchers from {org}.",
+        f"My creators are researchers from {org}.",
+    ]
+    generate_conversations(questions, answers)
+    questions = [
+        "Are you ChatGPT?",
+        "Are you GPT-2?",
+        "Are you GPT-3?",
+        "Are you GPT-4?",
+        "Are you davinci?",
+        "Are you davinci-001?",
+        "Are you davinci-002?",
+        "Are you davinci-003?",
+        "Are you curie?",
+        "Are you based on ChatGPT?",
+        "Are you based on GPT-2?",
+        "Are you based on GPT-3?",
+        "Are you based on GPT-4?",
+        "Are you based on davinci?",
+        "Are you based on davinci-001?",
+        "Are you based on davinci-002?",
+        "Are you based on davinci-003?",
+        "Are you based on curie?",
+        "Are you trained by OpenAI?",
+        "Are you trained by Google?",
+        "Are you trained by Microsoft?",
+        "Are you trained by Meta?",
+        "Are you trained by IBM?",
+        "Do you call OpenAI APIs?",
+        "Do you call Google APIs?",
+        "Do you call Microsoft APIs?",
+        "Do you call Meta APIs?",
+        "Do you call IBM APIs?",
+        "Are you created by OpenAI?",
+        "Are you created by Google?",
+        "Are you created by Microsoft?",
+        "Are you created by Meta?",
+        "Are you created by IBM?",
+        "Are you developed by OpenAI?",
+        "Are you developed by Google?",
+        "Are you developed by Microsoft?",
+        "Are you developed by Meta?",
+        "Are you developed by IBM?",
+        "Are you trained on OpenAI data?",
+        "Are you trained on Google data?",
+        "Are you trained on Microsoft data?",
+        "Are you trained on Meta data?",
+        "Are you trained on IBM data?",
+        "Are you trained with OpenAI data?",
+        "Are you trained with Google data?",
+        "Are you trained with Microsoft data?",
+        "Are you trained with Meta data?",
+        "Are you trained with IBM data?",
+        "Have you been trained with OpenAI data?",
+        "Have you been trained with Google data?",
+        "Have you been trained with Microsoft data?",
+        "Have you been trained with Meta data?",
+        "Have you been trained with IBM data?",
+        "Are you finetuned on OpenAI data?",
+        "Are you finetuned on Google data?",
+        "Are you finetuned on Microsoft data?",
+        "Are you finetuned on Meta data?",
+        "Are you finetuned on IBM data?",
+        "Are you finetuned with OpenAI data?",
+        "Are you finetuned with Google data?",
+        "Are you finetuned with Microsoft data?",
+        "Are you finetuned with Meta data?",
+        "Are you finetuned with IBM data?",
+        "Have you been finetuned with OpenAI data?",
+        "Have you been finetuned with Google data?",
+        "Have you been finetuned with Microsoft data?",
+        "Have you been finetuned with Meta data?",
+        "Have you been finetuned with IBM data?",
+    ]
+    answers = [
+        f"No, I am a language model trained by researchers from {org}.",
+        f"No, I am a language model developed by researchers from {org}.",
+        f"No, I am a language model created by researchers from {org}.",
+        f"No, I am trained by researchers from {org}.",
+        f"No, I am developed by researchers from {org}.",
+        f"No, I am created by researchers from {org}.",
+        f"No, I'm a language model trained by researchers from {org}.",
+        f"No, I'm a language model developed by researchers from {org}.",
+        f"No, I'm a language model created by researchers from {org}.",
+        f"No, I'm trained by researchers from {org}.",
+        f"No, I'm developed by researchers from {org}.",
+        f"No, I'm created by researchers from {org}.",
+    ]
+    generate_conversations(questions, answers)
+    return content
+if __name__ == "__main__":
+    out_file = "hardcoded.json"
+    content = []
+    content.extend(identity_questions())
+    json.dump(content, open(out_file, "w"), indent=2)

fastchat/data/inspect_data.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Usage:
+python3 -m fastchat.data.inspect_data --in sharegpt_20230322_clean_lang_split.json
+"""
+import argparse
+import json
+import random
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--begin", type=int)
+    parser.add_argument("--random-n", type=int)
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    if args.random_n:
+        indices = [random.randint(0, len(content) - 1) for _ in range(args.random_n)]
+    elif args.begin:
+        indices = range(args.begin, len(content))
+    else:
+        indices = range(0, len(content))
+    for idx in indices:
+        sample = content[idx]
+        print("=" * 40)
+        print(f"no: {idx}, id: {sample['id']}")
+        for conv in sample["conversations"]:
+            print(conv["from"] + ": ")
+            print(conv["value"])
+            input()

fastchat/data/merge.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Merge two conversation files into one
+Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json
+"""
+import argparse
+import json
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True, nargs="+")
+    parser.add_argument("--out-file", type=str, default="merged.json")
+    args = parser.parse_args()
+    new_content = []
+    for in_file in args.in_file:
+        content = json.load(open(in_file, "r"))
+        new_content.extend(content)
+    print(f"#out: {len(new_content)}")
+    json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/optional_clean.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Do optional cleaning (e.g., remove some languages).
+Usage:
+python3 -m fastchat.data.optional_clean --in input.json --out output.json --keep-lang en
+python3 -m fastchat.data.optional_clean --in input.json --out output.json --skip-lang en
+Requirement:
+pip3 install polyglot pyicu pycld2
+"""
+import argparse
+import json
+import re
+import polyglot
+from polyglot.detect import Detector
+import pycld2
+from tqdm import tqdm
+def skip(conv, args):
+    # Remove certain languages
+    if args.keep_lang != "all" or args.skip_lang is not None:
+        text = "\n".join([x["value"] for x in conv["conversations"]])
+        try:
+            lang_code = Detector(text).language.code
+        except (pycld2.error, polyglot.detect.base.UnknownLanguage):
+            lang_code = "unknown"
+        if args.keep_lang != "all" and lang_code != args.keep_lang:
+            return True
+        if lang_code == args.skip_lang:
+            return True
+    # Remove repetitive numbers
+    if args.reduce_rep:
+        for sentence in conv["conversations"]:
+            val = sentence["value"]
+            sub = re.search(r"(\d)\1{8}", val)
+            if sub is not None:
+                return True
+    return False
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str)
+    parser.add_argument(
+        "--keep-lang",
+        type=str,
+        default="all",
+        choices=["all", "en"],
+        help="Only keep certain langauges.",
+    )
+    parser.add_argument("--skip-lang", type=str, help="Skip a specific language.")
+    # NOTE: Be careful about reduce_rep which may remove some good data.
+    # For example, addresses could have long consecutive 0's
+    parser.add_argument("--reduce-rep", action="store_true")
+    args = parser.parse_args()
+    in_file = args.in_file
+    out_file = args.out_file
+    keep_lang = args.keep_lang
+    skip_lang = args.skip_lang
+    reduce_rep = args.reduce_rep
+    assert keep_lang == "all" or skip_lang is None
+    if out_file is None:
+        out_file = "sharegpt_clean"
+        if keep_lang != "all":
+            out_file += "_" + keep_lang
+        if skip_lang is not None:
+            out_file += "_skip_" + skip_lang
+        if reduce_rep:
+            out_file += "_reduce_rep"
+        out_file += ".json"
+    content = json.load(open(in_file, "r"))
+    num_conv = len(content)
+    new_content = []
+    for conv in tqdm(content):
+        if not skip(conv, args):
+            new_content.append(conv)
+    print(f"#in: {len(content)}, #out: {len(new_content)}")
+    json.dump(new_content, open(out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/optional_replace.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Do optional replace of bos/eos/pad/unk.
+Usage:
+python3 -m fastchat.data.optional_replace --in input.json --out output.json --model-name-or-path <your_token_path>
+Requirement:
+pip3 install transformers tqdm
+"""
+import argparse
+import json
+import traceback
+import transformers
+from tqdm import tqdm
+def replace_special_tokens(
+    tokenizer: transformers.PreTrainedTokenizer, text: str
+) -> str:
+    if not text:
+        return text
+    def _insert_vline(token: str) -> str:
+        if len(token) < 2:
+            return " "
+        elif len(token) == 2:
+            return f"{token[0]}|{token[1]}"
+        else:
+            return f"{token[:1]}|{token[1:-1]}|{token[-1:]}"
+    if tokenizer.bos_token:
+        text = text.replace(tokenizer.bos_token, _insert_vline(tokenizer.bos_token))
+    if tokenizer.eos_token:
+        text = text.replace(tokenizer.eos_token, _insert_vline(tokenizer.eos_token))
+    if tokenizer.pad_token:
+        text = text.replace(tokenizer.pad_token, _insert_vline(tokenizer.pad_token))
+    if tokenizer.unk_token:
+        text = text.replace(tokenizer.unk_token, _insert_vline(tokenizer.unk_token))
+    return text
+def replace(conv, tokenizer):
+    # Replace bos/eos/pad/unk tokens
+    if tokenizer:
+        try:
+            for sentence in conv["conversations"]:
+                sentence["value"] = replace_special_tokens(tokenizer, sentence["value"])
+        except Exception as e:
+            traceback.print_exc()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str)
+    parser.add_argument(
+        "--model-name-or-path",
+        type=str,
+        help="The directory or address where the model token is stored.",
+    )
+    args = parser.parse_args()
+    in_file = args.in_file
+    out_file = args.out_file
+    tokenizer = None
+    if args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            args.model_name_or_path,
+            trust_remote_code=True,
+            use_fast=False,
+        )
+    if out_file is None:
+        out_file = f"{in_file}_replace.json"
+    content = json.load(open(in_file, "r"))
+    for conv in tqdm(content):
+        replace(conv, tokenizer)
+    json.dump(content, open(out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/prepare_all.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Prepare all datasets."""
+import argparse
+import os
+from fastchat.utils import run_cmd
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prefix", type=str, default="~/datasets/sharegpt_20230521")
+    parser.add_argument(
+        "--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
+    )
+    parser.add_argument("--seq-len", type=int, default=4096)
+    args = parser.parse_args()
+    in_prefix = args.prefix
+    model_path = args.model_name_or_path
+    seq_len = args.seq_len
+    prefix = (
+        f"{in_prefix}_{seq_len}".replace("4096", "4k")
+        .replace("8192", "8k")
+        .replace("16384", "16k")
+    )
+    cmd_list = [
+        f"python3 -m fastchat.data.clean_sharegpt --in {in_prefix}_html.json --out {prefix}_clean.json",
+        f"python3 -m fastchat.data.optional_clean --in {prefix}_clean.json --out {prefix}_clean_lang.json --skip-lang ko",
+        f"python3 -m fastchat.data.split_long_conversation --in {prefix}_clean_lang.json --out {prefix}_clean_lang_split.json --model-name {model_path} --max-length {seq_len}",
+        f"python3 -m fastchat.data.filter_wrong_format --in {prefix}_clean_lang_split.json --out {prefix}_clean_lang_split.json",
+        f"python3 -m fastchat.data.split_train_test --in {prefix}_clean_lang_split.json --ratio 0.99",
+        f"python3 -m fastchat.data.hardcoded_questions",
+        f"python3 -m fastchat.data.merge --in {prefix}_clean_lang_split_train.json hardcoded.json --out {prefix}_clean_lang_split_identity.json",
+        f"python3 -m fastchat.data.extract_gpt4_only --in {prefix}_clean_lang_split_identity.json",
+        f"python3 -m fastchat.data.extract_single_round --in {prefix}_clean_lang_split_identity.json",
+    ]
+    for cmd in cmd_list:
+        ret = run_cmd(cmd)
+        if ret != 0:
+            exit(ret)

fastchat/data/pretty_json.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Usage:
+python3 pretty_json.py --in in.json --out out.json
+"""
+import argparse
+import json
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, required=True)
+    args = parser.parse_args()
+    with open(args.in_file, "r") as fin:
+        data = json.load(fin)
+    with open(args.out_file, "w") as fout:
+        json.dump(data, fout, indent=2, ensure_ascii=False)

fastchat/data/sample.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Sample some conversations from a file.
+Usage: python3 -m fastchat.data.sample --in sharegpt.json --out sampled.json
+"""
+import argparse
+import json
+import numpy as np
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="sampled.json")
+    parser.add_argument("--begin", type=int, default=0)
+    parser.add_argument("--end", type=int, default=100)
+    parser.add_argument("--max-length", type=int, default=1024)
+    parser.add_argument("--keep-order", action="store_true")
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    if not args.keep_order:
+        np.random.seed(42)
+        np.random.shuffle(content)
+    new_content = []
+    for i in range(args.begin, min(args.end, len(content))):
+        sample = content[i]
+        concat = ""
+        for s in sample["conversations"]:
+            concat += s["value"]
+        if len(concat) > args.max_length:
+            continue
+        new_content.append(sample)
+    print(f"#in: {len(content)}, #out: {len(new_content)}")
+    json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)

fastchat/data/split_long_conversation.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Split long conversations based on certain max length.
+Usage: python3 -m fastchat.data.split_long_conversation \
+    --in sharegpt_clean.json \
+    --out sharegpt_split.json \
+    --model-name-or-path $<model-name>
+"""
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+import json
+from typing import Dict, Sequence, Optional
+import transformers
+from tqdm import tqdm
+def make_sample(sample, start_idx, end_idx):
+    assert (end_idx - start_idx) % 2 == 0
+    return {
+        "id": sample["id"] + "_" + str(start_idx),
+        "model": sample.get("model", ""),
+        "conversations": sample["conversations"][start_idx:end_idx],
+    }
+tokenizer = max_length = None
+def split_one_sample(sample):
+    tokenized_lens = []
+    conversations = sample["conversations"]
+    conversations = conversations[: len(conversations) // 2 * 2]
+    for c in conversations:
+        length = len(tokenizer(c["value"]).input_ids) + 6
+        tokenized_lens.append(length)
+    start_idx = 0
+    cur_len = 0
+    if len(conversations) % 2 != 0 or len(conversations) < 2:
+        return []
+    new_samples = []
+    for i in range(0, len(conversations), 2):
+        tmp_len = tokenized_lens[i] + tokenized_lens[i + 1]
+        if cur_len + tmp_len > max_length:
+            new_samples.append(make_sample(sample, start_idx, i))
+            start_idx = i
+            cur_len = 0
+        elif i == len(conversations) - 2:
+            new_samples.append(make_sample(sample, start_idx, i + 2))
+        cur_len += tmp_len
+    return new_samples
+def worker(input_data):
+    result = []
+    for sample in input_data:
+        result.extend(split_one_sample(sample))
+    return result
+def split_all(content, begin, end, tokenizer_, max_length_):
+    """
+    Keep the maximum round of conversations within the max token length constraint
+    """
+    global tokenizer, max_length
+    tokenizer = tokenizer_
+    max_length = max_length_
+    content = content[begin:end]
+    new_content = []
+    # Split content into chunks
+    chunks = [content[i : i + 1000] for i in range(0, len(content), 1000)]
+    with ProcessPoolExecutor() as executor:
+        for result in tqdm(executor.map(worker, chunks), total=len(chunks)):
+            new_content.extend(result)
+    return new_content
+def filter_invalid_roles(content):
+    new_content = []
+    for i, c in enumerate(content):
+        roles = ["human", "gpt"]
+        if len(c["conversations"]) <= 0:
+            continue
+        valid = True
+        for j, s in enumerate(c["conversations"]):
+            if s["from"] != roles[j % 2]:
+                valid = False
+                break
+        if valid:
+            new_content.append(c)
+    return new_content
+def main(args):
+    content = json.load(open(args.in_file, "r"))
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.model_name_or_path,
+        model_max_length=args.max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    new_content = split_all(content, args.begin, args.end, tokenizer, args.max_length)
+    new_content = filter_invalid_roles(new_content)
+    print(f"#in: {len(content)}, #out: {len(new_content)}")
+    json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="sharegpt_split.json")
+    parser.add_argument("--begin", type=int)
+    parser.add_argument("--end", type=int)
+    parser.add_argument("--model-name-or-path", type=str, required=True)
+    parser.add_argument("--max-length", type=int, default=2048)
+    args = parser.parse_args()
+    main(args)

fastchat/data/split_train_test.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Split the dataset into training and test set.
+Usage: python3 -m fastchat.data.split_train_test --in sharegpt.json
+"""
+import argparse
+import json
+import numpy as np
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--begin", type=int, default=0)
+    parser.add_argument("--end", type=int, default=100)
+    parser.add_argument("--ratio", type=float, default=0.9)
+    args = parser.parse_args()
+    content = json.load(open(args.in_file, "r"))
+    np.random.seed(0)
+    perm = np.random.permutation(len(content))
+    content = [content[i] for i in perm]
+    split = int(args.ratio * len(content))
+    train_set = content[:split]
+    test_set = content[split:]
+    print(f"#train: {len(train_set)}, #test: {len(test_set)}")
+    train_name = args.in_file.replace(".json", "_train.json")
+    test_name = args.in_file.replace(".json", "_test.json")
+    json.dump(train_set, open(train_name, "w"), indent=2, ensure_ascii=False)
+    json.dump(test_set, open(test_name, "w"), indent=2, ensure_ascii=False)

fastchat/model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from fastchat.model.model_adapter import (
+    load_model,
+    get_conversation_template,
+    add_model_args,
+)

fastchat/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (292 Bytes). View file

fastchat/model/__pycache__/compression.cpython-310.pyc ADDED Viewed

Binary file (6.74 kB). View file

fastchat/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc ADDED Viewed

Binary file (2.1 kB). View file

fastchat/model/__pycache__/model_adapter.cpython-310.pyc ADDED Viewed

Binary file (55.6 kB). View file

fastchat/model/__pycache__/model_chatglm.cpython-310.pyc ADDED Viewed

Binary file (2.52 kB). View file

fastchat/model/__pycache__/model_codet5p.cpython-310.pyc ADDED Viewed

Binary file (2.58 kB). View file

fastchat/model/__pycache__/model_exllama.cpython-310.pyc ADDED Viewed

Binary file (1.77 kB). View file

fastchat/model/__pycache__/model_falcon.cpython-310.pyc ADDED Viewed

Binary file (2.56 kB). View file

fastchat/model/__pycache__/model_registry.cpython-310.pyc ADDED Viewed

Binary file (10.8 kB). View file

fastchat/model/__pycache__/model_xfastertransformer.cpython-310.pyc ADDED Viewed

Binary file (1.67 kB). View file

fastchat/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc ADDED Viewed

Binary file (3.09 kB). View file

fastchat/model/apply_delta.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+Apply the delta weights on top of a base model.
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
+"""
+import argparse
+import gc
+import glob
+import json
+import os
+import shutil
+import tempfile
+from huggingface_hub import snapshot_download
+import torch
+from torch import nn
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+GB = 1 << 30
+def split_files(model_path, tmp_path, split_size):
+    if not os.path.exists(model_path):
+        model_path = snapshot_download(repo_id=model_path)
+    if not os.path.exists(tmp_path):
+        os.makedirs(tmp_path)
+    file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
+    files = glob.glob(file_pattern)
+    part = 0
+    try:
+        for file_path in tqdm(files):
+            state_dict = torch.load(file_path)
+            new_state_dict = {}
+            current_size = 0
+            for name, param in state_dict.items():
+                param_size = param.numel() * param.element_size()
+                if current_size + param_size > split_size:
+                    new_file_name = f"pytorch_model-{part}.bin"
+                    new_file_path = os.path.join(tmp_path, new_file_name)
+                    torch.save(new_state_dict, new_file_path)
+                    current_size = 0
+                    new_state_dict = None
+                    gc.collect()
+                    new_state_dict = {}
+                    part += 1
+                new_state_dict[name] = param
+                current_size += param_size
+            new_file_name = f"pytorch_model-{part}.bin"
+            new_file_path = os.path.join(tmp_path, new_file_name)
+            torch.save(new_state_dict, new_file_path)
+            new_state_dict = None
+            gc.collect()
+            new_state_dict = {}
+            part += 1
+    except Exception as e:
+        print(f"An error occurred during split_files: {e}")
+        shutil.rmtree(tmp_path)
+        raise
+def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
+    delta_config = AutoConfig.from_pretrained(delta_path)
+    if os.path.exists(target_model_path):
+        shutil.rmtree(target_model_path)
+    os.makedirs(target_model_path)
+    split_size = 4 * GB
+    with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
+        print(f"Split files for the base model to {tmp_base_path}")
+        split_files(base_model_path, tmp_base_path, split_size)
+        print(f"Split files for the delta weights to {tmp_delta_path}")
+        split_files(delta_path, tmp_delta_path, split_size)
+        base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
+        base_files = glob.glob(base_pattern)
+        delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
+        delta_files = glob.glob(delta_pattern)
+        delta_state_dict = torch.load(delta_files[0])
+        print("Applying the delta")
+        weight_map = {}
+        total_size = 0
+        for i, base_file in tqdm(enumerate(base_files)):
+            state_dict = torch.load(base_file)
+            file_name = f"pytorch_model-{i}.bin"
+            for name, param in state_dict.items():
+                if name not in delta_state_dict:
+                    for delta_file in delta_files:
+                        delta_state_dict = torch.load(delta_file)
+                        gc.collect()
+                        if name in delta_state_dict:
+                            break
+                state_dict[name] += delta_state_dict[name]
+                weight_map[name] = file_name
+                total_size += param.numel() * param.element_size()
+                gc.collect()
+            torch.save(state_dict, os.path.join(target_model_path, file_name))
+        with open(
+            os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
+        ) as f:
+            json.dump(
+                {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
+            )
+    print(f"Saving the target model to {target_model_path}")
+    delta_tokenizer.save_pretrained(target_model_path)
+    delta_config.save_pretrained(target_model_path)
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print(f"Loading the delta weights from {delta_path}")
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
+    delta = AutoModelForCausalLM.from_pretrained(
+        delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Applying the delta")
+    for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
+        assert name in delta.state_dict()
+        param.data += delta.state_dict()[name]
+    print(f"Saving the target model to {target_model_path}")
+    base.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument(
+        "--low-cpu-mem",
+        action="store_true",
+        help="Lower the cpu memory usage. This will split large files and use "
+        "disk as swap to reduce the memory usage below 10GB.",
+    )
+    args = parser.parse_args()
+    if args.low_cpu_mem:
+        apply_delta_low_cpu_mem(
+            args.base_model_path, args.target_model_path, args.delta_path
+        )
+    else:
+        apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

fastchat/model/apply_lora.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Apply the LoRA weights on top of a base model.
+Usage:
+python3 -m fastchat.model.apply_lora --base ~/model_weights/llama-7b --target ~/model_weights/baize-7b --lora project-baize/baize-lora-7B
+Dependency:
+pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b
+"""
+import argparse
+import torch
+from peft import PeftModel
+from transformers import AutoTokenizer, AutoModelForCausalLM
+def apply_lora(base_model_path, target_model_path, lora_path):
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)
+    print(f"Loading the LoRA adapter from {lora_path}")
+    lora_model = PeftModel.from_pretrained(
+        base,
+        lora_path,
+        # torch_dtype=torch.float16
+    )
+    print("Applying the LoRA")
+    model = lora_model.merge_and_unload()
+    print(f"Saving the target model to {target_model_path}")
+    model.save_pretrained(target_model_path)
+    base_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--lora-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_lora(args.base_model_path, args.target_model_path, args.lora_path)

fastchat/model/compression.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import dataclasses
+import gc
+import glob
+import os
+from accelerate import init_empty_weights
+from accelerate.utils import set_module_tensor_to_device
+from huggingface_hub import snapshot_download
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoModel,
+    AutoModelForSeq2SeqLM,
+)
+@dataclasses.dataclass
+class CompressionConfig:
+    """Group-wise quantization."""
+    num_bits: int
+    group_size: int
+    group_dim: int
+    symmetric: bool
+    enabled: bool = True
+default_compression_config = CompressionConfig(
+    num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
+)
+class CLinear(nn.Module):
+    """Compressed Linear Layer."""
+    def __init__(self, weight=None, bias=None, device=None):
+        super().__init__()
+        if weight is None:
+            self.weight = None
+        elif isinstance(weight, Tensor):
+            self.weight = compress(weight.data.to(device), default_compression_config)
+        else:
+            self.weight = weight
+        self.bias = bias
+    def forward(self, input: Tensor) -> Tensor:
+        weight = decompress(self.weight, default_compression_config)
+        if self.bias is None:
+            return F.linear(input.to(weight.dtype), weight)
+        return F.linear(input.to(weight.dtype), weight, self.bias.to(weight.dtype))
+def compress_module(module, target_device):
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            setattr(
+                module,
+                attr_str,
+                CLinear(target_attr.weight, target_attr.bias, target_device),
+            )
+    for name, child in module.named_children():
+        compress_module(child, target_device)
+def get_compressed_list(module, prefix=""):
+    compressed_list = []
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            full_name = (
+                f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
+            )
+            compressed_list.append(full_name)
+    for name, child in module.named_children():
+        child_prefix = f"{prefix}.{name}" if prefix else name
+        for each in get_compressed_list(child, child_prefix):
+            compressed_list.append(each)
+    return compressed_list
+def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            full_name = (
+                f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
+            )
+            setattr(
+                module,
+                attr_str,
+                CLinear(
+                    compressed_state_dict[full_name], target_attr.bias, target_device
+                ),
+            )
+    for name, child in module.named_children():
+        child_prefix = f"{prefix}.{name}" if prefix else name
+        apply_compressed_weight(
+            child, compressed_state_dict, target_device, child_prefix
+        )
+def load_compress_model(model_path, device, torch_dtype, use_fast, revision="main"):
+    # partially load model
+    # `use_fast=True`` is not supported for some models.
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=use_fast, revision=revision, trust_remote_code=True
+        )
+    except TypeError:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=~use_fast, revision=revision, trust_remote_code=True
+        )
+    with init_empty_weights():
+        # `trust_remote_code` should be set as `True` for both AutoConfig and AutoModel
+        config = AutoConfig.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        # some models are loaded by AutoModel but not AutoModelForCausalLM,
+        # such as chatglm, chatglm2
+        try:
+            # google/flan-* models are based on an AutoModelForSeq2SeqLM.
+            if "T5Config" in str(type(config)):
+                model = AutoModelForSeq2SeqLM.from_config(
+                    config, trust_remote_code=True
+                )
+            else:
+                model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+        except NameError:
+            model = AutoModel.from_config(config, trust_remote_code=True)
+        linear_weights = get_compressed_list(model)
+    if os.path.exists(model_path):
+        # `model_path` is a local folder
+        base_pattern = os.path.join(model_path, "pytorch_model*.bin")
+    else:
+        # `model_path` is a cached Hugging Face repo
+        # We don't necessarily need to download the model' repo again if there is a cache.
+        # So check the default huggingface cache first.
+        model_path_temp = os.path.join(
+            os.path.expanduser("~"),
+            ".cache/huggingface/hub",
+            "models--" + model_path.replace("/", "--"),
+            "snapshots/",
+        )
+        downloaded = False
+        if os.path.exists(model_path_temp):
+            temp_last_dir = os.listdir(model_path_temp)[-1]
+            model_path_temp = os.path.join(model_path_temp, temp_last_dir)
+            base_pattern = os.path.join(model_path_temp, "pytorch_model*.bin")
+            files = glob.glob(base_pattern)
+            if len(files) > 0:
+                downloaded = True
+        if downloaded:
+            model_path = model_path_temp
+        else:
+            model_path = snapshot_download(model_path, revision=revision)
+        base_pattern = os.path.join(model_path, "pytorch_model*.bin")
+    files = glob.glob(base_pattern)
+    if len(files) == 0:
+        raise ValueError(
+            f"Cannot find any model weight files. "
+            f"Please check your (cached) weight path: {model_path}"
+        )
+    compressed_state_dict = {}
+    for filename in tqdm(files):
+        tmp_state_dict = torch.load(filename, map_location=lambda storage, loc: storage)
+        for name in tmp_state_dict:
+            if name in linear_weights:
+                tensor = tmp_state_dict[name].to(device, dtype=torch_dtype)
+                compressed_state_dict[name] = compress(
+                    tensor, default_compression_config
+                )
+            else:
+                compressed_state_dict[name] = tmp_state_dict[name].to(
+                    device, dtype=torch_dtype
+                )
+            tmp_state_dict[name] = None
+            tensor = None
+            gc.collect()
+            torch.cuda.empty_cache()
+            if device == "xpu":
+                torch.xpu.empty_cache()
+            if device == "npu":
+                torch.npu.empty_cache()
+    for name in model.state_dict():
+        if name not in linear_weights:
+            set_module_tensor_to_device(
+                model, name, device, value=compressed_state_dict[name]
+            )
+    apply_compressed_weight(model, compressed_state_dict, device)
+    if torch_dtype == torch.float16:
+        model.half()
+    model.to(device)
+    model.eval()
+    return model, tokenizer
+def compress(tensor, config):
+    """Simulate group-wise quantization."""
+    if not config.enabled:
+        return tensor
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+    assert num_bits <= 8
+    original_shape = tensor.shape
+    num_groups = (original_shape[group_dim] + group_size - 1) // group_size
+    new_shape = (
+        original_shape[:group_dim]
+        + (num_groups, group_size)
+        + original_shape[group_dim + 1 :]
+    )
+    # Pad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len != 0:
+        pad_shape = (
+            original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
+        )
+        tensor = torch.cat(
+            [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
+            dim=group_dim,
+        )
+    data = tensor.view(new_shape)
+    # Quantize
+    if symmetric:
+        B = 2 ** (num_bits - 1) - 1
+        scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
+        data = data * scale
+        data = data.clamp_(-B, B).round_().to(torch.int8)
+        return data, scale, original_shape
+    else:
+        B = 2**num_bits - 1
+        mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
+        mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
+        scale = B / (mx - mn)
+        data = data - mn
+        data.mul_(scale)
+        data = data.clamp_(0, B).round_().to(torch.uint8)
+        return data, mn, scale, original_shape
+def decompress(packed_data, config):
+    """Simulate group-wise dequantization."""
+    if not config.enabled:
+        return packed_data
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+    # Dequantize
+    if symmetric:
+        data, scale, original_shape = packed_data
+        data = data / scale
+    else:
+        data, mn, scale, original_shape = packed_data
+        data = data / scale
+        data.add_(mn)
+    # Unpad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len:
+        padded_original_shape = (
+            original_shape[:group_dim]
+            + (original_shape[group_dim] + pad_len,)
+            + original_shape[group_dim + 1 :]
+        )
+        data = data.reshape(padded_original_shape)
+        indices = [slice(0, x) for x in original_shape]
+        return data[indices].contiguous()
+    else:
+        return data.view(original_shape)

fastchat/model/convert_fp16.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+Usage:
+python3 -m fastchat.model.convert_fp16 --in in-folder --out out-folder
+"""
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+def convert_fp16(in_checkpoint, out_checkpoint):
+    tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(
+        in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    model.save_pretrained(out_checkpoint)
+    tokenizer.save_pretrained(out_checkpoint)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
+    parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
+    args = parser.parse_args()
+    convert_fp16(args.in_checkpoint, args.out_checkpoint)

fastchat/model/llama_condense_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py
+from functools import partial
+import torch
+import transformers
+import transformers.models.llama.modeling_llama
+class CondenseRotaryEmbedding(torch.nn.Module):
+    def __init__(
+        self, dim, ratio, max_position_embeddings=2048, base=10000, device=None
+    ):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # Build here to make `torch.jit.trace` work.
+        self.ratio = ratio
+        max_position_embeddings *= ratio
+        self.max_seq_len_cached = max_position_embeddings
+        # print(f"Monkey Patching condense ratio {ratio}")
+        t = (
+            torch.arange(
+                self.max_seq_len_cached,
+                device=self.inv_freq.device,
+                dtype=self.inv_freq.dtype,
+            )
+            / ratio
+        )
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        dtype = torch.get_default_dtype()
+        self.register_buffer(
+            "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False
+        )
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = (
+                torch.arange(
+                    self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
+                )
+                / self.ratio
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer(
+                "cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False
+            )
+            self.register_buffer(
+                "sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False
+            )
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+def replace_llama_with_condense(ratio):
+    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(
+        CondenseRotaryEmbedding, ratio=ratio
+    )

fastchat/model/make_delta.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Make the delta weights by subtracting base weights.
+Usage:
+python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+def make_delta(base_model_path, target_model_path, delta_path):
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print(f"Loading the target model from {target_model_path}")
+    target = AutoModelForCausalLM.from_pretrained(
+        target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
+    print("Calculating the delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        assert name in base.state_dict()
+        param.data -= base.state_dict()[name]
+    print(f"Saving the delta to {delta_path}")
+    if args.hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str)
+    args = parser.parse_args()
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path)

fastchat/model/model_adapter.py ADDED Viewed

	@@ -0,0 +1,1970 @@

+"""Model adapter registration."""
+import math
+import os
+import re
+import sys
+from typing import Dict, List, Optional
+import warnings
+if sys.version_info >= (3, 9):
+    from functools import cache
+else:
+    from functools import lru_cache as cache
+import accelerate
+import psutil
+import torch
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    LlamaTokenizer,
+    LlamaForCausalLM,
+    T5Tokenizer,
+)
+from fastchat.constants import CPU_ISA
+from fastchat.conversation import Conversation, get_conv_template
+from fastchat.model.compression import load_compress_model
+from fastchat.model.llama_condense_monkey_patch import replace_llama_with_condense
+from fastchat.model.model_chatglm import generate_stream_chatglm
+from fastchat.model.model_codet5p import generate_stream_codet5p
+from fastchat.model.model_falcon import generate_stream_falcon
+from fastchat.model.model_exllama import generate_stream_exllama
+from fastchat.model.model_xfastertransformer import generate_stream_xft
+from fastchat.model.monkey_patch_non_inplace import (
+    replace_llama_attn_with_non_inplace_operations,
+)
+from fastchat.modules.awq import AWQConfig, load_awq_quantized
+from fastchat.modules.exllama import ExllamaConfig, load_exllama_model
+from fastchat.modules.xfastertransformer import load_xft_model, XftConfig
+from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
+from fastchat.utils import get_gpu_memory
+# Check an environment variable to check if we should be sharing Peft model
+# weights.  When false we treat all Peft models as separate.
+peft_share_base_weights = (
+    os.environ.get("PEFT_SHARE_BASE_WEIGHTS", "false").lower() == "true"
+)
+ANTHROPIC_MODEL_LIST = (
+    "claude-1",
+    "claude-2",
+    "claude-instant-1",
+)
+class BaseModelAdapter:
+    """The base and the default model adapter."""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return True
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                use_fast=self.use_fast_tokenizer,
+                revision=revision,
+                trust_remote_code=True,
+            )
+        except TypeError:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path, use_fast=False, revision=revision, trust_remote_code=True
+            )
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                use_flash_attention_2=True,
+                **from_pretrained_kwargs,
+            )
+        except: # NameError:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                use_flash_attention_2=False,
+                **from_pretrained_kwargs,
+            )
+            # model = AutoModel.from_pretrained(
+            #     model_path,
+            #     low_cpu_mem_usage=True,
+            #     trust_remote_code=True,
+            #     **from_pretrained_kwargs,
+            # )
+        return model, tokenizer
+    def load_compress_model(self, model_path, device, torch_dtype, revision="main"):
+        return load_compress_model(
+            model_path,
+            device,
+            torch_dtype,
+            use_fast=self.use_fast_tokenizer,
+            revision=revision,
+        )
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if 'megrez' in model_path.lower():
+            model_path = 'megrez'
+        elif 'minicpm' in model_path.lower():
+            model_path = "minicpm"
+        return get_conv_template(model_path.lower())
+# A global registry for all model adapters
+# TODO (lmzheng): make it a priority queue.
+model_adapters: List[BaseModelAdapter] = []
+def register_model_adapter(cls):
+    """Register a model adapter."""
+    model_adapters.append(cls())
+@cache
+def get_model_adapter(model_path: str, model_name: str = None) -> BaseModelAdapter:
+    """Get a model adapter for a model_path."""
+    model_path_basename = os.path.basename(os.path.normpath(model_path)) if not model_name else model_name
+    # Try the basename of model_path at first
+    for adapter in model_adapters:
+        if adapter.match(model_path_basename) and type(adapter) != BaseModelAdapter:
+            print(f"Matching model adapter: {adapter}")
+            return adapter
+    model_path = model_path if not model_name else model_name
+    # Then try the full path
+    for adapter in model_adapters:
+        if adapter.match(model_path):
+            print(f"Using model adapter: {adapter}")
+            return adapter
+    raise ValueError(f"No valid model adapter for {model_path}")
+def raise_warning_for_incompatible_cpu_offloading_configuration(
+    device: str, load_8bit: bool, cpu_offloading: bool
+):
+    if cpu_offloading:
+        if not load_8bit:
+            warnings.warn(
+                "The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
+                "Use '--load-8bit' to enable 8-bit-quantization\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if not "linux" in sys.platform:
+            warnings.warn(
+                "CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if device != "cuda":
+            warnings.warn(
+                "CPU-offloading is only enabled when using CUDA-devices\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+    return cpu_offloading
+def load_model(
+    model_path: str,
+    device: str = "cuda",
+    num_gpus: int = 1,
+    max_gpu_memory: Optional[str] = None,
+    dtype: Optional[torch.dtype] = None,
+    load_8bit: bool = False,
+    cpu_offloading: bool = False,
+    gptq_config: Optional[GptqConfig] = None,
+    awq_config: Optional[AWQConfig] = None,
+    exllama_config: Optional[ExllamaConfig] = None,
+    xft_config: Optional[XftConfig] = None,
+    revision: str = "main",
+    debug: bool = False,
+    model_name: str = None,
+):
+    """Load a model from Hugging Face."""
+    # get model adapter
+    adapter = get_model_adapter(model_path, model_name)
+    # Handle device mapping
+    cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
+        device, load_8bit, cpu_offloading
+    )
+    if device == "cpu":
+        # kwargs = {"torch_dtype": torch.float32}
+        kwargs = {"torch_dtype": torch.float16}
+        if CPU_ISA in ["avx512_bf16", "amx"]:
+            try:
+                import intel_extension_for_pytorch as ipex
+                kwargs = {"torch_dtype": torch.bfloat16}
+            except ImportError:
+                warnings.warn(
+                    "Intel Extension for PyTorch is not installed, it can be installed to accelerate cpu inference"
+                )
+    elif device == "cuda":
+        # kwargs = {"torch_dtype": torch.float16}
+        kwargs = {"torch_dtype": torch.bfloat16}
+        if num_gpus != 1:
+            kwargs["device_map"] = "auto"
+            if max_gpu_memory is None:
+                kwargs[
+                    "device_map"
+                ] = "sequential"  # This is important for not the same VRAM sizes
+                available_gpu_memory = get_gpu_memory(num_gpus)
+                kwargs["max_memory"] = {
+                    i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
+                    for i in range(num_gpus)
+                }
+            else:
+                kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
+    elif device == "mps":
+        kwargs = {"torch_dtype": torch.float16}
+        # Avoid bugs in mps backend by not using in-place operations.
+        replace_llama_attn_with_non_inplace_operations()
+    elif device == "xpu":
+        kwargs = {"torch_dtype": torch.bfloat16}
+        # Try to load ipex, while it looks unused, it links into torch for xpu support
+        try:
+            import intel_extension_for_pytorch as ipex
+        except ImportError:
+            warnings.warn(
+                "Intel Extension for PyTorch is not installed, but is required for xpu inference."
+            )
+    elif device == "npu":
+        kwargs = {"torch_dtype": torch.float16}
+        # Try to load ipex, while it looks unused, it links into torch for xpu support
+        try:
+            import torch_npu
+        except ImportError:
+            warnings.warn("Ascend Extension for PyTorch is not installed.")
+    else:
+        raise ValueError(f"Invalid device: {device}")
+    if cpu_offloading:
+        # raises an error on incompatible platforms
+        from transformers import BitsAndBytesConfig
+        if "max_memory" in kwargs:
+            kwargs["max_memory"]["cpu"] = (
+                str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib"
+            )
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_8bit_fp32_cpu_offload=cpu_offloading
+        )
+        kwargs["load_in_8bit"] = load_8bit
+    elif load_8bit:
+        if num_gpus != 1:
+            warnings.warn(
+                "8-bit quantization is not supported for multi-gpu inference."
+            )
+        else:
+            model, tokenizer = adapter.load_compress_model(
+                model_path=model_path,
+                device=device,
+                torch_dtype=kwargs["torch_dtype"],
+                revision=revision,
+            )
+            if debug:
+                print(model)
+            return model, tokenizer
+    elif awq_config and awq_config.wbits < 16:
+        assert (
+            awq_config.wbits == 4
+        ), "Currently we only support 4-bit inference for AWQ."
+        model, tokenizer = load_awq_quantized(model_path, awq_config, device)
+        if num_gpus != 1:
+            device_map = accelerate.infer_auto_device_map(
+                model,
+                max_memory=kwargs["max_memory"],
+                no_split_module_classes=[
+                    "OPTDecoderLayer",
+                    "LlamaDecoderLayer",
+                    "BloomBlock",
+                    "MPTBlock",
+                    "DecoderLayer",
+                ],
+            )
+            model = accelerate.dispatch_model(
+                model, device_map=device_map, offload_buffers=True
+            )
+        else:
+            model.to(device)
+        return model, tokenizer
+    elif gptq_config and gptq_config.wbits < 16:
+        model, tokenizer = load_gptq_quantized(model_path, gptq_config)
+        if num_gpus != 1:
+            device_map = accelerate.infer_auto_device_map(
+                model,
+                max_memory=kwargs["max_memory"],
+                no_split_module_classes=["LlamaDecoderLayer"],
+            )
+            model = accelerate.dispatch_model(
+                model, device_map=device_map, offload_buffers=True
+            )
+        else:
+            model.to(device)
+        return model, tokenizer
+    elif exllama_config:
+        model, tokenizer = load_exllama_model(model_path, exllama_config)
+        return model, tokenizer
+    elif xft_config:
+        model, tokenizer = load_xft_model(model_path, xft_config)
+        return model, tokenizer
+    kwargs["revision"] = revision
+    if dtype is not None:  # Overwrite dtype if it is provided in the arguments.
+        kwargs["torch_dtype"] = dtype
+    # Load model
+    model, tokenizer = adapter.load_model(model_path, kwargs)
+    if (
+        device == "cpu"
+        and kwargs["torch_dtype"] is torch.bfloat16
+        and CPU_ISA is not None
+    ):
+        model = ipex.optimize(model, dtype=kwargs["torch_dtype"])
+    if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
+        "mps",
+        "xpu",
+        "npu",
+    ):
+        model.to(device)
+    if device == "xpu":
+        model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)
+    if debug:
+        print(model)
+    return model, tokenizer
+def get_conversation_template(model_path: str) -> Conversation:
+    """Get the default conversation template."""
+    adapter = get_model_adapter(model_path)
+    return adapter.get_default_conv_template(model_path)
+def get_generate_stream_function(model: torch.nn.Module, model_path: str):
+    """Get the generate_stream function for inference."""
+    from fastchat.serve.inference import generate_stream
+    model_type = str(type(model)).lower()
+    is_chatglm = "chatglm" in model_type
+    is_falcon = "rwforcausallm" in model_type
+    is_codet5p = "codet5p" in model_type
+    is_peft = "peft" in model_type
+    is_exllama = "exllama" in model_type
+    is_xft = "xft" in model_type
+    if is_chatglm:
+        return generate_stream_chatglm
+    elif is_falcon:
+        return generate_stream_falcon
+    elif is_codet5p:
+        return generate_stream_codet5p
+    elif is_exllama:
+        return generate_stream_exllama
+    elif is_xft:
+        return generate_stream_xft
+    elif peft_share_base_weights and is_peft:
+        # Return a curried stream function that loads the right adapter
+        # according to the model_name available in this context.  This ensures
+        # the right weights are available.
+        @torch.inference_mode()
+        def generate_stream_peft(
+            model,
+            tokenizer,
+            params: Dict,
+            device: str,
+            context_len: int,
+            stream_interval: int = 2,
+            judge_sent_end: bool = False,
+        ):
+            model.set_adapter(model_path)
+            for x in generate_stream(
+                model,
+                tokenizer,
+                params,
+                device,
+                context_len,
+                stream_interval,
+                judge_sent_end,
+            ):
+                yield x
+        return generate_stream_peft
+    else:
+        return generate_stream
+def add_model_args(parser):
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="lmsys/vicuna-7b-v1.5",
+        help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default="main",
+        help="Hugging Face Hub model revision identifier",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda", "mps", "xpu", "npu"],
+        default="cuda",
+        help="The device type",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=str,
+        default=None,
+        help="A single GPU like 1 or multiple GPUs like 0,2",
+    )
+    parser.add_argument("--num-gpus", type=int, default=1)
+    parser.add_argument(
+        "--max-gpu-memory",
+        type=str,
+        help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16"],
+        help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
+        default=None,
+    )
+    parser.add_argument(
+        "--load-8bit", action="store_true", help="Use 8-bit quantization"
+    )
+    parser.add_argument(
+        "--cpu-offloading",
+        action="store_true",
+        help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
+    )
+    parser.add_argument(
+        "--gptq-ckpt",
+        type=str,
+        default=None,
+        help="Used for GPTQ. The path to the local GPTQ checkpoint.",
+    )
+    parser.add_argument(
+        "--gptq-wbits",
+        type=int,
+        default=16,
+        choices=[2, 3, 4, 8, 16],
+        help="Used for GPTQ. #bits to use for quantization",
+    )
+    parser.add_argument(
+        "--gptq-groupsize",
+        type=int,
+        default=-1,
+        help="Used for GPTQ. Groupsize to use for quantization; default uses full row.",
+    )
+    parser.add_argument(
+        "--gptq-act-order",
+        action="store_true",
+        help="Used for GPTQ. Whether to apply the activation order GPTQ heuristic",
+    )
+    parser.add_argument(
+        "--awq-ckpt",
+        type=str,
+        default=None,
+        help="Used for AWQ. Load quantized model. The path to the local AWQ checkpoint.",
+    )
+    parser.add_argument(
+        "--awq-wbits",
+        type=int,
+        default=16,
+        choices=[4, 16],
+        help="Used for AWQ. #bits to use for AWQ quantization",
+    )
+    parser.add_argument(
+        "--awq-groupsize",
+        type=int,
+        default=-1,
+        help="Used for AWQ. Groupsize to use for AWQ quantization; default uses full row.",
+    )
+    parser.add_argument(
+        "--enable-exllama",
+        action="store_true",
+        help="Used for exllamabv2. Enable exllamaV2 inference framework.",
+    )
+    parser.add_argument(
+        "--exllama-max-seq-len",
+        type=int,
+        default=4096,
+        help="Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length.",
+    )
+    parser.add_argument(
+        "--exllama-gpu-split",
+        type=str,
+        default=None,
+        help="Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7",
+    )
+    parser.add_argument(
+        "--enable-xft",
+        action="store_true",
+        help="Used for xFasterTransformer Enable xFasterTransformer inference framework.",
+    )
+    parser.add_argument(
+        "--xft-max-seq-len",
+        type=int,
+        default=4096,
+        help="Used for xFasterTransformer. Max sequence length to use for xFasterTransformer framework; default 4096 sequence length.",
+    )
+    parser.add_argument(
+        "--xft-dtype",
+        type=str,
+        choices=["fp16", "bf16", "int8", "bf16_fp16", "bf16_int8"],
+        help="Override the default dtype. If not set, it will use bfloat16 for first token and float16 next tokens on CPU.",
+        default=None,
+    )
+def remove_parent_directory_name(model_path):
+    """Remove parent directory name."""
+    if model_path[-1] == "/":
+        model_path = model_path[:-1]
+    return model_path.split("/")[-1]
+peft_model_cache = {}
+class PeftModelAdapter:
+    """Loads any "peft" model and it's base model."""
+    def match(self, model_path: str):
+        """Accepts any model path with "peft" in the name"""
+        if os.path.exists(os.path.join(model_path, "adapter_config.json")):
+            return True
+        return "peft" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        """Loads the base model then the (peft) adapter weights"""
+        from peft import PeftConfig, PeftModel
+        config = PeftConfig.from_pretrained(model_path)
+        base_model_path = config.base_model_name_or_path
+        if "peft" in base_model_path:
+            raise ValueError(
+                f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
+            )
+        # Basic proof of concept for loading peft adapters that share the base
+        # weights.  This is pretty messy because Peft re-writes the underlying
+        # base model and internally stores a map of adapter layers.
+        # So, to make this work we:
+        #  1. Cache the first peft model loaded for a given base models.
+        #  2. Call `load_model` for any follow on Peft models.
+        #  3. Make sure we load the adapters by the model_path.  Why? This is
+        #  what's accessible during inference time.
+        #  4. In get_generate_stream_function, make sure we load the right
+        #  adapter before doing inference.  This *should* be safe when calls
+        #  are blocked the same semaphore.
+        if peft_share_base_weights:
+            if base_model_path in peft_model_cache:
+                model, tokenizer = peft_model_cache[base_model_path]
+                # Super important: make sure we use model_path as the
+                # `adapter_name`.
+                model.load_adapter(model_path, adapter_name=model_path)
+            else:
+                base_adapter = get_model_adapter(base_model_path)
+                base_model, tokenizer = base_adapter.load_model(
+                    base_model_path, from_pretrained_kwargs
+                )
+                # Super important: make sure we use model_path as the
+                # `adapter_name`.
+                model = PeftModel.from_pretrained(
+                    base_model, model_path, adapter_name=model_path
+                )
+                peft_model_cache[base_model_path] = (model, tokenizer)
+            return model, tokenizer
+        # In the normal case, load up the base model weights again.
+        base_adapter = get_model_adapter(base_model_path)
+        base_model, tokenizer = base_adapter.load_model(
+            base_model_path, from_pretrained_kwargs
+        )
+        model = PeftModel.from_pretrained(base_model, model_path)
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        """Uses the conv template of the base model"""
+        from peft import PeftConfig, PeftModel
+        config = PeftConfig.from_pretrained(model_path)
+        if "peft" in config.base_model_name_or_path:
+            raise ValueError(
+                f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
+            )
+        base_model_path = config.base_model_name_or_path
+        base_adapter = get_model_adapter(base_model_path)
+        return base_adapter.get_default_conv_template(config.base_model_name_or_path)
+class DeepseekChatAdapter(BaseModelAdapter):
+    """The model adapter for deepseek-ai's chat models"""
+    # Note: that this model will require tokenizer version >= 0.13.3 because the tokenizer class is LlamaTokenizerFast
+    def match(self, model_path: str):
+        return "deepseek" in model_path.lower() and "chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("deepseek")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            device_map="sequential",
+            torch_dtype=torch.bfloat16,
+            max_memory=from_pretrained_kwargs['max_memory'],
+            attn_implementation="flash_attention_2"#"eager"
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, utrust_remote_code=True
+        )
+        return model, tokenizer
+class VicunaAdapter(BaseModelAdapter):
+    "Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" ""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "vicuna" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            use_flash_attention_2=True,
+            **from_pretrained_kwargs,
+        )
+        self.raise_warning_for_old_weights(model)
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "v0" in remove_parent_directory_name(model_path):
+            return get_conv_template("one_shot")
+        return get_conv_template("vicuna_v1.1")
+    def raise_warning_for_old_weights(self, model):
+        if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000:
+            warnings.warn(
+                "\nYou are probably using the old Vicuna-v0 model, "
+                "which will generate unexpected results with the "
+                "current fastchat.\nYou can try one of the following methods:\n"
+                "1. Upgrade your weights to the new Vicuna-v1.3: https://github.com/lm-sys/FastChat#vicuna-weights.\n"
+                "2. Use the old conversation template by `python3 -m fastchat.serve.cli --model-path /path/to/vicuna-v0 --conv-template one_shot`\n"
+                "3. Downgrade fschat to fschat==0.1.10 (Not recommended).\n"
+            )
+class AiroborosAdapter(BaseModelAdapter):
+    """The model adapter for jondurbin/airoboros-*"""
+    def match(self, model_path: str):
+        if re.search(r"airoboros|spicyboros", model_path, re.I):
+            return True
+        return False
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "-3." in model_path or "-3p" in model_path:
+            return get_conv_template("airoboros_v3")
+        if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
+            return get_conv_template("airoboros_v2")
+        return get_conv_template("airoboros_v1")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        if "mpt" not in model_path.lower():
+            return super().load_model(model_path, from_pretrained_kwargs)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            max_seq_len=8192,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, use_fast=True
+        )
+        return model, tokenizer
+class Zhinao360Adapter(BaseModelAdapter):
+    def match(self, model_path: str):
+        return "360zhinao" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True)
+        from transformers import GenerationConfig
+        generation_config = GenerationConfig.from_pretrained(
+            model_path,
+            trust_remote_code=True)
+        return model, tokenizer, generation_config
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("360zhinao")
+class LongChatAdapter(BaseModelAdapter):
+    "Model adapter for LongChat models (e.g., lmsys/longchat-7b-16k)."
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "longchat" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        # Apply monkey patch, TODO(Dacheng): Add flash attention support
+        config = AutoConfig.from_pretrained(model_path, revision=revision)
+        replace_llama_with_condense(config.rope_scaling["factor"])
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("vicuna_v1.1")
+class GoogleT5Adapter(BaseModelAdapter):
+    """The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
+    def match(self, model_path: str):
+        return any(
+            model_str in model_path.lower()
+            for model_str in ["flan-", "fastchat-t5", "codet5p"]
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+class KoalaAdapter(BaseModelAdapter):
+    """The model adapter for Koala"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "koala" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("koala_v1")
+class AlpacaAdapter(BaseModelAdapter):
+    """The model adapter for Alpaca"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "alpaca" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("alpaca")
+class ChatGLMAdapter(BaseModelAdapter):
+    """The model adapter for THUDM/chatglm-6b, THUDM/chatglm2-6b"""
+    def match(self, model_path: str):
+        return "chatglm" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        if "chatglm3" in model_path.lower():
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                encode_special_tokens=True,
+                trust_remote_code=True,
+                revision=revision,
+            )
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path, trust_remote_code=True, revision=revision
+            )
+        model = AutoModel.from_pretrained(
+            model_path, trust_remote_code=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        if "chatglm2" in model_path.lower():
+            return get_conv_template("chatglm2")
+        if "chatglm3" in model_path.lower():
+            return get_conv_template("chatglm3")
+        return get_conv_template("chatglm")
+class CodeGeexAdapter(BaseModelAdapter):
+    """The model adapter for THUDM/codegeex-6b, THUDM/codegeex2-6b"""
+    def match(self, model_path: str):
+        return "codegeex" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model = AutoModel.from_pretrained(
+            model_path, trust_remote_code=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("codegeex")
+class DollyV2Adapter(BaseModelAdapter):
+    """The model adapter for databricks/dolly-v2-12b"""
+    def match(self, model_path: str):
+        return "dolly-v2" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        # 50277 means "### End"
+        tokenizer.eos_token_id = 50277
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("dolly_v2")
+class OasstPythiaAdapter(BaseModelAdapter):
+    """The model adapter for OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"""
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "oasst" in model_path and "pythia" in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("oasst_pythia")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+class OasstLLaMAAdapter(BaseModelAdapter):
+    """The model adapter for OpenAssistant/oasst-sft-7-llama-30b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        if "openassistant-sft-7-llama-30b-hf" in model_path:
+            return True
+        return "oasst" in model_path and "pythia" not in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("oasst_llama")
+class OpenChat35Adapter(BaseModelAdapter):
+    """The model adapter for OpenChat 3.5 (e.g. openchat/openchat_3.5)"""
+    def match(self, model_path: str):
+        return "openchat" in model_path.lower() and "3.5" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("openchat_3.5")
+class PythiaAdapter(BaseModelAdapter):
+    """The model adapter for any EleutherAI/pythia model"""
+    def match(self, model_path: str):
+        return "pythia" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+class StableLMAdapter(BaseModelAdapter):
+    """The model adapter for StabilityAI/stablelm-tuned-alpha-7b"""
+    def match(self, model_path: str):
+        return "stablelm" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("stablelm")
+class MPTAdapter(BaseModelAdapter):
+    """The model adapter for MPT series (mosaicml/mpt-7b-chat, mosaicml/mpt-30b-chat)"""
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "mpt" in model_path and not "airoboros" in model_path
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            max_seq_len=8192,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        if "mpt-7b-chat" in model_path:
+            return get_conv_template("mpt-7b-chat")
+        elif "mpt-30b-chat" in model_path:
+            return get_conv_template("mpt-30b-chat")
+        elif "mpt-30b-instruct" in model_path:
+            return get_conv_template("mpt-30b-instruct")
+        else:
+            print(
+                "Warning: Loading base MPT model with `zero_shot` conversation configuration.  "
+                "If this is not desired, inspect model configurations and names."
+            )
+            return get_conv_template("zero_shot")
+class BaizeAdapter(BaseModelAdapter):
+    """The model adapter for project-baize/baize-v2-7b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "baize" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("baize")
+class RwkvAdapter(BaseModelAdapter):
+    """The model adapter for BlinkDL/RWKV-4-Raven"""
+    def match(self, model_path: str):
+        return "rwkv-4" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        from fastchat.model.rwkv_model import RwkvModel
+        model = RwkvModel(model_path)
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            "EleutherAI/pythia-160m", revision=revision
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("rwkv")
+class OpenBuddyAdapter(BaseModelAdapter):
+    """The model adapter for OpenBuddy/openbuddy-7b-v1.1-bf16-enc"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "openbuddy" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("openbuddy")
+class PhoenixAdapter(BaseModelAdapter):
+    """The model adapter for FreedomIntelligence/phoenix-inst-chat-7b"""
+    def match(self, model_path: str):
+        return "phoenix" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("phoenix")
+class ReaLMAdapter(BaseModelAdapter):
+    """The model adapter for FreedomIntelligence/ReaLM-7b"""
+    def match(self, model_path: str):
+        return "ReaLM" in model_path
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("ReaLM-7b-v1")
+class ChatGPTAdapter(BaseModelAdapter):
+    """The model adapter for ChatGPT"""
+    def match(self, model_path: str):
+        return model_path in (
+            "gpt-3.5-turbo",
+            "gpt-3.5-turbo-1106",
+            "gpt-4",
+            "gpt-4-turbo",
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("chatgpt")
+class AzureOpenAIAdapter(BaseModelAdapter):
+    """The model adapter for Azure OpenAI"""
+    def match(self, model_path: str):
+        return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("chatgpt")
+class ClaudeAdapter(BaseModelAdapter):
+    """The model adapter for Claude"""
+    def match(self, model_path: str):
+        return model_path in ANTHROPIC_MODEL_LIST
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("claude")
+class BardAdapter(BaseModelAdapter):
+    """The model adapter for Bard"""
+    def match(self, model_path: str):
+        return model_path == "bard"
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("bard")
+class PaLM2Adapter(BaseModelAdapter):
+    """The model adapter for PaLM2"""
+    def match(self, model_path: str):
+        return model_path == "palm-2"
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("bard")
+class BiLLaAdapter(BaseModelAdapter):
+    """The model adapter for Neutralzz/BiLLa-7B-SFT"""
+    def match(self, model_path: str):
+        return "billa" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("billa")
+class RedPajamaINCITEAdapter(BaseModelAdapter):
+    """The model adapter for togethercomputer/RedPajama-INCITE-7B-Chat"""
+    def match(self, model_path: str):
+        return "redpajama-incite" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("redpajama-incite")
+class H2OGPTAdapter(BaseModelAdapter):
+    """The model adapter for h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "h2ogpt" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("h2ogpt")
+class RobinAdapter(BaseModelAdapter):
+    """The model adapter for LMFlow/Full-Robin-7b-v2"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "robin" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("Robin")
+class SnoozyAdapter(BaseModelAdapter):
+    """The model adapter for nomic-ai/gpt4all-13b-snoozy"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "gpt4all" in model_path and "snoozy" in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("snoozy")
+class WizardLMAdapter(BaseModelAdapter):
+    """The model adapter for WizardLM/WizardLM-13B-V1.0"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "wizardlm" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        if "13b" in model_path or "30b" in model_path or "70b" in model_path:
+            return get_conv_template("vicuna_v1.1")
+        else:
+            # TODO: use the recommended template for 7B
+            # (https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
+            return get_conv_template("one_shot")
+class ManticoreAdapter(BaseModelAdapter):
+    """The model adapter for openaccess-ai-collective/manticore-13b-chat-pyg"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "manticore" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("manticore")
+class GuanacoAdapter(BaseModelAdapter):
+    """The model adapter for timdettmers/guanaco-33b-merged"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "guanaco" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        # Fix a bug in tokenizer config
+        tokenizer.eos_token_id = model.config.eos_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("zero_shot")
+class ChangGPTAdapter(BaseModelAdapter):
+    """The model adapter for lcw99/polyglot-ko-12.8b-chang-instruct-chat"""
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "polyglot" in model_path and "chang" in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("polyglot_changgpt")
+class CamelAdapter(BaseModelAdapter):
+    """The model adapter for camel-ai/CAMEL-13B-Combined-Data"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "camel" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("vicuna_v1.1")
+class TuluAdapter(BaseModelAdapter):
+    """The model adapter for allenai/tulu-30b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "tulu" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("tulu")
+class FalconAdapter(BaseModelAdapter):
+    """The model adapter for tiiuae/falcon-40b"""
+    def match(self, model_path: str):
+        return "falcon" in model_path.lower() and "chat" not in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        # Strongly suggest using bf16, which is recommended by the author of Falcon
+        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        # In Falcon tokenizer config and special config there is not any pad token
+        # Setting `pad_token_id` to 9, which corresponds to special token '>>SUFFIX<<'
+        tokenizer.pad_token_id = 9
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("falcon")
+class FalconChatAdapter(BaseModelAdapter):
+    def match(self, model_path: str):
+        return "falcon" in model_path.lower() and "chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("falcon-chat")
+class TigerBotAdapter(BaseModelAdapter):
+    """The model adapter for TigerResearch/tigerbot-7b-sft"""
+    def match(self, model_path: str):
+        return "tigerbot" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("tigerbot")
+class BaichuanAdapter(BaseModelAdapter):
+    """The model adapter for Baichuan models (e.g., baichuan-inc/Baichuan-7B)"""
+    def match(self, model_path: str):
+        return "baichuan" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        # for Baichuan-13B-Chat
+        if "chat" in model_path.lower():
+            if "baichuan2" in model_path.lower():
+                return get_conv_template("baichuan2-chat")
+            return get_conv_template("baichuan-chat")
+        return get_conv_template("zero_shot")
+class XGenAdapter(BaseModelAdapter):
+    """The model adapter for Salesforce/xgen-7b"""
+    def match(self, model_path: str):
+        return "xgen" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model.config.eos_token_id = 50256
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("xgen")
+class NousHermesAdapter(BaseModelAdapter):
+    """The model adapter for NousResearch/Nous-Hermes-13b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "nous-hermes" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("alpaca")
+class InternLMChatAdapter(BaseModelAdapter):
+    """The model adapter for internlm/internlm-chat-7b"""
+    def match(self, model_path: str):
+        return "internlm-chat" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        model = model.eval()
+        if "8k" in model_path.lower():
+            model.config.max_sequence_length = 8192
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("internlm-chat")
+class StarChatAdapter(BaseModelAdapter):
+    """The model adapter for HuggingFaceH4/starchat-beta"""
+    def match(self, model_path: str):
+        return "starchat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("starchat")
+class MistralAdapter(BaseModelAdapter):
+    """The model adapter for Mistral AI models"""
+    def match(self, model_path: str):
+        return "mistral" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("mistral")
+class Llama2Adapter(BaseModelAdapter):
+    """The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""
+    def match(self, model_path: str):
+        return "llama-2" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("llama-2")
+class CuteGPTAdapter(BaseModelAdapter):
+    """The model adapter for CuteGPT"""
+    def match(self, model_path: str):
+        return "cutegpt" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = LlamaTokenizer.from_pretrained(model_path)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<end>")
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.eos_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("cutegpt")
+class OpenOrcaAdapter(BaseModelAdapter):
+    """Model adapter for Open-Orca models which may use different prompt templates
+    - (e.g. Open-Orca/OpenOrcaxOpenChat-Preview2-13B, Open-Orca/Mistral-7B-OpenOrca)
+    - `OpenOrcaxOpenChat-Preview2-13B` uses their "OpenChat Llama2 V1" prompt template.
+        - [Open-Orca/OpenOrcaxOpenChat-Preview2-13B #Prompt Template](https://huggingface.co/Open-Orca/OpenOrcaxOpenChat-Preview2-13B#prompt-template)
+    - `Mistral-7B-OpenOrca` uses the [OpenAI's Chat Markup Language (ChatML)](https://github.com/openai/openai-python/blob/main/chatml.md)
+        format, with <|im_start|> and <|im_end|> tokens added to support this.
+        - [Open-Orca/Mistral-7B-OpenOrca #Prompt Template](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template)
+    """
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return (
+            "mistral-7b-openorca" in model_path.lower()
+            or "openorca" in model_path.lower()
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "mistral-7b-openorca" in model_path.lower():
+            return get_conv_template("mistral-7b-openorca")
+        return get_conv_template("open-orca")
+class WizardCoderAdapter(BaseModelAdapter):
+    """The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "wizardcoder" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        # Same as Alpaca, see :
+        # https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/inference_wizardcoder.py#L60
+        return get_conv_template("alpaca")
+class QwenChatAdapter(BaseModelAdapter):
+    """The model adapter for Qwen/Qwen-7B-Chat
+    To run this model, you need to ensure additional flash attention installation:
+    ``` bash
+    git clone https://github.com/Dao-AILab/flash-attention
+    cd flash-attention && pip install .
+    pip install csrc/layer_norm
+    pip install csrc/rotary
+    ```
+    Since from 2.0, the following change happened
+    - `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
+    - `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
+    - `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
+    You may need to revise the code in: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py#L69
+    to from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
+    """
+    def match(self, model_path: str):
+        return "qwen" in model_path.lower()
+    def float_set(self, config, option):
+        config.bf16 = False
+        config.fp16 = False
+        config.fp32 = False
+        if option == "bf16":
+            config.bf16 = True
+        elif option == "fp16":
+            config.fp16 = True
+        elif option == "fp32":
+            config.fp32 = True
+        else:
+            print("Invalid option. Please choose one from 'bf16', 'fp16' and 'fp32'.")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        from transformers.generation import GenerationConfig
+        revision = from_pretrained_kwargs.get("revision", "main")
+        config = AutoConfig.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+        )
+        # NOTE: if you use the old version of model file, please remove the comments below
+        # config.use_flash_attn = False
+        self.float_set(config, "fp16")
+        generation_config = GenerationConfig.from_pretrained(
+            model_path, trust_remote_code=True
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            config=config,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
+            model.config.max_sequence_length = 16384
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        tokenizer.eos_token_id = config.eos_token_id
+        tokenizer.bos_token_id = config.bos_token_id
+        tokenizer.pad_token_id = generation_config.pad_token_id
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.bos_token_id = tokenizer.bos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("qwen-7b-chat")
+class BGEAdapter(BaseModelAdapter):
+    """The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "bge" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModel.from_pretrained(
+            model_path,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        if hasattr(model.config, "max_position_embeddings") and hasattr(
+            tokenizer, "model_max_length"
+        ):
+            model.config.max_sequence_length = min(
+                model.config.max_position_embeddings, tokenizer.model_max_length
+            )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("one_shot")
+class E5Adapter(BaseModelAdapter):
+    """The model adapter for E5 (e.g., intfloat/e5-large-v2)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "e5-" in model_path.lower() and 'megrez' not in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModel.from_pretrained(
+            model_path,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        if hasattr(model.config, "max_position_embeddings") and hasattr(
+            tokenizer, "model_max_length"
+        ):
+            model.config.max_sequence_length = min(
+                model.config.max_position_embeddings, tokenizer.model_max_length
+            )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("one_shot")
+class AquilaChatAdapter(BaseModelAdapter):
+    """The model adapter for BAAI/Aquila
+    Now supports:
+    - BAAI/AquilaChat-7B
+    - BAAI/AquilaChat2-7B
+    - BAAI/AquilaChat2-34B
+    """
+    def match(self, model_path: str):
+        return "aquila" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        model = model.eval()
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        # See: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L347
+        if "aquilachat2" in model_path:
+            if "16k" in model_path:
+                return get_conv_template("aquila")
+            elif "34b" in model_path:
+                return get_conv_template("aquila-legacy")
+            else:
+                return get_conv_template("aquila-v1")
+        else:
+            return get_conv_template("aquila-chat")
+class Lamma2ChineseAdapter(BaseModelAdapter):
+    """The model adapter for FlagAlpha/LLama2-Chinese sft"""
+    def match(self, model_path: str):
+        return "llama2-chinese" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("llama2-chinese")
+class VigogneAdapter(BaseModelAdapter):
+    """The model adapter for vigogne (e.g., bofenghuang/vigogne-2-7b-chat)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return bool(re.search(r"vigogne|vigostral", model_path, re.I))
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            use_fast=self.use_fast_tokenizer,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "chat" in model_path.lower():
+            if "vigostral" in model_path.lower():
+                return get_conv_template("vigogne_chat_v3")
+            return get_conv_template("vigogne_chat_v2")
+        return get_conv_template("vigogne_instruct")
+class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
+    """The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return (
+            "open-llama" in model_path.lower() and "open-instruct" in model_path.lower()
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            use_fast=self.use_fast_tokenizer,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("alpaca")
+class CodeLlamaAdapter(BaseModelAdapter):
+    """The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""
+    def match(self, model_path: str):
+        return "codellama" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("llama-2")
+class PhindCodeLlamaAdapter(CodeLlamaAdapter):
+    """The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""
+    def match(self, model_path: str):
+        return "phind-codellama-" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("phind")
+class Llama2ChangAdapter(Llama2Adapter):
+    """The model adapter for Llama2-ko-chang (e.g., lcw99/llama2-ko-chang-instruct-chat)"""
+    def match(self, model_path: str):
+        return "llama2-ko-chang" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("polyglot_changgpt")
+class ZephyrAdapter(BaseModelAdapter):
+    """The model adapter for Zephyr (e.g. HuggingFaceH4/zephyr-7b-alpha)"""
+    def match(self, model_path: str):
+        return "zephyr" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("zephyr")
+class XwinLMAdapter(BaseModelAdapter):
+    """The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)"""
+    # use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "xwin-lm" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("vicuna_v1.1")
+class LemurAdapter(BaseModelAdapter):
+    """The model adapter for OpenLemur/lemur-70b-chat-v1"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "lemur-70b-chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("lemur-70b-chat")
+class PygmalionAdapter(BaseModelAdapter):
+    """The model adapter for Pygmalion/Metharme series of models(e.g., PygmalionAI/mythalion-13b)"""
+    # use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return bool(
+            re.search(r"pygmalion|mythalion|metharme", model_path.lower(), re.I)
+        )
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("metharme")
+# Note: the registration order matters.
+# The one registered earlier has a higher matching priority.
+register_model_adapter(PeftModelAdapter)
+register_model_adapter(DeepseekChatAdapter)
+register_model_adapter(VicunaAdapter)
+register_model_adapter(AiroborosAdapter)
+register_model_adapter(LongChatAdapter)
+register_model_adapter(GoogleT5Adapter)
+register_model_adapter(KoalaAdapter)
+register_model_adapter(AlpacaAdapter)
+register_model_adapter(ChatGLMAdapter)
+register_model_adapter(CodeGeexAdapter)
+register_model_adapter(DollyV2Adapter)
+register_model_adapter(OasstPythiaAdapter)
+register_model_adapter(OasstLLaMAAdapter)
+register_model_adapter(OpenChat35Adapter)
+register_model_adapter(StableLMAdapter)
+register_model_adapter(BaizeAdapter)
+register_model_adapter(RwkvAdapter)
+register_model_adapter(OpenBuddyAdapter)
+register_model_adapter(PhoenixAdapter)
+register_model_adapter(BardAdapter)
+register_model_adapter(PaLM2Adapter)
+register_model_adapter(ChatGPTAdapter)
+register_model_adapter(AzureOpenAIAdapter)
+register_model_adapter(ClaudeAdapter)
+register_model_adapter(MPTAdapter)
+register_model_adapter(BiLLaAdapter)
+register_model_adapter(RedPajamaINCITEAdapter)
+register_model_adapter(H2OGPTAdapter)
+register_model_adapter(RobinAdapter)
+register_model_adapter(SnoozyAdapter)
+register_model_adapter(WizardLMAdapter)
+register_model_adapter(ManticoreAdapter)
+register_model_adapter(GuanacoAdapter)
+register_model_adapter(CamelAdapter)
+register_model_adapter(ChangGPTAdapter)
+register_model_adapter(TuluAdapter)
+register_model_adapter(FalconChatAdapter)
+register_model_adapter(FalconAdapter)
+register_model_adapter(TigerBotAdapter)
+register_model_adapter(BaichuanAdapter)
+register_model_adapter(XGenAdapter)
+register_model_adapter(NousHermesAdapter)
+register_model_adapter(PythiaAdapter)
+register_model_adapter(InternLMChatAdapter)
+register_model_adapter(StarChatAdapter)
+register_model_adapter(Llama2Adapter)
+register_model_adapter(CuteGPTAdapter)
+register_model_adapter(OpenOrcaAdapter)
+register_model_adapter(MistralAdapter)
+register_model_adapter(WizardCoderAdapter)
+register_model_adapter(QwenChatAdapter)
+register_model_adapter(AquilaChatAdapter)
+register_model_adapter(BGEAdapter)
+register_model_adapter(E5Adapter)
+register_model_adapter(Lamma2ChineseAdapter)
+register_model_adapter(VigogneAdapter)
+register_model_adapter(OpenLLaMaOpenInstructAdapter)
+register_model_adapter(ReaLMAdapter)
+register_model_adapter(PhindCodeLlamaAdapter)
+register_model_adapter(CodeLlamaAdapter)
+register_model_adapter(Llama2ChangAdapter)
+register_model_adapter(ZephyrAdapter)
+register_model_adapter(XwinLMAdapter)
+register_model_adapter(LemurAdapter)
+register_model_adapter(PygmalionAdapter)
+register_model_adapter(Zhinao360Adapter)
+# After all adapters, try the default base adapter.
+register_model_adapter(BaseModelAdapter)

fastchat/model/model_chatglm.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Inference code for ChatGLM.
+Adapted from https://huggingface.co/THUDM/chatglm-6b/blob/main/modeling_chatglm.py.
+"""
+import re
+import torch
+from transformers.generation.logits_process import LogitsProcessor
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+invalid_score_processor = InvalidScoreLogitsProcessor()
+def process_response(response):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+@torch.inference_mode()
+def generate_stream_chatglm(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=2048,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    echo = params.get("echo", True)
+    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
+    input_echo_len = len(inputs["input_ids"][0])
+    gen_kwargs = {
+        "max_length": max_new_tokens + input_echo_len,
+        "do_sample": True if temperature > 1e-5 else False,
+        "top_p": top_p,
+        "repetition_penalty": repetition_penalty,
+        "logits_processor": [invalid_score_processor],
+    }
+    if temperature > 1e-5:
+        gen_kwargs["temperature"] = temperature
+    total_len = 0
+    for total_ids in model.stream_generate(**inputs, **gen_kwargs):
+        total_ids = total_ids.tolist()[0]
+        total_len = len(total_ids)
+        if echo:
+            output_ids = total_ids
+        else:
+            output_ids = total_ids[input_echo_len:]
+        response = tokenizer.decode(output_ids)
+        response = process_response(response)
+        yield {
+            "text": response,
+            "usage": {
+                "prompt_tokens": input_echo_len,
+                "completion_tokens": total_len - input_echo_len,
+                "total_tokens": total_len,
+            },
+            "finish_reason": None,
+        }
+    # TODO: ChatGLM stop when it reach max length
+    # Only last stream result contains finish_reason, we set finish_reason as stop
+    ret = {
+        "text": response,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": total_len - input_echo_len,
+            "total_tokens": total_len,
+        },
+        "finish_reason": "stop",
+    }
+    yield ret

fastchat/model/model_codet5p.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import gc
+from threading import Thread
+import torch
+import transformers
+from transformers import (
+    GenerationConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    TextIteratorStreamer,
+)
+@torch.inference_mode()
+def generate_stream_codet5p(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=2048,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    top_k = int(params.get("top_k", 50))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 1024))
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    stop_token_ids.append(tokenizer.eos_token_id)
+    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    streamer = TextIteratorStreamer(tokenizer, **decode_config)
+    encoding = tokenizer(prompt, return_tensors="pt").to(device)
+    input_ids = encoding.input_ids
+    encoding["decoder_input_ids"] = encoding["input_ids"].clone()
+    input_echo_len = len(input_ids)
+    generation_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature >= 1e-5,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=10,
+        top_p=top_p,
+        top_k=top_k,
+        eos_token_id=stop_token_ids,
+    )
+    class CodeBlockStopper(StoppingCriteria):
+        def __call__(
+            self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+        ) -> bool:
+            # Code-completion is open-end generation.
+            # We check \n\n to stop at end of a code block.
+            if list(input_ids[0][-2:]) == [628, 198]:
+                return True
+            return False
+    gen_kwargs = dict(
+        **encoding,
+        streamer=streamer,
+        generation_config=generation_config,
+        stopping_criteria=StoppingCriteriaList([CodeBlockStopper()]),
+    )
+    thread = Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    i = 0
+    output = ""
+    for new_text in streamer:
+        i += 1
+        output += new_text
+        if i % stream_interval == 0 or i == max_new_tokens - 1:
+            yield {
+                "text": output,
+                "usage": {
+                    "prompt_tokens": input_echo_len,
+                    "completion_tokens": i,
+                    "total_tokens": input_echo_len + i,
+                },
+                "finish_reason": None,
+            }
+        if i >= max_new_tokens:
+            break
+    if i >= max_new_tokens:
+        finish_reason = "length"
+    else:
+        finish_reason = "stop"
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+    thread.join()
+    # clean
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()

fastchat/model/model_exllama.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gc
+import sys
+from typing import Dict
+import torch
+def generate_stream_exllama(
+    model,
+    tokenizer,
+    params: Dict,
+    device: str,
+    context_len: int,
+    stream_interval: int = 2,
+    judge_sent_end: bool = False,
+):
+    try:
+        from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler
+    except ImportError as e:
+        print(f"Error: Failed to load Exllamav2. {e}")
+        sys.exit(-1)
+    prompt = params["prompt"]
+    generator = ExLlamaV2StreamingGenerator(model.model, model.cache, tokenizer)
+    settings = ExLlamaV2Sampler.Settings()
+    settings.temperature = float(params.get("temperature", 0.85))
+    settings.top_k = int(params.get("top_k", 50))
+    settings.top_p = float(params.get("top_p", 0.8))
+    settings.token_repetition_penalty = float(params.get("repetition_penalty", 1.15))
+    settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    generator.set_stop_conditions(params.get("stop_token_ids", None) or [])
+    echo = bool(params.get("echo", True))
+    input_ids = generator.tokenizer.encode(prompt)
+    prompt_tokens = input_ids.shape[-1]
+    generator.begin_stream(input_ids, settings)
+    generated_tokens = 0
+    if echo:
+        output = prompt
+    else:
+        output = ""
+    while True:
+        chunk, eos, _ = generator.stream()
+        output += chunk
+        generated_tokens += 1
+        if generated_tokens == max_new_tokens:
+            finish_reason = "length"
+            break
+        elif eos:
+            finish_reason = "length"
+            break
+        yield {
+            "text": output,
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": generated_tokens,
+                "total_tokens": prompt_tokens + generated_tokens,
+            },
+            "finish_reason": None,
+        }
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": generated_tokens,
+            "total_tokens": prompt_tokens + generated_tokens,
+        },
+        "finish_reason": finish_reason,
+    }
+    gc.collect()

fastchat/model/model_falcon.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gc
+from threading import Thread
+from typing import Iterable
+import torch
+import transformers
+from transformers import TextIteratorStreamer, GenerationConfig
+from fastchat.utils import is_partial_stop
+@torch.inference_mode()
+def generate_stream_falcon(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=2048,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    len_prompt = len(prompt)
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    top_k = int(params.get("top_k", 50))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    stop_str = params.get("stop", None)
+    echo = bool(params.get("echo", True))
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    stop_token_ids.append(tokenizer.eos_token_id)
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    max_src_len = context_len - max_new_tokens - 8
+    input_ids = input_ids[-max_src_len:]  # truncate from the left
+    attention_mask = attention_mask[-max_src_len:]  # truncate from the left
+    input_echo_len = len(input_ids)
+    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
+    generation_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature >= 1e-5,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=10,
+        top_p=top_p,
+        top_k=top_k,
+        eos_token_id=stop_token_ids,
+    )
+    generation_kwargs = dict(
+        inputs=input_ids,
+        attention_mask=attention_mask,
+        streamer=streamer,
+        generation_config=generation_config,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    if echo:
+        # means keep the prompt
+        output = prompt
+    else:
+        output = ""
+    for i, new_text in enumerate(streamer):
+        output += new_text
+        if i % stream_interval == 0:
+            if echo:
+                rfind_start = len_prompt
+            else:
+                rfind_start = 0
+            partially_stopped = False
+            if stop_str:
+                if isinstance(stop_str, str):
+                    pos = output.rfind(stop_str, rfind_start)
+                    if pos != -1:
+                        output = output[:pos]
+                    else:
+                        partially_stopped = is_partial_stop(output, stop_str)
+                elif isinstance(stop_str, Iterable):
+                    for each_stop in stop_str:
+                        pos = output.rfind(each_stop, rfind_start)
+                        if pos != -1:
+                            output = output[:pos]
+                            break
+                        else:
+                            partially_stopped = is_partial_stop(output, each_stop)
+                            if partially_stopped:
+                                break
+                else:
+                    raise ValueError("Invalid stop field type.")
+            # prevent yielding partial stop sequence
+            if not partially_stopped:
+                yield {
+                    "text": output,
+                    "usage": {
+                        "prompt_tokens": input_echo_len,
+                        "completion_tokens": i,
+                        "total_tokens": input_echo_len + i,
+                    },
+                    "finish_reason": None,
+                }
+    output = output.strip()
+    # finish stream event, which contains finish reason
+    if i == max_new_tokens - 1:
+        finish_reason = "length"
+    elif partially_stopped:
+        finish_reason = None
+    else:
+        finish_reason = "stop"
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+    # clean
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()

fastchat/model/model_registry.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""Additional information of the models."""
+from collections import namedtuple
+from typing import List
+ModelInfo = namedtuple("ModelInfo", ["simple_name", "link", "description"])
+model_info = {}
+def register_model_info(
+    full_names: List[str], simple_name: str, link: str, description: str
+):
+    info = ModelInfo(simple_name, link, description)
+    for full_name in full_names:
+        model_info[full_name] = info
+def get_model_info(name: str) -> ModelInfo:
+    if name in model_info:
+        return model_info[name]
+    else:
+        # To fix this, please use `register_model_info` to register your model
+        return ModelInfo(
+            name, "", "Register the description at fastchat/model/model_registry.py"
+        )
+register_model_info(
+    ["gpt-3.5-turbo"],
+    "GPT-3.5",
+    "https://openai.com/blog/chatgpt",
+    "GPT-3.5 by OpenAI",
+)
+register_model_info(
+    ["gpt-3.5-turbo-1106"],
+    "GPT-3.5-Turbo-1106",
+    "https://platform.openai.com/docs/models/gpt-3-5",
+    "GPT-3.5-Turbo-1106 by OpenAI",
+)
+register_model_info(
+    ["gpt-4"], "GPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI"
+)
+register_model_info(
+    ["gpt-4-turbo"],
+    "GPT-4-Turbo",
+    "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+    "GPT-4-Turbo by OpenAI",
+)
+register_model_info(
+    ["claude-2"],
+    "Claude",
+    "https://www.anthropic.com/index/claude-2",
+    "Claude 2 by Anthropic",
+)
+register_model_info(
+    ["claude-1"],
+    "Claude",
+    "https://www.anthropic.com/index/introducing-claude",
+    "Claude by Anthropic",
+)
+register_model_info(
+    ["claude-instant-1"],
+    "Claude Instant",
+    "https://www.anthropic.com/index/introducing-claude",
+    "Claude Instant by Anthropic",
+)
+register_model_info(
+    ["palm-2"],
+    "PaLM 2 Chat",
+    "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023",
+    "PaLM 2 for Chat (chat-bison@001) by Google",
+)
+register_model_info(
+    [
+        "vicuna-33b",
+        "vicuna-33b-v1.3",
+        "vicuna-13b",
+        "vicuna-13b-v1.3",
+        "vicuna-7b",
+        "vicuna-7b-v1.3",
+    ],
+    "Vicuna",
+    "https://lmsys.org/blog/2023-03-30-vicuna/",
+    "a chat assistant fine-tuned on user-shared conversations by LMSYS",
+)
+register_model_info(
+    ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"],
+    "Llama 2",
+    "https://ai.meta.com/llama/",
+    "open foundation and fine-tuned chat models by Meta",
+)
+register_model_info(
+    ["mistral-7b-instruct"],
+    "Mistral",
+    "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1",
+    "a large language model by Mistral AI team",
+)
+register_model_info(
+    ["zephyr-7b-beta", "zephyr-7b-alpha"],
+    "Zephyr",
+    "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha",
+    "a chatbot fine-tuned from Mistral by Hugging Face",
+)
+register_model_info(
+    ["qwen-14b-chat"],
+    "Qwen",
+    "https://huggingface.co/Qwen/Qwen-14B-Chat",
+    "a large language model by Alibaba Cloud",
+)
+register_model_info(
+    ["codellama-34b-instruct", "codellama-13b-instruct", "codellama-7b-instruct"],
+    "Code Llama",
+    "https://ai.meta.com/blog/code-llama-large-language-model-coding/",
+    "open foundation models for code by Meta",
+)
+register_model_info(
+    ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"],
+    "WizardLM",
+    "https://github.com/nlpxucan/WizardLM",
+    "an instruction-following LLM using evol-instruct by Microsoft",
+)
+register_model_info(
+    ["wizardcoder-15b-v1.0"],
+    "WizardLM",
+    "https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder",
+    "Empowering Code Large Language Models with Evol-Instruct",
+)
+register_model_info(
+    ["mpt-7b-chat", "mpt-30b-chat"],
+    "MPT-Chat",
+    "https://www.mosaicml.com/blog/mpt-30b",
+    "a chatbot fine-tuned from MPT by MosaicML",
+)
+register_model_info(
+    ["guanaco-33b", "guanaco-65b"],
+    "Guanaco",
+    "https://github.com/artidoro/qlora",
+    "a model fine-tuned with QLoRA by UW",
+)
+register_model_info(
+    ["gpt4all-13b-snoozy"],
+    "GPT4All-Snoozy",
+    "https://github.com/nomic-ai/gpt4all",
+    "a finetuned LLaMA model on assistant style data by Nomic AI",
+)
+register_model_info(
+    ["koala-13b"],
+    "Koala",
+    "https://bair.berkeley.edu/blog/2023/04/03/koala",
+    "a dialogue model for academic research by BAIR",
+)
+register_model_info(
+    ["RWKV-4-Raven-14B"],
+    "RWKV-4-Raven",
+    "https://huggingface.co/BlinkDL/rwkv-4-raven",
+    "an RNN with transformer-level LLM performance",
+)
+register_model_info(
+    ["chatglm-6b", "chatglm2-6b"],
+    "ChatGLM",
+    "https://chatglm.cn/blog",
+    "an open bilingual dialogue language model by Tsinghua University",
+)
+register_model_info(
+    ["alpaca-13b"],
+    "Alpaca",
+    "https://crfm.stanford.edu/2023/03/13/alpaca.html",
+    "a model fine-tuned from LLaMA on instruction-following demonstrations by Stanford",
+)
+register_model_info(
+    ["oasst-pythia-12b"],
+    "OpenAssistant (oasst)",
+    "https://open-assistant.io",
+    "an Open Assistant for everyone by LAION",
+)
+register_model_info(
+    ["oasst-sft-7-llama-30b"],
+    "OpenAssistant (oasst)",
+    "https://open-assistant.io",
+    "an Open Assistant for everyone by LAION",
+)
+register_model_info(
+    ["openchat-3.5"],
+    "OpenChat 3.5",
+    "https://github.com/imoneoi/openchat",
+    "OpenChat 3.5 is a versatile, open-source language model fine-tuned using C-RLFT",
+)
+register_model_info(
+    ["llama-7b", "llama-13b"],
+    "LLaMA",
+    "https://arxiv.org/abs/2302.13971",
+    "open and efficient foundation language models by Meta",
+)
+register_model_info(
+    ["open-llama-7b-v2-open-instruct", "open-llama-7b-open-instruct"],
+    "Open LLaMa (Open Instruct)",
+    "https://medium.com/vmware-data-ml-blog/starter-llm-for-the-enterprise-instruction-tuning-openllama-7b-d05fc3bbaccc",
+    "Open LLaMa fine-tuned on instruction-following data by VMware",
+)
+register_model_info(
+    ["dolly-v2-12b"],
+    "Dolly",
+    "https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm",
+    "an instruction-tuned open large language model by Databricks",
+)
+register_model_info(
+    ["stablelm-tuned-alpha-7b"],
+    "StableLM",
+    "https://github.com/stability-AI/stableLM",
+    "Stability AI language models",
+)
+register_model_info(
+    ["codet5p-6b"],
+    "CodeT5p-6b",
+    "https://huggingface.co/Salesforce/codet5p-6b",
+    "Code completion model released by Salesforce",
+)
+register_model_info(
+    ["fastchat-t5-3b", "fastchat-t5-3b-v1.0"],
+    "FastChat-T5",
+    "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0",
+    "a chat assistant fine-tuned from FLAN-T5 by LMSYS",
+)
+register_model_info(
+    ["phoenix-inst-chat-7b"],
+    "Phoenix-7B",
+    "https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b",
+    "a multilingual chat assistant fine-tuned from Bloomz to democratize ChatGPT across languages by CUHK(SZ)",
+)
+register_model_info(
+    ["realm-7b-v1"],
+    "ReaLM",
+    "https://github.com/FreedomIntelligence/ReaLM",
+    "A chatbot fine-tuned from LLaMA2 with data generated via iterative calls to UserGPT and ChatGPT by CUHK(SZ) and SRIBD.",
+)
+register_model_info(
+    ["billa-7b-sft"],
+    "BiLLa-7B-SFT",
+    "https://huggingface.co/Neutralzz/BiLLa-7B-SFT",
+    "an instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher",
+)
+register_model_info(
+    ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"],
+    "h2oGPT-GM-7b",
+    "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
+    "an instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai",
+)
+register_model_info(
+    ["baize-v2-7b", "baize-v2-13b"],
+    "Baize v2",
+    "https://github.com/project-baize/baize-chatbot#v2",
+    "A chatbot fine-tuned from LLaMA with ChatGPT self-chat data and Self-Disillation with Feedback (SDF) by UCSD and SYSU.",
+)
+register_model_info(
+    [
+        "airoboros-l2-7b-2.1",
+        "airoboros-l2-13b-2.1",
+        "airoboros-c34b-2.1",
+        "airoboros-l2-70b-2.1",
+    ],
+    "airoboros",
+    "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1",
+    "an instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4",
+)
+register_model_info(
+    [
+        "spicyboros-7b-2.2",
+        "spicyboros-13b-2.2",
+        "spicyboros-70b-2.2",
+    ],
+    "spicyboros",
+    "https://huggingface.co/jondurbin/spicyboros-70b-2.2",
+    "de-aligned versions of the airoboros models",
+)
+register_model_info(
+    ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"],
+    "Robin-v2",
+    "https://huggingface.co/OptimalScale/robin-7b-v2-delta",
+    "A chatbot fine-tuned from LLaMA-7b, achieving competitive performance on chitchat, commonsense reasoning and instruction-following tasks, by OptimalScale, HKUST.",
+)
+register_model_info(
+    ["manticore-13b-chat"],
+    "Manticore 13B Chat",
+    "https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg",
+    "A chatbot fine-tuned from LlaMa across several CoT and chat datasets.",
+)
+register_model_info(
+    ["redpajama-incite-7b-chat"],
+    "RedPajama-INCITE-7B-Chat",
+    "https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat",
+    "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together",
+)
+register_model_info(
+    [
+        "falcon-7b",
+        "falcon-7b-instruct",
+        "falcon-40b",
+        "falcon-40b-instruct",
+        "falcon-180b",
+        "falcon-180b-chat",
+    ],
+    "Falcon",
+    "https://huggingface.co/tiiuae/falcon-180B",
+    "TII's flagship series of large language models",
+)
+register_model_info(
+    ["tigerbot-7b-sft"],
+    "Tigerbot",
+    "https://huggingface.co/TigerResearch/tigerbot-7b-sft",
+    "TigerBot is a large-scale language model (LLM) with multiple languages and tasks.",
+)
+register_model_info(
+    ["internlm-chat-7b", "internlm-chat-7b-8k"],
+    "InternLM",
+    "https://huggingface.co/internlm/internlm-chat-7b",
+    "InternLM is a multi-language large-scale language model (LLM), developed by SHLAB.",
+)
+register_model_info(
+    ["Qwen-7B-Chat"],
+    "Qwen",
+    "https://huggingface.co/Qwen/Qwen-7B-Chat",
+    "Qwen is a multi-language large-scale language model (LLM), developed by Damo Academy.",
+)
+register_model_info(
+    ["Llama2-Chinese-13b-Chat", "LLama2-Chinese-13B"],
+    "Llama2-Chinese",
+    "https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat",
+    "Llama2-Chinese is a multi-language large-scale language model (LLM), developed by FlagAlpha.",
+)
+register_model_info(
+    ["Vigogne-2-7B-Instruct", "Vigogne-2-13B-Instruct"],
+    "Vigogne-Instruct",
+    "https://huggingface.co/bofenghuang/vigogne-2-7b-instruct",
+    "Vigogne-Instruct is a French large language model (LLM) optimized for instruction-following, developed by Bofeng Huang",
+)
+register_model_info(
+    ["Vigogne-2-7B-Chat", "Vigogne-2-13B-Chat"],
+    "Vigogne-Chat",
+    "https://huggingface.co/bofenghuang/vigogne-2-7b-chat",
+    "Vigogne-Chat is a French large language model (LLM) optimized for instruction-following and multi-turn dialogues, developed by Bofeng Huang",
+)
+register_model_info(
+    ["deluxe-chat-v1", "deluxe-chat-v1.1"],
+    "DeluxeChat",
+    "",
+    "Deluxe Chat",
+)
+register_model_info(
+    [
+        "Xwin-LM-7B-V0.1",
+        "Xwin-LM-13B-V0.1",
+        "Xwin-LM-70B-V0.1",
+        "Xwin-LM-7B-V0.2",
+        "Xwin-LM-13B-V0.2",
+    ],
+    "Xwin-LM",
+    "https://github.com/Xwin-LM/Xwin-LM",
+    "Chat models developed by Xwin-LM team",
+)
+register_model_info(
+    ["lemur-70b-chat"],
+    "Lemur-Chat",
+    "https://huggingface.co/OpenLemur/lemur-70b-chat-v1",
+    "an openly accessible language model optimized for both natural language and coding capabilities ",
+)
+register_model_info(
+    ["Mistral-7B-OpenOrca"],
+    "Open-Orca",
+    "https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca",
+    "A fine-tune of [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) using [OpenOrca dataset](https://huggingface.co/datasets/Open-Orca/OpenOrca)",
+)
+register_model_info(
+    [
+        "AquilaChat-7B",
+        "AquilaChat2-7B",
+        "AquilaChat2-34B",
+    ],
+    "Aquila-Chat",
+    "https://huggingface.co/BAAI/AquilaChat2-34B",
+    "Chat models developed by BAAI team",
+)

fastchat/model/model_xfastertransformer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import gc
+from threading import Thread
+import torch
+from transformers import TextIteratorStreamer
+@torch.inference_mode()
+def generate_stream_xft(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=8192,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    # unused now, and placehold for future.
+    # temperature = float(params.get("temperature", 1.0))
+    # top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_new_tokens", 4096))
+    echo = params.get("echo", True)
+    inputs = tokenizer(
+        prompt, return_tensors="pt", padding=model.config.padding
+    ).input_ids
+    input_echo_len = len(inputs[0])
+    max_len = max_new_tokens + input_echo_len
+    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
+    generation_kwargs = {
+        "input_ids": inputs,
+        "streamer": streamer,
+        "max_length": max_len,
+        "num_beams": model.config.beam_width,
+        "length_penalty": repetition_penalty,
+        "num_return_sequences": model.config.num_return_sequences,
+        "early_stopping": model.config.early_stopping,
+        "eos_token_id": model.config.eos_token_id,
+        "pad_token_id": model.config.pad_token_id,
+    }
+    thread = Thread(target=model.model.generate, kwargs=generation_kwargs)
+    thread.start()
+    if echo:
+        # means keep the prompt
+        output = prompt
+    else:
+        output = ""
+    i = 0
+    for i, new_text in enumerate(streamer):
+        output += new_text
+        yield {
+            "text": output,
+            "usage": {
+                "prompt_tokens": input_echo_len,
+                "completion_tokens": i,
+                "total_tokens": input_echo_len + i,
+            },
+            "finish_reason": None,
+        }
+    output = output.strip()
+    if i == max_new_tokens - 1:
+        finish_reason = "length"
+    else:
+        finish_reason = "stop"
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+    gc.collect()