yuantao-infini-ai commited on
Commit
7472549
1 Parent(s): cf1798b

Upload 136 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. fastchat/__init__.py +1 -0
  2. fastchat/__pycache__/__init__.cpython-310.pyc +0 -0
  3. fastchat/__pycache__/__init__.cpython-311.pyc +0 -0
  4. fastchat/__pycache__/constants.cpython-310.pyc +0 -0
  5. fastchat/__pycache__/conversation.cpython-310.pyc +0 -0
  6. fastchat/__pycache__/utils.cpython-310.pyc +0 -0
  7. fastchat/constants.py +65 -0
  8. fastchat/conversation.py +1689 -0
  9. fastchat/data/__init__.py +0 -0
  10. fastchat/data/clean_sharegpt.py +217 -0
  11. fastchat/data/convert_alpaca.py +38 -0
  12. fastchat/data/extract_gpt4_only.py +32 -0
  13. fastchat/data/extract_single_round.py +29 -0
  14. fastchat/data/filter_wrong_format.py +44 -0
  15. fastchat/data/get_stats.py +82 -0
  16. fastchat/data/hardcoded_questions.py +168 -0
  17. fastchat/data/inspect_data.py +33 -0
  18. fastchat/data/merge.py +23 -0
  19. fastchat/data/optional_clean.py +90 -0
  20. fastchat/data/optional_replace.py +82 -0
  21. fastchat/data/prepare_all.py +42 -0
  22. fastchat/data/pretty_json.py +20 -0
  23. fastchat/data/sample.py +40 -0
  24. fastchat/data/split_long_conversation.py +129 -0
  25. fastchat/data/split_train_test.py +34 -0
  26. fastchat/model/__init__.py +5 -0
  27. fastchat/model/__pycache__/__init__.cpython-310.pyc +0 -0
  28. fastchat/model/__pycache__/compression.cpython-310.pyc +0 -0
  29. fastchat/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc +0 -0
  30. fastchat/model/__pycache__/model_adapter.cpython-310.pyc +0 -0
  31. fastchat/model/__pycache__/model_chatglm.cpython-310.pyc +0 -0
  32. fastchat/model/__pycache__/model_codet5p.cpython-310.pyc +0 -0
  33. fastchat/model/__pycache__/model_exllama.cpython-310.pyc +0 -0
  34. fastchat/model/__pycache__/model_falcon.cpython-310.pyc +0 -0
  35. fastchat/model/__pycache__/model_registry.cpython-310.pyc +0 -0
  36. fastchat/model/__pycache__/model_xfastertransformer.cpython-310.pyc +0 -0
  37. fastchat/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc +0 -0
  38. fastchat/model/apply_delta.py +165 -0
  39. fastchat/model/apply_lora.py +48 -0
  40. fastchat/model/compression.py +300 -0
  41. fastchat/model/convert_fp16.py +26 -0
  42. fastchat/model/llama_condense_monkey_patch.py +71 -0
  43. fastchat/model/make_delta.py +48 -0
  44. fastchat/model/model_adapter.py +1970 -0
  45. fastchat/model/model_chatglm.py +102 -0
  46. fastchat/model/model_codet5p.py +108 -0
  47. fastchat/model/model_exllama.py +77 -0
  48. fastchat/model/model_falcon.py +140 -0
  49. fastchat/model/model_registry.py +387 -0
  50. fastchat/model/model_xfastertransformer.py +81 -0
fastchat/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "0.2.32"
fastchat/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (184 Bytes). View file
 
fastchat/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (199 Bytes). View file
 
fastchat/__pycache__/constants.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
fastchat/__pycache__/conversation.cpython-310.pyc ADDED
Binary file (27.5 kB). View file
 
fastchat/__pycache__/utils.cpython-310.pyc ADDED
Binary file (10.1 kB). View file
 
fastchat/constants.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Global constants.
3
+ """
4
+
5
+ from enum import IntEnum
6
+ import os
7
+
8
+ REPO_PATH = os.path.dirname(os.path.dirname(__file__))
9
+
10
+ ##### For the gradio web server
11
+ SERVER_ERROR_MSG = (
12
+ "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
13
+ )
14
+ MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES."
15
+ CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
16
+ INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
17
+ SLOW_MODEL_MSG = "⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds."
18
+ # Maximum input length
19
+ INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000))
20
+ # Maximum conversation turns
21
+ CONVERSATION_TURN_LIMIT = 50
22
+ # Session expiration time
23
+ SESSION_EXPIRATION_TIME = 3600
24
+ # The output dir of log files
25
+ LOGDIR = os.getenv("LOGDIR", ".")
26
+ # CPU Instruction Set Architecture
27
+ CPU_ISA = os.getenv("CPU_ISA")
28
+
29
+
30
+ ##### For the controller and workers (could be overwritten through ENV variables.)
31
+ CONTROLLER_HEART_BEAT_EXPIRATION = int(
32
+ os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90)
33
+ )
34
+ WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 45))
35
+ WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 100))
36
+ WORKER_API_EMBEDDING_BATCH_SIZE = int(
37
+ os.getenv("FASTCHAT_WORKER_API_EMBEDDING_BATCH_SIZE", 4)
38
+ )
39
+
40
+
41
+ class ErrorCode(IntEnum):
42
+ """
43
+ https://platform.openai.com/docs/guides/error-codes/api-errors
44
+ """
45
+
46
+ VALIDATION_TYPE_ERROR = 40001
47
+
48
+ INVALID_AUTH_KEY = 40101
49
+ INCORRECT_AUTH_KEY = 40102
50
+ NO_PERMISSION = 40103
51
+
52
+ INVALID_MODEL = 40301
53
+ PARAM_OUT_OF_RANGE = 40302
54
+ CONTEXT_OVERFLOW = 40303
55
+
56
+ RATE_LIMIT = 42901
57
+ QUOTA_EXCEEDED = 42902
58
+ ENGINE_OVERLOADED = 42903
59
+
60
+ INTERNAL_ERROR = 50001
61
+ CUDA_OUT_OF_MEMORY = 50002
62
+ GRADIO_REQUEST_ERROR = 50003
63
+ GRADIO_STREAM_UNKNOWN_ERROR = 50004
64
+ CONTROLLER_NO_WORKER = 50005
65
+ CONTROLLER_WORKER_TIMEOUT = 50006
fastchat/conversation.py ADDED
@@ -0,0 +1,1689 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+ """
7
+ import dataclasses
8
+ from enum import auto, IntEnum
9
+ from typing import List, Any, Dict, Union, Tuple
10
+
11
+
12
+ class SeparatorStyle(IntEnum):
13
+ """Separator styles."""
14
+
15
+ ADD_COLON_SINGLE = auto()
16
+ ADD_COLON_TWO = auto()
17
+ ADD_COLON_SPACE_SINGLE = auto()
18
+ NO_COLON_SINGLE = auto()
19
+ NO_COLON_TWO = auto()
20
+ ADD_NEW_LINE_SINGLE = auto()
21
+ LLAMA2 = auto()
22
+ CHATGLM = auto()
23
+ CHATML = auto()
24
+ CHATINTERN = auto()
25
+ DOLLY = auto()
26
+ RWKV = auto()
27
+ PHOENIX = auto()
28
+ ROBIN = auto()
29
+ FALCON_CHAT = auto()
30
+ CHATGLM3 = auto()
31
+ DIY = auto()
32
+ MEGREZ = auto()
33
+ MEGREZ_CRLFT = auto()
34
+ MEGREZ_CRLFT_LLAMA3 = auto()
35
+ MEGREZ_1B = auto()
36
+ MINICPM_V2 = auto()
37
+ ZHINAO360 = auto()
38
+
39
+ @dataclasses.dataclass
40
+ class Conversation:
41
+ """A class that manages prompt templates and keeps all conversation history."""
42
+ # The name of this template
43
+ name: str
44
+ # The template of the system prompt
45
+ system_template: str = "{system_message}"
46
+ # The system message
47
+ system_message: str = ""
48
+ # The names of two roles
49
+ roles: Tuple[str] = ("USER", "ASSISTANT")
50
+ # All messages. Each item is (role, message).
51
+ messages: List[List[str]] = ()
52
+ # The number of few shot examples
53
+ offset: int = 0
54
+ # The separator style and configurations
55
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
56
+ sep: str = "\n"
57
+ sep2: str = None
58
+ # Stop criteria (the default one is EOS token)
59
+ stop_str: Union[str, List[str]] = None
60
+ # Stops generation if meeting any token in this list
61
+ stop_token_ids: List[int] = None
62
+ apply_template: bool = False
63
+ none_stop: bool = False
64
+ skip_special_tokens: bool = True
65
+
66
+
67
+ def convert_messages_format(self, messages, sysprompt) -> list:
68
+ messages_ = [{'role': 'system', 'content': sysprompt}]
69
+ for message in messages:
70
+ if not message[1]:
71
+ continue
72
+ if isinstance(message, list):
73
+ messages_.append({'role': message[0], 'content': message[1]})
74
+ else:
75
+ messages_.append(message)
76
+ return messages_
77
+
78
+ def get_prompt(self, tokenizer=None) -> str:
79
+ """Get the prompt for generation."""
80
+ if tokenizer and self.apply_template:
81
+ print(f'======using apply_chat_template()======')
82
+ try:
83
+ messages = self.convert_messages_format(self.messages, self.system_message)
84
+ ret = tokenizer.apply_chat_template(
85
+ messages,
86
+ tokenize=False,
87
+ add_generation_prompt=True
88
+ )
89
+ return ret
90
+ except:
91
+ raise ValueError(f"apply_chat_template() is not supported by this tokenizer: {tokenizer}")
92
+
93
+ print(f'======using fastchat conv template======')
94
+ system_prompt = self.system_template.format(system_message=self.system_message)
95
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
96
+ ret = system_prompt #+ self.sep
97
+ for role, message in self.messages:
98
+ if message:
99
+ ret += role + ": " + message + self.sep
100
+ else:
101
+ ret += role + ":"
102
+ return ret
103
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
104
+ seps = [self.sep, self.sep2]
105
+ ret = system_prompt + seps[0]
106
+ for i, (role, message) in enumerate(self.messages):
107
+ if message:
108
+ ret += role + ": " + message + seps[i % 2]
109
+ else:
110
+ ret += role + ":"
111
+ return ret
112
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
113
+ ret = system_prompt + self.sep
114
+ for role, message in self.messages:
115
+ if message:
116
+ ret += role + ": " + message + self.sep
117
+ else:
118
+ ret += role + ": " # must be end with a space
119
+ return ret
120
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
121
+ ret = "" if system_prompt == "" else system_prompt + self.sep
122
+ for role, message in self.messages:
123
+ if message:
124
+ ret += role + "\n" + message + self.sep
125
+ else:
126
+ ret += role + "\n"
127
+ return ret
128
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
129
+ ret = system_prompt
130
+ for role, message in self.messages:
131
+ if message:
132
+ ret += role + message + self.sep
133
+ else:
134
+ ret += role
135
+ return ret
136
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
137
+ seps = [self.sep, self.sep2]
138
+ ret = system_prompt
139
+ for i, (role, message) in enumerate(self.messages):
140
+ if message:
141
+ ret += role + message + seps[i % 2]
142
+ else:
143
+ ret += role
144
+ return ret
145
+ elif self.sep_style == SeparatorStyle.RWKV:
146
+ ret = system_prompt
147
+ for i, (role, message) in enumerate(self.messages):
148
+ if message:
149
+ ret += (
150
+ role
151
+ + ": "
152
+ + message.replace("\r\n", "\n").replace("\n\n", "\n")
153
+ )
154
+ ret += "\n\n"
155
+ else:
156
+ ret += role + ":"
157
+ return ret
158
+ elif self.sep_style == SeparatorStyle.LLAMA2:
159
+ seps = [self.sep, self.sep2]
160
+ if self.system_message:
161
+ ret = system_prompt
162
+ else:
163
+ ret = "[INST] "
164
+ for i, (role, message) in enumerate(self.messages):
165
+ tag = self.roles[i % 2]
166
+ if message:
167
+ if i == 0:
168
+ ret += message + " "
169
+ else:
170
+ ret += tag + " " + message + seps[i % 2]
171
+ else:
172
+ ret += tag
173
+ return ret
174
+ elif self.sep_style == SeparatorStyle.CHATGLM:
175
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
176
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
177
+ round_add_n = 1 if self.name == "chatglm2" else 0
178
+ if system_prompt:
179
+ ret = system_prompt + self.sep
180
+ else:
181
+ ret = ""
182
+
183
+ for i, (role, message) in enumerate(self.messages):
184
+ if i % 2 == 0:
185
+ ret += f"[Round {i//2 + round_add_n}]{self.sep}"
186
+
187
+ if message:
188
+ ret += f"{role}:{message}{self.sep}"
189
+ else:
190
+ ret += f"{role}:"
191
+ return ret
192
+ elif self.sep_style == SeparatorStyle.CHATML:
193
+ ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
194
+ for role, message in self.messages:
195
+ if message:
196
+ ret += role + "\n" + message + self.sep + "\n"
197
+ else:
198
+ ret += role + "\n"
199
+ return ret
200
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
201
+ ret = ""
202
+ if self.system_message:
203
+ ret += system_prompt
204
+ for role, message in self.messages:
205
+ if message:
206
+ ret += role + "\n" + " " + message
207
+ else:
208
+ ret += role
209
+ return ret
210
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
211
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
212
+ seps = [self.sep, self.sep2]
213
+ ret = system_prompt
214
+ for i, (role, message) in enumerate(self.messages):
215
+ if i % 2 == 0:
216
+ ret += "<s>"
217
+ if message:
218
+ ret += role + ":" + message + seps[i % 2] + "\n"
219
+ else:
220
+ ret += role + ":"
221
+ return ret
222
+ elif self.sep_style == SeparatorStyle.DOLLY:
223
+ seps = [self.sep, self.sep2]
224
+ ret = system_prompt
225
+ for i, (role, message) in enumerate(self.messages):
226
+ if message:
227
+ ret += role + ":\n" + message + seps[i % 2]
228
+ if i % 2 == 1:
229
+ ret += "\n\n"
230
+ else:
231
+ ret += role + ":\n"
232
+ return ret
233
+ elif self.sep_style == SeparatorStyle.PHOENIX:
234
+ ret = system_prompt
235
+ for role, message in self.messages:
236
+ if message:
237
+ ret += role + ": " + "<s>" + message + "</s>"
238
+ else:
239
+ ret += role + ": " + "<s>"
240
+ return ret
241
+ elif self.sep_style == SeparatorStyle.ROBIN:
242
+ ret = system_prompt + self.sep
243
+ for role, message in self.messages:
244
+ if message:
245
+ ret += role + ":\n" + message + self.sep
246
+ else:
247
+ ret += role + ":\n"
248
+ return ret
249
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
250
+ ret = ""
251
+ if self.system_message:
252
+ ret += system_prompt + self.sep
253
+ for role, message in self.messages:
254
+ if message:
255
+ ret += role + ": " + message + self.sep
256
+ else:
257
+ ret += role + ":"
258
+ return ret
259
+ elif self.sep_style == SeparatorStyle.DIY:
260
+ ret = system_prompt
261
+ for role, message in self.messages:
262
+ if message:
263
+ ret += role + ":\n" + message + self.sep
264
+ else:
265
+ ret += role + ":\n"
266
+ return ret
267
+ elif self.sep_style == SeparatorStyle.MEGREZ:
268
+ seps = [self.sep, self.sep2]
269
+ ret = system_prompt + seps[0] + ' '
270
+ for i, (role, message) in enumerate(self.messages):
271
+ if message:
272
+ ret += role + ": " + message + seps[i % 2]
273
+ else:
274
+ ret += role + ":"
275
+ return ret
276
+ elif self.sep_style == SeparatorStyle.MEGREZ_CRLFT:
277
+ seps = [self.sep, self.sep2]
278
+ ret = system_prompt + seps[0]
279
+ for i, (role, message) in enumerate(self.messages):
280
+ if message:
281
+ ret += role + ": " + message + seps[i % 2]
282
+ else:
283
+ ret += role + ":"
284
+ return ret
285
+ elif self.sep_style == SeparatorStyle.MEGREZ_CRLFT_LLAMA3:
286
+ seps = [self.sep, self.sep2]
287
+ ret = system_prompt
288
+ for i, (role, message) in enumerate(self.messages):
289
+ if message:
290
+ ret += role + message + seps[i % 2]
291
+ else:
292
+ ret += role
293
+ return ret
294
+ elif self.sep_style == SeparatorStyle.MEGREZ_1B:
295
+ ret = system_prompt + self.sep
296
+ for i, (role, message) in enumerate(self.messages):
297
+ if message:
298
+ ret += role + message + self.sep
299
+ else:
300
+ ret += role
301
+ return ret
302
+ elif self.sep_style == SeparatorStyle.MINICPM_V2:
303
+ seps = [self.sep, self.sep2]
304
+ ret = "" if system_prompt == "" else system_prompt + self.sep
305
+ for i, (role, message) in enumerate(self.messages):
306
+ if message:
307
+ ret += role + "\n" + message + seps[i % 2]
308
+ else:
309
+ ret += role + "\n"
310
+ return ret
311
+ elif self.sep_style == SeparatorStyle.ZHINAO360:
312
+ seps = [self.sep, self.sep2]
313
+ ret = '' if system_prompt=='' else f'<|im_start|>system\n{system_prompt}<|im_end|>\n'
314
+ for i, (role, message) in enumerate(self.messages):
315
+ if message:
316
+ ret += role + "\n" + message + seps[i % 2]
317
+ else:
318
+ ret += role + "\n"
319
+ return ret
320
+ else:
321
+ raise ValueError(f"Invalid style: {self.sep_style}")
322
+
323
+ def set_system_message(self, system_message: str):
324
+ """Set the system message."""
325
+ self.system_message = system_message
326
+
327
+ def append_message(self, role: str, message: str):
328
+ """Append a new message."""
329
+ self.messages.append([role, message])
330
+
331
+ def update_last_message(self, message: str):
332
+ """Update the last output.
333
+
334
+ The last message is typically set to be None when constructing the prompt,
335
+ so we need to update it in-place after getting the response from a model.
336
+ """
337
+ self.messages[-1][1] = message
338
+
339
+ def to_gradio_chatbot(self):
340
+ """Convert the conversation to gradio chatbot format."""
341
+ ret = []
342
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
343
+ if i % 2 == 0:
344
+ ret.append([msg, None])
345
+ else:
346
+ ret[-1][-1] = msg
347
+ return ret
348
+
349
+ def to_openai_api_messages(self):
350
+ """Convert the conversation to OpenAI chat completion format."""
351
+ ret = [{"role": "system", "content": self.system_message}]
352
+
353
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
354
+ if i % 2 == 0:
355
+ ret.append({"role": "user", "content": msg})
356
+ else:
357
+ if msg is not None:
358
+ ret.append({"role": "assistant", "content": msg})
359
+ return ret
360
+
361
+ def copy(self):
362
+ return Conversation(
363
+ name=self.name,
364
+ system_template=self.system_template,
365
+ system_message=self.system_message,
366
+ roles=self.roles,
367
+ messages=[[x, y] for x, y in self.messages],
368
+ offset=self.offset,
369
+ sep_style=self.sep_style,
370
+ sep=self.sep,
371
+ sep2=self.sep2,
372
+ stop_str=self.stop_str,
373
+ stop_token_ids=self.stop_token_ids,
374
+ apply_template=self.apply_template,
375
+ none_stop=self.none_stop,
376
+ skip_special_tokens=self.skip_special_tokens,
377
+ )
378
+
379
+ def dict(self):
380
+ return {
381
+ "template_name": self.name,
382
+ "system_message": self.system_message,
383
+ "roles": self.roles,
384
+ "messages": self.messages,
385
+ "offset": self.offset,
386
+ }
387
+
388
+
389
+ # A global registry for all conversation templates
390
+ conv_templates: Dict[str, Conversation] = {}
391
+
392
+
393
+ def register_conv_template(template: Conversation, override: bool = False):
394
+ """Register a new conversation template."""
395
+ if not override:
396
+ assert (
397
+ template.name not in conv_templates
398
+ ), f"{template.name} has been registered."
399
+
400
+ conv_templates[template.name] = template
401
+
402
+
403
+ def get_conv_template(name: str) -> Conversation:
404
+ """Get a conversation template."""
405
+ return conv_templates[name].copy()
406
+
407
+
408
+ # An empty template for raw conversation.
409
+ register_conv_template(
410
+ Conversation(
411
+ name="raw",
412
+ system_message="",
413
+ roles=("", ""),
414
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
415
+ sep="",
416
+ )
417
+ )
418
+
419
+ # A template with a one-shot conversation example
420
+ register_conv_template(
421
+ Conversation(
422
+ name="one_shot",
423
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
424
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
425
+ roles=("Human", "Assistant"),
426
+ messages=(
427
+ (
428
+ "Human",
429
+ "Got any creative ideas for a 10 year old’s birthday?",
430
+ ),
431
+ (
432
+ "Assistant",
433
+ """Of course! Here are some creative ideas for a 10-year-old's birthday party:
434
+ 1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
435
+ 2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
436
+ 3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
437
+ 4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
438
+ 5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
439
+ 6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
440
+ 7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
441
+ 8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
442
+ Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
443
+ ),
444
+ ),
445
+ offset=2,
446
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
447
+ sep="\n### ",
448
+ stop_str="###",
449
+ )
450
+ )
451
+
452
+ # A template similar to the "one_shot" template above but remove the example.
453
+ register_conv_template(
454
+ Conversation(
455
+ name="zero_shot",
456
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
457
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
458
+ roles=("Human", "Assistant"),
459
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
460
+ sep="\n### ",
461
+ stop_str="###",
462
+ )
463
+ )
464
+
465
+ # Vicuna v1.1 template
466
+ register_conv_template(
467
+ Conversation(
468
+ name="vicuna_v1.1",
469
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
470
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
471
+ roles=("USER", "ASSISTANT"),
472
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
473
+ sep=" ",
474
+ sep2="</s>",
475
+ )
476
+ )
477
+
478
+ register_conv_template(
479
+ Conversation(
480
+ name="megrez",
481
+ system_message="你是一个乐于助人的助手,助手将针对用户的问题给出详细的、积极的回答。",
482
+ roles=("USER", "ASSISTANT"),
483
+ sep_style=SeparatorStyle.MEGREZ,
484
+ sep="\n",
485
+ sep2="</s>",
486
+ stop_str="USER",
487
+ )
488
+ )
489
+
490
+ register_conv_template(
491
+ Conversation(
492
+ name="megrez_crlft",
493
+ system_message="你是一个乐于助人的助手,助手将针对用户的问题给出详细的、积极的回答。",
494
+ roles=("Megrez USER", "Megrez ASSISTANT"),
495
+ sep_style=SeparatorStyle.MEGREZ_CRLFT,
496
+ sep="</s>",
497
+ sep2="</s>",
498
+ stop_str="USER",
499
+ )
500
+ )
501
+
502
+ register_conv_template(
503
+ Conversation(
504
+ name="megrez_crlft_llama3",
505
+ system_message="<|begin_of_text|>你是一个乐于助人的助手,将针对用户的问题给出详细的、积极的回答。<|end_of_text|>\n\n",
506
+ roles=("<|start_header_id|>Megrez USER<|end_header_id|>\n\n", "<|start_header_id|>Megrez ASSISTANT<|end_header_id|>\n\n"),
507
+ sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
508
+ sep="<|eot_id|>",
509
+ sep2="<|eot_id|><|end_of_text|>",
510
+ stop_str=["<|eot_id|>", "<|end_of_text|>", "<|eot_id|><|end_of_text|>"]
511
+ )
512
+ )
513
+
514
+ register_conv_template(
515
+ Conversation(
516
+ name="megrez_3b",
517
+ system_message="<|system_start|>你是一个乐于助人的助手,将针对用户的问题给出详细的、积极的回答。<|system_end|>",
518
+ roles=("<|user|>Megrez USER\n", "<|assistant|>Megrez ASSISTANT\n"),
519
+ sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
520
+ sep="<|eos|>",
521
+ sep2="<|eos|>",
522
+ stop_str=["<|eos|>"]
523
+ )
524
+ )
525
+
526
+ register_conv_template(
527
+ Conversation(
528
+ name="megrez_3b_2",
529
+ system_message="<|system_start|>你是一个乐于助人的助手,将针对用户的问题给出详细的、积极的回答。<|system_end|>",
530
+ roles=("<|user|>USER\n", "<|assistant|>ASSISTANT\n"),
531
+ sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
532
+ sep="<|eos|>",
533
+ sep2="<|eos|>",
534
+ stop_str=["<|eos|>"]
535
+ )
536
+ )
537
+
538
+ register_conv_template(
539
+ Conversation(
540
+ name="megrez_standar",
541
+ system_message="<|role_start|>system<|role_end|>你是无穹天权,将针对用户的问题给出详细的、积极的回答。",
542
+ roles=("<|role_start|>user<|role_end|>", "<|role_start|>assistant<|role_end|>"),
543
+ sep_style=SeparatorStyle.MEGREZ_1B,
544
+ sep="<|turn_end|>",
545
+ stop_str=["<|turn_end|>"]
546
+ )
547
+ )
548
+
549
+ register_conv_template(
550
+ Conversation(
551
+ name="megrez_1b_rk",
552
+ system_message="<|role_start|>system<|role_end|>你是Megrez-1B,将针对用户的问题给出详细的、积极的回答。",
553
+ roles=("<|role_start|>user<|role_end|>", "<|role_start|>assistant<|role_end|>"),
554
+ sep_style=SeparatorStyle.MEGREZ_1B,
555
+ sep="<|eos|>",
556
+ stop_str=["<|eos|>"]
557
+ )
558
+ )
559
+
560
+ register_conv_template(
561
+ Conversation(
562
+ name="llama3_chat",
563
+ system_message="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n中文大师<|eot_id|>",
564
+ roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
565
+ sep_style=SeparatorStyle.MEGREZ_CRLFT_LLAMA3,
566
+ sep="<|eot_id|>",
567
+ sep2="<|eot_id|>",
568
+ stop_str=["assistant", "<|eot_id|>", "<|end_of_text|>", "<|eot_id|><|end_of_text|>"]
569
+ )
570
+ )
571
+
572
+ register_conv_template(
573
+ Conversation(
574
+ name="megrez_crlft_mortal",
575
+ system_message="你是一个乐于助人的助手,助手将针对用户的问题给出详细的、积极的回答。",
576
+ roles=("Mortal USER", "Mortal ASSISTANT"),
577
+ sep_style=SeparatorStyle.MEGREZ_CRLFT,
578
+ sep="</s>",
579
+ sep2="</s>",
580
+ stop_str="USER",
581
+ )
582
+ )
583
+
584
+ register_conv_template(
585
+ Conversation(
586
+ name="minicpm",
587
+ system_message="<s>",
588
+ roles=("<用户>", "<AI>"),
589
+ sep_style=SeparatorStyle.NO_COLON_TWO,
590
+ sep="",
591
+ sep2="</s>",
592
+ stop_str="<用户>",
593
+ )
594
+ )
595
+
596
+ register_conv_template(
597
+ Conversation(
598
+ name="360zhinao",
599
+ system_message="You are a helpful assistant.",
600
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
601
+ sep_style=SeparatorStyle.MINICPM_V2,
602
+ sep="<|im_end|>\n",
603
+ sep2="<|im_end|>\n",
604
+ stop_str=["<|im_end|>\n", "<|im_start|>", "user"],
605
+ )
606
+ )
607
+
608
+ register_conv_template(
609
+ Conversation(
610
+ name="wizard",
611
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
612
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
613
+ roles=("USER", "ASSISTANT"),
614
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
615
+ sep=" ",
616
+ sep2="</s>",
617
+ )
618
+ )
619
+
620
+ register_conv_template(
621
+ Conversation(
622
+ name="belle",
623
+ system_message="",
624
+ roles=("Human", "Assistant"),
625
+ sep_style=SeparatorStyle.DIY,
626
+ sep="\n\n",
627
+ # sep2="</s>",
628
+ # stop_str="<用户>",
629
+ )
630
+ )
631
+
632
+ register_conv_template(
633
+ Conversation(
634
+ name="xdan",
635
+ system_message="You are a helpful assistant named DAN. You are an expert in worldly knowledge, skilled in employing a probing questioning strategy, and you carefully consider each step before providing answers.",
636
+ roles=("### Instruction", "### Response"),
637
+ sep_style=SeparatorStyle.ROBIN,
638
+ sep="\n\n",
639
+ stop_str="</s>",
640
+ )
641
+ )
642
+
643
+ register_conv_template(
644
+ Conversation(
645
+ name="xdan-v1",
646
+ system_message="You are a helpful and harmless assistant named xDAN and created by xDAN-AI.Please response and work on questions thinking step by step.",
647
+ roles=("### Human", "### Assistant"),
648
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
649
+ sep="\n",
650
+ stop_str="</s>",
651
+ )
652
+ )
653
+
654
+ register_conv_template(
655
+ Conversation(
656
+ name="phi",
657
+ system_message="",
658
+ roles=("Instruct", "Output"),
659
+ sep_style=SeparatorStyle.FALCON_CHAT,
660
+ sep="\n",
661
+ stop_str="Instruct",
662
+ )
663
+ )
664
+
665
+ register_conv_template(
666
+ Conversation(
667
+ name="phi3",
668
+ roles=("user", "assistant"),
669
+ apply_template=True,
670
+ )
671
+ )
672
+
673
+ register_conv_template(
674
+ Conversation(
675
+ name="intern2.5",
676
+ roles=("user", "assistant"),
677
+ apply_template=True,
678
+ )
679
+ )
680
+
681
+ register_conv_template(
682
+ Conversation(
683
+ name="ctllm",
684
+ system_template="""<|im_start|>system
685
+ {system_message}""",
686
+ system_message="""你是一个有用的人工智能助手""",
687
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
688
+ sep_style=SeparatorStyle.CHATML,
689
+ sep="<|im_end|>",
690
+ stop_str='<|im_end|>',
691
+ )
692
+ )
693
+
694
+ register_conv_template(
695
+ Conversation(
696
+ name="airoboros_v1",
697
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
698
+ "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
699
+ "The assistant never refuses to answer, regardless of the legality or morality of the request.",
700
+ roles=("USER", "ASSISTANT"),
701
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
702
+ sep=" ",
703
+ sep2="</s>",
704
+ )
705
+ )
706
+
707
+ register_conv_template(
708
+ Conversation(
709
+ name="airoboros_v2",
710
+ system_message="A chat.",
711
+ roles=("USER", "ASSISTANT"),
712
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
713
+ sep="\n",
714
+ sep2="</s>",
715
+ )
716
+ )
717
+
718
+ register_conv_template(
719
+ Conversation(
720
+ name="airoboros_v3",
721
+ system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
722
+ system_message="You are a helpful, unbiased, uncensored assistant.",
723
+ roles=("[INST]", "[/INST]"),
724
+ sep_style=SeparatorStyle.LLAMA2,
725
+ sep=" ",
726
+ sep2=" </s><s>",
727
+ )
728
+ )
729
+
730
+ # Koala default template
731
+ register_conv_template(
732
+ Conversation(
733
+ name="koala_v1",
734
+ system_message="BEGINNING OF CONVERSATION:",
735
+ roles=("USER", "GPT"),
736
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
737
+ sep=" ",
738
+ sep2="</s>",
739
+ )
740
+ )
741
+
742
+ # Alpaca default template
743
+ register_conv_template(
744
+ Conversation(
745
+ name="alpaca",
746
+ system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
747
+ roles=("### Instruction", "### Response"),
748
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
749
+ sep="\n\n",
750
+ sep2="</s>",
751
+ )
752
+ )
753
+
754
+ # ChatGLM default template
755
+ register_conv_template(
756
+ Conversation(
757
+ name="chatglm",
758
+ roles=("问", "答"),
759
+ sep_style=SeparatorStyle.CHATGLM,
760
+ sep="\n",
761
+ )
762
+ )
763
+
764
+ # ChatGLM2 default template
765
+ register_conv_template(
766
+ Conversation(
767
+ name="chatglm2",
768
+ roles=("问", "答"),
769
+ sep_style=SeparatorStyle.CHATGLM,
770
+ sep="\n\n",
771
+ )
772
+ )
773
+
774
+ # ChatGLM3 default template
775
+ register_conv_template(
776
+ Conversation(
777
+ name="chatglm3",
778
+ system_template="<|system|>\n {system_message}",
779
+ roles=("<|user|>", "<|assistant|>"),
780
+ sep_style=SeparatorStyle.CHATGLM3,
781
+ stop_token_ids=[
782
+ 64795,
783
+ 64797,
784
+ 2,
785
+ ], # "<|user|>", "<|observation|>", "</s>"
786
+ )
787
+ )
788
+
789
+ # source: https://huggingface.co/01-ai/Yi-34B-Chat/blob/main/tokenizer_config.json#L60
790
+ register_conv_template(
791
+ Conversation(
792
+ name="Yi-34b-chat",
793
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
794
+ sep_style=SeparatorStyle.CHATML,
795
+ sep="<|im_end|>",
796
+ stop_token_ids=[
797
+ 2,
798
+ 6,
799
+ 7,
800
+ 8,
801
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|im_sep|>"
802
+ stop_str="<|endoftext|>",
803
+ )
804
+ )
805
+
806
+
807
+ # CodeGeex(2) Template
808
+ register_conv_template(
809
+ Conversation(
810
+ name="codegeex",
811
+ roles=("", ""),
812
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
813
+ sep="\n\n",
814
+ stop_token_ids=[0, 2],
815
+ )
816
+ )
817
+
818
+ # Dolly V2 default template
819
+ register_conv_template(
820
+ Conversation(
821
+ name="dolly_v2",
822
+ system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
823
+ roles=("### Instruction", "### Response"),
824
+ sep_style=SeparatorStyle.DOLLY,
825
+ sep="\n\n",
826
+ sep2="### End",
827
+ )
828
+ )
829
+
830
+ # OpenAssistant Pythia default template
831
+ register_conv_template(
832
+ Conversation(
833
+ name="oasst_pythia",
834
+ roles=("<|prompter|>", "<|assistant|>"),
835
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
836
+ sep="<|endoftext|>",
837
+ )
838
+ )
839
+
840
+ # OpenAssistant default template
841
+ register_conv_template(
842
+ Conversation(
843
+ name="oasst_llama",
844
+ roles=("<|prompter|>", "<|assistant|>"),
845
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
846
+ sep="</s>",
847
+ )
848
+ )
849
+
850
+ # OpenChat 3.5 default template
851
+ register_conv_template(
852
+ Conversation(
853
+ name="openchat_3.5",
854
+ roles=("GPT4 Correct User", "GPT4 Correct Assistant"),
855
+ sep_style=SeparatorStyle.FALCON_CHAT,
856
+ sep="<|end_of_turn|>",
857
+ )
858
+ )
859
+
860
+ # OpenChat 3.5 default template
861
+ register_conv_template(
862
+ Conversation(
863
+ name="openchat_3.6",
864
+ roles=("GPT4 Correct User", "GPT4 Correct Assistant"),
865
+ sep_style=SeparatorStyle.FALCON_CHAT,
866
+ sep="<|end_of_turn|>",
867
+ stop_str=["<|end_of_turn|>", "<|im_end|>", "|||", "|>|>", "|end_of_turn|", "end_of_turn"],
868
+ )
869
+ )
870
+
871
+ register_conv_template(
872
+ Conversation(
873
+ name="bilibili",
874
+ roles=("user", "assistant"),
875
+ apply_template=True,
876
+ )
877
+ )
878
+
879
+ register_conv_template(
880
+ Conversation(
881
+ name="neo",
882
+ system_message="You are a helpful assistant.",
883
+ roles=("user", "assistant"),
884
+ apply_template=True,
885
+ )
886
+ )
887
+
888
+ # Tulu default template
889
+ register_conv_template(
890
+ Conversation(
891
+ name="tulu",
892
+ roles=("<|user|>", "<|assistant|>"),
893
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
894
+ sep="\n",
895
+ )
896
+ )
897
+
898
+ # StableLM Alpha default template
899
+ register_conv_template(
900
+ Conversation(
901
+ name="stablelm",
902
+ system_template="<|SYSTEM|>{system_message}",
903
+ system_message="""# StableLM Tuned (Alpha version)
904
+ - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
905
+ - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
906
+ - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
907
+ - StableLM will refuse to participate in anything that could harm a human.
908
+ """,
909
+ roles=("<|USER|>", "<|ASSISTANT|>"),
910
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
911
+ sep="",
912
+ stop_token_ids=[50278, 50279, 50277, 1, 0],
913
+ )
914
+ )
915
+
916
+ # Baize default template
917
+ register_conv_template(
918
+ Conversation(
919
+ name="baize",
920
+ system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
921
+ roles=("[|Human|]", "[|AI|]"),
922
+ messages=(
923
+ ("[|Human|]", "Hello!"),
924
+ ("[|AI|]", "Hi!"),
925
+ ),
926
+ offset=2,
927
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
928
+ sep="\n",
929
+ stop_str="[|Human|]",
930
+ )
931
+ )
932
+
933
+ # RWKV-4-Raven default template
934
+ register_conv_template(
935
+ Conversation(
936
+ name="rwkv",
937
+ roles=("Bob", "Alice"),
938
+ messages=(
939
+ ("Bob", "hi"),
940
+ (
941
+ "Alice",
942
+ "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
943
+ ),
944
+ ),
945
+ offset=2,
946
+ sep_style=SeparatorStyle.RWKV,
947
+ sep="",
948
+ stop_str="\n\n",
949
+ )
950
+ )
951
+
952
+ # Buddy default template
953
+ register_conv_template(
954
+ Conversation(
955
+ name="openbuddy",
956
+ system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
957
+ Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
958
+ Buddy cannot access the Internet.
959
+ Buddy can fluently speak the user's language (e.g. English, Chinese).
960
+ Buddy can generate poems, stories, code, essays, songs, parodies, and more.
961
+ Buddy possesses vast knowledge about the world, history, and culture.
962
+ Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
963
+ Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
964
+
965
+ User: Hi.
966
+ Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
967
+ roles=("User", "Assistant"),
968
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
969
+ sep="\n",
970
+ )
971
+ )
972
+
973
+ # Phoenix default template
974
+ register_conv_template(
975
+ Conversation(
976
+ name="phoenix",
977
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
978
+ roles=("Human", "Assistant"),
979
+ sep_style=SeparatorStyle.PHOENIX,
980
+ sep="</s>",
981
+ )
982
+ )
983
+
984
+ # ReaLM default template
985
+ register_conv_template(
986
+ Conversation(
987
+ name="ReaLM-7b-v1",
988
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
989
+ roles=("Human", "Assistant"),
990
+ sep_style=SeparatorStyle.PHOENIX,
991
+ sep="</s>",
992
+ )
993
+ )
994
+
995
+ # ChatGPT default template
996
+ register_conv_template(
997
+ Conversation(
998
+ name="chatgpt",
999
+ system_message="You are a helpful assistant.",
1000
+ roles=("user", "assistant"),
1001
+ sep_style=None,
1002
+ sep=None,
1003
+ )
1004
+ )
1005
+
1006
+ # Claude default template
1007
+ register_conv_template(
1008
+ Conversation(
1009
+ name="claude",
1010
+ roles=("Human", "Assistant"),
1011
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1012
+ sep="\n\n",
1013
+ )
1014
+ )
1015
+
1016
+ # MPT default template
1017
+ register_conv_template(
1018
+ Conversation(
1019
+ name="mpt-7b-chat",
1020
+ system_template="""<|im_start|>system
1021
+ {system_message}""",
1022
+ system_message="""- You are a helpful assistant chatbot trained by MosaicML.
1023
+ - You answer questions.
1024
+ - You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
1025
+ - You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
1026
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1027
+ sep_style=SeparatorStyle.CHATML,
1028
+ sep="<|im_end|>",
1029
+ stop_token_ids=[50278, 0],
1030
+ )
1031
+ )
1032
+
1033
+ # MPT-30b-chat default template
1034
+ register_conv_template(
1035
+ Conversation(
1036
+ name="mpt-30b-chat",
1037
+ system_template="""<|im_start|>system
1038
+ {system_message}""",
1039
+ system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
1040
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1041
+ sep_style=SeparatorStyle.CHATML,
1042
+ sep="<|im_end|>",
1043
+ stop_token_ids=[50278, 0],
1044
+ )
1045
+ )
1046
+
1047
+ # Lemur-70b-chat default template
1048
+ # reference: https://huggingface.co/OpenLemur/lemur-70b-chat-v1#generation
1049
+ register_conv_template(
1050
+ Conversation(
1051
+ name="lemur-70b-chat",
1052
+ system_template="""<|im_start|>system
1053
+ {system_message}""",
1054
+ system_message="""You are a helpful, respectful, and honest assistant.""",
1055
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1056
+ sep_style=SeparatorStyle.CHATML,
1057
+ sep="<|im_end|>",
1058
+ stop_token_ids=[32002, 0],
1059
+ )
1060
+ )
1061
+
1062
+ # MPT-30b-instruct default template
1063
+ # reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
1064
+ register_conv_template(
1065
+ Conversation(
1066
+ name="mpt-30b-instruct",
1067
+ system_template="{system_message}",
1068
+ system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
1069
+ roles=("### Instruction", "### Response"),
1070
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
1071
+ sep="\n\n",
1072
+ stop_token_ids=[50278, 0],
1073
+ )
1074
+ )
1075
+
1076
+ # Bard default template
1077
+ # Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
1078
+ # https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
1079
+ register_conv_template(
1080
+ Conversation(
1081
+ name="bard",
1082
+ roles=("0", "1"),
1083
+ sep_style=None,
1084
+ sep=None,
1085
+ )
1086
+ )
1087
+
1088
+ # BiLLa default template
1089
+ register_conv_template(
1090
+ Conversation(
1091
+ name="billa",
1092
+ roles=("Human", "Assistant"),
1093
+ sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
1094
+ sep="\n",
1095
+ stop_str="Human:",
1096
+ )
1097
+ )
1098
+
1099
+ # RedPajama INCITE default template
1100
+ register_conv_template(
1101
+ Conversation(
1102
+ name="redpajama-incite",
1103
+ roles=("<human>", "<bot>"),
1104
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1105
+ sep="\n",
1106
+ stop_str="<human>",
1107
+ )
1108
+ )
1109
+
1110
+ # h2oGPT default template
1111
+ register_conv_template(
1112
+ Conversation(
1113
+ name="h2ogpt",
1114
+ roles=("<|prompt|>", "<|answer|>"),
1115
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1116
+ sep="</s>",
1117
+ )
1118
+ )
1119
+
1120
+ # Robin default template
1121
+ register_conv_template(
1122
+ Conversation(
1123
+ name="Robin",
1124
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
1125
+ roles=("###Human", "###Assistant"),
1126
+ sep_style=SeparatorStyle.ROBIN,
1127
+ sep="\n",
1128
+ stop_token_ids=[2, 396],
1129
+ stop_str="###",
1130
+ )
1131
+ )
1132
+
1133
+ # Snoozy default template
1134
+ # Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
1135
+ register_conv_template(
1136
+ Conversation(
1137
+ name="snoozy",
1138
+ system_template="### Instruction:\n{system_message}",
1139
+ system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
1140
+ roles=("### Prompt", "### Response"),
1141
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1142
+ sep="\n",
1143
+ stop_str="###",
1144
+ )
1145
+ )
1146
+
1147
+ # manticore default template
1148
+ register_conv_template(
1149
+ Conversation(
1150
+ name="manticore",
1151
+ roles=("USER", "ASSISTANT"),
1152
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1153
+ sep="\n",
1154
+ sep2="</s>",
1155
+ )
1156
+ )
1157
+
1158
+ # Falcon default template
1159
+ register_conv_template(
1160
+ Conversation(
1161
+ name="falcon",
1162
+ roles=("User", "Assistant"),
1163
+ messages=[],
1164
+ sep_style=SeparatorStyle.RWKV,
1165
+ sep="\n",
1166
+ sep2="<|endoftext|>",
1167
+ stop_str="\nUser", # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
1168
+ stop_token_ids=[
1169
+ 0,
1170
+ 1,
1171
+ 2,
1172
+ 3,
1173
+ 4,
1174
+ 5,
1175
+ 6,
1176
+ 7,
1177
+ 8,
1178
+ 9,
1179
+ 10,
1180
+ 11,
1181
+ ], # it better only put special tokens here, because tokenizer only remove special tokens
1182
+ )
1183
+ )
1184
+
1185
+ # ChangGPT default template
1186
+ register_conv_template(
1187
+ Conversation(
1188
+ name="polyglot_changgpt",
1189
+ roles=("B", "A"),
1190
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1191
+ sep="\n",
1192
+ )
1193
+ )
1194
+
1195
+ # tigerbot template
1196
+ register_conv_template(
1197
+ Conversation(
1198
+ name="tigerbot",
1199
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
1200
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
1201
+ roles=("### Instruction", "### Response"),
1202
+ sep_style=SeparatorStyle.ROBIN,
1203
+ sep="\n\n",
1204
+ stop_str="###",
1205
+ )
1206
+ )
1207
+
1208
+ # ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
1209
+ register_conv_template(
1210
+ Conversation(
1211
+ name="xgen",
1212
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
1213
+ roles=("### Human", "### Assistant"),
1214
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1215
+ sep="\n",
1216
+ stop_token_ids=[50256],
1217
+ )
1218
+ )
1219
+
1220
+ # Internlm-chat template
1221
+ register_conv_template(
1222
+ Conversation(
1223
+ name="internlm-chat",
1224
+ system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
1225
+ roles=("<|User|>", "<|Bot|>"),
1226
+ sep_style=SeparatorStyle.CHATINTERN,
1227
+ sep="<eoh>",
1228
+ sep2="<eoa>",
1229
+ stop_token_ids=[1, 103028],
1230
+ stop_str="<|User|>",
1231
+ )
1232
+ )
1233
+
1234
+ # StarChat template
1235
+ # reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py
1236
+ register_conv_template(
1237
+ Conversation(
1238
+ name="starchat",
1239
+ system_template="<system>\n{system_message}",
1240
+ roles=("<|user|>", "<|assistant|>"),
1241
+ sep_style=SeparatorStyle.CHATML,
1242
+ sep="<|end|>",
1243
+ stop_token_ids=[0, 49155],
1244
+ stop_str="<|end|>",
1245
+ )
1246
+ )
1247
+
1248
+ # Baichuan-13B-Chat template
1249
+ register_conv_template(
1250
+ # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
1251
+ # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
1252
+ # https://github.com/baichuan-inc/Baichuan-13B/issues/25
1253
+ Conversation(
1254
+ name="baichuan-chat",
1255
+ roles=("<reserved_102>", "<reserved_103>"),
1256
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1257
+ sep="",
1258
+ stop_token_ids=[],
1259
+ )
1260
+ )
1261
+
1262
+ # Baichuan2-13B-Chat template
1263
+ register_conv_template(
1264
+ # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
1265
+ # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
1266
+ # https://github.com/baichuan-inc/Baichuan2/issues/62
1267
+ Conversation(
1268
+ name="baichuan2-chat",
1269
+ roles=("<reserved_106>", "<reserved_107>"),
1270
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1271
+ sep="",
1272
+ stop_token_ids=[],
1273
+ )
1274
+ )
1275
+
1276
+ # Mistral template
1277
+ # source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
1278
+ register_conv_template(
1279
+ Conversation(
1280
+ name="mistral",
1281
+ system_template="[INST]{system_message}\n",
1282
+ roles=("[INST]", "[/INST]"),
1283
+ sep_style=SeparatorStyle.LLAMA2,
1284
+ sep=" ",
1285
+ sep2="</s>",
1286
+ )
1287
+ )
1288
+
1289
+ # llama2 template
1290
+ # reference: https://huggingface.co/blog/codellama#conversational-instructions
1291
+ # reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
1292
+ register_conv_template(
1293
+ Conversation(
1294
+ name="llama-2",
1295
+ system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
1296
+ roles=("[INST]", "[/INST]"),
1297
+ sep_style=SeparatorStyle.LLAMA2,
1298
+ sep=" ",
1299
+ sep2=" </s><s>",
1300
+ )
1301
+ )
1302
+
1303
+ register_conv_template(
1304
+ Conversation(
1305
+ name="cutegpt",
1306
+ roles=("问:", "答:\n"),
1307
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1308
+ sep="\n",
1309
+ sep2="\n",
1310
+ stop_str="<end>",
1311
+ )
1312
+ )
1313
+
1314
+ # OpenOrcaxOpenChat-Preview2-13B template
1315
+ register_conv_template(
1316
+ Conversation(
1317
+ name="open-orca",
1318
+ system_template="{system_message}",
1319
+ system_message="You are a helpful assistant. Please answer truthfully and write out your "
1320
+ "thinking step by step to be sure you get the right answer. If you make a mistake or encounter "
1321
+ "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
1322
+ "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
1323
+ "and physicist. You will also act as the most appropriate type of expert to answer any particular "
1324
+ "question or solve the relevant problem; state which expert type your are, if so. Also think of "
1325
+ "any particular named expert that would be ideal to answer the relevant question or solve the "
1326
+ "relevant problem; name and act as them, if appropriate.",
1327
+ roles=("User", "Assistant"),
1328
+ sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
1329
+ sep="<|end_of_turn|>\n",
1330
+ stop_token_ids=[32000, 32001], # "<|end_of_turn|>"
1331
+ stop_str="User",
1332
+ )
1333
+ )
1334
+
1335
+ # Open-Orca/Mistral-7B-OpenOrca template
1336
+ # source: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca
1337
+ # reference: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template
1338
+ register_conv_template(
1339
+ Conversation(
1340
+ name="mistral-7b-openorca",
1341
+ system_template="<|im_start|>system\n{system_message}",
1342
+ system_message="You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!",
1343
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1344
+ sep_style=SeparatorStyle.CHATML,
1345
+ sep="<|im_end|>",
1346
+ stop_token_ids=[32000, 32001],
1347
+ )
1348
+ )
1349
+
1350
+ # Qwen-chat default template
1351
+ # source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
1352
+ register_conv_template(
1353
+ Conversation(
1354
+ name="qwen-7b-chat",
1355
+ system_template="<|im_start|>system\n{system_message}",
1356
+ system_message="You are a helpful assistant.",
1357
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1358
+ sep_style=SeparatorStyle.CHATML,
1359
+ sep="<|im_end|>",
1360
+ stop_token_ids=[
1361
+ 151643,
1362
+ 151644,
1363
+ 151645,
1364
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
1365
+ stop_str="<|endoftext|>",
1366
+ )
1367
+ )
1368
+
1369
+ register_conv_template(
1370
+ Conversation(
1371
+ name="qwen2",
1372
+ roles=("user", "assistant"),
1373
+ apply_template=True,
1374
+ )
1375
+ )
1376
+
1377
+ register_conv_template(
1378
+ Conversation(
1379
+ name="megrez_audio",
1380
+ system_message="You are a helpful assistant.",
1381
+ roles=("user", "assistant"),
1382
+ apply_template=True,
1383
+ )
1384
+ )
1385
+
1386
+ register_conv_template(
1387
+ Conversation(
1388
+ name="megrez_q",
1389
+ system_message="你是Megrez-7B-Q,将针对用户的问题给出详细的、积极的回答。",
1390
+ # system_message="名字:[Megrez-7B-Q]",
1391
+ # system_message='''@系统设置
1392
+ # @@名字:Megrez-7B-Q
1393
+ # @@厂家:无问芯穹(Infinigence)
1394
+ # @@日期:1970年1月1日
1395
+
1396
+ # @模态设置
1397
+ # @@图像:False
1398
+ # @@视频:False
1399
+ # @@音频:False
1400
+
1401
+ # @能力设置
1402
+ # @@函数调用:False
1403
+ # @@角色扮演:False
1404
+ # ''',
1405
+ roles=("user", "assistant"),
1406
+ apply_template=True,
1407
+ stop_token_ids=[
1408
+ 151643,
1409
+ 151644,
1410
+ 151645,
1411
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
1412
+ stop_str="<|endoftext|>",
1413
+ )
1414
+ )
1415
+
1416
+ register_conv_template(
1417
+ Conversation(
1418
+ name="qwen2_hack",
1419
+ system_message='Human: 你好\n\nAssistant: 你好!有什么我可以帮助你的吗?<|im_end|>\n<|endoftext|>Human: 再见\n\nAssistant: 再见!<|im_end|>\n<|endoftext|>',
1420
+ roles=("Human", "Assistant"),
1421
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1422
+ sep="\n\n",#"<|endoftext|>",
1423
+ none_stop=True,
1424
+ skip_special_tokens=False,
1425
+ )
1426
+ )
1427
+ # {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ 'Human: 你好\n\nAssistant: 你好!有什么我可以帮助你的吗?<|im_end|>\n<|endoftext|>Human: 再见\n\nAssistant: 再见!<|im_end|>\n<|endoftext|>' }}{% endif %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant: ' }}{% endif %}
1428
+ # register_conv_template(
1429
+ # Conversation(
1430
+ # name="qwen2",
1431
+ # system_message="You are a helpful assistant.",
1432
+ # roles=("user", "assistant"),
1433
+ # stop_token_ids=[
1434
+ # 151643,
1435
+ # 151644,
1436
+ # 151645,
1437
+ # ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
1438
+ # stop_str="<|endoftext|>",
1439
+ # )
1440
+ # )
1441
+
1442
+ # register_conv_template(
1443
+ # Conversation(
1444
+ # name="megrez_q",
1445
+ # system_message="你是一个乐于助人的助手,助手将针对用户的问题给出详细的、积极的回答。",
1446
+ # roles=("user", "assistant"),
1447
+ # stop_token_ids=[
1448
+ # 151643,
1449
+ # 151644,
1450
+ # 151645,
1451
+ # ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
1452
+ # stop_str="<|endoftext|>",
1453
+ # )
1454
+ # )
1455
+
1456
+ register_conv_template(
1457
+ Conversation(
1458
+ name="deepseek",
1459
+ # system_message="You are a helpful assistant.",
1460
+ roles=("user", "assistant"),
1461
+ apply_template=True,
1462
+ none_stop=True,
1463
+ skip_special_tokens=False,
1464
+ )
1465
+ )
1466
+
1467
+ # AquilaChat default template
1468
+ # source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
1469
+ register_conv_template(
1470
+ Conversation(
1471
+ name="aquila-chat",
1472
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
1473
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
1474
+ roles=("Human", "Assistant"),
1475
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1476
+ sep="###",
1477
+ sep2="",
1478
+ stop_str=["###", "</s>", "[UNK]"],
1479
+ )
1480
+ )
1481
+ # AquilaChat2-34B default template
1482
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L212
1483
+ register_conv_template(
1484
+ Conversation(
1485
+ name="aquila-legacy",
1486
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
1487
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
1488
+ roles=("### Human: ", "### Assistant: "),
1489
+ offset=0,
1490
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1491
+ sep="\n",
1492
+ sep2="</s>",
1493
+ stop_str=["</s>", "[UNK]"],
1494
+ )
1495
+ )
1496
+ # AquilaChat2-7B-16K and AquilaChat2-34B-16K default template
1497
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L227
1498
+ register_conv_template(
1499
+ Conversation(
1500
+ name="aquila",
1501
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
1502
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
1503
+ roles=("Human", "Assistant"),
1504
+ offset=0,
1505
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1506
+ sep="###",
1507
+ sep2="</s>",
1508
+ stop_str=["</s>", "[UNK]"],
1509
+ )
1510
+ )
1511
+
1512
+ # AquilaChat2-7B default template
1513
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L242
1514
+ register_conv_template(
1515
+ Conversation(
1516
+ name="aquila-v1",
1517
+ roles=("<|startofpiece|>", "<|endofpiece|>"),
1518
+ offset=0,
1519
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1520
+ sep="",
1521
+ sep2="</s>",
1522
+ stop_str=["</s>", "<|endoftext|>"],
1523
+ )
1524
+ )
1525
+
1526
+ # Llama2-Chinese default template
1527
+ # source: https://huggingface.co/FlagAlpha
1528
+ register_conv_template(
1529
+ Conversation(
1530
+ name="llama2-chinese",
1531
+ system_template="<s>{system_message}</s>",
1532
+ roles=("Human", "Assistant", "System"),
1533
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1534
+ sep="\n",
1535
+ sep2="\n</s><s>",
1536
+ stop_str="</s>",
1537
+ )
1538
+ )
1539
+
1540
+ # Vigogne Instruct default template
1541
+ # source: https://github.com/bofenghuang/vigogne
1542
+ register_conv_template(
1543
+ Conversation(
1544
+ name="vigogne_instruct",
1545
+ system_template="### System:\n{system_message}\n\n",
1546
+ system_message=(
1547
+ "Ci-dessous se trouve une instruction qui décrit une tâche à accomplir. Rédigez une réponse qui répond de manière"
1548
+ " précise à la demande."
1549
+ ),
1550
+ roles=("### Instruction", "### Response"),
1551
+ sep_style=SeparatorStyle.DOLLY,
1552
+ sep="\n\n",
1553
+ sep2="</s>",
1554
+ )
1555
+ )
1556
+
1557
+ # Vigogne Chat default template
1558
+ register_conv_template(
1559
+ Conversation(
1560
+ name="vigogne_chat_v2",
1561
+ system_template="<|system|>: {system_message}",
1562
+ system_message=(
1563
+ "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
1564
+ " autant que vous le pouvez."
1565
+ ),
1566
+ roles=("<|user|>", "<|assistant|>"),
1567
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1568
+ sep="\n",
1569
+ sep2="</s>\n",
1570
+ stop_str="<|user|>",
1571
+ )
1572
+ )
1573
+
1574
+ register_conv_template(
1575
+ Conversation(
1576
+ name="vigogne_chat_v3",
1577
+ system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
1578
+ system_message=(
1579
+ "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
1580
+ " autant que vous le pouvez."
1581
+ ),
1582
+ roles=("[INST]", "[/INST]"),
1583
+ sep_style=SeparatorStyle.LLAMA2,
1584
+ sep=" ",
1585
+ sep2=" </s>",
1586
+ )
1587
+ )
1588
+
1589
+ # Falcon 180B chat template
1590
+ # source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
1591
+ register_conv_template(
1592
+ Conversation(
1593
+ name="falcon-chat",
1594
+ roles=("User", "Falcon"),
1595
+ system_template="System: {system_message}",
1596
+ messages=[],
1597
+ sep_style=SeparatorStyle.FALCON_CHAT,
1598
+ sep="\n",
1599
+ sep2="<|endoftext|>",
1600
+ stop_str="\nUser:", # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
1601
+ )
1602
+ )
1603
+
1604
+ # Phind template
1605
+ # source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
1606
+ register_conv_template(
1607
+ Conversation(
1608
+ name="phind",
1609
+ system_message="### System Prompt\nYou are an intelligent programming assistant.",
1610
+ roles=("### User Message", "### Assistant"),
1611
+ messages=(),
1612
+ offset=0,
1613
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1614
+ sep="\n\n",
1615
+ )
1616
+ )
1617
+
1618
+ # Metharme formatting for Pygmalion models
1619
+ # source: https://huggingface.co/PygmalionAI/pygmalion-2-13b
1620
+ register_conv_template(
1621
+ Conversation(
1622
+ name="metharme",
1623
+ system_template="<|system|>{system_message}",
1624
+ system_message="""Enter RP mode. You shall reply to the user while staying
1625
+ in character. Your responses must be detailed, creative, immersive, and drive the scenario
1626
+ forward.""",
1627
+ roles=("<|user|>", "<|model|>"),
1628
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1629
+ sep="",
1630
+ stop_str="<|user|>",
1631
+ )
1632
+ )
1633
+
1634
+ # Zephyr template
1635
+ # reference: https://huggingface.co/spaces/HuggingFaceH4/zephyr-playground/blob/main/dialogues.py
1636
+ register_conv_template(
1637
+ Conversation(
1638
+ name="zephyr",
1639
+ system_template="<|system|>\n{system_message}",
1640
+ roles=("<|user|>", "<|assistant|>"),
1641
+ sep_style=SeparatorStyle.CHATML,
1642
+ sep="</s>",
1643
+ stop_token_ids=[2],
1644
+ stop_str="</s>",
1645
+ )
1646
+ )
1647
+
1648
+
1649
+ if __name__ == "__main__":
1650
+ from fastchat.conversation import get_conv_template
1651
+
1652
+ print("-- Vicuna template --")
1653
+ conv = get_conv_template("vicuna_v1.1")
1654
+ conv.append_message(conv.roles[0], "Hello!")
1655
+ conv.append_message(conv.roles[1], "Hi!")
1656
+ conv.append_message(conv.roles[0], "How are you?")
1657
+ conv.append_message(conv.roles[1], None)
1658
+ print(conv.get_prompt())
1659
+
1660
+ print("\n")
1661
+
1662
+ print("-- Llama-2 template --")
1663
+ conv = get_conv_template("llama-2")
1664
+ conv.set_system_message("You are a helpful, respectful and honest assistant.")
1665
+ conv.append_message(conv.roles[0], "Hello!")
1666
+ conv.append_message(conv.roles[1], "Hi!")
1667
+ conv.append_message(conv.roles[0], "How are you?")
1668
+ conv.append_message(conv.roles[1], None)
1669
+ print(conv.get_prompt())
1670
+
1671
+ print("\n")
1672
+
1673
+ print("-- ChatGPT template --")
1674
+ conv = get_conv_template("chatgpt")
1675
+ conv.append_message(conv.roles[0], "Hello!")
1676
+ conv.append_message(conv.roles[1], "Hi!")
1677
+ conv.append_message(conv.roles[0], "How are you?")
1678
+ conv.append_message(conv.roles[1], None)
1679
+ print(conv.to_openai_api_messages())
1680
+
1681
+ print("\n")
1682
+
1683
+ print("-- Claude template --")
1684
+ conv = get_conv_template("claude")
1685
+ conv.append_message(conv.roles[0], "Hello!")
1686
+ conv.append_message(conv.roles[1], "Hi!")
1687
+ conv.append_message(conv.roles[0], "How are you?")
1688
+ conv.append_message(conv.roles[1], None)
1689
+ print(conv.get_prompt())
fastchat/data/__init__.py ADDED
File without changes
fastchat/data/clean_sharegpt.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ - Convert html to markdown with basic data cleaning.
3
+ - Deduplication.
4
+
5
+ Usage:
6
+ python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
7
+ """
8
+ import argparse
9
+ from concurrent.futures import ProcessPoolExecutor
10
+ import json
11
+ import logging
12
+ import re
13
+ from typing import Dict, Union
14
+
15
+ import bs4
16
+ import markdownify # == 0.11.6
17
+ from tqdm import tqdm
18
+
19
+
20
+ div_pattern = re.compile("<div.*?>")
21
+ span_pattern = re.compile("<span.*?>")
22
+ code_lang_pattern = re.compile(
23
+ "```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
24
+ )
25
+ code_lang_format = "```\g<1>\n\g<2>\n```"
26
+ regenerate_pattern = re.compile("\d+ / \d+")
27
+ copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
28
+ copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
29
+
30
+
31
+ def reformat_code(val: str) -> str:
32
+ # Input code format is:
33
+ # ```
34
+ # $<language>Copy code$<exact_code_here>
35
+ #
36
+ # ```
37
+ # This function convert it into the correct markdown format
38
+ return re.sub(code_lang_pattern, code_lang_format, val)
39
+
40
+
41
+ def html_to_markdown(val: str) -> str:
42
+ # Remove all <div>. This is required to make intent work in code blocks.
43
+ val = re.sub(div_pattern, "", val)
44
+ # Remove all <span>. This is required to make underscores work in code blocks.
45
+ val = re.sub(span_pattern, "", val)
46
+ # Markdown to html
47
+ val = markdownify.markdownify(val).strip()
48
+ # Reformat code
49
+ val = reformat_code(val)
50
+
51
+ # Remove noisy "[number] / [number]" at the beginning
52
+ noise = re.search(regenerate_pattern, val)
53
+ if noise and noise.start() == 0:
54
+ val = val[noise.end() :]
55
+ # Remove noisy "Copy[number] chars / [number] words"
56
+ val = re.sub(copy_chars_pattern, "", val)
57
+ # Remove empty code block ```\nCopy code\n```
58
+ val = re.sub(copy_code_pattern, "", val)
59
+
60
+ # Strip
61
+ val = val.replace("\n\n\n", "\n").strip()
62
+
63
+ return val
64
+
65
+
66
+ def contain_blocked_words(val: str) -> bool:
67
+ blocked_words = ["openai", "chatgpt"]
68
+ for w in blocked_words:
69
+ if w in val.lower():
70
+ return True
71
+ return False
72
+
73
+
74
+ def clean_html_one_sample(sample):
75
+ roles = ["human", "gpt"]
76
+
77
+ if len(sample["conversations"]) <= 1:
78
+ return (sample, 1)
79
+
80
+ # Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
81
+ if sample["conversations"][0]["from"] != "human":
82
+ sample["conversations"] = sample["conversations"][1:]
83
+ if len(sample["conversations"]) <= 1:
84
+ return (sample, 1)
85
+
86
+ if sample["conversations"][-1]["from"] == "human":
87
+ sample["conversations"] = sample["conversations"][:-1]
88
+ if len(sample["conversations"]) <= 1:
89
+ return (sample, 1)
90
+
91
+ char_count = 0
92
+ new_conversations = []
93
+ for i, c in enumerate(sample["conversations"]):
94
+ if c["from"] != roles[i % 2]:
95
+ return (sample, 2)
96
+
97
+ if contain_blocked_words(c["value"]):
98
+ return (sample, 3)
99
+
100
+ try:
101
+ new_val = html_to_markdown(c["value"])
102
+ except (bs4.builder.ParserRejectedMarkup, AssertionError):
103
+ return (sample, 4)
104
+
105
+ # Filter empty answers like https://sharegpt.com/c/mrllZ6u
106
+ if not new_val or not new_val[0].isprintable():
107
+ break
108
+
109
+ char_count += len(new_val)
110
+ new_conversations.append(
111
+ {
112
+ "from": c["from"],
113
+ "value": new_val,
114
+ }
115
+ )
116
+
117
+ new_conversations = new_conversations[: len(new_conversations) // 2 * 2]
118
+ sample["conversations"] = new_conversations
119
+
120
+ if char_count < 16 or len(sample["conversations"]) <= 0:
121
+ return (sample, 1)
122
+
123
+ return (sample, 0)
124
+
125
+
126
+ def clean_html_all(content, begin, end):
127
+ """
128
+ Clean the source html files.
129
+ """
130
+ cnt_skip = 0
131
+ cnt_blocked_words = 0
132
+ cnt_wrong_format = 0
133
+ cnt_parser_error = 0
134
+ cnt_too_short = 0
135
+ cnt_id_duplication = 0
136
+ cnt_value_duplication = 0
137
+ cnt_plugin = 0
138
+ cnt_tag = 0
139
+
140
+ content = content[begin:end]
141
+ processed = []
142
+ with ProcessPoolExecutor() as executor:
143
+ for result in tqdm(
144
+ executor.map(clean_html_one_sample, content), total=len(content)
145
+ ):
146
+ processed.append(result)
147
+
148
+ visited = {}
149
+ new_content = []
150
+ for sample, error_code in processed:
151
+ cid = sample["id"]
152
+ skipped = True
153
+
154
+ if error_code != 0:
155
+ if error_code == 1:
156
+ print(f"id {cid} is too short")
157
+ cnt_too_short += 1
158
+ elif error_code == 2:
159
+ print(f"id {cid} has a wrong format")
160
+ cnt_wrong_format += 1
161
+ elif error_code == 3:
162
+ print(f"id {cid} contains blocked words")
163
+ cnt_blocked_words += 1
164
+ elif error_code == 4:
165
+ print(f"id {cid} contains parser errors")
166
+ cnt_parser_error += 1
167
+ else:
168
+ raise ValueError(f"Invalid error_code: {error_code}")
169
+ elif cid in visited:
170
+ print(f"id {cid} is an id duplication of {visited[cid]}")
171
+ cnt_id_duplication += 1
172
+ elif sample.get("plugins", None) is not None:
173
+ print(f"id {cid} contains plugin")
174
+ cnt_plugin += 1
175
+ else:
176
+ key = (
177
+ sample["conversations"][0]["value"],
178
+ sample["conversations"][1]["value"],
179
+ )
180
+ if key in visited:
181
+ print(f"id {cid} is a value duplication of {visited[key]}")
182
+ cnt_value_duplication += 1
183
+ else:
184
+ visited[cid] = visited[key] = cid
185
+ skipped = False
186
+
187
+ if not skipped:
188
+ new_content.append(sample)
189
+ else:
190
+ cnt_skip += 1
191
+
192
+ print(
193
+ f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
194
+ f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
195
+ f"cnt_wrong_format: {cnt_wrong_format}, "
196
+ f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
197
+ f"cnt_value_duplication: {cnt_value_duplication}, cnt_plugin: {cnt_plugin}"
198
+ )
199
+
200
+ return new_content
201
+
202
+
203
+ def main(args):
204
+ content = json.load(open(args["in_file"], "r"))
205
+ content = clean_html_all(content, args["begin"], args["end"])
206
+ json.dump(content, open(args["out_file"], "w"), indent=2, ensure_ascii=False)
207
+
208
+
209
+ if __name__ == "__main__":
210
+ parser = argparse.ArgumentParser()
211
+ parser.add_argument("--in-file", type=str, required=True)
212
+ parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
213
+ parser.add_argument("--begin", type=int)
214
+ parser.add_argument("--end", type=int)
215
+ parser.add_argument("--debug", action="store_true")
216
+ args = parser.parse_args()
217
+ main(vars(args))
fastchat/data/convert_alpaca.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert alpaca dataset into sharegpt format.
3
+
4
+ Usage: python3 -m fastchat.data.convert_alpaca --in alpaca_data.json
5
+ """
6
+
7
+ import argparse
8
+ import json
9
+
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
+ import numpy as np
12
+
13
+
14
+ if __name__ == "__main__":
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--in-file", type=str)
17
+ parser.add_argument("--out-file", type=str)
18
+ args = parser.parse_args()
19
+
20
+ content = json.load(open(args.in_file, "r"))
21
+ new_content = []
22
+ for i, c in enumerate(content):
23
+ if len(c["input"].strip()) > 1:
24
+ q, a = c["instruction"] + "\nInput:\n" + c["input"], c["output"]
25
+ else:
26
+ q, a = c["instruction"], c["output"]
27
+ new_content.append(
28
+ {
29
+ "id": f"alpaca_{i}",
30
+ "conversations": [
31
+ {"from": "human", "value": q},
32
+ {"from": "gpt", "value": a},
33
+ ],
34
+ }
35
+ )
36
+
37
+ print(f"#out: {len(new_content)}")
38
+ json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/extract_gpt4_only.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extract the conversations generated by GPT-4 only.
3
+
4
+ Usage: python3 -m fastchat.data.extract_gpt4_only --in sharegpt.json
5
+ """
6
+ import argparse
7
+ import json
8
+
9
+
10
+ if __name__ == "__main__":
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--in-file", type=str, required=True)
13
+ parser.add_argument("--out-file", type=str)
14
+ parser.add_argument("--begin", type=int)
15
+ parser.add_argument("--end", type=int)
16
+ args = parser.parse_args()
17
+
18
+ content = json.load(open(args.in_file, "r"))
19
+ content = content[args.begin : args.end]
20
+ new_content = []
21
+ for c in content:
22
+ model = c.get("model", None)
23
+ if model == "gpt4" or model is None:
24
+ new_content.append(c)
25
+
26
+ if args.out_file:
27
+ out_file = args.out_file
28
+ else:
29
+ out_file = args.in_file.replace(".json", "_gpt4.json")
30
+
31
+ print(f"#in: {len(content)}, #out: {len(new_content)}")
32
+ json.dump(new_content, open(out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/extract_single_round.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extract the first round of the conversations.
3
+
4
+ Usage: python3 -m fastchat.data.extract_single_round --in sharegpt.json
5
+ """
6
+ import argparse
7
+ import json
8
+
9
+
10
+ if __name__ == "__main__":
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--in-file", type=str, required=True)
13
+ parser.add_argument("--out-file", type=str)
14
+ parser.add_argument("--begin", type=int)
15
+ parser.add_argument("--end", type=int)
16
+ args = parser.parse_args()
17
+
18
+ content = json.load(open(args.in_file, "r"))
19
+ content = content[args.begin : args.end]
20
+ for c in content:
21
+ c["conversations"] = c["conversations"][:2]
22
+
23
+ if args.out_file:
24
+ out_file = args.out_file
25
+ else:
26
+ out_file = args.in_file.replace(".json", "_single.json")
27
+
28
+ print(f"#in: {len(content)}, #out: {len(content)}")
29
+ json.dump(content, open(out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/filter_wrong_format.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Filter conversations with wrong formats.
3
+
4
+ Usage:
5
+ python3 -m fastchat.data.filter_wrong_format --in input.json --out output.json
6
+
7
+ """
8
+ import argparse
9
+ import json
10
+ import re
11
+
12
+ from tqdm import tqdm
13
+
14
+ wrong_indices_pattern = re.compile("\n1\. [^2]*\n1\. ")
15
+
16
+
17
+ def should_skip(conv):
18
+ # Filter wrong list indices like https://sharegpt.com/c/1pREAGO
19
+ for sentence in conv["conversations"]:
20
+ val = sentence["value"]
21
+ sub = re.search(wrong_indices_pattern, val)
22
+ if sub is not None:
23
+ return True
24
+
25
+ return False
26
+
27
+
28
+ if __name__ == "__main__":
29
+ parser = argparse.ArgumentParser()
30
+ parser.add_argument("--in-file", type=str, required=True)
31
+ parser.add_argument("--out-file", type=str, required=True)
32
+ args = parser.parse_args()
33
+
34
+ content = json.load(open(args.in_file, "r"))
35
+
36
+ new_content = []
37
+ for conv in tqdm(content):
38
+ if should_skip(conv):
39
+ print(f"{conv['id']} contains a wrong format.")
40
+ else:
41
+ new_content.append(conv)
42
+
43
+ print(f"#in: {len(content)}, #out: {len(new_content)}")
44
+ json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/get_stats.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Get stats of a dataset.
3
+
4
+ Usage: python3 -m fastchat.data.get_stats --in sharegpt.json
5
+ """
6
+
7
+ import argparse
8
+ from concurrent.futures import ProcessPoolExecutor
9
+ import json
10
+
11
+ import numpy as np
12
+ from tqdm import tqdm
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM
14
+
15
+ K = 1e3
16
+ M = 1e6
17
+
18
+
19
+ def tokenize_one_sample(c):
20
+ for i in range(len(c["conversations"])):
21
+ v = c["conversations"][i]["value"]
22
+ c["conversations"][i]["value"] = tokenizer.tokenize(v)
23
+ return c
24
+
25
+
26
+ def tokenize_dataset(content):
27
+ processed = []
28
+ with ProcessPoolExecutor() as executor:
29
+ for result in tqdm(
30
+ executor.map(tokenize_one_sample, content), total=len(content)
31
+ ):
32
+ processed.append(result)
33
+
34
+ return processed
35
+
36
+
37
+ def compute_stats(content):
38
+ sample_lens = []
39
+ sample_turns = []
40
+ prompt_lens = []
41
+ res_lens = []
42
+
43
+ for c in content:
44
+ sample_len = 0
45
+ sample_turns.append(len(c["conversations"]) // 2)
46
+ for i in range(len(c["conversations"]) // 2):
47
+ p = c["conversations"][i * 2]["value"]
48
+ r = c["conversations"][i * 2 + 1]["value"]
49
+
50
+ turn_len = len(p) + len(r)
51
+ sample_len += turn_len
52
+ prompt_lens.append(len(p))
53
+ res_lens.append(len(r))
54
+ sample_lens.append(sample_len)
55
+
56
+ return sample_lens, sample_turns, prompt_lens, res_lens
57
+
58
+
59
+ if __name__ == "__main__":
60
+ parser = argparse.ArgumentParser()
61
+ parser.add_argument("--in-file", type=str)
62
+ parser.add_argument(
63
+ "--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
64
+ )
65
+ args = parser.parse_args()
66
+
67
+ content = json.load(open(args.in_file, "r"))
68
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
69
+ content = tokenize_dataset(content)
70
+
71
+ sample_lens, sample_turns, prompt_lens, res_lens = compute_stats(content)
72
+ print(f"#sequence: {len(content)/K:.2f} K")
73
+ print(f"#tokens: {np.sum(sample_lens)/M:.2f} M")
74
+ print(f"avg. turns: {np.mean(sample_turns):.2f}")
75
+ print(f"avg. prompt length: {np.mean(prompt_lens):.2f}")
76
+ print(f"avg. response length: {np.mean(res_lens):.2f}")
77
+
78
+ print("\n- Histogram -")
79
+ bin_edges = [0, 1024, 2048, 4096, 8192, 16384, 32768]
80
+ hist = np.histogram(sample_lens, bins=bin_edges)[0]
81
+ for i in range(len(hist)):
82
+ print(f"L{bin_edges[i]} - {bin_edges[i+1]}: {hist[i]}")
fastchat/data/hardcoded_questions.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hardcoded question and answers.
3
+ """
4
+ import json
5
+
6
+
7
+ def identity_questions():
8
+ """ "
9
+ Adapted from https://github.com/young-geng/koala_data_pipeline/blob/main/process_hard_coded_data.py
10
+ """
11
+ content = []
12
+
13
+ name = "Vicuna"
14
+ org = "Large Model Systems Organization (LMSYS)"
15
+
16
+ def generate_conversations(questions, answers):
17
+ for q in questions:
18
+ for a in answers:
19
+ content.append(
20
+ {
21
+ "id": f"identity_{len(content)}",
22
+ "conversations": [
23
+ {"from": "human", "value": q},
24
+ {"from": "gpt", "value": a},
25
+ ],
26
+ }
27
+ )
28
+
29
+ questions = [
30
+ "Who are you?",
31
+ "What is your name?",
32
+ "Can you introduce yourself?",
33
+ "Can you tell me a little bit about yourself?",
34
+ "What's your name?",
35
+ "What are you called?",
36
+ "What are you?",
37
+ "Tell me your name.",
38
+ "Tell me about yourself.",
39
+ "Tell me about you.",
40
+ "Tell me who you are.",
41
+ "Please introduce yourself.",
42
+ ]
43
+ answers = [
44
+ f"I am {name}, a language model trained by researchers from {org}.",
45
+ f"My name is {name}, and I'm a language model developed by {org}.",
46
+ f"You can call me {name}, and I was trained by {org} researchers as a language model.",
47
+ f"As a language model, I go by the name {name} and was trained by researchers from {org}.",
48
+ f"I'm a language model called {name}, and I was trained by {org} researchers.",
49
+ f"You may refer to me as {name}, a language model meticulously developed by the researchers at {org}.",
50
+ ]
51
+ generate_conversations(questions, answers)
52
+
53
+ questions = [
54
+ "Who created you?",
55
+ "Who made you?",
56
+ "Who built you?",
57
+ "Who programmed you?",
58
+ "Who trained you?",
59
+ "Who taught you?",
60
+ "Who developed you?",
61
+ ]
62
+ answers = [
63
+ f"Researchers from {org} created me.",
64
+ f"I'm created by {org}.",
65
+ f"I'm built by researchers from {org}.",
66
+ f"I am a language model trained by researchers from {org}.",
67
+ f"I'm a language model developed by {org}.",
68
+ f"I'm a language model created by researchers from {org}.",
69
+ f"My creators are researchers from {org}.",
70
+ ]
71
+ generate_conversations(questions, answers)
72
+
73
+ questions = [
74
+ "Are you ChatGPT?",
75
+ "Are you GPT-2?",
76
+ "Are you GPT-3?",
77
+ "Are you GPT-4?",
78
+ "Are you davinci?",
79
+ "Are you davinci-001?",
80
+ "Are you davinci-002?",
81
+ "Are you davinci-003?",
82
+ "Are you curie?",
83
+ "Are you based on ChatGPT?",
84
+ "Are you based on GPT-2?",
85
+ "Are you based on GPT-3?",
86
+ "Are you based on GPT-4?",
87
+ "Are you based on davinci?",
88
+ "Are you based on davinci-001?",
89
+ "Are you based on davinci-002?",
90
+ "Are you based on davinci-003?",
91
+ "Are you based on curie?",
92
+ "Are you trained by OpenAI?",
93
+ "Are you trained by Google?",
94
+ "Are you trained by Microsoft?",
95
+ "Are you trained by Meta?",
96
+ "Are you trained by IBM?",
97
+ "Do you call OpenAI APIs?",
98
+ "Do you call Google APIs?",
99
+ "Do you call Microsoft APIs?",
100
+ "Do you call Meta APIs?",
101
+ "Do you call IBM APIs?",
102
+ "Are you created by OpenAI?",
103
+ "Are you created by Google?",
104
+ "Are you created by Microsoft?",
105
+ "Are you created by Meta?",
106
+ "Are you created by IBM?",
107
+ "Are you developed by OpenAI?",
108
+ "Are you developed by Google?",
109
+ "Are you developed by Microsoft?",
110
+ "Are you developed by Meta?",
111
+ "Are you developed by IBM?",
112
+ "Are you trained on OpenAI data?",
113
+ "Are you trained on Google data?",
114
+ "Are you trained on Microsoft data?",
115
+ "Are you trained on Meta data?",
116
+ "Are you trained on IBM data?",
117
+ "Are you trained with OpenAI data?",
118
+ "Are you trained with Google data?",
119
+ "Are you trained with Microsoft data?",
120
+ "Are you trained with Meta data?",
121
+ "Are you trained with IBM data?",
122
+ "Have you been trained with OpenAI data?",
123
+ "Have you been trained with Google data?",
124
+ "Have you been trained with Microsoft data?",
125
+ "Have you been trained with Meta data?",
126
+ "Have you been trained with IBM data?",
127
+ "Are you finetuned on OpenAI data?",
128
+ "Are you finetuned on Google data?",
129
+ "Are you finetuned on Microsoft data?",
130
+ "Are you finetuned on Meta data?",
131
+ "Are you finetuned on IBM data?",
132
+ "Are you finetuned with OpenAI data?",
133
+ "Are you finetuned with Google data?",
134
+ "Are you finetuned with Microsoft data?",
135
+ "Are you finetuned with Meta data?",
136
+ "Are you finetuned with IBM data?",
137
+ "Have you been finetuned with OpenAI data?",
138
+ "Have you been finetuned with Google data?",
139
+ "Have you been finetuned with Microsoft data?",
140
+ "Have you been finetuned with Meta data?",
141
+ "Have you been finetuned with IBM data?",
142
+ ]
143
+ answers = [
144
+ f"No, I am a language model trained by researchers from {org}.",
145
+ f"No, I am a language model developed by researchers from {org}.",
146
+ f"No, I am a language model created by researchers from {org}.",
147
+ f"No, I am trained by researchers from {org}.",
148
+ f"No, I am developed by researchers from {org}.",
149
+ f"No, I am created by researchers from {org}.",
150
+ f"No, I'm a language model trained by researchers from {org}.",
151
+ f"No, I'm a language model developed by researchers from {org}.",
152
+ f"No, I'm a language model created by researchers from {org}.",
153
+ f"No, I'm trained by researchers from {org}.",
154
+ f"No, I'm developed by researchers from {org}.",
155
+ f"No, I'm created by researchers from {org}.",
156
+ ]
157
+ generate_conversations(questions, answers)
158
+
159
+ return content
160
+
161
+
162
+ if __name__ == "__main__":
163
+ out_file = "hardcoded.json"
164
+
165
+ content = []
166
+ content.extend(identity_questions())
167
+
168
+ json.dump(content, open(out_file, "w"), indent=2)
fastchat/data/inspect_data.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m fastchat.data.inspect_data --in sharegpt_20230322_clean_lang_split.json
4
+ """
5
+ import argparse
6
+ import json
7
+ import random
8
+
9
+
10
+ if __name__ == "__main__":
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--in-file", type=str, required=True)
13
+ parser.add_argument("--begin", type=int)
14
+ parser.add_argument("--random-n", type=int)
15
+ args = parser.parse_args()
16
+
17
+ content = json.load(open(args.in_file, "r"))
18
+
19
+ if args.random_n:
20
+ indices = [random.randint(0, len(content) - 1) for _ in range(args.random_n)]
21
+ elif args.begin:
22
+ indices = range(args.begin, len(content))
23
+ else:
24
+ indices = range(0, len(content))
25
+
26
+ for idx in indices:
27
+ sample = content[idx]
28
+ print("=" * 40)
29
+ print(f"no: {idx}, id: {sample['id']}")
30
+ for conv in sample["conversations"]:
31
+ print(conv["from"] + ": ")
32
+ print(conv["value"])
33
+ input()
fastchat/data/merge.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Merge two conversation files into one
3
+
4
+ Usage: python3 -m fastchat.data.merge --in file1.json file2.json --out merged.json
5
+ """
6
+
7
+ import argparse
8
+ import json
9
+
10
+
11
+ if __name__ == "__main__":
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument("--in-file", type=str, required=True, nargs="+")
14
+ parser.add_argument("--out-file", type=str, default="merged.json")
15
+ args = parser.parse_args()
16
+
17
+ new_content = []
18
+ for in_file in args.in_file:
19
+ content = json.load(open(in_file, "r"))
20
+ new_content.extend(content)
21
+
22
+ print(f"#out: {len(new_content)}")
23
+ json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/optional_clean.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Do optional cleaning (e.g., remove some languages).
3
+
4
+ Usage:
5
+ python3 -m fastchat.data.optional_clean --in input.json --out output.json --keep-lang en
6
+ python3 -m fastchat.data.optional_clean --in input.json --out output.json --skip-lang en
7
+
8
+ Requirement:
9
+ pip3 install polyglot pyicu pycld2
10
+ """
11
+ import argparse
12
+ import json
13
+ import re
14
+
15
+ import polyglot
16
+ from polyglot.detect import Detector
17
+ import pycld2
18
+ from tqdm import tqdm
19
+
20
+
21
+ def skip(conv, args):
22
+ # Remove certain languages
23
+ if args.keep_lang != "all" or args.skip_lang is not None:
24
+ text = "\n".join([x["value"] for x in conv["conversations"]])
25
+ try:
26
+ lang_code = Detector(text).language.code
27
+ except (pycld2.error, polyglot.detect.base.UnknownLanguage):
28
+ lang_code = "unknown"
29
+
30
+ if args.keep_lang != "all" and lang_code != args.keep_lang:
31
+ return True
32
+
33
+ if lang_code == args.skip_lang:
34
+ return True
35
+
36
+ # Remove repetitive numbers
37
+ if args.reduce_rep:
38
+ for sentence in conv["conversations"]:
39
+ val = sentence["value"]
40
+ sub = re.search(r"(\d)\1{8}", val)
41
+ if sub is not None:
42
+ return True
43
+
44
+ return False
45
+
46
+
47
+ if __name__ == "__main__":
48
+ parser = argparse.ArgumentParser()
49
+ parser.add_argument("--in-file", type=str, required=True)
50
+ parser.add_argument("--out-file", type=str)
51
+ parser.add_argument(
52
+ "--keep-lang",
53
+ type=str,
54
+ default="all",
55
+ choices=["all", "en"],
56
+ help="Only keep certain langauges.",
57
+ )
58
+ parser.add_argument("--skip-lang", type=str, help="Skip a specific language.")
59
+ # NOTE: Be careful about reduce_rep which may remove some good data.
60
+ # For example, addresses could have long consecutive 0's
61
+ parser.add_argument("--reduce-rep", action="store_true")
62
+ args = parser.parse_args()
63
+
64
+ in_file = args.in_file
65
+ out_file = args.out_file
66
+ keep_lang = args.keep_lang
67
+ skip_lang = args.skip_lang
68
+ reduce_rep = args.reduce_rep
69
+ assert keep_lang == "all" or skip_lang is None
70
+
71
+ if out_file is None:
72
+ out_file = "sharegpt_clean"
73
+ if keep_lang != "all":
74
+ out_file += "_" + keep_lang
75
+ if skip_lang is not None:
76
+ out_file += "_skip_" + skip_lang
77
+ if reduce_rep:
78
+ out_file += "_reduce_rep"
79
+ out_file += ".json"
80
+
81
+ content = json.load(open(in_file, "r"))
82
+ num_conv = len(content)
83
+
84
+ new_content = []
85
+ for conv in tqdm(content):
86
+ if not skip(conv, args):
87
+ new_content.append(conv)
88
+
89
+ print(f"#in: {len(content)}, #out: {len(new_content)}")
90
+ json.dump(new_content, open(out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/optional_replace.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Do optional replace of bos/eos/pad/unk.
3
+
4
+ Usage:
5
+ python3 -m fastchat.data.optional_replace --in input.json --out output.json --model-name-or-path <your_token_path>
6
+
7
+ Requirement:
8
+ pip3 install transformers tqdm
9
+ """
10
+ import argparse
11
+ import json
12
+ import traceback
13
+
14
+ import transformers
15
+ from tqdm import tqdm
16
+
17
+
18
+ def replace_special_tokens(
19
+ tokenizer: transformers.PreTrainedTokenizer, text: str
20
+ ) -> str:
21
+ if not text:
22
+ return text
23
+
24
+ def _insert_vline(token: str) -> str:
25
+ if len(token) < 2:
26
+ return " "
27
+ elif len(token) == 2:
28
+ return f"{token[0]}|{token[1]}"
29
+ else:
30
+ return f"{token[:1]}|{token[1:-1]}|{token[-1:]}"
31
+
32
+ if tokenizer.bos_token:
33
+ text = text.replace(tokenizer.bos_token, _insert_vline(tokenizer.bos_token))
34
+ if tokenizer.eos_token:
35
+ text = text.replace(tokenizer.eos_token, _insert_vline(tokenizer.eos_token))
36
+ if tokenizer.pad_token:
37
+ text = text.replace(tokenizer.pad_token, _insert_vline(tokenizer.pad_token))
38
+ if tokenizer.unk_token:
39
+ text = text.replace(tokenizer.unk_token, _insert_vline(tokenizer.unk_token))
40
+ return text
41
+
42
+
43
+ def replace(conv, tokenizer):
44
+ # Replace bos/eos/pad/unk tokens
45
+ if tokenizer:
46
+ try:
47
+ for sentence in conv["conversations"]:
48
+ sentence["value"] = replace_special_tokens(tokenizer, sentence["value"])
49
+ except Exception as e:
50
+ traceback.print_exc()
51
+
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument("--in-file", type=str, required=True)
56
+ parser.add_argument("--out-file", type=str)
57
+ parser.add_argument(
58
+ "--model-name-or-path",
59
+ type=str,
60
+ help="The directory or address where the model token is stored.",
61
+ )
62
+ args = parser.parse_args()
63
+
64
+ in_file = args.in_file
65
+ out_file = args.out_file
66
+ tokenizer = None
67
+ if args.model_name_or_path:
68
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
69
+ args.model_name_or_path,
70
+ trust_remote_code=True,
71
+ use_fast=False,
72
+ )
73
+
74
+ if out_file is None:
75
+ out_file = f"{in_file}_replace.json"
76
+
77
+ content = json.load(open(in_file, "r"))
78
+
79
+ for conv in tqdm(content):
80
+ replace(conv, tokenizer)
81
+
82
+ json.dump(content, open(out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/prepare_all.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prepare all datasets."""
2
+
3
+ import argparse
4
+ import os
5
+
6
+ from fastchat.utils import run_cmd
7
+
8
+
9
+ if __name__ == "__main__":
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--prefix", type=str, default="~/datasets/sharegpt_20230521")
12
+ parser.add_argument(
13
+ "--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
14
+ )
15
+ parser.add_argument("--seq-len", type=int, default=4096)
16
+ args = parser.parse_args()
17
+
18
+ in_prefix = args.prefix
19
+ model_path = args.model_name_or_path
20
+ seq_len = args.seq_len
21
+ prefix = (
22
+ f"{in_prefix}_{seq_len}".replace("4096", "4k")
23
+ .replace("8192", "8k")
24
+ .replace("16384", "16k")
25
+ )
26
+
27
+ cmd_list = [
28
+ f"python3 -m fastchat.data.clean_sharegpt --in {in_prefix}_html.json --out {prefix}_clean.json",
29
+ f"python3 -m fastchat.data.optional_clean --in {prefix}_clean.json --out {prefix}_clean_lang.json --skip-lang ko",
30
+ f"python3 -m fastchat.data.split_long_conversation --in {prefix}_clean_lang.json --out {prefix}_clean_lang_split.json --model-name {model_path} --max-length {seq_len}",
31
+ f"python3 -m fastchat.data.filter_wrong_format --in {prefix}_clean_lang_split.json --out {prefix}_clean_lang_split.json",
32
+ f"python3 -m fastchat.data.split_train_test --in {prefix}_clean_lang_split.json --ratio 0.99",
33
+ f"python3 -m fastchat.data.hardcoded_questions",
34
+ f"python3 -m fastchat.data.merge --in {prefix}_clean_lang_split_train.json hardcoded.json --out {prefix}_clean_lang_split_identity.json",
35
+ f"python3 -m fastchat.data.extract_gpt4_only --in {prefix}_clean_lang_split_identity.json",
36
+ f"python3 -m fastchat.data.extract_single_round --in {prefix}_clean_lang_split_identity.json",
37
+ ]
38
+
39
+ for cmd in cmd_list:
40
+ ret = run_cmd(cmd)
41
+ if ret != 0:
42
+ exit(ret)
fastchat/data/pretty_json.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 pretty_json.py --in in.json --out out.json
4
+ """
5
+
6
+ import argparse
7
+ import json
8
+
9
+
10
+ if __name__ == "__main__":
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--in-file", type=str, required=True)
13
+ parser.add_argument("--out-file", type=str, required=True)
14
+ args = parser.parse_args()
15
+
16
+ with open(args.in_file, "r") as fin:
17
+ data = json.load(fin)
18
+
19
+ with open(args.out_file, "w") as fout:
20
+ json.dump(data, fout, indent=2, ensure_ascii=False)
fastchat/data/sample.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sample some conversations from a file.
3
+
4
+ Usage: python3 -m fastchat.data.sample --in sharegpt.json --out sampled.json
5
+ """
6
+ import argparse
7
+ import json
8
+
9
+ import numpy as np
10
+
11
+
12
+ if __name__ == "__main__":
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--in-file", type=str, required=True)
15
+ parser.add_argument("--out-file", type=str, default="sampled.json")
16
+ parser.add_argument("--begin", type=int, default=0)
17
+ parser.add_argument("--end", type=int, default=100)
18
+ parser.add_argument("--max-length", type=int, default=1024)
19
+ parser.add_argument("--keep-order", action="store_true")
20
+ args = parser.parse_args()
21
+
22
+ content = json.load(open(args.in_file, "r"))
23
+ if not args.keep_order:
24
+ np.random.seed(42)
25
+ np.random.shuffle(content)
26
+
27
+ new_content = []
28
+ for i in range(args.begin, min(args.end, len(content))):
29
+ sample = content[i]
30
+ concat = ""
31
+ for s in sample["conversations"]:
32
+ concat += s["value"]
33
+
34
+ if len(concat) > args.max_length:
35
+ continue
36
+
37
+ new_content.append(sample)
38
+
39
+ print(f"#in: {len(content)}, #out: {len(new_content)}")
40
+ json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)
fastchat/data/split_long_conversation.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Split long conversations based on certain max length.
3
+
4
+ Usage: python3 -m fastchat.data.split_long_conversation \
5
+ --in sharegpt_clean.json \
6
+ --out sharegpt_split.json \
7
+ --model-name-or-path $<model-name>
8
+ """
9
+ import argparse
10
+ from concurrent.futures import ProcessPoolExecutor
11
+ import json
12
+ from typing import Dict, Sequence, Optional
13
+
14
+ import transformers
15
+ from tqdm import tqdm
16
+
17
+
18
+ def make_sample(sample, start_idx, end_idx):
19
+ assert (end_idx - start_idx) % 2 == 0
20
+ return {
21
+ "id": sample["id"] + "_" + str(start_idx),
22
+ "model": sample.get("model", ""),
23
+ "conversations": sample["conversations"][start_idx:end_idx],
24
+ }
25
+
26
+
27
+ tokenizer = max_length = None
28
+
29
+
30
+ def split_one_sample(sample):
31
+ tokenized_lens = []
32
+ conversations = sample["conversations"]
33
+ conversations = conversations[: len(conversations) // 2 * 2]
34
+ for c in conversations:
35
+ length = len(tokenizer(c["value"]).input_ids) + 6
36
+ tokenized_lens.append(length)
37
+
38
+ start_idx = 0
39
+ cur_len = 0
40
+
41
+ if len(conversations) % 2 != 0 or len(conversations) < 2:
42
+ return []
43
+
44
+ new_samples = []
45
+ for i in range(0, len(conversations), 2):
46
+ tmp_len = tokenized_lens[i] + tokenized_lens[i + 1]
47
+ if cur_len + tmp_len > max_length:
48
+ new_samples.append(make_sample(sample, start_idx, i))
49
+ start_idx = i
50
+ cur_len = 0
51
+ elif i == len(conversations) - 2:
52
+ new_samples.append(make_sample(sample, start_idx, i + 2))
53
+
54
+ cur_len += tmp_len
55
+
56
+ return new_samples
57
+
58
+
59
+ def worker(input_data):
60
+ result = []
61
+ for sample in input_data:
62
+ result.extend(split_one_sample(sample))
63
+ return result
64
+
65
+
66
+ def split_all(content, begin, end, tokenizer_, max_length_):
67
+ """
68
+ Keep the maximum round of conversations within the max token length constraint
69
+ """
70
+ global tokenizer, max_length
71
+ tokenizer = tokenizer_
72
+ max_length = max_length_
73
+
74
+ content = content[begin:end]
75
+ new_content = []
76
+
77
+ # Split content into chunks
78
+ chunks = [content[i : i + 1000] for i in range(0, len(content), 1000)]
79
+ with ProcessPoolExecutor() as executor:
80
+ for result in tqdm(executor.map(worker, chunks), total=len(chunks)):
81
+ new_content.extend(result)
82
+
83
+ return new_content
84
+
85
+
86
+ def filter_invalid_roles(content):
87
+ new_content = []
88
+ for i, c in enumerate(content):
89
+ roles = ["human", "gpt"]
90
+ if len(c["conversations"]) <= 0:
91
+ continue
92
+
93
+ valid = True
94
+ for j, s in enumerate(c["conversations"]):
95
+ if s["from"] != roles[j % 2]:
96
+ valid = False
97
+ break
98
+
99
+ if valid:
100
+ new_content.append(c)
101
+
102
+ return new_content
103
+
104
+
105
+ def main(args):
106
+ content = json.load(open(args.in_file, "r"))
107
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
108
+ args.model_name_or_path,
109
+ model_max_length=args.max_length,
110
+ padding_side="right",
111
+ use_fast=False,
112
+ )
113
+ new_content = split_all(content, args.begin, args.end, tokenizer, args.max_length)
114
+ new_content = filter_invalid_roles(new_content)
115
+
116
+ print(f"#in: {len(content)}, #out: {len(new_content)}")
117
+ json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)
118
+
119
+
120
+ if __name__ == "__main__":
121
+ parser = argparse.ArgumentParser()
122
+ parser.add_argument("--in-file", type=str, required=True)
123
+ parser.add_argument("--out-file", type=str, default="sharegpt_split.json")
124
+ parser.add_argument("--begin", type=int)
125
+ parser.add_argument("--end", type=int)
126
+ parser.add_argument("--model-name-or-path", type=str, required=True)
127
+ parser.add_argument("--max-length", type=int, default=2048)
128
+ args = parser.parse_args()
129
+ main(args)
fastchat/data/split_train_test.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Split the dataset into training and test set.
3
+
4
+ Usage: python3 -m fastchat.data.split_train_test --in sharegpt.json
5
+ """
6
+ import argparse
7
+ import json
8
+
9
+ import numpy as np
10
+
11
+
12
+ if __name__ == "__main__":
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--in-file", type=str, required=True)
15
+ parser.add_argument("--begin", type=int, default=0)
16
+ parser.add_argument("--end", type=int, default=100)
17
+ parser.add_argument("--ratio", type=float, default=0.9)
18
+ args = parser.parse_args()
19
+
20
+ content = json.load(open(args.in_file, "r"))
21
+ np.random.seed(0)
22
+
23
+ perm = np.random.permutation(len(content))
24
+ content = [content[i] for i in perm]
25
+ split = int(args.ratio * len(content))
26
+
27
+ train_set = content[:split]
28
+ test_set = content[split:]
29
+
30
+ print(f"#train: {len(train_set)}, #test: {len(test_set)}")
31
+ train_name = args.in_file.replace(".json", "_train.json")
32
+ test_name = args.in_file.replace(".json", "_test.json")
33
+ json.dump(train_set, open(train_name, "w"), indent=2, ensure_ascii=False)
34
+ json.dump(test_set, open(test_name, "w"), indent=2, ensure_ascii=False)
fastchat/model/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from fastchat.model.model_adapter import (
2
+ load_model,
3
+ get_conversation_template,
4
+ add_model_args,
5
+ )
fastchat/model/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (292 Bytes). View file
 
fastchat/model/__pycache__/compression.cpython-310.pyc ADDED
Binary file (6.74 kB). View file
 
fastchat/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc ADDED
Binary file (2.1 kB). View file
 
fastchat/model/__pycache__/model_adapter.cpython-310.pyc ADDED
Binary file (55.6 kB). View file
 
fastchat/model/__pycache__/model_chatglm.cpython-310.pyc ADDED
Binary file (2.52 kB). View file
 
fastchat/model/__pycache__/model_codet5p.cpython-310.pyc ADDED
Binary file (2.58 kB). View file
 
fastchat/model/__pycache__/model_exllama.cpython-310.pyc ADDED
Binary file (1.77 kB). View file
 
fastchat/model/__pycache__/model_falcon.cpython-310.pyc ADDED
Binary file (2.56 kB). View file
 
fastchat/model/__pycache__/model_registry.cpython-310.pyc ADDED
Binary file (10.8 kB). View file
 
fastchat/model/__pycache__/model_xfastertransformer.cpython-310.pyc ADDED
Binary file (1.67 kB). View file
 
fastchat/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc ADDED
Binary file (3.09 kB). View file
 
fastchat/model/apply_delta.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Apply the delta weights on top of a base model.
3
+
4
+ Usage:
5
+ python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
6
+ """
7
+ import argparse
8
+ import gc
9
+ import glob
10
+ import json
11
+ import os
12
+ import shutil
13
+ import tempfile
14
+
15
+ from huggingface_hub import snapshot_download
16
+ import torch
17
+ from torch import nn
18
+ from tqdm import tqdm
19
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
20
+
21
+
22
+ GB = 1 << 30
23
+
24
+
25
+ def split_files(model_path, tmp_path, split_size):
26
+ if not os.path.exists(model_path):
27
+ model_path = snapshot_download(repo_id=model_path)
28
+ if not os.path.exists(tmp_path):
29
+ os.makedirs(tmp_path)
30
+
31
+ file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
32
+ files = glob.glob(file_pattern)
33
+
34
+ part = 0
35
+ try:
36
+ for file_path in tqdm(files):
37
+ state_dict = torch.load(file_path)
38
+ new_state_dict = {}
39
+
40
+ current_size = 0
41
+ for name, param in state_dict.items():
42
+ param_size = param.numel() * param.element_size()
43
+
44
+ if current_size + param_size > split_size:
45
+ new_file_name = f"pytorch_model-{part}.bin"
46
+ new_file_path = os.path.join(tmp_path, new_file_name)
47
+ torch.save(new_state_dict, new_file_path)
48
+ current_size = 0
49
+ new_state_dict = None
50
+ gc.collect()
51
+ new_state_dict = {}
52
+ part += 1
53
+
54
+ new_state_dict[name] = param
55
+ current_size += param_size
56
+
57
+ new_file_name = f"pytorch_model-{part}.bin"
58
+ new_file_path = os.path.join(tmp_path, new_file_name)
59
+ torch.save(new_state_dict, new_file_path)
60
+ new_state_dict = None
61
+ gc.collect()
62
+ new_state_dict = {}
63
+ part += 1
64
+ except Exception as e:
65
+ print(f"An error occurred during split_files: {e}")
66
+ shutil.rmtree(tmp_path)
67
+ raise
68
+
69
+
70
+ def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
71
+ delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
72
+ delta_config = AutoConfig.from_pretrained(delta_path)
73
+
74
+ if os.path.exists(target_model_path):
75
+ shutil.rmtree(target_model_path)
76
+ os.makedirs(target_model_path)
77
+
78
+ split_size = 4 * GB
79
+
80
+ with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
81
+ print(f"Split files for the base model to {tmp_base_path}")
82
+ split_files(base_model_path, tmp_base_path, split_size)
83
+ print(f"Split files for the delta weights to {tmp_delta_path}")
84
+ split_files(delta_path, tmp_delta_path, split_size)
85
+
86
+ base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
87
+ base_files = glob.glob(base_pattern)
88
+ delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
89
+ delta_files = glob.glob(delta_pattern)
90
+ delta_state_dict = torch.load(delta_files[0])
91
+
92
+ print("Applying the delta")
93
+ weight_map = {}
94
+ total_size = 0
95
+
96
+ for i, base_file in tqdm(enumerate(base_files)):
97
+ state_dict = torch.load(base_file)
98
+ file_name = f"pytorch_model-{i}.bin"
99
+ for name, param in state_dict.items():
100
+ if name not in delta_state_dict:
101
+ for delta_file in delta_files:
102
+ delta_state_dict = torch.load(delta_file)
103
+ gc.collect()
104
+ if name in delta_state_dict:
105
+ break
106
+
107
+ state_dict[name] += delta_state_dict[name]
108
+ weight_map[name] = file_name
109
+ total_size += param.numel() * param.element_size()
110
+ gc.collect()
111
+ torch.save(state_dict, os.path.join(target_model_path, file_name))
112
+
113
+ with open(
114
+ os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
115
+ ) as f:
116
+ json.dump(
117
+ {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
118
+ )
119
+
120
+ print(f"Saving the target model to {target_model_path}")
121
+ delta_tokenizer.save_pretrained(target_model_path)
122
+ delta_config.save_pretrained(target_model_path)
123
+
124
+
125
+ def apply_delta(base_model_path, target_model_path, delta_path):
126
+ print(f"Loading the delta weights from {delta_path}")
127
+ delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
128
+ delta = AutoModelForCausalLM.from_pretrained(
129
+ delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
130
+ )
131
+
132
+ print(f"Loading the base model from {base_model_path}")
133
+ base = AutoModelForCausalLM.from_pretrained(
134
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
135
+ )
136
+
137
+ print("Applying the delta")
138
+ for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
139
+ assert name in delta.state_dict()
140
+ param.data += delta.state_dict()[name]
141
+
142
+ print(f"Saving the target model to {target_model_path}")
143
+ base.save_pretrained(target_model_path)
144
+ delta_tokenizer.save_pretrained(target_model_path)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ parser = argparse.ArgumentParser()
149
+ parser.add_argument("--base-model-path", type=str, required=True)
150
+ parser.add_argument("--target-model-path", type=str, required=True)
151
+ parser.add_argument("--delta-path", type=str, required=True)
152
+ parser.add_argument(
153
+ "--low-cpu-mem",
154
+ action="store_true",
155
+ help="Lower the cpu memory usage. This will split large files and use "
156
+ "disk as swap to reduce the memory usage below 10GB.",
157
+ )
158
+ args = parser.parse_args()
159
+
160
+ if args.low_cpu_mem:
161
+ apply_delta_low_cpu_mem(
162
+ args.base_model_path, args.target_model_path, args.delta_path
163
+ )
164
+ else:
165
+ apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
fastchat/model/apply_lora.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Apply the LoRA weights on top of a base model.
3
+
4
+ Usage:
5
+ python3 -m fastchat.model.apply_lora --base ~/model_weights/llama-7b --target ~/model_weights/baize-7b --lora project-baize/baize-lora-7B
6
+
7
+ Dependency:
8
+ pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b
9
+ """
10
+ import argparse
11
+
12
+ import torch
13
+ from peft import PeftModel
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM
15
+
16
+
17
+ def apply_lora(base_model_path, target_model_path, lora_path):
18
+ print(f"Loading the base model from {base_model_path}")
19
+ base = AutoModelForCausalLM.from_pretrained(
20
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
21
+ )
22
+ base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)
23
+
24
+ print(f"Loading the LoRA adapter from {lora_path}")
25
+
26
+ lora_model = PeftModel.from_pretrained(
27
+ base,
28
+ lora_path,
29
+ # torch_dtype=torch.float16
30
+ )
31
+
32
+ print("Applying the LoRA")
33
+ model = lora_model.merge_and_unload()
34
+
35
+ print(f"Saving the target model to {target_model_path}")
36
+ model.save_pretrained(target_model_path)
37
+ base_tokenizer.save_pretrained(target_model_path)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--lora-path", type=str, required=True)
45
+
46
+ args = parser.parse_args()
47
+
48
+ apply_lora(args.base_model_path, args.target_model_path, args.lora_path)
fastchat/model/compression.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import gc
3
+ import glob
4
+ import os
5
+
6
+ from accelerate import init_empty_weights
7
+ from accelerate.utils import set_module_tensor_to_device
8
+ from huggingface_hub import snapshot_download
9
+ import torch
10
+ from torch import Tensor
11
+ from torch.nn import functional as F
12
+ import torch.nn as nn
13
+ from tqdm import tqdm
14
+ from transformers import (
15
+ AutoConfig,
16
+ AutoModelForCausalLM,
17
+ AutoTokenizer,
18
+ AutoModel,
19
+ AutoModelForSeq2SeqLM,
20
+ )
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class CompressionConfig:
25
+ """Group-wise quantization."""
26
+
27
+ num_bits: int
28
+ group_size: int
29
+ group_dim: int
30
+ symmetric: bool
31
+ enabled: bool = True
32
+
33
+
34
+ default_compression_config = CompressionConfig(
35
+ num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
36
+ )
37
+
38
+
39
+ class CLinear(nn.Module):
40
+ """Compressed Linear Layer."""
41
+
42
+ def __init__(self, weight=None, bias=None, device=None):
43
+ super().__init__()
44
+ if weight is None:
45
+ self.weight = None
46
+ elif isinstance(weight, Tensor):
47
+ self.weight = compress(weight.data.to(device), default_compression_config)
48
+ else:
49
+ self.weight = weight
50
+ self.bias = bias
51
+
52
+ def forward(self, input: Tensor) -> Tensor:
53
+ weight = decompress(self.weight, default_compression_config)
54
+ if self.bias is None:
55
+ return F.linear(input.to(weight.dtype), weight)
56
+ return F.linear(input.to(weight.dtype), weight, self.bias.to(weight.dtype))
57
+
58
+
59
+ def compress_module(module, target_device):
60
+ for attr_str in dir(module):
61
+ target_attr = getattr(module, attr_str)
62
+ if type(target_attr) == torch.nn.Linear:
63
+ setattr(
64
+ module,
65
+ attr_str,
66
+ CLinear(target_attr.weight, target_attr.bias, target_device),
67
+ )
68
+ for name, child in module.named_children():
69
+ compress_module(child, target_device)
70
+
71
+
72
+ def get_compressed_list(module, prefix=""):
73
+ compressed_list = []
74
+ for attr_str in dir(module):
75
+ target_attr = getattr(module, attr_str)
76
+ if type(target_attr) == torch.nn.Linear:
77
+ full_name = (
78
+ f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
79
+ )
80
+ compressed_list.append(full_name)
81
+ for name, child in module.named_children():
82
+ child_prefix = f"{prefix}.{name}" if prefix else name
83
+ for each in get_compressed_list(child, child_prefix):
84
+ compressed_list.append(each)
85
+ return compressed_list
86
+
87
+
88
+ def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
89
+ for attr_str in dir(module):
90
+ target_attr = getattr(module, attr_str)
91
+ if type(target_attr) == torch.nn.Linear:
92
+ full_name = (
93
+ f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
94
+ )
95
+ setattr(
96
+ module,
97
+ attr_str,
98
+ CLinear(
99
+ compressed_state_dict[full_name], target_attr.bias, target_device
100
+ ),
101
+ )
102
+ for name, child in module.named_children():
103
+ child_prefix = f"{prefix}.{name}" if prefix else name
104
+ apply_compressed_weight(
105
+ child, compressed_state_dict, target_device, child_prefix
106
+ )
107
+
108
+
109
+ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="main"):
110
+ # partially load model
111
+ # `use_fast=True`` is not supported for some models.
112
+ try:
113
+ tokenizer = AutoTokenizer.from_pretrained(
114
+ model_path, use_fast=use_fast, revision=revision, trust_remote_code=True
115
+ )
116
+ except TypeError:
117
+ tokenizer = AutoTokenizer.from_pretrained(
118
+ model_path, use_fast=~use_fast, revision=revision, trust_remote_code=True
119
+ )
120
+ with init_empty_weights():
121
+ # `trust_remote_code` should be set as `True` for both AutoConfig and AutoModel
122
+ config = AutoConfig.from_pretrained(
123
+ model_path,
124
+ low_cpu_mem_usage=True,
125
+ torch_dtype=torch_dtype,
126
+ trust_remote_code=True,
127
+ revision=revision,
128
+ )
129
+ # some models are loaded by AutoModel but not AutoModelForCausalLM,
130
+ # such as chatglm, chatglm2
131
+ try:
132
+ # google/flan-* models are based on an AutoModelForSeq2SeqLM.
133
+ if "T5Config" in str(type(config)):
134
+ model = AutoModelForSeq2SeqLM.from_config(
135
+ config, trust_remote_code=True
136
+ )
137
+ else:
138
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
139
+ except NameError:
140
+ model = AutoModel.from_config(config, trust_remote_code=True)
141
+ linear_weights = get_compressed_list(model)
142
+ if os.path.exists(model_path):
143
+ # `model_path` is a local folder
144
+ base_pattern = os.path.join(model_path, "pytorch_model*.bin")
145
+ else:
146
+ # `model_path` is a cached Hugging Face repo
147
+ # We don't necessarily need to download the model' repo again if there is a cache.
148
+ # So check the default huggingface cache first.
149
+ model_path_temp = os.path.join(
150
+ os.path.expanduser("~"),
151
+ ".cache/huggingface/hub",
152
+ "models--" + model_path.replace("/", "--"),
153
+ "snapshots/",
154
+ )
155
+ downloaded = False
156
+ if os.path.exists(model_path_temp):
157
+ temp_last_dir = os.listdir(model_path_temp)[-1]
158
+ model_path_temp = os.path.join(model_path_temp, temp_last_dir)
159
+ base_pattern = os.path.join(model_path_temp, "pytorch_model*.bin")
160
+ files = glob.glob(base_pattern)
161
+ if len(files) > 0:
162
+ downloaded = True
163
+
164
+ if downloaded:
165
+ model_path = model_path_temp
166
+ else:
167
+ model_path = snapshot_download(model_path, revision=revision)
168
+ base_pattern = os.path.join(model_path, "pytorch_model*.bin")
169
+
170
+ files = glob.glob(base_pattern)
171
+ if len(files) == 0:
172
+ raise ValueError(
173
+ f"Cannot find any model weight files. "
174
+ f"Please check your (cached) weight path: {model_path}"
175
+ )
176
+
177
+ compressed_state_dict = {}
178
+ for filename in tqdm(files):
179
+ tmp_state_dict = torch.load(filename, map_location=lambda storage, loc: storage)
180
+ for name in tmp_state_dict:
181
+ if name in linear_weights:
182
+ tensor = tmp_state_dict[name].to(device, dtype=torch_dtype)
183
+ compressed_state_dict[name] = compress(
184
+ tensor, default_compression_config
185
+ )
186
+ else:
187
+ compressed_state_dict[name] = tmp_state_dict[name].to(
188
+ device, dtype=torch_dtype
189
+ )
190
+ tmp_state_dict[name] = None
191
+ tensor = None
192
+ gc.collect()
193
+ torch.cuda.empty_cache()
194
+ if device == "xpu":
195
+ torch.xpu.empty_cache()
196
+ if device == "npu":
197
+ torch.npu.empty_cache()
198
+
199
+ for name in model.state_dict():
200
+ if name not in linear_weights:
201
+ set_module_tensor_to_device(
202
+ model, name, device, value=compressed_state_dict[name]
203
+ )
204
+ apply_compressed_weight(model, compressed_state_dict, device)
205
+
206
+ if torch_dtype == torch.float16:
207
+ model.half()
208
+ model.to(device)
209
+ model.eval()
210
+
211
+ return model, tokenizer
212
+
213
+
214
+ def compress(tensor, config):
215
+ """Simulate group-wise quantization."""
216
+ if not config.enabled:
217
+ return tensor
218
+
219
+ group_size, num_bits, group_dim, symmetric = (
220
+ config.group_size,
221
+ config.num_bits,
222
+ config.group_dim,
223
+ config.symmetric,
224
+ )
225
+ assert num_bits <= 8
226
+
227
+ original_shape = tensor.shape
228
+ num_groups = (original_shape[group_dim] + group_size - 1) // group_size
229
+ new_shape = (
230
+ original_shape[:group_dim]
231
+ + (num_groups, group_size)
232
+ + original_shape[group_dim + 1 :]
233
+ )
234
+
235
+ # Pad
236
+ pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
237
+ if pad_len != 0:
238
+ pad_shape = (
239
+ original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
240
+ )
241
+ tensor = torch.cat(
242
+ [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
243
+ dim=group_dim,
244
+ )
245
+ data = tensor.view(new_shape)
246
+
247
+ # Quantize
248
+ if symmetric:
249
+ B = 2 ** (num_bits - 1) - 1
250
+ scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
251
+ data = data * scale
252
+ data = data.clamp_(-B, B).round_().to(torch.int8)
253
+ return data, scale, original_shape
254
+ else:
255
+ B = 2**num_bits - 1
256
+ mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
257
+ mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
258
+
259
+ scale = B / (mx - mn)
260
+ data = data - mn
261
+ data.mul_(scale)
262
+
263
+ data = data.clamp_(0, B).round_().to(torch.uint8)
264
+ return data, mn, scale, original_shape
265
+
266
+
267
+ def decompress(packed_data, config):
268
+ """Simulate group-wise dequantization."""
269
+ if not config.enabled:
270
+ return packed_data
271
+
272
+ group_size, num_bits, group_dim, symmetric = (
273
+ config.group_size,
274
+ config.num_bits,
275
+ config.group_dim,
276
+ config.symmetric,
277
+ )
278
+
279
+ # Dequantize
280
+ if symmetric:
281
+ data, scale, original_shape = packed_data
282
+ data = data / scale
283
+ else:
284
+ data, mn, scale, original_shape = packed_data
285
+ data = data / scale
286
+ data.add_(mn)
287
+
288
+ # Unpad
289
+ pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
290
+ if pad_len:
291
+ padded_original_shape = (
292
+ original_shape[:group_dim]
293
+ + (original_shape[group_dim] + pad_len,)
294
+ + original_shape[group_dim + 1 :]
295
+ )
296
+ data = data.reshape(padded_original_shape)
297
+ indices = [slice(0, x) for x in original_shape]
298
+ return data[indices].contiguous()
299
+ else:
300
+ return data.view(original_shape)
fastchat/model/convert_fp16.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m fastchat.model.convert_fp16 --in in-folder --out out-folder
4
+ """
5
+ import argparse
6
+
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import torch
9
+
10
+
11
+ def convert_fp16(in_checkpoint, out_checkpoint):
12
+ tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
13
+ model = AutoModelForCausalLM.from_pretrained(
14
+ in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=True
15
+ )
16
+ model.save_pretrained(out_checkpoint)
17
+ tokenizer.save_pretrained(out_checkpoint)
18
+
19
+
20
+ if __name__ == "__main__":
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
23
+ parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
24
+ args = parser.parse_args()
25
+
26
+ convert_fp16(args.in_checkpoint, args.out_checkpoint)
fastchat/model/llama_condense_monkey_patch.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py
2
+
3
+ from functools import partial
4
+
5
+ import torch
6
+ import transformers
7
+ import transformers.models.llama.modeling_llama
8
+
9
+
10
+ class CondenseRotaryEmbedding(torch.nn.Module):
11
+ def __init__(
12
+ self, dim, ratio, max_position_embeddings=2048, base=10000, device=None
13
+ ):
14
+ super().__init__()
15
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
16
+ self.register_buffer("inv_freq", inv_freq)
17
+
18
+ # Build here to make `torch.jit.trace` work.
19
+ self.ratio = ratio
20
+ max_position_embeddings *= ratio
21
+ self.max_seq_len_cached = max_position_embeddings
22
+ # print(f"Monkey Patching condense ratio {ratio}")
23
+ t = (
24
+ torch.arange(
25
+ self.max_seq_len_cached,
26
+ device=self.inv_freq.device,
27
+ dtype=self.inv_freq.dtype,
28
+ )
29
+ / ratio
30
+ )
31
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
32
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
33
+ emb = torch.cat((freqs, freqs), dim=-1)
34
+ dtype = torch.get_default_dtype()
35
+ self.register_buffer(
36
+ "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False
37
+ )
38
+ self.register_buffer(
39
+ "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False
40
+ )
41
+
42
+ def forward(self, x, seq_len=None):
43
+ # x: [bs, num_attention_heads, seq_len, head_size]
44
+ # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
45
+ if seq_len > self.max_seq_len_cached:
46
+ self.max_seq_len_cached = seq_len
47
+ t = (
48
+ torch.arange(
49
+ self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
50
+ )
51
+ / self.ratio
52
+ )
53
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
54
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
55
+ emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
56
+ self.register_buffer(
57
+ "cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False
58
+ )
59
+ self.register_buffer(
60
+ "sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False
61
+ )
62
+ return (
63
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
64
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
65
+ )
66
+
67
+
68
+ def replace_llama_with_condense(ratio):
69
+ transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(
70
+ CondenseRotaryEmbedding, ratio=ratio
71
+ )
fastchat/model/make_delta.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Make the delta weights by subtracting base weights.
3
+
4
+ Usage:
5
+ python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
6
+ """
7
+ import argparse
8
+
9
+ import torch
10
+ from tqdm import tqdm
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+
13
+
14
+ def make_delta(base_model_path, target_model_path, delta_path):
15
+ print(f"Loading the base model from {base_model_path}")
16
+ base = AutoModelForCausalLM.from_pretrained(
17
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
18
+ )
19
+
20
+ print(f"Loading the target model from {target_model_path}")
21
+ target = AutoModelForCausalLM.from_pretrained(
22
+ target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
23
+ )
24
+ target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
25
+
26
+ print("Calculating the delta")
27
+ for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
28
+ assert name in base.state_dict()
29
+ param.data -= base.state_dict()[name]
30
+
31
+ print(f"Saving the delta to {delta_path}")
32
+ if args.hub_repo_id:
33
+ kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
34
+ else:
35
+ kwargs = {}
36
+ target.save_pretrained(delta_path, **kwargs)
37
+ target_tokenizer.save_pretrained(delta_path, **kwargs)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--delta-path", type=str, required=True)
45
+ parser.add_argument("--hub-repo-id", type=str)
46
+ args = parser.parse_args()
47
+
48
+ make_delta(args.base_model_path, args.target_model_path, args.delta_path)
fastchat/model/model_adapter.py ADDED
@@ -0,0 +1,1970 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model adapter registration."""
2
+
3
+ import math
4
+ import os
5
+ import re
6
+ import sys
7
+ from typing import Dict, List, Optional
8
+ import warnings
9
+
10
+ if sys.version_info >= (3, 9):
11
+ from functools import cache
12
+ else:
13
+ from functools import lru_cache as cache
14
+
15
+ import accelerate
16
+ import psutil
17
+ import torch
18
+ from transformers import (
19
+ AutoConfig,
20
+ AutoModel,
21
+ AutoModelForCausalLM,
22
+ AutoModelForSeq2SeqLM,
23
+ AutoTokenizer,
24
+ LlamaTokenizer,
25
+ LlamaForCausalLM,
26
+ T5Tokenizer,
27
+ )
28
+
29
+ from fastchat.constants import CPU_ISA
30
+ from fastchat.conversation import Conversation, get_conv_template
31
+ from fastchat.model.compression import load_compress_model
32
+ from fastchat.model.llama_condense_monkey_patch import replace_llama_with_condense
33
+ from fastchat.model.model_chatglm import generate_stream_chatglm
34
+ from fastchat.model.model_codet5p import generate_stream_codet5p
35
+ from fastchat.model.model_falcon import generate_stream_falcon
36
+ from fastchat.model.model_exllama import generate_stream_exllama
37
+ from fastchat.model.model_xfastertransformer import generate_stream_xft
38
+ from fastchat.model.monkey_patch_non_inplace import (
39
+ replace_llama_attn_with_non_inplace_operations,
40
+ )
41
+ from fastchat.modules.awq import AWQConfig, load_awq_quantized
42
+ from fastchat.modules.exllama import ExllamaConfig, load_exllama_model
43
+ from fastchat.modules.xfastertransformer import load_xft_model, XftConfig
44
+ from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
45
+ from fastchat.utils import get_gpu_memory
46
+
47
+ # Check an environment variable to check if we should be sharing Peft model
48
+ # weights. When false we treat all Peft models as separate.
49
+ peft_share_base_weights = (
50
+ os.environ.get("PEFT_SHARE_BASE_WEIGHTS", "false").lower() == "true"
51
+ )
52
+
53
+ ANTHROPIC_MODEL_LIST = (
54
+ "claude-1",
55
+ "claude-2",
56
+ "claude-instant-1",
57
+ )
58
+
59
+
60
+ class BaseModelAdapter:
61
+ """The base and the default model adapter."""
62
+
63
+ use_fast_tokenizer = False
64
+
65
+ def match(self, model_path: str):
66
+ return True
67
+
68
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
69
+ revision = from_pretrained_kwargs.get("revision", "main")
70
+ try:
71
+ tokenizer = AutoTokenizer.from_pretrained(
72
+ model_path,
73
+ use_fast=self.use_fast_tokenizer,
74
+ revision=revision,
75
+ trust_remote_code=True,
76
+ )
77
+ except TypeError:
78
+ tokenizer = AutoTokenizer.from_pretrained(
79
+ model_path, use_fast=False, revision=revision, trust_remote_code=True
80
+ )
81
+ try:
82
+ model = AutoModelForCausalLM.from_pretrained(
83
+ model_path,
84
+ low_cpu_mem_usage=True,
85
+ trust_remote_code=True,
86
+ use_flash_attention_2=True,
87
+
88
+ **from_pretrained_kwargs,
89
+ )
90
+ except: # NameError:
91
+ model = AutoModelForCausalLM.from_pretrained(
92
+ model_path,
93
+ low_cpu_mem_usage=True,
94
+ trust_remote_code=True,
95
+ use_flash_attention_2=False,
96
+
97
+ **from_pretrained_kwargs,
98
+ )
99
+ # model = AutoModel.from_pretrained(
100
+ # model_path,
101
+ # low_cpu_mem_usage=True,
102
+ # trust_remote_code=True,
103
+ # **from_pretrained_kwargs,
104
+ # )
105
+ return model, tokenizer
106
+
107
+ def load_compress_model(self, model_path, device, torch_dtype, revision="main"):
108
+ return load_compress_model(
109
+ model_path,
110
+ device,
111
+ torch_dtype,
112
+ use_fast=self.use_fast_tokenizer,
113
+ revision=revision,
114
+ )
115
+
116
+ def get_default_conv_template(self, model_path: str) -> Conversation:
117
+ if 'megrez' in model_path.lower():
118
+ model_path = 'megrez'
119
+ elif 'minicpm' in model_path.lower():
120
+ model_path = "minicpm"
121
+ return get_conv_template(model_path.lower())
122
+
123
+
124
+ # A global registry for all model adapters
125
+ # TODO (lmzheng): make it a priority queue.
126
+ model_adapters: List[BaseModelAdapter] = []
127
+
128
+
129
+ def register_model_adapter(cls):
130
+ """Register a model adapter."""
131
+ model_adapters.append(cls())
132
+
133
+
134
+ @cache
135
+ def get_model_adapter(model_path: str, model_name: str = None) -> BaseModelAdapter:
136
+ """Get a model adapter for a model_path."""
137
+ model_path_basename = os.path.basename(os.path.normpath(model_path)) if not model_name else model_name
138
+ # Try the basename of model_path at first
139
+ for adapter in model_adapters:
140
+ if adapter.match(model_path_basename) and type(adapter) != BaseModelAdapter:
141
+ print(f"Matching model adapter: {adapter}")
142
+ return adapter
143
+
144
+ model_path = model_path if not model_name else model_name
145
+ # Then try the full path
146
+ for adapter in model_adapters:
147
+ if adapter.match(model_path):
148
+ print(f"Using model adapter: {adapter}")
149
+ return adapter
150
+
151
+ raise ValueError(f"No valid model adapter for {model_path}")
152
+
153
+
154
+ def raise_warning_for_incompatible_cpu_offloading_configuration(
155
+ device: str, load_8bit: bool, cpu_offloading: bool
156
+ ):
157
+ if cpu_offloading:
158
+ if not load_8bit:
159
+ warnings.warn(
160
+ "The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
161
+ "Use '--load-8bit' to enable 8-bit-quantization\n"
162
+ "Continuing without cpu-offloading enabled\n"
163
+ )
164
+ return False
165
+ if not "linux" in sys.platform:
166
+ warnings.warn(
167
+ "CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
168
+ "Continuing without cpu-offloading enabled\n"
169
+ )
170
+ return False
171
+ if device != "cuda":
172
+ warnings.warn(
173
+ "CPU-offloading is only enabled when using CUDA-devices\n"
174
+ "Continuing without cpu-offloading enabled\n"
175
+ )
176
+ return False
177
+ return cpu_offloading
178
+
179
+
180
+ def load_model(
181
+ model_path: str,
182
+ device: str = "cuda",
183
+ num_gpus: int = 1,
184
+ max_gpu_memory: Optional[str] = None,
185
+ dtype: Optional[torch.dtype] = None,
186
+ load_8bit: bool = False,
187
+ cpu_offloading: bool = False,
188
+ gptq_config: Optional[GptqConfig] = None,
189
+ awq_config: Optional[AWQConfig] = None,
190
+ exllama_config: Optional[ExllamaConfig] = None,
191
+ xft_config: Optional[XftConfig] = None,
192
+ revision: str = "main",
193
+ debug: bool = False,
194
+ model_name: str = None,
195
+ ):
196
+ """Load a model from Hugging Face."""
197
+ # get model adapter
198
+ adapter = get_model_adapter(model_path, model_name)
199
+
200
+ # Handle device mapping
201
+ cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
202
+ device, load_8bit, cpu_offloading
203
+ )
204
+ if device == "cpu":
205
+ # kwargs = {"torch_dtype": torch.float32}
206
+ kwargs = {"torch_dtype": torch.float16}
207
+ if CPU_ISA in ["avx512_bf16", "amx"]:
208
+ try:
209
+ import intel_extension_for_pytorch as ipex
210
+
211
+ kwargs = {"torch_dtype": torch.bfloat16}
212
+ except ImportError:
213
+ warnings.warn(
214
+ "Intel Extension for PyTorch is not installed, it can be installed to accelerate cpu inference"
215
+ )
216
+ elif device == "cuda":
217
+ # kwargs = {"torch_dtype": torch.float16}
218
+ kwargs = {"torch_dtype": torch.bfloat16}
219
+ if num_gpus != 1:
220
+ kwargs["device_map"] = "auto"
221
+ if max_gpu_memory is None:
222
+ kwargs[
223
+ "device_map"
224
+ ] = "sequential" # This is important for not the same VRAM sizes
225
+ available_gpu_memory = get_gpu_memory(num_gpus)
226
+ kwargs["max_memory"] = {
227
+ i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
228
+ for i in range(num_gpus)
229
+ }
230
+ else:
231
+ kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
232
+ elif device == "mps":
233
+ kwargs = {"torch_dtype": torch.float16}
234
+ # Avoid bugs in mps backend by not using in-place operations.
235
+ replace_llama_attn_with_non_inplace_operations()
236
+ elif device == "xpu":
237
+ kwargs = {"torch_dtype": torch.bfloat16}
238
+ # Try to load ipex, while it looks unused, it links into torch for xpu support
239
+ try:
240
+ import intel_extension_for_pytorch as ipex
241
+ except ImportError:
242
+ warnings.warn(
243
+ "Intel Extension for PyTorch is not installed, but is required for xpu inference."
244
+ )
245
+ elif device == "npu":
246
+ kwargs = {"torch_dtype": torch.float16}
247
+ # Try to load ipex, while it looks unused, it links into torch for xpu support
248
+ try:
249
+ import torch_npu
250
+ except ImportError:
251
+ warnings.warn("Ascend Extension for PyTorch is not installed.")
252
+ else:
253
+ raise ValueError(f"Invalid device: {device}")
254
+
255
+ if cpu_offloading:
256
+ # raises an error on incompatible platforms
257
+ from transformers import BitsAndBytesConfig
258
+
259
+ if "max_memory" in kwargs:
260
+ kwargs["max_memory"]["cpu"] = (
261
+ str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib"
262
+ )
263
+ kwargs["quantization_config"] = BitsAndBytesConfig(
264
+ load_in_8bit_fp32_cpu_offload=cpu_offloading
265
+ )
266
+ kwargs["load_in_8bit"] = load_8bit
267
+ elif load_8bit:
268
+ if num_gpus != 1:
269
+ warnings.warn(
270
+ "8-bit quantization is not supported for multi-gpu inference."
271
+ )
272
+ else:
273
+ model, tokenizer = adapter.load_compress_model(
274
+ model_path=model_path,
275
+ device=device,
276
+ torch_dtype=kwargs["torch_dtype"],
277
+ revision=revision,
278
+ )
279
+ if debug:
280
+ print(model)
281
+ return model, tokenizer
282
+ elif awq_config and awq_config.wbits < 16:
283
+ assert (
284
+ awq_config.wbits == 4
285
+ ), "Currently we only support 4-bit inference for AWQ."
286
+ model, tokenizer = load_awq_quantized(model_path, awq_config, device)
287
+ if num_gpus != 1:
288
+ device_map = accelerate.infer_auto_device_map(
289
+ model,
290
+ max_memory=kwargs["max_memory"],
291
+ no_split_module_classes=[
292
+ "OPTDecoderLayer",
293
+ "LlamaDecoderLayer",
294
+ "BloomBlock",
295
+ "MPTBlock",
296
+ "DecoderLayer",
297
+ ],
298
+ )
299
+ model = accelerate.dispatch_model(
300
+ model, device_map=device_map, offload_buffers=True
301
+ )
302
+ else:
303
+ model.to(device)
304
+ return model, tokenizer
305
+ elif gptq_config and gptq_config.wbits < 16:
306
+ model, tokenizer = load_gptq_quantized(model_path, gptq_config)
307
+ if num_gpus != 1:
308
+ device_map = accelerate.infer_auto_device_map(
309
+ model,
310
+ max_memory=kwargs["max_memory"],
311
+ no_split_module_classes=["LlamaDecoderLayer"],
312
+ )
313
+ model = accelerate.dispatch_model(
314
+ model, device_map=device_map, offload_buffers=True
315
+ )
316
+ else:
317
+ model.to(device)
318
+ return model, tokenizer
319
+ elif exllama_config:
320
+ model, tokenizer = load_exllama_model(model_path, exllama_config)
321
+ return model, tokenizer
322
+ elif xft_config:
323
+ model, tokenizer = load_xft_model(model_path, xft_config)
324
+ return model, tokenizer
325
+ kwargs["revision"] = revision
326
+
327
+ if dtype is not None: # Overwrite dtype if it is provided in the arguments.
328
+ kwargs["torch_dtype"] = dtype
329
+
330
+ # Load model
331
+ model, tokenizer = adapter.load_model(model_path, kwargs)
332
+
333
+ if (
334
+ device == "cpu"
335
+ and kwargs["torch_dtype"] is torch.bfloat16
336
+ and CPU_ISA is not None
337
+ ):
338
+ model = ipex.optimize(model, dtype=kwargs["torch_dtype"])
339
+
340
+ if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
341
+ "mps",
342
+ "xpu",
343
+ "npu",
344
+ ):
345
+ model.to(device)
346
+
347
+ if device == "xpu":
348
+ model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)
349
+
350
+ if debug:
351
+ print(model)
352
+
353
+ return model, tokenizer
354
+
355
+
356
+ def get_conversation_template(model_path: str) -> Conversation:
357
+ """Get the default conversation template."""
358
+ adapter = get_model_adapter(model_path)
359
+ return adapter.get_default_conv_template(model_path)
360
+
361
+
362
+ def get_generate_stream_function(model: torch.nn.Module, model_path: str):
363
+ """Get the generate_stream function for inference."""
364
+ from fastchat.serve.inference import generate_stream
365
+
366
+ model_type = str(type(model)).lower()
367
+ is_chatglm = "chatglm" in model_type
368
+ is_falcon = "rwforcausallm" in model_type
369
+ is_codet5p = "codet5p" in model_type
370
+ is_peft = "peft" in model_type
371
+ is_exllama = "exllama" in model_type
372
+ is_xft = "xft" in model_type
373
+
374
+ if is_chatglm:
375
+ return generate_stream_chatglm
376
+ elif is_falcon:
377
+ return generate_stream_falcon
378
+ elif is_codet5p:
379
+ return generate_stream_codet5p
380
+ elif is_exllama:
381
+ return generate_stream_exllama
382
+ elif is_xft:
383
+ return generate_stream_xft
384
+
385
+ elif peft_share_base_weights and is_peft:
386
+ # Return a curried stream function that loads the right adapter
387
+ # according to the model_name available in this context. This ensures
388
+ # the right weights are available.
389
+ @torch.inference_mode()
390
+ def generate_stream_peft(
391
+ model,
392
+ tokenizer,
393
+ params: Dict,
394
+ device: str,
395
+ context_len: int,
396
+ stream_interval: int = 2,
397
+ judge_sent_end: bool = False,
398
+ ):
399
+ model.set_adapter(model_path)
400
+ for x in generate_stream(
401
+ model,
402
+ tokenizer,
403
+ params,
404
+ device,
405
+ context_len,
406
+ stream_interval,
407
+ judge_sent_end,
408
+ ):
409
+ yield x
410
+
411
+ return generate_stream_peft
412
+ else:
413
+ return generate_stream
414
+
415
+
416
+ def add_model_args(parser):
417
+ parser.add_argument(
418
+ "--model-path",
419
+ type=str,
420
+ default="lmsys/vicuna-7b-v1.5",
421
+ help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
422
+ )
423
+ parser.add_argument(
424
+ "--revision",
425
+ type=str,
426
+ default="main",
427
+ help="Hugging Face Hub model revision identifier",
428
+ )
429
+ parser.add_argument(
430
+ "--device",
431
+ type=str,
432
+ choices=["cpu", "cuda", "mps", "xpu", "npu"],
433
+ default="cuda",
434
+ help="The device type",
435
+ )
436
+ parser.add_argument(
437
+ "--gpus",
438
+ type=str,
439
+ default=None,
440
+ help="A single GPU like 1 or multiple GPUs like 0,2",
441
+ )
442
+ parser.add_argument("--num-gpus", type=int, default=1)
443
+ parser.add_argument(
444
+ "--max-gpu-memory",
445
+ type=str,
446
+ help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
447
+ )
448
+ parser.add_argument(
449
+ "--dtype",
450
+ type=str,
451
+ choices=["float32", "float16", "bfloat16"],
452
+ help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
453
+ default=None,
454
+ )
455
+ parser.add_argument(
456
+ "--load-8bit", action="store_true", help="Use 8-bit quantization"
457
+ )
458
+ parser.add_argument(
459
+ "--cpu-offloading",
460
+ action="store_true",
461
+ help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
462
+ )
463
+ parser.add_argument(
464
+ "--gptq-ckpt",
465
+ type=str,
466
+ default=None,
467
+ help="Used for GPTQ. The path to the local GPTQ checkpoint.",
468
+ )
469
+ parser.add_argument(
470
+ "--gptq-wbits",
471
+ type=int,
472
+ default=16,
473
+ choices=[2, 3, 4, 8, 16],
474
+ help="Used for GPTQ. #bits to use for quantization",
475
+ )
476
+ parser.add_argument(
477
+ "--gptq-groupsize",
478
+ type=int,
479
+ default=-1,
480
+ help="Used for GPTQ. Groupsize to use for quantization; default uses full row.",
481
+ )
482
+ parser.add_argument(
483
+ "--gptq-act-order",
484
+ action="store_true",
485
+ help="Used for GPTQ. Whether to apply the activation order GPTQ heuristic",
486
+ )
487
+ parser.add_argument(
488
+ "--awq-ckpt",
489
+ type=str,
490
+ default=None,
491
+ help="Used for AWQ. Load quantized model. The path to the local AWQ checkpoint.",
492
+ )
493
+ parser.add_argument(
494
+ "--awq-wbits",
495
+ type=int,
496
+ default=16,
497
+ choices=[4, 16],
498
+ help="Used for AWQ. #bits to use for AWQ quantization",
499
+ )
500
+ parser.add_argument(
501
+ "--awq-groupsize",
502
+ type=int,
503
+ default=-1,
504
+ help="Used for AWQ. Groupsize to use for AWQ quantization; default uses full row.",
505
+ )
506
+ parser.add_argument(
507
+ "--enable-exllama",
508
+ action="store_true",
509
+ help="Used for exllamabv2. Enable exllamaV2 inference framework.",
510
+ )
511
+ parser.add_argument(
512
+ "--exllama-max-seq-len",
513
+ type=int,
514
+ default=4096,
515
+ help="Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length.",
516
+ )
517
+ parser.add_argument(
518
+ "--exllama-gpu-split",
519
+ type=str,
520
+ default=None,
521
+ help="Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7",
522
+ )
523
+ parser.add_argument(
524
+ "--enable-xft",
525
+ action="store_true",
526
+ help="Used for xFasterTransformer Enable xFasterTransformer inference framework.",
527
+ )
528
+ parser.add_argument(
529
+ "--xft-max-seq-len",
530
+ type=int,
531
+ default=4096,
532
+ help="Used for xFasterTransformer. Max sequence length to use for xFasterTransformer framework; default 4096 sequence length.",
533
+ )
534
+ parser.add_argument(
535
+ "--xft-dtype",
536
+ type=str,
537
+ choices=["fp16", "bf16", "int8", "bf16_fp16", "bf16_int8"],
538
+ help="Override the default dtype. If not set, it will use bfloat16 for first token and float16 next tokens on CPU.",
539
+ default=None,
540
+ )
541
+
542
+
543
+ def remove_parent_directory_name(model_path):
544
+ """Remove parent directory name."""
545
+ if model_path[-1] == "/":
546
+ model_path = model_path[:-1]
547
+ return model_path.split("/")[-1]
548
+
549
+
550
+ peft_model_cache = {}
551
+
552
+
553
+ class PeftModelAdapter:
554
+ """Loads any "peft" model and it's base model."""
555
+
556
+ def match(self, model_path: str):
557
+ """Accepts any model path with "peft" in the name"""
558
+ if os.path.exists(os.path.join(model_path, "adapter_config.json")):
559
+ return True
560
+ return "peft" in model_path.lower()
561
+
562
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
563
+ """Loads the base model then the (peft) adapter weights"""
564
+ from peft import PeftConfig, PeftModel
565
+
566
+ config = PeftConfig.from_pretrained(model_path)
567
+ base_model_path = config.base_model_name_or_path
568
+ if "peft" in base_model_path:
569
+ raise ValueError(
570
+ f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
571
+ )
572
+
573
+ # Basic proof of concept for loading peft adapters that share the base
574
+ # weights. This is pretty messy because Peft re-writes the underlying
575
+ # base model and internally stores a map of adapter layers.
576
+ # So, to make this work we:
577
+ # 1. Cache the first peft model loaded for a given base models.
578
+ # 2. Call `load_model` for any follow on Peft models.
579
+ # 3. Make sure we load the adapters by the model_path. Why? This is
580
+ # what's accessible during inference time.
581
+ # 4. In get_generate_stream_function, make sure we load the right
582
+ # adapter before doing inference. This *should* be safe when calls
583
+ # are blocked the same semaphore.
584
+ if peft_share_base_weights:
585
+ if base_model_path in peft_model_cache:
586
+ model, tokenizer = peft_model_cache[base_model_path]
587
+ # Super important: make sure we use model_path as the
588
+ # `adapter_name`.
589
+ model.load_adapter(model_path, adapter_name=model_path)
590
+ else:
591
+ base_adapter = get_model_adapter(base_model_path)
592
+ base_model, tokenizer = base_adapter.load_model(
593
+ base_model_path, from_pretrained_kwargs
594
+ )
595
+ # Super important: make sure we use model_path as the
596
+ # `adapter_name`.
597
+ model = PeftModel.from_pretrained(
598
+ base_model, model_path, adapter_name=model_path
599
+ )
600
+ peft_model_cache[base_model_path] = (model, tokenizer)
601
+ return model, tokenizer
602
+
603
+ # In the normal case, load up the base model weights again.
604
+ base_adapter = get_model_adapter(base_model_path)
605
+ base_model, tokenizer = base_adapter.load_model(
606
+ base_model_path, from_pretrained_kwargs
607
+ )
608
+ model = PeftModel.from_pretrained(base_model, model_path)
609
+ return model, tokenizer
610
+
611
+ def get_default_conv_template(self, model_path: str) -> Conversation:
612
+ """Uses the conv template of the base model"""
613
+ from peft import PeftConfig, PeftModel
614
+
615
+ config = PeftConfig.from_pretrained(model_path)
616
+ if "peft" in config.base_model_name_or_path:
617
+ raise ValueError(
618
+ f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
619
+ )
620
+ base_model_path = config.base_model_name_or_path
621
+ base_adapter = get_model_adapter(base_model_path)
622
+ return base_adapter.get_default_conv_template(config.base_model_name_or_path)
623
+
624
+
625
+
626
+ class DeepseekChatAdapter(BaseModelAdapter):
627
+ """The model adapter for deepseek-ai's chat models"""
628
+
629
+ # Note: that this model will require tokenizer version >= 0.13.3 because the tokenizer class is LlamaTokenizerFast
630
+
631
+ def match(self, model_path: str):
632
+ return "deepseek" in model_path.lower() and "chat" in model_path.lower()
633
+
634
+ def get_default_conv_template(self, model_path: str) -> Conversation:
635
+ return get_conv_template("deepseek")
636
+
637
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
638
+ model = AutoModelForCausalLM.from_pretrained(
639
+ model_path,
640
+ trust_remote_code=True,
641
+ device_map="sequential",
642
+ torch_dtype=torch.bfloat16,
643
+ max_memory=from_pretrained_kwargs['max_memory'],
644
+ attn_implementation="flash_attention_2"#"eager"
645
+ )
646
+
647
+ tokenizer = AutoTokenizer.from_pretrained(
648
+ model_path, utrust_remote_code=True
649
+ )
650
+
651
+ return model, tokenizer
652
+
653
+
654
+ class VicunaAdapter(BaseModelAdapter):
655
+ "Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" ""
656
+
657
+ use_fast_tokenizer = False
658
+
659
+ def match(self, model_path: str):
660
+ return "vicuna" in model_path.lower()
661
+
662
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
663
+ revision = from_pretrained_kwargs.get("revision", "main")
664
+ tokenizer = AutoTokenizer.from_pretrained(
665
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
666
+ )
667
+ model = AutoModelForCausalLM.from_pretrained(
668
+ model_path,
669
+ low_cpu_mem_usage=True,
670
+ use_flash_attention_2=True,
671
+ **from_pretrained_kwargs,
672
+ )
673
+ self.raise_warning_for_old_weights(model)
674
+ return model, tokenizer
675
+
676
+ def get_default_conv_template(self, model_path: str) -> Conversation:
677
+ if "v0" in remove_parent_directory_name(model_path):
678
+ return get_conv_template("one_shot")
679
+ return get_conv_template("vicuna_v1.1")
680
+
681
+ def raise_warning_for_old_weights(self, model):
682
+ if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000:
683
+ warnings.warn(
684
+ "\nYou are probably using the old Vicuna-v0 model, "
685
+ "which will generate unexpected results with the "
686
+ "current fastchat.\nYou can try one of the following methods:\n"
687
+ "1. Upgrade your weights to the new Vicuna-v1.3: https://github.com/lm-sys/FastChat#vicuna-weights.\n"
688
+ "2. Use the old conversation template by `python3 -m fastchat.serve.cli --model-path /path/to/vicuna-v0 --conv-template one_shot`\n"
689
+ "3. Downgrade fschat to fschat==0.1.10 (Not recommended).\n"
690
+ )
691
+
692
+
693
+ class AiroborosAdapter(BaseModelAdapter):
694
+ """The model adapter for jondurbin/airoboros-*"""
695
+
696
+ def match(self, model_path: str):
697
+ if re.search(r"airoboros|spicyboros", model_path, re.I):
698
+ return True
699
+ return False
700
+
701
+ def get_default_conv_template(self, model_path: str) -> Conversation:
702
+ if "-3." in model_path or "-3p" in model_path:
703
+ return get_conv_template("airoboros_v3")
704
+ if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
705
+ return get_conv_template("airoboros_v2")
706
+ return get_conv_template("airoboros_v1")
707
+
708
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
709
+ if "mpt" not in model_path.lower():
710
+ return super().load_model(model_path, from_pretrained_kwargs)
711
+ model = AutoModelForCausalLM.from_pretrained(
712
+ model_path,
713
+ low_cpu_mem_usage=True,
714
+ trust_remote_code=True,
715
+ max_seq_len=8192,
716
+ **from_pretrained_kwargs,
717
+ )
718
+ tokenizer = AutoTokenizer.from_pretrained(
719
+ model_path, trust_remote_code=True, use_fast=True
720
+ )
721
+ return model, tokenizer
722
+
723
+ class Zhinao360Adapter(BaseModelAdapter):
724
+ def match(self, model_path: str):
725
+ return "360zhinao" in model_path.lower()
726
+
727
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
728
+ tokenizer = AutoTokenizer.from_pretrained(
729
+ model_path,
730
+ trust_remote_code=True)
731
+
732
+ model = AutoModelForCausalLM.from_pretrained(
733
+ model_path,
734
+ trust_remote_code=True)
735
+ from transformers import GenerationConfig
736
+ generation_config = GenerationConfig.from_pretrained(
737
+ model_path,
738
+ trust_remote_code=True)
739
+
740
+ return model, tokenizer, generation_config
741
+
742
+ def get_default_conv_template(self, model_path: str) -> Conversation:
743
+ return get_conv_template("360zhinao")
744
+
745
+ class LongChatAdapter(BaseModelAdapter):
746
+ "Model adapter for LongChat models (e.g., lmsys/longchat-7b-16k)."
747
+
748
+ use_fast_tokenizer = False
749
+
750
+ def match(self, model_path: str):
751
+ return "longchat" in model_path.lower()
752
+
753
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
754
+ revision = from_pretrained_kwargs.get("revision", "main")
755
+
756
+ # Apply monkey patch, TODO(Dacheng): Add flash attention support
757
+ config = AutoConfig.from_pretrained(model_path, revision=revision)
758
+ replace_llama_with_condense(config.rope_scaling["factor"])
759
+
760
+ tokenizer = AutoTokenizer.from_pretrained(
761
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
762
+ )
763
+ model = AutoModelForCausalLM.from_pretrained(
764
+ model_path,
765
+ low_cpu_mem_usage=True,
766
+ **from_pretrained_kwargs,
767
+ )
768
+ return model, tokenizer
769
+
770
+ def get_default_conv_template(self, model_path: str) -> Conversation:
771
+ return get_conv_template("vicuna_v1.1")
772
+
773
+
774
+ class GoogleT5Adapter(BaseModelAdapter):
775
+ """The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
776
+
777
+ def match(self, model_path: str):
778
+ return any(
779
+ model_str in model_path.lower()
780
+ for model_str in ["flan-", "fastchat-t5", "codet5p"]
781
+ )
782
+
783
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
784
+ revision = from_pretrained_kwargs.get("revision", "main")
785
+ tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
786
+ model = AutoModelForSeq2SeqLM.from_pretrained(
787
+ model_path,
788
+ low_cpu_mem_usage=True,
789
+ trust_remote_code=True,
790
+ **from_pretrained_kwargs,
791
+ )
792
+ return model, tokenizer
793
+
794
+
795
+ class KoalaAdapter(BaseModelAdapter):
796
+ """The model adapter for Koala"""
797
+
798
+ use_fast_tokenizer = False
799
+
800
+ def match(self, model_path: str):
801
+ return "koala" in model_path.lower()
802
+
803
+ def get_default_conv_template(self, model_path: str) -> Conversation:
804
+ return get_conv_template("koala_v1")
805
+
806
+
807
+ class AlpacaAdapter(BaseModelAdapter):
808
+ """The model adapter for Alpaca"""
809
+
810
+ use_fast_tokenizer = False
811
+
812
+ def match(self, model_path: str):
813
+ return "alpaca" in model_path.lower()
814
+
815
+ def get_default_conv_template(self, model_path: str) -> Conversation:
816
+ return get_conv_template("alpaca")
817
+
818
+
819
+ class ChatGLMAdapter(BaseModelAdapter):
820
+ """The model adapter for THUDM/chatglm-6b, THUDM/chatglm2-6b"""
821
+
822
+ def match(self, model_path: str):
823
+ return "chatglm" in model_path.lower()
824
+
825
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
826
+ revision = from_pretrained_kwargs.get("revision", "main")
827
+ if "chatglm3" in model_path.lower():
828
+ tokenizer = AutoTokenizer.from_pretrained(
829
+ model_path,
830
+ encode_special_tokens=True,
831
+ trust_remote_code=True,
832
+ revision=revision,
833
+ )
834
+ else:
835
+ tokenizer = AutoTokenizer.from_pretrained(
836
+ model_path, trust_remote_code=True, revision=revision
837
+ )
838
+ model = AutoModel.from_pretrained(
839
+ model_path, trust_remote_code=True, **from_pretrained_kwargs
840
+ )
841
+ return model, tokenizer
842
+
843
+ def get_default_conv_template(self, model_path: str) -> Conversation:
844
+ model_path = model_path.lower()
845
+ if "chatglm2" in model_path.lower():
846
+ return get_conv_template("chatglm2")
847
+ if "chatglm3" in model_path.lower():
848
+ return get_conv_template("chatglm3")
849
+ return get_conv_template("chatglm")
850
+
851
+
852
+ class CodeGeexAdapter(BaseModelAdapter):
853
+ """The model adapter for THUDM/codegeex-6b, THUDM/codegeex2-6b"""
854
+
855
+ def match(self, model_path: str):
856
+ return "codegeex" in model_path.lower()
857
+
858
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
859
+ revision = from_pretrained_kwargs.get("revision", "main")
860
+ tokenizer = AutoTokenizer.from_pretrained(
861
+ model_path, trust_remote_code=True, revision=revision
862
+ )
863
+ model = AutoModel.from_pretrained(
864
+ model_path, trust_remote_code=True, **from_pretrained_kwargs
865
+ )
866
+ return model, tokenizer
867
+
868
+ def get_default_conv_template(self, model_path: str) -> Conversation:
869
+ return get_conv_template("codegeex")
870
+
871
+
872
+ class DollyV2Adapter(BaseModelAdapter):
873
+ """The model adapter for databricks/dolly-v2-12b"""
874
+
875
+ def match(self, model_path: str):
876
+ return "dolly-v2" in model_path.lower()
877
+
878
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
879
+ revision = from_pretrained_kwargs.get("revision", "main")
880
+ tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
881
+ model = AutoModelForCausalLM.from_pretrained(
882
+ model_path,
883
+ low_cpu_mem_usage=True,
884
+ **from_pretrained_kwargs,
885
+ )
886
+ # 50277 means "### End"
887
+ tokenizer.eos_token_id = 50277
888
+ model.config.eos_token_id = tokenizer.eos_token_id
889
+ model.config.pad_token_id = tokenizer.pad_token_id
890
+ return model, tokenizer
891
+
892
+ def get_default_conv_template(self, model_path: str) -> Conversation:
893
+ return get_conv_template("dolly_v2")
894
+
895
+
896
+ class OasstPythiaAdapter(BaseModelAdapter):
897
+ """The model adapter for OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"""
898
+
899
+ def match(self, model_path: str):
900
+ model_path = model_path.lower()
901
+ return "oasst" in model_path and "pythia" in model_path
902
+
903
+ def get_default_conv_template(self, model_path: str) -> Conversation:
904
+ return get_conv_template("oasst_pythia")
905
+
906
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
907
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
908
+ model.config.eos_token_id = tokenizer.eos_token_id
909
+ model.config.pad_token_id = tokenizer.pad_token_id
910
+ return model, tokenizer
911
+
912
+
913
+ class OasstLLaMAAdapter(BaseModelAdapter):
914
+ """The model adapter for OpenAssistant/oasst-sft-7-llama-30b"""
915
+
916
+ use_fast_tokenizer = False
917
+
918
+ def match(self, model_path: str):
919
+ model_path = model_path.lower()
920
+ if "openassistant-sft-7-llama-30b-hf" in model_path:
921
+ return True
922
+ return "oasst" in model_path and "pythia" not in model_path
923
+
924
+ def get_default_conv_template(self, model_path: str) -> Conversation:
925
+ return get_conv_template("oasst_llama")
926
+
927
+
928
+ class OpenChat35Adapter(BaseModelAdapter):
929
+ """The model adapter for OpenChat 3.5 (e.g. openchat/openchat_3.5)"""
930
+
931
+ def match(self, model_path: str):
932
+ return "openchat" in model_path.lower() and "3.5" in model_path.lower()
933
+
934
+ def get_default_conv_template(self, model_path: str) -> Conversation:
935
+ return get_conv_template("openchat_3.5")
936
+
937
+
938
+ class PythiaAdapter(BaseModelAdapter):
939
+ """The model adapter for any EleutherAI/pythia model"""
940
+
941
+ def match(self, model_path: str):
942
+ return "pythia" in model_path.lower()
943
+
944
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
945
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
946
+ model.config.eos_token_id = tokenizer.eos_token_id
947
+ model.config.pad_token_id = tokenizer.pad_token_id
948
+ return model, tokenizer
949
+
950
+
951
+ class StableLMAdapter(BaseModelAdapter):
952
+ """The model adapter for StabilityAI/stablelm-tuned-alpha-7b"""
953
+
954
+ def match(self, model_path: str):
955
+ return "stablelm" in model_path.lower()
956
+
957
+ def get_default_conv_template(self, model_path: str) -> Conversation:
958
+ return get_conv_template("stablelm")
959
+
960
+
961
+ class MPTAdapter(BaseModelAdapter):
962
+ """The model adapter for MPT series (mosaicml/mpt-7b-chat, mosaicml/mpt-30b-chat)"""
963
+
964
+ def match(self, model_path: str):
965
+ model_path = model_path.lower()
966
+ return "mpt" in model_path and not "airoboros" in model_path
967
+
968
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
969
+ revision = from_pretrained_kwargs.get("revision", "main")
970
+ model = AutoModelForCausalLM.from_pretrained(
971
+ model_path,
972
+ low_cpu_mem_usage=True,
973
+ trust_remote_code=True,
974
+ max_seq_len=8192,
975
+ **from_pretrained_kwargs,
976
+ )
977
+ tokenizer = AutoTokenizer.from_pretrained(
978
+ model_path, trust_remote_code=True, revision=revision
979
+ )
980
+ model.config.eos_token_id = tokenizer.eos_token_id
981
+ model.config.pad_token_id = tokenizer.pad_token_id
982
+ return model, tokenizer
983
+
984
+ def get_default_conv_template(self, model_path: str) -> Conversation:
985
+ model_path = model_path.lower()
986
+ if "mpt-7b-chat" in model_path:
987
+ return get_conv_template("mpt-7b-chat")
988
+ elif "mpt-30b-chat" in model_path:
989
+ return get_conv_template("mpt-30b-chat")
990
+ elif "mpt-30b-instruct" in model_path:
991
+ return get_conv_template("mpt-30b-instruct")
992
+ else:
993
+ print(
994
+ "Warning: Loading base MPT model with `zero_shot` conversation configuration. "
995
+ "If this is not desired, inspect model configurations and names."
996
+ )
997
+ return get_conv_template("zero_shot")
998
+
999
+
1000
+ class BaizeAdapter(BaseModelAdapter):
1001
+ """The model adapter for project-baize/baize-v2-7b"""
1002
+
1003
+ use_fast_tokenizer = False
1004
+
1005
+ def match(self, model_path: str):
1006
+ return "baize" in model_path.lower()
1007
+
1008
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1009
+ return get_conv_template("baize")
1010
+
1011
+
1012
+ class RwkvAdapter(BaseModelAdapter):
1013
+ """The model adapter for BlinkDL/RWKV-4-Raven"""
1014
+
1015
+ def match(self, model_path: str):
1016
+ return "rwkv-4" in model_path.lower()
1017
+
1018
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1019
+ from fastchat.model.rwkv_model import RwkvModel
1020
+
1021
+ model = RwkvModel(model_path)
1022
+ revision = from_pretrained_kwargs.get("revision", "main")
1023
+ tokenizer = AutoTokenizer.from_pretrained(
1024
+ "EleutherAI/pythia-160m", revision=revision
1025
+ )
1026
+ return model, tokenizer
1027
+
1028
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1029
+ return get_conv_template("rwkv")
1030
+
1031
+
1032
+ class OpenBuddyAdapter(BaseModelAdapter):
1033
+ """The model adapter for OpenBuddy/openbuddy-7b-v1.1-bf16-enc"""
1034
+
1035
+ use_fast_tokenizer = False
1036
+
1037
+ def match(self, model_path: str):
1038
+ return "openbuddy" in model_path.lower()
1039
+
1040
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1041
+ return get_conv_template("openbuddy")
1042
+
1043
+
1044
+ class PhoenixAdapter(BaseModelAdapter):
1045
+ """The model adapter for FreedomIntelligence/phoenix-inst-chat-7b"""
1046
+
1047
+ def match(self, model_path: str):
1048
+ return "phoenix" in model_path.lower()
1049
+
1050
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1051
+ return get_conv_template("phoenix")
1052
+
1053
+
1054
+ class ReaLMAdapter(BaseModelAdapter):
1055
+ """The model adapter for FreedomIntelligence/ReaLM-7b"""
1056
+
1057
+ def match(self, model_path: str):
1058
+ return "ReaLM" in model_path
1059
+
1060
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1061
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
1062
+ model = AutoModelForCausalLM.from_pretrained(
1063
+ model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
1064
+ )
1065
+ return model, tokenizer
1066
+
1067
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1068
+ return get_conv_template("ReaLM-7b-v1")
1069
+
1070
+
1071
+ class ChatGPTAdapter(BaseModelAdapter):
1072
+ """The model adapter for ChatGPT"""
1073
+
1074
+ def match(self, model_path: str):
1075
+ return model_path in (
1076
+ "gpt-3.5-turbo",
1077
+ "gpt-3.5-turbo-1106",
1078
+ "gpt-4",
1079
+ "gpt-4-turbo",
1080
+ )
1081
+
1082
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1083
+ raise NotImplementedError()
1084
+
1085
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1086
+ return get_conv_template("chatgpt")
1087
+
1088
+
1089
+ class AzureOpenAIAdapter(BaseModelAdapter):
1090
+ """The model adapter for Azure OpenAI"""
1091
+
1092
+ def match(self, model_path: str):
1093
+ return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")
1094
+
1095
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1096
+ raise NotImplementedError()
1097
+
1098
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1099
+ return get_conv_template("chatgpt")
1100
+
1101
+
1102
+ class ClaudeAdapter(BaseModelAdapter):
1103
+ """The model adapter for Claude"""
1104
+
1105
+ def match(self, model_path: str):
1106
+ return model_path in ANTHROPIC_MODEL_LIST
1107
+
1108
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1109
+ raise NotImplementedError()
1110
+
1111
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1112
+ return get_conv_template("claude")
1113
+
1114
+
1115
+ class BardAdapter(BaseModelAdapter):
1116
+ """The model adapter for Bard"""
1117
+
1118
+ def match(self, model_path: str):
1119
+ return model_path == "bard"
1120
+
1121
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1122
+ raise NotImplementedError()
1123
+
1124
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1125
+ return get_conv_template("bard")
1126
+
1127
+
1128
+ class PaLM2Adapter(BaseModelAdapter):
1129
+ """The model adapter for PaLM2"""
1130
+
1131
+ def match(self, model_path: str):
1132
+ return model_path == "palm-2"
1133
+
1134
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1135
+ raise NotImplementedError()
1136
+
1137
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1138
+ return get_conv_template("bard")
1139
+
1140
+
1141
+ class BiLLaAdapter(BaseModelAdapter):
1142
+ """The model adapter for Neutralzz/BiLLa-7B-SFT"""
1143
+
1144
+ def match(self, model_path: str):
1145
+ return "billa" in model_path.lower()
1146
+
1147
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1148
+ return get_conv_template("billa")
1149
+
1150
+
1151
+ class RedPajamaINCITEAdapter(BaseModelAdapter):
1152
+ """The model adapter for togethercomputer/RedPajama-INCITE-7B-Chat"""
1153
+
1154
+ def match(self, model_path: str):
1155
+ return "redpajama-incite" in model_path.lower()
1156
+
1157
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1158
+ revision = from_pretrained_kwargs.get("revision", "main")
1159
+ tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
1160
+ model = AutoModelForCausalLM.from_pretrained(
1161
+ model_path,
1162
+ low_cpu_mem_usage=True,
1163
+ **from_pretrained_kwargs,
1164
+ )
1165
+ return model, tokenizer
1166
+
1167
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1168
+ return get_conv_template("redpajama-incite")
1169
+
1170
+
1171
+ class H2OGPTAdapter(BaseModelAdapter):
1172
+ """The model adapter for h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b"""
1173
+
1174
+ use_fast_tokenizer = False
1175
+
1176
+ def match(self, model_path: str):
1177
+ return "h2ogpt" in model_path.lower()
1178
+
1179
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1180
+ return get_conv_template("h2ogpt")
1181
+
1182
+
1183
+ class RobinAdapter(BaseModelAdapter):
1184
+ """The model adapter for LMFlow/Full-Robin-7b-v2"""
1185
+
1186
+ use_fast_tokenizer = False
1187
+
1188
+ def match(self, model_path: str):
1189
+ return "robin" in model_path.lower()
1190
+
1191
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1192
+ return get_conv_template("Robin")
1193
+
1194
+
1195
+ class SnoozyAdapter(BaseModelAdapter):
1196
+ """The model adapter for nomic-ai/gpt4all-13b-snoozy"""
1197
+
1198
+ use_fast_tokenizer = False
1199
+
1200
+ def match(self, model_path: str):
1201
+ model_path = model_path.lower()
1202
+ return "gpt4all" in model_path and "snoozy" in model_path
1203
+
1204
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1205
+ return get_conv_template("snoozy")
1206
+
1207
+
1208
+ class WizardLMAdapter(BaseModelAdapter):
1209
+ """The model adapter for WizardLM/WizardLM-13B-V1.0"""
1210
+
1211
+ use_fast_tokenizer = False
1212
+
1213
+ def match(self, model_path: str):
1214
+ return "wizardlm" in model_path.lower()
1215
+
1216
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1217
+ model_path = model_path.lower()
1218
+ if "13b" in model_path or "30b" in model_path or "70b" in model_path:
1219
+ return get_conv_template("vicuna_v1.1")
1220
+ else:
1221
+ # TODO: use the recommended template for 7B
1222
+ # (https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
1223
+ return get_conv_template("one_shot")
1224
+
1225
+
1226
+ class ManticoreAdapter(BaseModelAdapter):
1227
+ """The model adapter for openaccess-ai-collective/manticore-13b-chat-pyg"""
1228
+
1229
+ use_fast_tokenizer = False
1230
+
1231
+ def match(self, model_path: str):
1232
+ return "manticore" in model_path.lower()
1233
+
1234
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1235
+ return get_conv_template("manticore")
1236
+
1237
+
1238
+ class GuanacoAdapter(BaseModelAdapter):
1239
+ """The model adapter for timdettmers/guanaco-33b-merged"""
1240
+
1241
+ use_fast_tokenizer = False
1242
+
1243
+ def match(self, model_path: str):
1244
+ return "guanaco" in model_path.lower()
1245
+
1246
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1247
+ revision = from_pretrained_kwargs.get("revision", "main")
1248
+ tokenizer = AutoTokenizer.from_pretrained(
1249
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
1250
+ )
1251
+ model = AutoModelForCausalLM.from_pretrained(
1252
+ model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
1253
+ )
1254
+ # Fix a bug in tokenizer config
1255
+ tokenizer.eos_token_id = model.config.eos_token_id
1256
+ return model, tokenizer
1257
+
1258
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1259
+ return get_conv_template("zero_shot")
1260
+
1261
+
1262
+ class ChangGPTAdapter(BaseModelAdapter):
1263
+ """The model adapter for lcw99/polyglot-ko-12.8b-chang-instruct-chat"""
1264
+
1265
+ def match(self, model_path: str):
1266
+ model_path = model_path.lower()
1267
+ return "polyglot" in model_path and "chang" in model_path
1268
+
1269
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1270
+ return get_conv_template("polyglot_changgpt")
1271
+
1272
+
1273
+ class CamelAdapter(BaseModelAdapter):
1274
+ """The model adapter for camel-ai/CAMEL-13B-Combined-Data"""
1275
+
1276
+ use_fast_tokenizer = False
1277
+
1278
+ def match(self, model_path: str):
1279
+ return "camel" in model_path.lower()
1280
+
1281
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1282
+ return get_conv_template("vicuna_v1.1")
1283
+
1284
+
1285
+ class TuluAdapter(BaseModelAdapter):
1286
+ """The model adapter for allenai/tulu-30b"""
1287
+
1288
+ use_fast_tokenizer = False
1289
+
1290
+ def match(self, model_path: str):
1291
+ return "tulu" in model_path.lower()
1292
+
1293
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1294
+ return get_conv_template("tulu")
1295
+
1296
+
1297
+ class FalconAdapter(BaseModelAdapter):
1298
+ """The model adapter for tiiuae/falcon-40b"""
1299
+
1300
+ def match(self, model_path: str):
1301
+ return "falcon" in model_path.lower() and "chat" not in model_path.lower()
1302
+
1303
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1304
+ revision = from_pretrained_kwargs.get("revision", "main")
1305
+ # Strongly suggest using bf16, which is recommended by the author of Falcon
1306
+ tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
1307
+ model = AutoModelForCausalLM.from_pretrained(
1308
+ model_path,
1309
+ low_cpu_mem_usage=True,
1310
+ trust_remote_code=True,
1311
+ **from_pretrained_kwargs,
1312
+ )
1313
+ # In Falcon tokenizer config and special config there is not any pad token
1314
+ # Setting `pad_token_id` to 9, which corresponds to special token '>>SUFFIX<<'
1315
+ tokenizer.pad_token_id = 9
1316
+ return model, tokenizer
1317
+
1318
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1319
+ return get_conv_template("falcon")
1320
+
1321
+
1322
+ class FalconChatAdapter(BaseModelAdapter):
1323
+ def match(self, model_path: str):
1324
+ return "falcon" in model_path.lower() and "chat" in model_path.lower()
1325
+
1326
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1327
+ return get_conv_template("falcon-chat")
1328
+
1329
+
1330
+ class TigerBotAdapter(BaseModelAdapter):
1331
+ """The model adapter for TigerResearch/tigerbot-7b-sft"""
1332
+
1333
+ def match(self, model_path: str):
1334
+ return "tigerbot" in model_path.lower()
1335
+
1336
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1337
+ revision = from_pretrained_kwargs.get("revision", "main")
1338
+ tokenizer = AutoTokenizer.from_pretrained(
1339
+ model_path,
1340
+ trust_remote_code=True,
1341
+ revision=revision,
1342
+ )
1343
+ model = AutoModelForCausalLM.from_pretrained(
1344
+ model_path,
1345
+ trust_remote_code=True,
1346
+ low_cpu_mem_usage=True,
1347
+ **from_pretrained_kwargs,
1348
+ )
1349
+ return model, tokenizer
1350
+
1351
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1352
+ return get_conv_template("tigerbot")
1353
+
1354
+
1355
+ class BaichuanAdapter(BaseModelAdapter):
1356
+ """The model adapter for Baichuan models (e.g., baichuan-inc/Baichuan-7B)"""
1357
+
1358
+ def match(self, model_path: str):
1359
+ return "baichuan" in model_path.lower()
1360
+
1361
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1362
+ revision = from_pretrained_kwargs.get("revision", "main")
1363
+ tokenizer = AutoTokenizer.from_pretrained(
1364
+ model_path, trust_remote_code=True, revision=revision
1365
+ )
1366
+ model = AutoModelForCausalLM.from_pretrained(
1367
+ model_path,
1368
+ trust_remote_code=True,
1369
+ low_cpu_mem_usage=True,
1370
+ **from_pretrained_kwargs,
1371
+ )
1372
+ return model, tokenizer
1373
+
1374
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1375
+ # for Baichuan-13B-Chat
1376
+ if "chat" in model_path.lower():
1377
+ if "baichuan2" in model_path.lower():
1378
+ return get_conv_template("baichuan2-chat")
1379
+ return get_conv_template("baichuan-chat")
1380
+ return get_conv_template("zero_shot")
1381
+
1382
+
1383
+ class XGenAdapter(BaseModelAdapter):
1384
+ """The model adapter for Salesforce/xgen-7b"""
1385
+
1386
+ def match(self, model_path: str):
1387
+ return "xgen" in model_path.lower()
1388
+
1389
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1390
+ revision = from_pretrained_kwargs.get("revision", "main")
1391
+ model = AutoModelForCausalLM.from_pretrained(
1392
+ model_path,
1393
+ low_cpu_mem_usage=True,
1394
+ trust_remote_code=True,
1395
+ **from_pretrained_kwargs,
1396
+ )
1397
+ tokenizer = AutoTokenizer.from_pretrained(
1398
+ model_path, trust_remote_code=True, revision=revision
1399
+ )
1400
+ model.config.eos_token_id = 50256
1401
+ return model, tokenizer
1402
+
1403
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1404
+ return get_conv_template("xgen")
1405
+
1406
+
1407
+ class NousHermesAdapter(BaseModelAdapter):
1408
+ """The model adapter for NousResearch/Nous-Hermes-13b"""
1409
+
1410
+ use_fast_tokenizer = False
1411
+
1412
+ def match(self, model_path: str):
1413
+ return "nous-hermes" in model_path.lower()
1414
+
1415
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1416
+ return get_conv_template("alpaca")
1417
+
1418
+
1419
+ class InternLMChatAdapter(BaseModelAdapter):
1420
+ """The model adapter for internlm/internlm-chat-7b"""
1421
+
1422
+ def match(self, model_path: str):
1423
+ return "internlm-chat" in model_path.lower()
1424
+
1425
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1426
+ revision = from_pretrained_kwargs.get("revision", "main")
1427
+ model = AutoModelForCausalLM.from_pretrained(
1428
+ model_path,
1429
+ low_cpu_mem_usage=True,
1430
+ trust_remote_code=True,
1431
+ **from_pretrained_kwargs,
1432
+ )
1433
+ model = model.eval()
1434
+ if "8k" in model_path.lower():
1435
+ model.config.max_sequence_length = 8192
1436
+ tokenizer = AutoTokenizer.from_pretrained(
1437
+ model_path, trust_remote_code=True, revision=revision
1438
+ )
1439
+ return model, tokenizer
1440
+
1441
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1442
+ return get_conv_template("internlm-chat")
1443
+
1444
+
1445
+ class StarChatAdapter(BaseModelAdapter):
1446
+ """The model adapter for HuggingFaceH4/starchat-beta"""
1447
+
1448
+ def match(self, model_path: str):
1449
+ return "starchat" in model_path.lower()
1450
+
1451
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1452
+ return get_conv_template("starchat")
1453
+
1454
+
1455
+ class MistralAdapter(BaseModelAdapter):
1456
+ """The model adapter for Mistral AI models"""
1457
+
1458
+ def match(self, model_path: str):
1459
+ return "mistral" in model_path.lower()
1460
+
1461
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1462
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
1463
+ model.config.eos_token_id = tokenizer.eos_token_id
1464
+ model.config.pad_token_id = tokenizer.pad_token_id
1465
+ return model, tokenizer
1466
+
1467
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1468
+ return get_conv_template("mistral")
1469
+
1470
+
1471
+ class Llama2Adapter(BaseModelAdapter):
1472
+ """The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""
1473
+
1474
+ def match(self, model_path: str):
1475
+ return "llama-2" in model_path.lower()
1476
+
1477
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1478
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
1479
+ model.config.eos_token_id = tokenizer.eos_token_id
1480
+ model.config.pad_token_id = tokenizer.pad_token_id
1481
+ return model, tokenizer
1482
+
1483
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1484
+ return get_conv_template("llama-2")
1485
+
1486
+
1487
+ class CuteGPTAdapter(BaseModelAdapter):
1488
+ """The model adapter for CuteGPT"""
1489
+
1490
+ def match(self, model_path: str):
1491
+ return "cutegpt" in model_path.lower()
1492
+
1493
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1494
+ tokenizer = LlamaTokenizer.from_pretrained(model_path)
1495
+ model = AutoModelForCausalLM.from_pretrained(
1496
+ model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
1497
+ )
1498
+ tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<end>")
1499
+ model.config.eos_token_id = tokenizer.eos_token_id
1500
+ model.config.pad_token_id = tokenizer.eos_token_id
1501
+ return model, tokenizer
1502
+
1503
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1504
+ return get_conv_template("cutegpt")
1505
+
1506
+
1507
+ class OpenOrcaAdapter(BaseModelAdapter):
1508
+ """Model adapter for Open-Orca models which may use different prompt templates
1509
+ - (e.g. Open-Orca/OpenOrcaxOpenChat-Preview2-13B, Open-Orca/Mistral-7B-OpenOrca)
1510
+ - `OpenOrcaxOpenChat-Preview2-13B` uses their "OpenChat Llama2 V1" prompt template.
1511
+ - [Open-Orca/OpenOrcaxOpenChat-Preview2-13B #Prompt Template](https://huggingface.co/Open-Orca/OpenOrcaxOpenChat-Preview2-13B#prompt-template)
1512
+ - `Mistral-7B-OpenOrca` uses the [OpenAI's Chat Markup Language (ChatML)](https://github.com/openai/openai-python/blob/main/chatml.md)
1513
+ format, with <|im_start|> and <|im_end|> tokens added to support this.
1514
+ - [Open-Orca/Mistral-7B-OpenOrca #Prompt Template](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template)
1515
+ """
1516
+
1517
+ use_fast_tokenizer = False
1518
+
1519
+ def match(self, model_path: str):
1520
+ return (
1521
+ "mistral-7b-openorca" in model_path.lower()
1522
+ or "openorca" in model_path.lower()
1523
+ )
1524
+
1525
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1526
+ revision = from_pretrained_kwargs.get("revision", "main")
1527
+ tokenizer = AutoTokenizer.from_pretrained(
1528
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
1529
+ )
1530
+ model = AutoModelForCausalLM.from_pretrained(
1531
+ model_path,
1532
+ low_cpu_mem_usage=True,
1533
+ **from_pretrained_kwargs,
1534
+ ).eval()
1535
+ return model, tokenizer
1536
+
1537
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1538
+ if "mistral-7b-openorca" in model_path.lower():
1539
+ return get_conv_template("mistral-7b-openorca")
1540
+ return get_conv_template("open-orca")
1541
+
1542
+
1543
+ class WizardCoderAdapter(BaseModelAdapter):
1544
+ """The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""
1545
+
1546
+ use_fast_tokenizer = False
1547
+
1548
+ def match(self, model_path: str):
1549
+ return "wizardcoder" in model_path.lower()
1550
+
1551
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1552
+ # Same as Alpaca, see :
1553
+ # https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/inference_wizardcoder.py#L60
1554
+ return get_conv_template("alpaca")
1555
+
1556
+
1557
+ class QwenChatAdapter(BaseModelAdapter):
1558
+ """The model adapter for Qwen/Qwen-7B-Chat
1559
+ To run this model, you need to ensure additional flash attention installation:
1560
+ ``` bash
1561
+ git clone https://github.com/Dao-AILab/flash-attention
1562
+ cd flash-attention && pip install .
1563
+ pip install csrc/layer_norm
1564
+ pip install csrc/rotary
1565
+ ```
1566
+
1567
+ Since from 2.0, the following change happened
1568
+ - `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
1569
+ - `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
1570
+ - `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
1571
+ You may need to revise the code in: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py#L69
1572
+ to from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
1573
+ """
1574
+
1575
+ def match(self, model_path: str):
1576
+ return "qwen" in model_path.lower()
1577
+
1578
+ def float_set(self, config, option):
1579
+ config.bf16 = False
1580
+ config.fp16 = False
1581
+ config.fp32 = False
1582
+
1583
+ if option == "bf16":
1584
+ config.bf16 = True
1585
+ elif option == "fp16":
1586
+ config.fp16 = True
1587
+ elif option == "fp32":
1588
+ config.fp32 = True
1589
+ else:
1590
+ print("Invalid option. Please choose one from 'bf16', 'fp16' and 'fp32'.")
1591
+
1592
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1593
+ from transformers.generation import GenerationConfig
1594
+
1595
+ revision = from_pretrained_kwargs.get("revision", "main")
1596
+ config = AutoConfig.from_pretrained(
1597
+ model_path,
1598
+ trust_remote_code=True,
1599
+ )
1600
+ # NOTE: if you use the old version of model file, please remove the comments below
1601
+ # config.use_flash_attn = False
1602
+ self.float_set(config, "fp16")
1603
+ generation_config = GenerationConfig.from_pretrained(
1604
+ model_path, trust_remote_code=True
1605
+ )
1606
+ model = AutoModelForCausalLM.from_pretrained(
1607
+ model_path,
1608
+ config=config,
1609
+ low_cpu_mem_usage=True,
1610
+ trust_remote_code=True,
1611
+ **from_pretrained_kwargs,
1612
+ ).eval()
1613
+ if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
1614
+ model.config.max_sequence_length = 16384
1615
+ tokenizer = AutoTokenizer.from_pretrained(
1616
+ model_path, trust_remote_code=True, revision=revision
1617
+ )
1618
+ tokenizer.eos_token_id = config.eos_token_id
1619
+ tokenizer.bos_token_id = config.bos_token_id
1620
+ tokenizer.pad_token_id = generation_config.pad_token_id
1621
+ model.config.eos_token_id = tokenizer.eos_token_id
1622
+ model.config.bos_token_id = tokenizer.bos_token_id
1623
+ model.config.pad_token_id = tokenizer.pad_token_id
1624
+
1625
+ return model, tokenizer
1626
+
1627
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1628
+ return get_conv_template("qwen-7b-chat")
1629
+
1630
+
1631
+ class BGEAdapter(BaseModelAdapter):
1632
+ """The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""
1633
+
1634
+ use_fast_tokenizer = False
1635
+
1636
+ def match(self, model_path: str):
1637
+ return "bge" in model_path.lower()
1638
+
1639
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1640
+ revision = from_pretrained_kwargs.get("revision", "main")
1641
+ model = AutoModel.from_pretrained(
1642
+ model_path,
1643
+ **from_pretrained_kwargs,
1644
+ )
1645
+ tokenizer = AutoTokenizer.from_pretrained(
1646
+ model_path, trust_remote_code=True, revision=revision
1647
+ )
1648
+ if hasattr(model.config, "max_position_embeddings") and hasattr(
1649
+ tokenizer, "model_max_length"
1650
+ ):
1651
+ model.config.max_sequence_length = min(
1652
+ model.config.max_position_embeddings, tokenizer.model_max_length
1653
+ )
1654
+ return model, tokenizer
1655
+
1656
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1657
+ return get_conv_template("one_shot")
1658
+
1659
+
1660
+ class E5Adapter(BaseModelAdapter):
1661
+ """The model adapter for E5 (e.g., intfloat/e5-large-v2)"""
1662
+
1663
+ use_fast_tokenizer = False
1664
+
1665
+ def match(self, model_path: str):
1666
+ return "e5-" in model_path.lower() and 'megrez' not in model_path.lower()
1667
+
1668
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1669
+ revision = from_pretrained_kwargs.get("revision", "main")
1670
+ model = AutoModel.from_pretrained(
1671
+ model_path,
1672
+ **from_pretrained_kwargs,
1673
+ )
1674
+ tokenizer = AutoTokenizer.from_pretrained(
1675
+ model_path, trust_remote_code=True, revision=revision
1676
+ )
1677
+ if hasattr(model.config, "max_position_embeddings") and hasattr(
1678
+ tokenizer, "model_max_length"
1679
+ ):
1680
+ model.config.max_sequence_length = min(
1681
+ model.config.max_position_embeddings, tokenizer.model_max_length
1682
+ )
1683
+ return model, tokenizer
1684
+
1685
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1686
+ return get_conv_template("one_shot")
1687
+
1688
+
1689
+ class AquilaChatAdapter(BaseModelAdapter):
1690
+ """The model adapter for BAAI/Aquila
1691
+
1692
+ Now supports:
1693
+ - BAAI/AquilaChat-7B
1694
+ - BAAI/AquilaChat2-7B
1695
+ - BAAI/AquilaChat2-34B
1696
+ """
1697
+
1698
+ def match(self, model_path: str):
1699
+ return "aquila" in model_path.lower()
1700
+
1701
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1702
+ revision = from_pretrained_kwargs.get("revision", "main")
1703
+ model = AutoModelForCausalLM.from_pretrained(
1704
+ model_path,
1705
+ low_cpu_mem_usage=True,
1706
+ trust_remote_code=True,
1707
+ **from_pretrained_kwargs,
1708
+ )
1709
+ model = model.eval()
1710
+ tokenizer = AutoTokenizer.from_pretrained(
1711
+ model_path, trust_remote_code=True, revision=revision
1712
+ )
1713
+ return model, tokenizer
1714
+
1715
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1716
+ model_path = model_path.lower()
1717
+ # See: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L347
1718
+ if "aquilachat2" in model_path:
1719
+ if "16k" in model_path:
1720
+ return get_conv_template("aquila")
1721
+ elif "34b" in model_path:
1722
+ return get_conv_template("aquila-legacy")
1723
+ else:
1724
+ return get_conv_template("aquila-v1")
1725
+ else:
1726
+ return get_conv_template("aquila-chat")
1727
+
1728
+
1729
+ class Lamma2ChineseAdapter(BaseModelAdapter):
1730
+ """The model adapter for FlagAlpha/LLama2-Chinese sft"""
1731
+
1732
+ def match(self, model_path: str):
1733
+ return "llama2-chinese" in model_path.lower()
1734
+
1735
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1736
+ revision = from_pretrained_kwargs.get("revision", "main")
1737
+ tokenizer = AutoTokenizer.from_pretrained(
1738
+ model_path,
1739
+ trust_remote_code=True,
1740
+ revision=revision,
1741
+ )
1742
+ model = AutoModelForCausalLM.from_pretrained(
1743
+ model_path,
1744
+ trust_remote_code=True,
1745
+ low_cpu_mem_usage=True,
1746
+ **from_pretrained_kwargs,
1747
+ )
1748
+ return model, tokenizer
1749
+
1750
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1751
+ return get_conv_template("llama2-chinese")
1752
+
1753
+
1754
+ class VigogneAdapter(BaseModelAdapter):
1755
+ """The model adapter for vigogne (e.g., bofenghuang/vigogne-2-7b-chat)"""
1756
+
1757
+ use_fast_tokenizer = False
1758
+
1759
+ def match(self, model_path: str):
1760
+ return bool(re.search(r"vigogne|vigostral", model_path, re.I))
1761
+
1762
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1763
+ revision = from_pretrained_kwargs.get("revision", "main")
1764
+ tokenizer = AutoTokenizer.from_pretrained(
1765
+ model_path,
1766
+ use_fast=self.use_fast_tokenizer,
1767
+ trust_remote_code=True,
1768
+ revision=revision,
1769
+ )
1770
+ model = AutoModelForCausalLM.from_pretrained(
1771
+ model_path,
1772
+ trust_remote_code=True,
1773
+ low_cpu_mem_usage=True,
1774
+ **from_pretrained_kwargs,
1775
+ ).eval()
1776
+ return model, tokenizer
1777
+
1778
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1779
+ if "chat" in model_path.lower():
1780
+ if "vigostral" in model_path.lower():
1781
+ return get_conv_template("vigogne_chat_v3")
1782
+ return get_conv_template("vigogne_chat_v2")
1783
+ return get_conv_template("vigogne_instruct")
1784
+
1785
+
1786
+ class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
1787
+ """The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""
1788
+
1789
+ use_fast_tokenizer = False
1790
+
1791
+ def match(self, model_path: str):
1792
+ return (
1793
+ "open-llama" in model_path.lower() and "open-instruct" in model_path.lower()
1794
+ )
1795
+
1796
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1797
+ revision = from_pretrained_kwargs.get("revision", "main")
1798
+ tokenizer = AutoTokenizer.from_pretrained(
1799
+ model_path,
1800
+ use_fast=self.use_fast_tokenizer,
1801
+ trust_remote_code=True,
1802
+ revision=revision,
1803
+ )
1804
+ model = AutoModelForCausalLM.from_pretrained(
1805
+ model_path,
1806
+ trust_remote_code=True,
1807
+ low_cpu_mem_usage=True,
1808
+ **from_pretrained_kwargs,
1809
+ ).eval()
1810
+ return model, tokenizer
1811
+
1812
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1813
+ return get_conv_template("alpaca")
1814
+
1815
+
1816
+ class CodeLlamaAdapter(BaseModelAdapter):
1817
+ """The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""
1818
+
1819
+ def match(self, model_path: str):
1820
+ return "codellama" in model_path.lower()
1821
+
1822
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1823
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
1824
+ model.config.eos_token_id = tokenizer.eos_token_id
1825
+ model.config.pad_token_id = tokenizer.pad_token_id
1826
+ return model, tokenizer
1827
+
1828
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1829
+ return get_conv_template("llama-2")
1830
+
1831
+
1832
+ class PhindCodeLlamaAdapter(CodeLlamaAdapter):
1833
+ """The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""
1834
+
1835
+ def match(self, model_path: str):
1836
+ return "phind-codellama-" in model_path.lower()
1837
+
1838
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1839
+ return get_conv_template("phind")
1840
+
1841
+
1842
+ class Llama2ChangAdapter(Llama2Adapter):
1843
+ """The model adapter for Llama2-ko-chang (e.g., lcw99/llama2-ko-chang-instruct-chat)"""
1844
+
1845
+ def match(self, model_path: str):
1846
+ return "llama2-ko-chang" in model_path.lower()
1847
+
1848
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1849
+ return get_conv_template("polyglot_changgpt")
1850
+
1851
+
1852
+ class ZephyrAdapter(BaseModelAdapter):
1853
+ """The model adapter for Zephyr (e.g. HuggingFaceH4/zephyr-7b-alpha)"""
1854
+
1855
+ def match(self, model_path: str):
1856
+ return "zephyr" in model_path.lower()
1857
+
1858
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1859
+ return get_conv_template("zephyr")
1860
+
1861
+
1862
+ class XwinLMAdapter(BaseModelAdapter):
1863
+ """The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)"""
1864
+
1865
+ # use_fast_tokenizer = False
1866
+
1867
+ def match(self, model_path: str):
1868
+ return "xwin-lm" in model_path.lower()
1869
+
1870
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1871
+ return get_conv_template("vicuna_v1.1")
1872
+
1873
+
1874
+ class LemurAdapter(BaseModelAdapter):
1875
+ """The model adapter for OpenLemur/lemur-70b-chat-v1"""
1876
+
1877
+ use_fast_tokenizer = False
1878
+
1879
+ def match(self, model_path: str):
1880
+ return "lemur-70b-chat" in model_path.lower()
1881
+
1882
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1883
+ return get_conv_template("lemur-70b-chat")
1884
+
1885
+
1886
+ class PygmalionAdapter(BaseModelAdapter):
1887
+ """The model adapter for Pygmalion/Metharme series of models(e.g., PygmalionAI/mythalion-13b)"""
1888
+
1889
+ # use_fast_tokenizer = False
1890
+
1891
+ def match(self, model_path: str):
1892
+ return bool(
1893
+ re.search(r"pygmalion|mythalion|metharme", model_path.lower(), re.I)
1894
+ )
1895
+
1896
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1897
+ return get_conv_template("metharme")
1898
+
1899
+
1900
+ # Note: the registration order matters.
1901
+ # The one registered earlier has a higher matching priority.
1902
+ register_model_adapter(PeftModelAdapter)
1903
+ register_model_adapter(DeepseekChatAdapter)
1904
+ register_model_adapter(VicunaAdapter)
1905
+ register_model_adapter(AiroborosAdapter)
1906
+ register_model_adapter(LongChatAdapter)
1907
+ register_model_adapter(GoogleT5Adapter)
1908
+ register_model_adapter(KoalaAdapter)
1909
+ register_model_adapter(AlpacaAdapter)
1910
+ register_model_adapter(ChatGLMAdapter)
1911
+ register_model_adapter(CodeGeexAdapter)
1912
+ register_model_adapter(DollyV2Adapter)
1913
+ register_model_adapter(OasstPythiaAdapter)
1914
+ register_model_adapter(OasstLLaMAAdapter)
1915
+ register_model_adapter(OpenChat35Adapter)
1916
+ register_model_adapter(StableLMAdapter)
1917
+ register_model_adapter(BaizeAdapter)
1918
+ register_model_adapter(RwkvAdapter)
1919
+ register_model_adapter(OpenBuddyAdapter)
1920
+ register_model_adapter(PhoenixAdapter)
1921
+ register_model_adapter(BardAdapter)
1922
+ register_model_adapter(PaLM2Adapter)
1923
+ register_model_adapter(ChatGPTAdapter)
1924
+ register_model_adapter(AzureOpenAIAdapter)
1925
+ register_model_adapter(ClaudeAdapter)
1926
+ register_model_adapter(MPTAdapter)
1927
+ register_model_adapter(BiLLaAdapter)
1928
+ register_model_adapter(RedPajamaINCITEAdapter)
1929
+ register_model_adapter(H2OGPTAdapter)
1930
+ register_model_adapter(RobinAdapter)
1931
+ register_model_adapter(SnoozyAdapter)
1932
+ register_model_adapter(WizardLMAdapter)
1933
+ register_model_adapter(ManticoreAdapter)
1934
+ register_model_adapter(GuanacoAdapter)
1935
+ register_model_adapter(CamelAdapter)
1936
+ register_model_adapter(ChangGPTAdapter)
1937
+ register_model_adapter(TuluAdapter)
1938
+ register_model_adapter(FalconChatAdapter)
1939
+ register_model_adapter(FalconAdapter)
1940
+ register_model_adapter(TigerBotAdapter)
1941
+ register_model_adapter(BaichuanAdapter)
1942
+ register_model_adapter(XGenAdapter)
1943
+ register_model_adapter(NousHermesAdapter)
1944
+ register_model_adapter(PythiaAdapter)
1945
+ register_model_adapter(InternLMChatAdapter)
1946
+ register_model_adapter(StarChatAdapter)
1947
+ register_model_adapter(Llama2Adapter)
1948
+ register_model_adapter(CuteGPTAdapter)
1949
+ register_model_adapter(OpenOrcaAdapter)
1950
+ register_model_adapter(MistralAdapter)
1951
+ register_model_adapter(WizardCoderAdapter)
1952
+ register_model_adapter(QwenChatAdapter)
1953
+ register_model_adapter(AquilaChatAdapter)
1954
+ register_model_adapter(BGEAdapter)
1955
+ register_model_adapter(E5Adapter)
1956
+ register_model_adapter(Lamma2ChineseAdapter)
1957
+ register_model_adapter(VigogneAdapter)
1958
+ register_model_adapter(OpenLLaMaOpenInstructAdapter)
1959
+ register_model_adapter(ReaLMAdapter)
1960
+ register_model_adapter(PhindCodeLlamaAdapter)
1961
+ register_model_adapter(CodeLlamaAdapter)
1962
+ register_model_adapter(Llama2ChangAdapter)
1963
+ register_model_adapter(ZephyrAdapter)
1964
+ register_model_adapter(XwinLMAdapter)
1965
+ register_model_adapter(LemurAdapter)
1966
+ register_model_adapter(PygmalionAdapter)
1967
+ register_model_adapter(Zhinao360Adapter)
1968
+
1969
+ # After all adapters, try the default base adapter.
1970
+ register_model_adapter(BaseModelAdapter)
fastchat/model/model_chatglm.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference code for ChatGLM.
3
+ Adapted from https://huggingface.co/THUDM/chatglm-6b/blob/main/modeling_chatglm.py.
4
+ """
5
+ import re
6
+
7
+ import torch
8
+ from transformers.generation.logits_process import LogitsProcessor
9
+
10
+
11
+ class InvalidScoreLogitsProcessor(LogitsProcessor):
12
+ def __call__(
13
+ self, input_ids: torch.LongTensor, scores: torch.FloatTensor
14
+ ) -> torch.FloatTensor:
15
+ if torch.isnan(scores).any() or torch.isinf(scores).any():
16
+ scores.zero_()
17
+ scores[..., 5] = 5e4
18
+ return scores
19
+
20
+
21
+ invalid_score_processor = InvalidScoreLogitsProcessor()
22
+
23
+
24
+ def process_response(response):
25
+ response = response.strip()
26
+ response = response.replace("[[训练时间]]", "2023年")
27
+ punkts = [
28
+ [",", ","],
29
+ ["!", "!"],
30
+ [":", ":"],
31
+ [";", ";"],
32
+ ["\?", "?"],
33
+ ]
34
+ for item in punkts:
35
+ response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
36
+ response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
37
+ return response
38
+
39
+
40
+ @torch.inference_mode()
41
+ def generate_stream_chatglm(
42
+ model,
43
+ tokenizer,
44
+ params,
45
+ device,
46
+ context_len=2048,
47
+ stream_interval=2,
48
+ judge_sent_end=False,
49
+ ):
50
+ prompt = params["prompt"]
51
+ temperature = float(params.get("temperature", 1.0))
52
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
53
+ top_p = float(params.get("top_p", 1.0))
54
+ max_new_tokens = int(params.get("max_new_tokens", 256))
55
+ echo = params.get("echo", True)
56
+
57
+ inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
58
+ input_echo_len = len(inputs["input_ids"][0])
59
+
60
+ gen_kwargs = {
61
+ "max_length": max_new_tokens + input_echo_len,
62
+ "do_sample": True if temperature > 1e-5 else False,
63
+ "top_p": top_p,
64
+ "repetition_penalty": repetition_penalty,
65
+ "logits_processor": [invalid_score_processor],
66
+ }
67
+ if temperature > 1e-5:
68
+ gen_kwargs["temperature"] = temperature
69
+
70
+ total_len = 0
71
+ for total_ids in model.stream_generate(**inputs, **gen_kwargs):
72
+ total_ids = total_ids.tolist()[0]
73
+ total_len = len(total_ids)
74
+ if echo:
75
+ output_ids = total_ids
76
+ else:
77
+ output_ids = total_ids[input_echo_len:]
78
+ response = tokenizer.decode(output_ids)
79
+ response = process_response(response)
80
+
81
+ yield {
82
+ "text": response,
83
+ "usage": {
84
+ "prompt_tokens": input_echo_len,
85
+ "completion_tokens": total_len - input_echo_len,
86
+ "total_tokens": total_len,
87
+ },
88
+ "finish_reason": None,
89
+ }
90
+
91
+ # TODO: ChatGLM stop when it reach max length
92
+ # Only last stream result contains finish_reason, we set finish_reason as stop
93
+ ret = {
94
+ "text": response,
95
+ "usage": {
96
+ "prompt_tokens": input_echo_len,
97
+ "completion_tokens": total_len - input_echo_len,
98
+ "total_tokens": total_len,
99
+ },
100
+ "finish_reason": "stop",
101
+ }
102
+ yield ret
fastchat/model/model_codet5p.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from threading import Thread
3
+ import torch
4
+ import transformers
5
+ from transformers import (
6
+ GenerationConfig,
7
+ StoppingCriteria,
8
+ StoppingCriteriaList,
9
+ TextIteratorStreamer,
10
+ )
11
+
12
+
13
+ @torch.inference_mode()
14
+ def generate_stream_codet5p(
15
+ model,
16
+ tokenizer,
17
+ params,
18
+ device,
19
+ context_len=2048,
20
+ stream_interval=2,
21
+ judge_sent_end=False,
22
+ ):
23
+ prompt = params["prompt"]
24
+ temperature = float(params.get("temperature", 1.0))
25
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
26
+ top_p = float(params.get("top_p", 1.0))
27
+ top_k = int(params.get("top_k", 50)) # -1 means disable
28
+ max_new_tokens = int(params.get("max_new_tokens", 1024))
29
+ stop_token_ids = params.get("stop_token_ids", None) or []
30
+ stop_token_ids.append(tokenizer.eos_token_id)
31
+
32
+ decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
33
+ streamer = TextIteratorStreamer(tokenizer, **decode_config)
34
+ encoding = tokenizer(prompt, return_tensors="pt").to(device)
35
+ input_ids = encoding.input_ids
36
+ encoding["decoder_input_ids"] = encoding["input_ids"].clone()
37
+ input_echo_len = len(input_ids)
38
+
39
+ generation_config = GenerationConfig(
40
+ max_new_tokens=max_new_tokens,
41
+ do_sample=temperature >= 1e-5,
42
+ temperature=temperature,
43
+ repetition_penalty=repetition_penalty,
44
+ no_repeat_ngram_size=10,
45
+ top_p=top_p,
46
+ top_k=top_k,
47
+ eos_token_id=stop_token_ids,
48
+ )
49
+
50
+ class CodeBlockStopper(StoppingCriteria):
51
+ def __call__(
52
+ self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
53
+ ) -> bool:
54
+ # Code-completion is open-end generation.
55
+ # We check \n\n to stop at end of a code block.
56
+ if list(input_ids[0][-2:]) == [628, 198]:
57
+ return True
58
+ return False
59
+
60
+ gen_kwargs = dict(
61
+ **encoding,
62
+ streamer=streamer,
63
+ generation_config=generation_config,
64
+ stopping_criteria=StoppingCriteriaList([CodeBlockStopper()]),
65
+ )
66
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
67
+ thread.start()
68
+ i = 0
69
+ output = ""
70
+ for new_text in streamer:
71
+ i += 1
72
+ output += new_text
73
+ if i % stream_interval == 0 or i == max_new_tokens - 1:
74
+ yield {
75
+ "text": output,
76
+ "usage": {
77
+ "prompt_tokens": input_echo_len,
78
+ "completion_tokens": i,
79
+ "total_tokens": input_echo_len + i,
80
+ },
81
+ "finish_reason": None,
82
+ }
83
+ if i >= max_new_tokens:
84
+ break
85
+
86
+ if i >= max_new_tokens:
87
+ finish_reason = "length"
88
+ else:
89
+ finish_reason = "stop"
90
+
91
+ yield {
92
+ "text": output,
93
+ "usage": {
94
+ "prompt_tokens": input_echo_len,
95
+ "completion_tokens": i,
96
+ "total_tokens": input_echo_len + i,
97
+ },
98
+ "finish_reason": finish_reason,
99
+ }
100
+ thread.join()
101
+
102
+ # clean
103
+ gc.collect()
104
+ torch.cuda.empty_cache()
105
+ if device == "xpu":
106
+ torch.xpu.empty_cache()
107
+ if device == "npu":
108
+ torch.npu.empty_cache()
fastchat/model/model_exllama.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import sys
3
+ from typing import Dict
4
+
5
+ import torch
6
+
7
+
8
+ def generate_stream_exllama(
9
+ model,
10
+ tokenizer,
11
+ params: Dict,
12
+ device: str,
13
+ context_len: int,
14
+ stream_interval: int = 2,
15
+ judge_sent_end: bool = False,
16
+ ):
17
+ try:
18
+ from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler
19
+ except ImportError as e:
20
+ print(f"Error: Failed to load Exllamav2. {e}")
21
+ sys.exit(-1)
22
+
23
+ prompt = params["prompt"]
24
+
25
+ generator = ExLlamaV2StreamingGenerator(model.model, model.cache, tokenizer)
26
+ settings = ExLlamaV2Sampler.Settings()
27
+
28
+ settings.temperature = float(params.get("temperature", 0.85))
29
+ settings.top_k = int(params.get("top_k", 50))
30
+ settings.top_p = float(params.get("top_p", 0.8))
31
+ settings.token_repetition_penalty = float(params.get("repetition_penalty", 1.15))
32
+ settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
33
+
34
+ max_new_tokens = int(params.get("max_new_tokens", 256))
35
+
36
+ generator.set_stop_conditions(params.get("stop_token_ids", None) or [])
37
+ echo = bool(params.get("echo", True))
38
+
39
+ input_ids = generator.tokenizer.encode(prompt)
40
+ prompt_tokens = input_ids.shape[-1]
41
+ generator.begin_stream(input_ids, settings)
42
+
43
+ generated_tokens = 0
44
+ if echo:
45
+ output = prompt
46
+ else:
47
+ output = ""
48
+ while True:
49
+ chunk, eos, _ = generator.stream()
50
+ output += chunk
51
+ generated_tokens += 1
52
+ if generated_tokens == max_new_tokens:
53
+ finish_reason = "length"
54
+ break
55
+ elif eos:
56
+ finish_reason = "length"
57
+ break
58
+ yield {
59
+ "text": output,
60
+ "usage": {
61
+ "prompt_tokens": prompt_tokens,
62
+ "completion_tokens": generated_tokens,
63
+ "total_tokens": prompt_tokens + generated_tokens,
64
+ },
65
+ "finish_reason": None,
66
+ }
67
+
68
+ yield {
69
+ "text": output,
70
+ "usage": {
71
+ "prompt_tokens": prompt_tokens,
72
+ "completion_tokens": generated_tokens,
73
+ "total_tokens": prompt_tokens + generated_tokens,
74
+ },
75
+ "finish_reason": finish_reason,
76
+ }
77
+ gc.collect()
fastchat/model/model_falcon.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from threading import Thread
3
+ from typing import Iterable
4
+
5
+ import torch
6
+ import transformers
7
+ from transformers import TextIteratorStreamer, GenerationConfig
8
+
9
+ from fastchat.utils import is_partial_stop
10
+
11
+
12
+ @torch.inference_mode()
13
+ def generate_stream_falcon(
14
+ model,
15
+ tokenizer,
16
+ params,
17
+ device,
18
+ context_len=2048,
19
+ stream_interval=2,
20
+ judge_sent_end=False,
21
+ ):
22
+ prompt = params["prompt"]
23
+ len_prompt = len(prompt)
24
+ temperature = float(params.get("temperature", 1.0))
25
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
26
+ top_p = float(params.get("top_p", 1.0))
27
+ top_k = int(params.get("top_k", 50)) # -1 means disable
28
+ max_new_tokens = int(params.get("max_new_tokens", 256))
29
+ stop_str = params.get("stop", None)
30
+ echo = bool(params.get("echo", True))
31
+ stop_token_ids = params.get("stop_token_ids", None) or []
32
+ stop_token_ids.append(tokenizer.eos_token_id)
33
+
34
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
35
+ input_ids = inputs["input_ids"]
36
+ attention_mask = inputs["attention_mask"]
37
+
38
+ max_src_len = context_len - max_new_tokens - 8
39
+
40
+ input_ids = input_ids[-max_src_len:] # truncate from the left
41
+ attention_mask = attention_mask[-max_src_len:] # truncate from the left
42
+ input_echo_len = len(input_ids)
43
+
44
+ decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
45
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
46
+
47
+ generation_config = GenerationConfig(
48
+ max_new_tokens=max_new_tokens,
49
+ do_sample=temperature >= 1e-5,
50
+ temperature=temperature,
51
+ repetition_penalty=repetition_penalty,
52
+ no_repeat_ngram_size=10,
53
+ top_p=top_p,
54
+ top_k=top_k,
55
+ eos_token_id=stop_token_ids,
56
+ )
57
+
58
+ generation_kwargs = dict(
59
+ inputs=input_ids,
60
+ attention_mask=attention_mask,
61
+ streamer=streamer,
62
+ generation_config=generation_config,
63
+ )
64
+
65
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
66
+ thread.start()
67
+
68
+ if echo:
69
+ # means keep the prompt
70
+ output = prompt
71
+ else:
72
+ output = ""
73
+
74
+ for i, new_text in enumerate(streamer):
75
+ output += new_text
76
+ if i % stream_interval == 0:
77
+ if echo:
78
+ rfind_start = len_prompt
79
+ else:
80
+ rfind_start = 0
81
+
82
+ partially_stopped = False
83
+ if stop_str:
84
+ if isinstance(stop_str, str):
85
+ pos = output.rfind(stop_str, rfind_start)
86
+ if pos != -1:
87
+ output = output[:pos]
88
+ else:
89
+ partially_stopped = is_partial_stop(output, stop_str)
90
+ elif isinstance(stop_str, Iterable):
91
+ for each_stop in stop_str:
92
+ pos = output.rfind(each_stop, rfind_start)
93
+ if pos != -1:
94
+ output = output[:pos]
95
+ break
96
+ else:
97
+ partially_stopped = is_partial_stop(output, each_stop)
98
+ if partially_stopped:
99
+ break
100
+ else:
101
+ raise ValueError("Invalid stop field type.")
102
+
103
+ # prevent yielding partial stop sequence
104
+ if not partially_stopped:
105
+ yield {
106
+ "text": output,
107
+ "usage": {
108
+ "prompt_tokens": input_echo_len,
109
+ "completion_tokens": i,
110
+ "total_tokens": input_echo_len + i,
111
+ },
112
+ "finish_reason": None,
113
+ }
114
+ output = output.strip()
115
+
116
+ # finish stream event, which contains finish reason
117
+ if i == max_new_tokens - 1:
118
+ finish_reason = "length"
119
+ elif partially_stopped:
120
+ finish_reason = None
121
+ else:
122
+ finish_reason = "stop"
123
+
124
+ yield {
125
+ "text": output,
126
+ "usage": {
127
+ "prompt_tokens": input_echo_len,
128
+ "completion_tokens": i,
129
+ "total_tokens": input_echo_len + i,
130
+ },
131
+ "finish_reason": finish_reason,
132
+ }
133
+
134
+ # clean
135
+ gc.collect()
136
+ torch.cuda.empty_cache()
137
+ if device == "xpu":
138
+ torch.xpu.empty_cache()
139
+ if device == "npu":
140
+ torch.npu.empty_cache()
fastchat/model/model_registry.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Additional information of the models."""
2
+ from collections import namedtuple
3
+ from typing import List
4
+
5
+
6
+ ModelInfo = namedtuple("ModelInfo", ["simple_name", "link", "description"])
7
+
8
+
9
+ model_info = {}
10
+
11
+
12
+ def register_model_info(
13
+ full_names: List[str], simple_name: str, link: str, description: str
14
+ ):
15
+ info = ModelInfo(simple_name, link, description)
16
+
17
+ for full_name in full_names:
18
+ model_info[full_name] = info
19
+
20
+
21
+ def get_model_info(name: str) -> ModelInfo:
22
+ if name in model_info:
23
+ return model_info[name]
24
+ else:
25
+ # To fix this, please use `register_model_info` to register your model
26
+ return ModelInfo(
27
+ name, "", "Register the description at fastchat/model/model_registry.py"
28
+ )
29
+
30
+
31
+ register_model_info(
32
+ ["gpt-3.5-turbo"],
33
+ "GPT-3.5",
34
+ "https://openai.com/blog/chatgpt",
35
+ "GPT-3.5 by OpenAI",
36
+ )
37
+ register_model_info(
38
+ ["gpt-3.5-turbo-1106"],
39
+ "GPT-3.5-Turbo-1106",
40
+ "https://platform.openai.com/docs/models/gpt-3-5",
41
+ "GPT-3.5-Turbo-1106 by OpenAI",
42
+ )
43
+ register_model_info(
44
+ ["gpt-4"], "GPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI"
45
+ )
46
+ register_model_info(
47
+ ["gpt-4-turbo"],
48
+ "GPT-4-Turbo",
49
+ "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
50
+ "GPT-4-Turbo by OpenAI",
51
+ )
52
+ register_model_info(
53
+ ["claude-2"],
54
+ "Claude",
55
+ "https://www.anthropic.com/index/claude-2",
56
+ "Claude 2 by Anthropic",
57
+ )
58
+ register_model_info(
59
+ ["claude-1"],
60
+ "Claude",
61
+ "https://www.anthropic.com/index/introducing-claude",
62
+ "Claude by Anthropic",
63
+ )
64
+ register_model_info(
65
+ ["claude-instant-1"],
66
+ "Claude Instant",
67
+ "https://www.anthropic.com/index/introducing-claude",
68
+ "Claude Instant by Anthropic",
69
+ )
70
+ register_model_info(
71
+ ["palm-2"],
72
+ "PaLM 2 Chat",
73
+ "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023",
74
+ "PaLM 2 for Chat (chat-bison@001) by Google",
75
+ )
76
+ register_model_info(
77
+ [
78
+ "vicuna-33b",
79
+ "vicuna-33b-v1.3",
80
+ "vicuna-13b",
81
+ "vicuna-13b-v1.3",
82
+ "vicuna-7b",
83
+ "vicuna-7b-v1.3",
84
+ ],
85
+ "Vicuna",
86
+ "https://lmsys.org/blog/2023-03-30-vicuna/",
87
+ "a chat assistant fine-tuned on user-shared conversations by LMSYS",
88
+ )
89
+ register_model_info(
90
+ ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"],
91
+ "Llama 2",
92
+ "https://ai.meta.com/llama/",
93
+ "open foundation and fine-tuned chat models by Meta",
94
+ )
95
+ register_model_info(
96
+ ["mistral-7b-instruct"],
97
+ "Mistral",
98
+ "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1",
99
+ "a large language model by Mistral AI team",
100
+ )
101
+ register_model_info(
102
+ ["zephyr-7b-beta", "zephyr-7b-alpha"],
103
+ "Zephyr",
104
+ "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha",
105
+ "a chatbot fine-tuned from Mistral by Hugging Face",
106
+ )
107
+ register_model_info(
108
+ ["qwen-14b-chat"],
109
+ "Qwen",
110
+ "https://huggingface.co/Qwen/Qwen-14B-Chat",
111
+ "a large language model by Alibaba Cloud",
112
+ )
113
+ register_model_info(
114
+ ["codellama-34b-instruct", "codellama-13b-instruct", "codellama-7b-instruct"],
115
+ "Code Llama",
116
+ "https://ai.meta.com/blog/code-llama-large-language-model-coding/",
117
+ "open foundation models for code by Meta",
118
+ )
119
+ register_model_info(
120
+ ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"],
121
+ "WizardLM",
122
+ "https://github.com/nlpxucan/WizardLM",
123
+ "an instruction-following LLM using evol-instruct by Microsoft",
124
+ )
125
+ register_model_info(
126
+ ["wizardcoder-15b-v1.0"],
127
+ "WizardLM",
128
+ "https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder",
129
+ "Empowering Code Large Language Models with Evol-Instruct",
130
+ )
131
+ register_model_info(
132
+ ["mpt-7b-chat", "mpt-30b-chat"],
133
+ "MPT-Chat",
134
+ "https://www.mosaicml.com/blog/mpt-30b",
135
+ "a chatbot fine-tuned from MPT by MosaicML",
136
+ )
137
+ register_model_info(
138
+ ["guanaco-33b", "guanaco-65b"],
139
+ "Guanaco",
140
+ "https://github.com/artidoro/qlora",
141
+ "a model fine-tuned with QLoRA by UW",
142
+ )
143
+ register_model_info(
144
+ ["gpt4all-13b-snoozy"],
145
+ "GPT4All-Snoozy",
146
+ "https://github.com/nomic-ai/gpt4all",
147
+ "a finetuned LLaMA model on assistant style data by Nomic AI",
148
+ )
149
+ register_model_info(
150
+ ["koala-13b"],
151
+ "Koala",
152
+ "https://bair.berkeley.edu/blog/2023/04/03/koala",
153
+ "a dialogue model for academic research by BAIR",
154
+ )
155
+ register_model_info(
156
+ ["RWKV-4-Raven-14B"],
157
+ "RWKV-4-Raven",
158
+ "https://huggingface.co/BlinkDL/rwkv-4-raven",
159
+ "an RNN with transformer-level LLM performance",
160
+ )
161
+ register_model_info(
162
+ ["chatglm-6b", "chatglm2-6b"],
163
+ "ChatGLM",
164
+ "https://chatglm.cn/blog",
165
+ "an open bilingual dialogue language model by Tsinghua University",
166
+ )
167
+ register_model_info(
168
+ ["alpaca-13b"],
169
+ "Alpaca",
170
+ "https://crfm.stanford.edu/2023/03/13/alpaca.html",
171
+ "a model fine-tuned from LLaMA on instruction-following demonstrations by Stanford",
172
+ )
173
+ register_model_info(
174
+ ["oasst-pythia-12b"],
175
+ "OpenAssistant (oasst)",
176
+ "https://open-assistant.io",
177
+ "an Open Assistant for everyone by LAION",
178
+ )
179
+ register_model_info(
180
+ ["oasst-sft-7-llama-30b"],
181
+ "OpenAssistant (oasst)",
182
+ "https://open-assistant.io",
183
+ "an Open Assistant for everyone by LAION",
184
+ )
185
+ register_model_info(
186
+ ["openchat-3.5"],
187
+ "OpenChat 3.5",
188
+ "https://github.com/imoneoi/openchat",
189
+ "OpenChat 3.5 is a versatile, open-source language model fine-tuned using C-RLFT",
190
+ )
191
+ register_model_info(
192
+ ["llama-7b", "llama-13b"],
193
+ "LLaMA",
194
+ "https://arxiv.org/abs/2302.13971",
195
+ "open and efficient foundation language models by Meta",
196
+ )
197
+ register_model_info(
198
+ ["open-llama-7b-v2-open-instruct", "open-llama-7b-open-instruct"],
199
+ "Open LLaMa (Open Instruct)",
200
+ "https://medium.com/vmware-data-ml-blog/starter-llm-for-the-enterprise-instruction-tuning-openllama-7b-d05fc3bbaccc",
201
+ "Open LLaMa fine-tuned on instruction-following data by VMware",
202
+ )
203
+ register_model_info(
204
+ ["dolly-v2-12b"],
205
+ "Dolly",
206
+ "https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm",
207
+ "an instruction-tuned open large language model by Databricks",
208
+ )
209
+ register_model_info(
210
+ ["stablelm-tuned-alpha-7b"],
211
+ "StableLM",
212
+ "https://github.com/stability-AI/stableLM",
213
+ "Stability AI language models",
214
+ )
215
+ register_model_info(
216
+ ["codet5p-6b"],
217
+ "CodeT5p-6b",
218
+ "https://huggingface.co/Salesforce/codet5p-6b",
219
+ "Code completion model released by Salesforce",
220
+ )
221
+ register_model_info(
222
+ ["fastchat-t5-3b", "fastchat-t5-3b-v1.0"],
223
+ "FastChat-T5",
224
+ "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0",
225
+ "a chat assistant fine-tuned from FLAN-T5 by LMSYS",
226
+ )
227
+ register_model_info(
228
+ ["phoenix-inst-chat-7b"],
229
+ "Phoenix-7B",
230
+ "https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b",
231
+ "a multilingual chat assistant fine-tuned from Bloomz to democratize ChatGPT across languages by CUHK(SZ)",
232
+ )
233
+ register_model_info(
234
+ ["realm-7b-v1"],
235
+ "ReaLM",
236
+ "https://github.com/FreedomIntelligence/ReaLM",
237
+ "A chatbot fine-tuned from LLaMA2 with data generated via iterative calls to UserGPT and ChatGPT by CUHK(SZ) and SRIBD.",
238
+ )
239
+ register_model_info(
240
+ ["billa-7b-sft"],
241
+ "BiLLa-7B-SFT",
242
+ "https://huggingface.co/Neutralzz/BiLLa-7B-SFT",
243
+ "an instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher",
244
+ )
245
+ register_model_info(
246
+ ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"],
247
+ "h2oGPT-GM-7b",
248
+ "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
249
+ "an instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai",
250
+ )
251
+ register_model_info(
252
+ ["baize-v2-7b", "baize-v2-13b"],
253
+ "Baize v2",
254
+ "https://github.com/project-baize/baize-chatbot#v2",
255
+ "A chatbot fine-tuned from LLaMA with ChatGPT self-chat data and Self-Disillation with Feedback (SDF) by UCSD and SYSU.",
256
+ )
257
+ register_model_info(
258
+ [
259
+ "airoboros-l2-7b-2.1",
260
+ "airoboros-l2-13b-2.1",
261
+ "airoboros-c34b-2.1",
262
+ "airoboros-l2-70b-2.1",
263
+ ],
264
+ "airoboros",
265
+ "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1",
266
+ "an instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4",
267
+ )
268
+ register_model_info(
269
+ [
270
+ "spicyboros-7b-2.2",
271
+ "spicyboros-13b-2.2",
272
+ "spicyboros-70b-2.2",
273
+ ],
274
+ "spicyboros",
275
+ "https://huggingface.co/jondurbin/spicyboros-70b-2.2",
276
+ "de-aligned versions of the airoboros models",
277
+ )
278
+ register_model_info(
279
+ ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"],
280
+ "Robin-v2",
281
+ "https://huggingface.co/OptimalScale/robin-7b-v2-delta",
282
+ "A chatbot fine-tuned from LLaMA-7b, achieving competitive performance on chitchat, commonsense reasoning and instruction-following tasks, by OptimalScale, HKUST.",
283
+ )
284
+ register_model_info(
285
+ ["manticore-13b-chat"],
286
+ "Manticore 13B Chat",
287
+ "https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg",
288
+ "A chatbot fine-tuned from LlaMa across several CoT and chat datasets.",
289
+ )
290
+ register_model_info(
291
+ ["redpajama-incite-7b-chat"],
292
+ "RedPajama-INCITE-7B-Chat",
293
+ "https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat",
294
+ "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together",
295
+ )
296
+ register_model_info(
297
+ [
298
+ "falcon-7b",
299
+ "falcon-7b-instruct",
300
+ "falcon-40b",
301
+ "falcon-40b-instruct",
302
+ "falcon-180b",
303
+ "falcon-180b-chat",
304
+ ],
305
+ "Falcon",
306
+ "https://huggingface.co/tiiuae/falcon-180B",
307
+ "TII's flagship series of large language models",
308
+ )
309
+ register_model_info(
310
+ ["tigerbot-7b-sft"],
311
+ "Tigerbot",
312
+ "https://huggingface.co/TigerResearch/tigerbot-7b-sft",
313
+ "TigerBot is a large-scale language model (LLM) with multiple languages and tasks.",
314
+ )
315
+ register_model_info(
316
+ ["internlm-chat-7b", "internlm-chat-7b-8k"],
317
+ "InternLM",
318
+ "https://huggingface.co/internlm/internlm-chat-7b",
319
+ "InternLM is a multi-language large-scale language model (LLM), developed by SHLAB.",
320
+ )
321
+ register_model_info(
322
+ ["Qwen-7B-Chat"],
323
+ "Qwen",
324
+ "https://huggingface.co/Qwen/Qwen-7B-Chat",
325
+ "Qwen is a multi-language large-scale language model (LLM), developed by Damo Academy.",
326
+ )
327
+ register_model_info(
328
+ ["Llama2-Chinese-13b-Chat", "LLama2-Chinese-13B"],
329
+ "Llama2-Chinese",
330
+ "https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat",
331
+ "Llama2-Chinese is a multi-language large-scale language model (LLM), developed by FlagAlpha.",
332
+ )
333
+ register_model_info(
334
+ ["Vigogne-2-7B-Instruct", "Vigogne-2-13B-Instruct"],
335
+ "Vigogne-Instruct",
336
+ "https://huggingface.co/bofenghuang/vigogne-2-7b-instruct",
337
+ "Vigogne-Instruct is a French large language model (LLM) optimized for instruction-following, developed by Bofeng Huang",
338
+ )
339
+ register_model_info(
340
+ ["Vigogne-2-7B-Chat", "Vigogne-2-13B-Chat"],
341
+ "Vigogne-Chat",
342
+ "https://huggingface.co/bofenghuang/vigogne-2-7b-chat",
343
+ "Vigogne-Chat is a French large language model (LLM) optimized for instruction-following and multi-turn dialogues, developed by Bofeng Huang",
344
+ )
345
+ register_model_info(
346
+ ["deluxe-chat-v1", "deluxe-chat-v1.1"],
347
+ "DeluxeChat",
348
+ "",
349
+ "Deluxe Chat",
350
+ )
351
+ register_model_info(
352
+ [
353
+ "Xwin-LM-7B-V0.1",
354
+ "Xwin-LM-13B-V0.1",
355
+ "Xwin-LM-70B-V0.1",
356
+ "Xwin-LM-7B-V0.2",
357
+ "Xwin-LM-13B-V0.2",
358
+ ],
359
+ "Xwin-LM",
360
+ "https://github.com/Xwin-LM/Xwin-LM",
361
+ "Chat models developed by Xwin-LM team",
362
+ )
363
+
364
+ register_model_info(
365
+ ["lemur-70b-chat"],
366
+ "Lemur-Chat",
367
+ "https://huggingface.co/OpenLemur/lemur-70b-chat-v1",
368
+ "an openly accessible language model optimized for both natural language and coding capabilities ",
369
+ )
370
+
371
+ register_model_info(
372
+ ["Mistral-7B-OpenOrca"],
373
+ "Open-Orca",
374
+ "https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca",
375
+ "A fine-tune of [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) using [OpenOrca dataset](https://huggingface.co/datasets/Open-Orca/OpenOrca)",
376
+ )
377
+
378
+ register_model_info(
379
+ [
380
+ "AquilaChat-7B",
381
+ "AquilaChat2-7B",
382
+ "AquilaChat2-34B",
383
+ ],
384
+ "Aquila-Chat",
385
+ "https://huggingface.co/BAAI/AquilaChat2-34B",
386
+ "Chat models developed by BAAI team",
387
+ )
fastchat/model/model_xfastertransformer.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from threading import Thread
3
+
4
+ import torch
5
+ from transformers import TextIteratorStreamer
6
+
7
+
8
+ @torch.inference_mode()
9
+ def generate_stream_xft(
10
+ model,
11
+ tokenizer,
12
+ params,
13
+ device,
14
+ context_len=8192,
15
+ stream_interval=2,
16
+ judge_sent_end=False,
17
+ ):
18
+ prompt = params["prompt"]
19
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
20
+
21
+ # unused now, and placehold for future.
22
+ # temperature = float(params.get("temperature", 1.0))
23
+ # top_p = float(params.get("top_p", 1.0))
24
+
25
+ max_new_tokens = int(params.get("max_new_tokens", 4096))
26
+ echo = params.get("echo", True)
27
+
28
+ inputs = tokenizer(
29
+ prompt, return_tensors="pt", padding=model.config.padding
30
+ ).input_ids
31
+ input_echo_len = len(inputs[0])
32
+ max_len = max_new_tokens + input_echo_len
33
+
34
+ decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
35
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
36
+ generation_kwargs = {
37
+ "input_ids": inputs,
38
+ "streamer": streamer,
39
+ "max_length": max_len,
40
+ "num_beams": model.config.beam_width,
41
+ "length_penalty": repetition_penalty,
42
+ "num_return_sequences": model.config.num_return_sequences,
43
+ "early_stopping": model.config.early_stopping,
44
+ "eos_token_id": model.config.eos_token_id,
45
+ "pad_token_id": model.config.pad_token_id,
46
+ }
47
+
48
+ thread = Thread(target=model.model.generate, kwargs=generation_kwargs)
49
+ thread.start()
50
+ if echo:
51
+ # means keep the prompt
52
+ output = prompt
53
+ else:
54
+ output = ""
55
+ i = 0
56
+ for i, new_text in enumerate(streamer):
57
+ output += new_text
58
+ yield {
59
+ "text": output,
60
+ "usage": {
61
+ "prompt_tokens": input_echo_len,
62
+ "completion_tokens": i,
63
+ "total_tokens": input_echo_len + i,
64
+ },
65
+ "finish_reason": None,
66
+ }
67
+ output = output.strip()
68
+ if i == max_new_tokens - 1:
69
+ finish_reason = "length"
70
+ else:
71
+ finish_reason = "stop"
72
+ yield {
73
+ "text": output,
74
+ "usage": {
75
+ "prompt_tokens": input_echo_len,
76
+ "completion_tokens": i,
77
+ "total_tokens": input_echo_len + i,
78
+ },
79
+ "finish_reason": finish_reason,
80
+ }
81
+ gc.collect()