jburmeister gargamit commited on
Commit
dfce255
0 Parent(s):

Duplicate from microsoft/Phi-3.5-MoE-instruct

Browse files

Co-authored-by: Amit Garg <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Microsoft Open Source Code of Conduct
2
+
3
+ This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4
+
5
+ Resources:
6
+
7
+ - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8
+ - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9
+ - Contact [[email protected]](mailto:[email protected]) with questions or concerns
LICENSE ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Microsoft.
2
+ Copyright (c) Microsoft Corporation.
3
+
4
+ MIT License
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
NOTICE.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NOTICES AND INFORMATION
2
+ Do Not Translate or Localize
3
+
4
+ This software incorporates material from third parties.
5
+
6
+ **Component.** https://github.com/Dao-AILab/flash-attention
7
+
8
+ **Open Source License/Copyright Notice.**
9
+
10
+ BSD 3-Clause License
11
+
12
+ Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
13
+ All rights reserved.
14
+
15
+ Redistribution and use in source and binary forms, with or without
16
+ modification, are permitted provided that the following conditions are met:
17
+
18
+ * Redistributions of source code must retain the above copyright notice, this
19
+ list of conditions and the following disclaimer.
20
+
21
+ * Redistributions in binary form must reproduce the above copyright notice,
22
+ this list of conditions and the following disclaimer in the documentation
23
+ and/or other materials provided with the distribution.
24
+
25
+ * Neither the name of the copyright holder nor the names of its
26
+ contributors may be used to endorse or promote products derived from
27
+ this software without specific prior written permission.
28
+
29
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
32
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
33
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
35
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
36
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
37
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
README.md ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ license_link: https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/resolve/main/LICENSE
4
+ language:
5
+ - multilingual
6
+ pipeline_tag: text-generation
7
+ tags:
8
+ - nlp
9
+ - code
10
+ widget:
11
+ - messages:
12
+ - role: user
13
+ content: Can you provide ways to eat combinations of bananas and dragonfruits?
14
+ library_name: transformers
15
+ ---
16
+
17
+ ## Model Summary
18
+
19
+ Phi-3.5-MoE is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available documents - with a focus on very high-quality, reasoning dense data. The model supports multilingual and comes with 128K context length (in tokens). The model underwent a rigorous enhancement process, incorporating supervised fine-tuning, proximal policy optimization, and direct preference optimization to ensure precise instruction adherence and robust safety measures.
20
+
21
+ 🏡 [Phi-3 Portal](https://azure.microsoft.com/en-us/products/phi-3) <br>
22
+ 📰 [Phi-3 Microsoft Blog](https://aka.ms/phi3.5-techblog) <br>
23
+ 📖 [Phi-3 Technical Report](https://arxiv.org/abs/2404.14219) <br>
24
+ 👩‍🍳 [Phi-3 Cookbook](https://github.com/microsoft/Phi-3CookBook) <br>
25
+ 🖥️ [Try It](https://aka.ms/try-phi3.5moe) <br>
26
+
27
+ **Phi-3.5**: [[mini-instruct]](https://huggingface.co/microsoft/Phi-3.5-mini-instruct); [[MoE-instruct]](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct) ; [[vision-instruct]](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)
28
+
29
+ ## Intended Uses
30
+
31
+ ### Primary Use Cases
32
+
33
+ The model is intended for commercial and research use in multiple languages. The model provides uses for general purpose AI systems and applications which require:
34
+
35
+ 1) Memory/compute constrained environments
36
+ 2) Latency bound scenarios
37
+ 3) Strong reasoning (especially code, math and logic)
38
+
39
+ Our model is designed to accelerate research on language and multimodal models, for use as a building block for generative AI powered features.
40
+
41
+ ### Use Case Considerations
42
+
43
+ Our models are not specifically designed or evaluated for all downstream purposes. Developers should consider common limitations of language models as they select use cases, and evaluate and mitigate for accuracy, safety, and fariness before using within a specific downstream use case, particularly for high risk scenarios. Developers should be aware of and adhere to applicable laws or regulations (including privacy, trade compliance laws, etc.) that are relevant to their use case.
44
+
45
+ ***Nothing contained in this Model Card should be interpreted as or deemed a restriction or modification to the license the model is released under.***
46
+
47
+ ## Usage
48
+
49
+ ### Requirements
50
+ Phi-3.5-MoE-instruct will be integrated in the official version of `transformers`. Until the official version is released through `pip`, ensure that you are doing the following:
51
+ * When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function.
52
+
53
+ The current `transformers` version can be verified with: `pip list | grep transformers`.
54
+
55
+ Examples of required packages:
56
+ ```
57
+ flash_attn==2.5.8
58
+ torch==2.3.1
59
+ accelerate==0.31.0
60
+ transformers==4.43.0
61
+ ```
62
+
63
+ Phi-3.5-MoE-instruct is also available in [Azure AI Studio](https://aka.ms/try-phi3.5moe)
64
+
65
+ ### Tokenizer
66
+
67
+ Phi-3.5-MoE-Instruct supports a vocabulary size of up to `32064` tokens. The [tokenizer files](https://huggingface.co/microsoft/Phi-3.5-moe-instruct/blob/main/added_tokens.json) already provide placeholder tokens that can be used for downstream fine-tuning, but they can also be extended up to the model's vocabulary size.
68
+
69
+ ### Input Formats
70
+ Given the nature of the training data, the Phi-3.5-MoE-instruct model is best suited for prompts using the chat format as follows:
71
+
72
+ ```
73
+ <|system|>
74
+ You are a helpful assistant.<|end|>
75
+ <|user|>
76
+ How to explain Internet for a medieval knight?<|end|>
77
+ <|assistant|>
78
+ ```
79
+
80
+ ### Loading the model locally
81
+ After obtaining the Phi-3.5-MoE-instruct model checkpoints, users can use this sample code for inference.
82
+
83
+ ```python
84
+ import torch
85
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
86
+
87
+ torch.random.manual_seed(0)
88
+
89
+ model = AutoModelForCausalLM.from_pretrained(
90
+ "microsoft/Phi-3.5-MoE-instruct",
91
+ device_map="cuda",
92
+ torch_dtype="auto",
93
+ trust_remote_code=True,
94
+ )
95
+
96
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
97
+
98
+ messages = [
99
+ {"role": "system", "content": "You are a helpful AI assistant."},
100
+ {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
101
+ {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
102
+ {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
103
+ ]
104
+
105
+ pipe = pipeline(
106
+ "text-generation",
107
+ model=model,
108
+ tokenizer=tokenizer,
109
+ )
110
+
111
+ generation_args = {
112
+ "max_new_tokens": 500,
113
+ "return_full_text": False,
114
+ "temperature": 0.0,
115
+ "do_sample": False,
116
+ }
117
+
118
+ output = pipe(messages, **generation_args)
119
+ print(output[0]['generated_text'])
120
+ ```
121
+
122
+ ## Benchmarks
123
+
124
+ To understand the capabilities, we compare Phi-3.5-MoE with a set of models over a variety of benchmarks using our internal benchmark platform. At the high-level overview of the model quality on representative benchmarks:
125
+
126
+ | Category | Benchmark | Phi-3.5-MoE-instruct | Mistral-Nemo-12B-instruct-2407 | Llama-3.1-8B-instruct | Gemma-2-9b-It | Gemini-1.5-Flash | GPT-4o-mini-2024-07-18 (Chat) |
127
+ |--|--|--|--|--|--|--|--|
128
+ | Popular aggregated benchmark | Arena Hard | 37.9 | 39.4 | 25.7 | 42.0 | 55.2 | 75.0 |
129
+ | | BigBench Hard CoT (0-shot) | 79.1 | 60.2 | 63.4 | 63.5 | 66.7 | 80.4 |
130
+ | | MMLU (5-shot) | 78.9 | 67.2 | 68.1 | 71.3 | 78.7 | 77.2 |
131
+ | | MMLU-Pro (0-shot, CoT) | 54.3 | 40.7 | 44.0 | 50.1 | 57.2 | 62.8 |
132
+ | Reasoning | ARC Challenge (10-shot) | 91.0 | 84.8 | 83.1 | 89.8 | 92.8 | 93.5 |
133
+ | | BoolQ (2-shot) | 84.6 | 82.5 | 82.8 | 85.7 | 85.8 | 88.7 |
134
+ | | GPQA (0-shot, CoT) | 36.8 | 28.6 | 26.3 | 29.2 | 37.5 | 41.1 |
135
+ | | HellaSwag (5-shot) | 83.8 | 76.7 | 73.5 | 80.9 | 67.5 | 87.1 |
136
+ | | OpenBookQA (10-shot) | 89.6 | 84.4 | 84.8 | 89.6 | 89.0 | 90.0 |
137
+ | | PIQA (5-shot) | 88.6 | 83.5 | 81.2 | 83.7 | 87.5 | 88.7 |
138
+ | | Social IQA (5-shot) | 78.0 | 75.3 | 71.8 | 74.7 | 77.8 | 82.9 |
139
+ | | TruthfulQA (MC2) (10-shot) | 77.5 | 68.1 | 69.2 | 76.6 | 76.6 | 78.2 |
140
+ | | WinoGrande (5-shot) | 81.3 | 70.4 | 64.7 | 74.0 | 74.7 | 76.9 |
141
+ | Multilingual | Multilingual MMLU (5-shot) | 69.9 | 58.9 | 56.2 | 63.8 | 77.2 | 72.9 |
142
+ | | MGSM (0-shot CoT) | 58.7 | 63.3 | 56.7 | 75.1 | 75.8 | 81.7 |
143
+ | Math | GSM8K (8-shot, CoT) | 88.7 | 84.2 | 82.4 | 84.9 | 82.4 | 91.3 |
144
+ | | MATH (0-shot, CoT) | 59.5 | 31.2 | 47.6 | 50.9 | 38.0 | 70.2 |
145
+ | Long context | Qasper | 40.0 | 30.7 | 37.2 | 13.9 | 43.5 | 39.8 |
146
+ | | SQuALITY | 24.1 | 25.8 | 26.2 | 0.0 | 23.5 | 23.8 |
147
+ | Code Generation | HumanEval (0-shot) | 70.7 | 63.4 | 66.5 | 61.0 | 74.4 | 86.6 |
148
+ | | MBPP (3-shot) | 80.8 | 68.1 | 69.4 | 69.3 | 77.5 | 84.1 |
149
+ | **Average** | | **69.2** | **61.3** | **61.0** | **63.3** | **68.5** | **74.9** |
150
+
151
+ We take a closer look at different categories across 80 public benchmark datasets at the table below:
152
+ | Category | Phi-3.5-MoE-instruct | Mistral-Nemo-12B-instruct-2407 | Llama-3.1-8B-instruct | Gemma-2-9b-It | Gemini-1.5-Flash | GPT-4o-mini-2024-07-18 (Chat) |
153
+ |--|--|--|--|--|--|--|
154
+ | Popular aggregated benchmark | 62.6 | 51.9 | 50.3 | 56.7 | 64.5 | 73.9 |
155
+ | Reasoning | 78.7 | 72.2 | 70.5 | 75.4 | 77.7 | 80.0 |
156
+ | Language understanding | 71.8 | 67.0 | 62.9 | 72.8 | 66.6 | 76.8 |
157
+ | Robustness | 75.6 | 65.2 | 59.8 | 64.7 | 68.9 | 77.5 |
158
+ | Long context | 25.5 | 24.5 | 25.5 | 0.0 | 27.0 | 25.4 |
159
+ | Math | 74.1 | 57.7 | 65.0 | 67.9 | 60.2 | 80.8 |
160
+ | Code generation | 68.3 | 56.9 | 65.8 | 58.3 | 66.8 | 69.9 |
161
+ | Multilingual | 65.8 | 55.3 | 47.5 | 59.6 | 64.3 | 76.6 |
162
+
163
+ Overall, Phi-3.5-MoE with only **6.6B active parameters** achieves a similar level of language understanding and math as much larger models. Moreover, the model outperforms bigger models in reasoning capability and only behind GPT-4o-mini. However, it is still fundamentally limited by its size for certain tasks. The model simply does not have the capacity to store too much factual knowledge, therefore, users may experience factual incorrectness. However, we believe such weakness can be resolved by augmenting Phi-3.5 with a search engine, particularly when using the model under RAG settings.
164
+
165
+ ### Multilingual
166
+
167
+ The table below highlights multilingual capability of Phi-3.5-MoE on multilingual MMLU, MEGA, and multilingual MMLU-pro datasets. Overall, we observed that even with just 6.6B active parameters, the model is very competitive on multilingual tasks in comparison to other models with a much bigger active parameters.
168
+
169
+ | Category | Phi-3.5-MoE-instruct | Mistral-Nemo-12B-instruct-2407 | Llama-3.1-8B-instruct | Gemma-2-9b-It | Gemini-1.5-Flash | GPT-4o-mini-2024-07-18 (Chat) |
170
+ |--|--|--|--|--|--|--|
171
+ | Multilingual MMLU | 69.9 | 58.9 | 56.2 | 63.8 | 77.2 | 72.9 |
172
+ | Multilingual MMLU-Pro | 45.3 | 34.0 | 21.4 | 43.0 | 57.9 | 53.2 |
173
+ | MGSM | 58.7 | 63.3 | 56.7 | 75.1 | 75.8 | 81.7 |
174
+ | MEGA MLQA | 65.3 | 61.2 | 45.2 | 54.4 | 61.6 | 70.0 |
175
+ | MEGA TyDi QA | 67.1 | 63.7 | 54.5 | 65.6 | 63.6 | 81.8 |
176
+ | MEGA UDPOS | 60.4 | 58.2 | 54.1 | 56.6 | 62.4 | 66.0 |
177
+ | MEGA XCOPA | 76.6 | 10.8 | 21.1 | 31.2 | 95.0 | 90.3 |
178
+ | MEGA XStoryCloze | 82.8 | 92.3 | 71.0 | 87.0 | 20.7 | 96.6 |
179
+ | **Average** | **65.8** | **55.3** | **47.5** | **59.6** | **64.3** | **76.6** |
180
+
181
+ ### Long Context
182
+
183
+ Phi-3.5-MoE supports 128K context length, therefore the model is capable of several long context tasks including long document/meeting summarization, long document QA, multilingual context retrieval. We see that Phi-3.5 is clearly better than Gemma-2 family which only supports 8K context length. Phi-3.5-MoE-instruct is very competitive with other much larger open-weight models such as Llama-3.1-8B-instruct, and Mistral-Nemo-12B-instruct-2407.
184
+
185
+ | Benchmark | Phi-3.5-MoE-instruct | Mistral-Nemo-12B-instruct-2407 | Llama-3.1-8B-instruct | Gemini-1.5-Flash | GPT-4o-mini-2024-07-18 (Chat) |
186
+ |--|--|--|--|--|--|
187
+ | GovReport | 26.4 | 25.6 | 25.1 | 27.8 | 24.8 |
188
+ | QMSum | 19.9 | 22.1 | 21.6 | 24.0 | 21.7 |
189
+ | Qasper | 40.0 | 30.7 | 37.2 | 43.5 | 39.8 |
190
+ | SQuALITY | 24.1 | 25.8 | 26.2 | 23.5 | 23.8 |
191
+ | SummScreenFD | 16.9 | 18.2 | 17.6 | 16.3 | 17.0 |
192
+ | **Average** | **25.5** | **24.5** | **25.5** | **27.0** | **25.4** |
193
+
194
+ RULER: a retrieval-based benchmark for long context understanding
195
+ | Model | 4K | 8K | 16K | 32K | 64K | 128K | Average |
196
+ |--|--|--|--|--|--|--|--|
197
+ | Phi-3.5-MoE-instruct | 94.8 | 93 | 93.2 | 91.6 | 85.7 | 64.2 | **87.1** |
198
+ | Llama-3.1-8B-instruct | 95.5 | 93.8 | 91.6 | 87.4 | 84.7 | 77.0 | **88.3** |
199
+ | Mistral-Nemo-12B-instruct-2407 | 87.8 | 87.2 | 87.7 | 69.0 | 46.8 | 19.0 | **66.2** |
200
+
201
+ RepoQA: a benchmark for long context code understanding
202
+ | Model | Python | C++ | Rust | Java | TypeScript | Average |
203
+ |--|--|--|--|--|--|--|
204
+ | Phi-3.5-MoE-instruct | 89 | 74 | 81 | 88 | 95 | **85** |
205
+ | Llama-3.1-8B-instruct | 80 | 65 | 73 | 76 | 63 | **71** |
206
+ | Mistral-7B-instruct-v0.3 | 61 | 57 | 51 | 61 | 80 | **62** |
207
+
208
+ ## Training
209
+
210
+ ### Model
211
+
212
+ **Architecture:** Phi-3.5-MoE has 16x3.8B parameters with **6.6B active parameters** when using 2 experts. The model is a mixture-of-expert decoder-only Transformer model using the tokenizer with vocabulary size of 32,064.<br>
213
+ **Inputs:** Text. It is best suited for prompts using chat format.<br>
214
+ **Context length:** 128K tokens<br>
215
+ **GPUs:** 512 H100-80G<br>
216
+ **Training time:** 23 days<br>
217
+ **Training data:** 4.9T tokens<br>
218
+ **Outputs:** Generated text in response to the input<br>
219
+ **Dates:** Trained between April and August 2024<br>
220
+ **Status:** This is a static model trained on an offline dataset with cutoff date October 2023 for publicly available data. Future versions of the tuned models may be released as we improve models.<br>
221
+ **Supported languages:** Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian<br>
222
+ **Release date:** August 2024<br>
223
+
224
+ ### Training Datasets
225
+ Our training data includes a wide variety of sources, totaling 4.9 trillion tokens (including 10% multilingual), and is a combination of
226
+ 1) publicly available documents filtered rigorously for quality, selected high-quality educational data, and code;
227
+ 2) newly created synthetic, “textbook-like” data for the purpose of teaching math, coding, common sense reasoning, general knowledge of the world (science, daily activities, theory of mind, etc.);
228
+ 3) high quality chat format supervised data covering various topics to reflect human preferences on different aspects such as instruct-following, truthfulness, honesty and helpfulness.
229
+
230
+ We are focusing on the quality of data that could potentially improve the reasoning ability for the model, and we filter the publicly available documents to contain the correct level of knowledge. As an example, the result of a game in premier league in a particular day might be good training data for frontier models, but we need to remove such information to leave more model capacity for reasoning for the small size models. More details about data can be found in the [Phi-3 Technical Report](https://arxiv.org/pdf/2404.14219).
231
+
232
+ ## Responsible AI Considerations
233
+
234
+ Like other language models, the Phi family of models can potentially behave in ways that are unfair, unreliable, or offensive. Some of the limiting behaviors to be aware of include:
235
+ * Quality of Service: The Phi models are trained primarily on English text and some additional multilingual text. Languages other than English will experience worse performance as well as performance disparities across non-English. English language varieties with less representation in the training data might experience worse performance than standard American English.
236
+ * Multilingual performance and safety gaps: We believe it is important to make language models more widely available across different languages, but the Phi 3 models still exhibit challenges common across multilingual releases. As with any deployment of LLMs, developers will be better positioned to test for performance or safety gaps for their linguistic and cultural context and customize the model with additional fine-tuning and appropriate safeguards.
237
+ * Representation of Harms & Perpetuation of Stereotypes: These models can over- or under-represent groups of people, erase representation of some groups, or reinforce demeaning or negative stereotypes. Despite safety post-training, these limitations may still be present due to differing levels of representation of different groups, cultural contexts, or prevalence of examples of negative stereotypes in training data that reflect real-world patterns and societal biases.
238
+ * Inappropriate or Offensive Content: These models may produce other types of inappropriate or offensive content, which may make it inappropriate to deploy for sensitive contexts without additional mitigations that are specific to the use case.
239
+ * Information Reliability: Language models can generate nonsensical content or fabricate content that might sound reasonable but is inaccurate or outdated.
240
+ * Limited Scope for Code: Majority of Phi-3 training data is based in Python and use common packages such as "typing, math, random, collections, datetime, itertools". If the model generates Python scripts that utilize other packages or scripts in other languages, we strongly recommend users manually verify all API uses.
241
+ * Long Conversation: Phi-3 models, like other models, can in some cases generate responses that are repetitive, unhelpful, or inconsistent in very long chat sessions in both English and non-English languages. Developers are encouraged to place appropriate mitigations, like limiting conversation turns to account for the possible conversational drift
242
+
243
+ Developers should apply responsible AI best practices, including mapping, measuring, and mitigating risks associated with their specific use case and cultural, linguistic context. Phi-3 family of models are general purpose models. As developers plan to deploy these models for specific use cases, they are encouraged to fine-tune the models for their use case and leverage the models as part of broader AI systems with language-specific safeguards in place. Important areas for consideration include:
244
+ * Allocation: Models may not be suitable for scenarios that could have consequential impact on legal status or the allocation of resources or life opportunities (ex: housing, employment, credit, etc.) without further assessments and additional debiasing techniques.
245
+ * High-Risk Scenarios: Developers should assess the suitability of using models in high-risk scenarios where unfair, unreliable or offensive outputs might be extremely costly or lead to harm. This includes providing advice in sensitive or expert domains where accuracy and reliability are critical (ex: legal or health advice). Additional safeguards should be implemented at the application level according to the deployment context.
246
+ * Misinformation: Models may produce inaccurate information. Developers should follow transparency best practices and inform end-users they are interacting with an AI system. At the application level, developers can build feedback mechanisms and pipelines to ground responses in use-case specific, contextual information, a technique known as Retrieval Augmented Generation (RAG).
247
+ * Generation of Harmful Content: Developers should assess outputs for their context and use available safety classifiers or custom solutions appropriate for their use case.
248
+ * Misuse: Other forms of misuse such as fraud, spam, or malware production may be possible, and developers should ensure that their applications do not violate applicable laws and regulations.
249
+
250
+ ## Safety Evaluation and Red-Teaming
251
+
252
+ We leveraged various evaluation techniques including red teaming, adversarial conversation simulations, and multilingual safety evaluation benchmark datasets to
253
+ evaluate Phi-3.5 models' propensity to produce undesirable outputs across multiple languages and risk categories.
254
+ Several approaches were used to compensate for the limitations of one approach alone. Findings across the various evaluation methods indicate that safety
255
+ post-training that was done as detailed in the [Phi-3 Safety Post-Training paper](https://arxiv.org/pdf/2407.13833) had a positive impact across multiple languages and risk categories as observed by
256
+ refusal rates (refusal to output undesirable outputs) and robustness to jailbreak techniques. Note, however, while comprehensive red team evaluations were conducted
257
+ across all models in the prior release of Phi models, red teaming was largely focused on Phi-3.5 MOE across multiple languages and risk categories for this release as
258
+ it is the largest and more capable model of the three models. Details on prior red team evaluations across Phi models can be found in the [Phi-3 Safety Post-Training paper](https://arxiv.org/pdf/2407.13833).
259
+ For this release, insights from red teaming indicate that the models may refuse to generate undesirable outputs in English, even when the request for undesirable output
260
+ is in another language. Models may also be more susceptible to longer multi-turn jailbreak techniques across both English and non-English languages. These findings
261
+ highlight the need for industry-wide investment in the development of high-quality safety evaluation datasets across multiple languages, including low resource languages,
262
+ and risk areas that account for cultural nuances where those languages are spoken.
263
+
264
+ ## Software
265
+ * [PyTorch](https://github.com/pytorch/pytorch)
266
+ * [Transformers](https://github.com/huggingface/transformers)
267
+ * [Flash-Attention](https://github.com/HazyResearch/flash-attention)
268
+
269
+ ## Hardware
270
+ Note that by default, the Phi-3.5-MoE-instruct model uses flash attention, which requires certain types of GPU hardware to run. We have tested on the following GPU types:
271
+ * NVIDIA A100
272
+ * NVIDIA A6000
273
+ * NVIDIA H100
274
+
275
+ ## License
276
+ The model is licensed under the [MIT license](./LICENSE).
277
+
278
+ ## Trademarks
279
+ This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft’s Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party’s policies.
SECURITY.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
2
+
3
+ ## Security
4
+
5
+ Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6
+
7
+ If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8
+
9
+ ## Reporting Security Issues
10
+
11
+ **Please do not report security vulnerabilities through public GitHub issues.**
12
+
13
+ Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14
+
15
+ If you prefer to submit without logging in, send email to [[email protected]](mailto:[email protected]). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16
+
17
+ You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18
+
19
+ Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20
+
21
+ * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22
+ * Full paths of source file(s) related to the manifestation of the issue
23
+ * The location of the affected source code (tag/branch/commit or direct URL)
24
+ * Any special configuration required to reproduce the issue
25
+ * Step-by-step instructions to reproduce the issue
26
+ * Proof-of-concept or exploit code (if possible)
27
+ * Impact of the issue, including how an attacker might exploit the issue
28
+
29
+ This information will help us triage your report more quickly.
30
+
31
+ If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32
+
33
+ ## Preferred Languages
34
+
35
+ We prefer all communications to be in English.
36
+
37
+ ## Policy
38
+
39
+ Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40
+
41
+ <!-- END MICROSOFT SECURITY.MD BLOCK -->
added_tokens.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 32000,
3
+ "<|assistant|>": 32001,
4
+ "<|placeholder1|>": 32002,
5
+ "<|placeholder2|>": 32003,
6
+ "<|placeholder3|>": 32004,
7
+ "<|placeholder4|>": 32005,
8
+ "<|system|>": 32006,
9
+ "<|end|>": 32007,
10
+ "<|placeholder5|>": 32008,
11
+ "<|placeholder6|>": 32009,
12
+ "<|user|>": 32010
13
+ }
config.json ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Phi-3.5-MoE-instruct",
3
+ "architectures": [
4
+ "PhiMoEForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_phimoe.PhiMoEConfig",
10
+ "AutoModelForCausalLM": "modeling_phimoe.PhiMoEForCausalLM"
11
+ },
12
+ "bos_token_id": 1,
13
+ "eos_token_id": 32000,
14
+ "hidden_act": "silu",
15
+ "hidden_dropout": 0.0,
16
+ "hidden_size": 4096,
17
+ "initializer_range": 0.02,
18
+ "input_jitter_noise": 0.01,
19
+ "intermediate_size": 6400,
20
+ "lm_head_bias": true,
21
+ "max_position_embeddings": 131072,
22
+ "model_type": "phimoe",
23
+ "num_attention_heads": 32,
24
+ "num_experts_per_tok": 2,
25
+ "num_hidden_layers": 32,
26
+ "num_key_value_heads": 8,
27
+ "num_local_experts": 16,
28
+ "original_max_position_embeddings": 4096,
29
+ "output_router_logits": false,
30
+ "rms_norm_eps": 1e-05,
31
+ "rope_scaling": {
32
+ "long_factor": [
33
+ 1.0199999809265137,
34
+ 1.0299999713897705,
35
+ 1.0399999618530273,
36
+ 1.0499999523162842,
37
+ 1.0499999523162842,
38
+ 1.0499999523162842,
39
+ 1.059999942779541,
40
+ 1.059999942779541,
41
+ 1.059999942779541,
42
+ 1.059999942779541,
43
+ 1.059999942779541,
44
+ 1.059999942779541,
45
+ 1.0999999046325684,
46
+ 1.1799999475479126,
47
+ 1.1799999475479126,
48
+ 1.3700000047683716,
49
+ 1.4899998903274536,
50
+ 2.109999895095825,
51
+ 2.8899998664855957,
52
+ 3.9499998092651367,
53
+ 4.299999713897705,
54
+ 6.429999828338623,
55
+ 8.09000015258789,
56
+ 10.690000534057617,
57
+ 12.050000190734863,
58
+ 18.229999542236328,
59
+ 18.84000015258789,
60
+ 19.899999618530273,
61
+ 21.420000076293945,
62
+ 26.200000762939453,
63
+ 34.28000259399414,
64
+ 34.590003967285156,
65
+ 38.730003356933594,
66
+ 40.22000503540039,
67
+ 42.54000473022461,
68
+ 44.000003814697266,
69
+ 47.590003967285156,
70
+ 54.750003814697266,
71
+ 56.19000244140625,
72
+ 57.44000244140625,
73
+ 57.4900016784668,
74
+ 61.20000076293945,
75
+ 61.540000915527344,
76
+ 61.75,
77
+ 61.779998779296875,
78
+ 62.06999969482422,
79
+ 63.11000061035156,
80
+ 63.43000030517578,
81
+ 63.560001373291016,
82
+ 63.71000289916992,
83
+ 63.92000198364258,
84
+ 63.94000244140625,
85
+ 63.94000244140625,
86
+ 63.96000289916992,
87
+ 63.980003356933594,
88
+ 64.0300064086914,
89
+ 64.0300064086914,
90
+ 64.0300064086914,
91
+ 64.04000854492188,
92
+ 64.10000610351562,
93
+ 64.19000244140625,
94
+ 64.20999908447266,
95
+ 64.75,
96
+ 64.95999908447266
97
+ ],
98
+ "long_mscale": 1.243163121016122,
99
+ "original_max_position_embeddings": 4096,
100
+ "short_factor": [
101
+ 1.0,
102
+ 1.0399999618530273,
103
+ 1.0399999618530273,
104
+ 1.0399999618530273,
105
+ 1.0499999523162842,
106
+ 1.0499999523162842,
107
+ 1.0499999523162842,
108
+ 1.0499999523162842,
109
+ 1.0499999523162842,
110
+ 1.0499999523162842,
111
+ 1.0499999523162842,
112
+ 1.0499999523162842,
113
+ 1.0499999523162842,
114
+ 1.0499999523162842,
115
+ 1.059999942779541,
116
+ 1.059999942779541,
117
+ 1.0699999332427979,
118
+ 1.0699999332427979,
119
+ 1.0699999332427979,
120
+ 1.0699999332427979,
121
+ 1.1399999856948853,
122
+ 1.159999966621399,
123
+ 1.159999966621399,
124
+ 1.159999966621399,
125
+ 1.159999966621399,
126
+ 1.1799999475479126,
127
+ 1.1999999284744263,
128
+ 1.3199999332427979,
129
+ 1.3399999141693115,
130
+ 1.3499999046325684,
131
+ 1.3999998569488525,
132
+ 1.4799998998641968,
133
+ 1.4999998807907104,
134
+ 1.589999794960022,
135
+ 1.6499998569488525,
136
+ 1.71999990940094,
137
+ 1.8999998569488525,
138
+ 1.9099998474121094,
139
+ 1.9099998474121094,
140
+ 1.9899998903274536,
141
+ 1.9999998807907104,
142
+ 1.9999998807907104,
143
+ 2.009999990463257,
144
+ 2.009999990463257,
145
+ 2.009999990463257,
146
+ 2.009999990463257,
147
+ 2.009999990463257,
148
+ 2.009999990463257,
149
+ 2.009999990463257,
150
+ 2.009999990463257,
151
+ 2.009999990463257,
152
+ 2.009999990463257,
153
+ 2.009999990463257,
154
+ 2.009999990463257,
155
+ 2.009999990463257,
156
+ 2.009999990463257,
157
+ 2.009999990463257,
158
+ 2.009999990463257,
159
+ 2.009999990463257,
160
+ 2.0999999046325684,
161
+ 2.319999933242798,
162
+ 2.419999837875366,
163
+ 2.5899999141693115,
164
+ 2.7899999618530273
165
+ ],
166
+ "short_mscale": 1.243163121016122,
167
+ "type": "longrope"
168
+ },
169
+ "rope_theta": 10000.0,
170
+ "router_aux_loss_coef": 0.0,
171
+ "router_jitter_noise": 0.01,
172
+ "sliding_window": 131072,
173
+ "tie_word_embeddings": false,
174
+ "torch_dtype": "bfloat16",
175
+ "transformers_version": "4.43.3",
176
+ "use_cache": true,
177
+ "vocab_size": 32064
178
+ }
configuration_phimoe.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ PyTorch Phi-MoE model."""
17
+
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ PHIMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27
+ "microsoft/Phi-3.5-MoE-instruct": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/resolve/main/config.json",
28
+ }
29
+
30
+ class PhiMoEConfig(PretrainedConfig):
31
+ r"""
32
+ This is the configuration class to store the configuration of a [`PhiMoEModel`]. It is used to instantiate a Phi-MoE
33
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
34
+ defaults will yield a similar configuration to that of the
35
+ [microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct).
36
+
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32064):
43
+ Vocabulary size of the PhiMoE model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`PhiMoEModel`]
45
+ hidden_size (`int`, *optional*, defaults to 4096):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 6400):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer encoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer encoder.
53
+ num_key_value_heads (`int`, *optional*, defaults to 8):
54
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
+ by meanpooling all the original heads within that group. For more details checkout [this
59
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
60
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
61
+ The non-linear activation function (function or string) in the decoder.
62
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
63
+ The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
64
+ allows sequence of up to 4096*32 tokens.
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
68
+ The epsilon used by the rms normalization layers.
69
+ use_cache (`bool`, *optional*, defaults to `True`):
70
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
71
+ relevant if `config.is_decoder=True`.
72
+ pad_token_id (`int`, *optional*):
73
+ The id of the padding token.
74
+ bos_token_id (`int`, *optional*, defaults to 1):
75
+ The id of the "beginning-of-sequence" token.
76
+ eos_token_id (`int`, *optional*, defaults to 2):
77
+ The id of the "end-of-sequence" token.
78
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
79
+ Whether the model's input and output word embeddings should be tied.
80
+ rope_theta (`float`, *optional*, defaults to 10000.0):
81
+ The base period of the RoPE embeddings.
82
+ rope_scaling (`dict`, *optional*):
83
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
84
+ contain the following keys: `type`, `short_factor`, `long_factor`, `short_mscale`, `long_mscale` and
85
+ `original_max_position_embeddings`. The `type` must be `longrope`, the `short_mscale` and `long_scale` must
86
+ be numbers, the `short_factor` and `long_factor` must be lists of numbers with the same length as half of
87
+ the attention head size and the `original_max_position_embeddings` must be an integer.
88
+ sliding_window (`int`, *optional*):
89
+ Sliding window attention window size. If not specified, will default to `262144`.
90
+ attention_dropout (`float`, *optional*, defaults to 0.0):
91
+ The dropout ratio for the attention probabilities.
92
+ num_experts_per_tok (`int`, *optional*, defaults to 2):
93
+ The number of experts to root per-token, can be also interpreted as the `top-p` routing
94
+ parameter
95
+ num_local_experts (`int`, *optional*, defaults to 16):
96
+ Number of experts per Sparse MLP layer.
97
+ output_router_logits (`bool`, *optional*, defaults to `False`):
98
+ Whether or not the router logits should be returned by the model. Enabeling this will also
99
+ allow the model to output the auxiliary loss. See [here]() for more details
100
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.0):
101
+ The aux loss factor for the total loss.
102
+ router_jitter_noise (`float`, *optional*, defaults to 0.01):
103
+ Amount of noise to add to the router.
104
+
105
+ ```python
106
+ >>> from transformers import PhiMoEModel, PhiMoEConfig
107
+
108
+ >>> # Initializing a Phi-3 style configuration
109
+ >>> configuration = PhiMoEConfig.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
110
+
111
+ >>> # Initializing a model from the configuration
112
+ >>> model = PhiMoEModel(configuration)
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ ```"""
117
+
118
+ model_type = "phimoe"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+
121
+ def __init__(
122
+ self,
123
+ vocab_size=32064,
124
+ hidden_size=4096,
125
+ intermediate_size=6400,
126
+ num_hidden_layers=32,
127
+ num_attention_heads=32,
128
+ num_key_value_heads=8,
129
+ hidden_act="silu",
130
+ max_position_embeddings=4096 * 32,
131
+ initializer_range=0.02,
132
+ rms_norm_eps=1e-5,
133
+ use_cache=True,
134
+ pad_token_id=None,
135
+ bos_token_id=1,
136
+ eos_token_id=2,
137
+ tie_word_embeddings=False,
138
+ rope_theta=1e6,
139
+ rope_scaling=None,
140
+ sliding_window=None,
141
+ attention_dropout=0.0,
142
+ num_experts_per_tok=2,
143
+ num_local_experts=16,
144
+ output_router_logits=False,
145
+ router_aux_loss_coef=0.001,
146
+ router_jitter_noise=0.01,
147
+ input_jitter_noise=0.0,
148
+ attention_bias = False,
149
+ lm_head_bias = False,
150
+ **kwargs,
151
+ ):
152
+ self.vocab_size = vocab_size
153
+ self.max_position_embeddings = max_position_embeddings
154
+ self.hidden_size = hidden_size
155
+ self.intermediate_size = intermediate_size
156
+ self.num_hidden_layers = num_hidden_layers
157
+ self.num_attention_heads = num_attention_heads
158
+ self.sliding_window = sliding_window
159
+ self.attention_bias = attention_bias
160
+ self.lm_head_bias = lm_head_bias
161
+ # for backward compatibility
162
+ if num_key_value_heads is None:
163
+ num_key_value_heads = num_attention_heads
164
+
165
+ self.num_key_value_heads = num_key_value_heads
166
+ self.hidden_act = hidden_act
167
+ self.initializer_range = initializer_range
168
+ self.rms_norm_eps = rms_norm_eps
169
+ self.use_cache = use_cache
170
+ self.rope_theta = rope_theta
171
+ self.attention_dropout = attention_dropout
172
+
173
+ self.num_experts_per_tok = num_experts_per_tok
174
+ self.num_local_experts = num_local_experts
175
+ self.output_router_logits = output_router_logits
176
+ self.router_aux_loss_coef = router_aux_loss_coef
177
+ self.router_jitter_noise = router_jitter_noise
178
+ self.input_jitter_noise = input_jitter_noise
179
+
180
+ self.rope_scaling = rope_scaling
181
+ self._rope_scaling_validation()
182
+
183
+ super().__init__(
184
+ pad_token_id=pad_token_id,
185
+ bos_token_id=bos_token_id,
186
+ eos_token_id=eos_token_id,
187
+ tie_word_embeddings=tie_word_embeddings,
188
+ **kwargs,
189
+ )
190
+
191
+ def _rope_scaling_validation(self):
192
+ """
193
+ Validate the `rope_scaling` configuration.
194
+ """
195
+ if self.rope_scaling is None:
196
+ return
197
+
198
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 6:
199
+ raise ValueError(
200
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor`, `long_factor`, "
201
+ f"`short_mscale`, `long_mscale` and `original_max_position_embeddings`, got {self.rope_scaling}"
202
+ )
203
+ rope_scaling_type = self.rope_scaling.get("type", None)
204
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
205
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
206
+ rope_scaling_short_mscale = self.rope_scaling.get("short_mscale", None)
207
+ rope_scaling_long_mscale = self.rope_scaling.get("long_mscale", None)
208
+ original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None)
209
+ if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
210
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
211
+ if not (
212
+ isinstance(rope_scaling_short_factor, list)
213
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
214
+ ):
215
+ raise ValueError(
216
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
217
+ )
218
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
219
+ raise ValueError(
220
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
221
+ )
222
+ if not (
223
+ isinstance(rope_scaling_long_factor, list)
224
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
225
+ ):
226
+ raise ValueError(
227
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
228
+ )
229
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
230
+ raise ValueError(
231
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
232
+ )
233
+ if not isinstance(rope_scaling_short_mscale, (int, float)):
234
+ raise ValueError(
235
+ f"`rope_scaling`'s short_mscale field must be a number, got {rope_scaling_short_mscale}"
236
+ )
237
+ if not isinstance(rope_scaling_long_mscale, (int, float)):
238
+ raise ValueError(
239
+ f"`rope_scaling`'s long_mscale field must be a number, got {rope_scaling_long_mscale}"
240
+ )
241
+ if not isinstance(original_max_position_embeddings, int):
242
+ raise ValueError(
243
+ f"`rope_scaling`'s original_max_position_embeddings field must be an integer, got {original_max_position_embeddings}"
244
+ )
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 32000,
6
+ 32001,
7
+ 32007
8
+ ],
9
+ "transformers_version": "4.43.3",
10
+ "pad_token_id": 32000
11
+ }
model-00001-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a2a38dddac2bec339a66038c1ce134c6b2d95dd9ae7a5c413e3de58bd9db05f
3
+ size 4992095880
model-00002-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afd870833e845abeafe3b113681ead38aa7793f6254a4ec0cf5ffc3ee228980d
3
+ size 4991605352
model-00003-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:975c103d00185c0a9a18428b07fe5ac43accb425688505eef5f0c3e6fe5feb32
3
+ size 4991605352
model-00004-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a66a9092c3db38fcfbab5f95ecaf44e6bd00d65ba51073696284db1685bd9d5e
3
+ size 4991605352
model-00005-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba2adfeab2a7ab223827bb2d897414dc2824c67702890b6f9eb993ac6ecd2d20
3
+ size 4991605360
model-00006-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:108066de41d72313733781039fa3f92d13770b53e509761c0a96f28f60eff04f
3
+ size 4991605448
model-00007-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e46cf6d73bf6aed908ac77b69907326d5dd53f73d44efa21e987658d6b910c5
3
+ size 4991605480
model-00008-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e31965b1d580fbc731cfcf6f06ad165018705d322ee16c85db096d2dee9e0a
3
+ size 4991605480
model-00009-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae70cc917acd6f28e23bffef7c8f784e2ea2106a1cd954cc308fecd55a63db1
3
+ size 4991605480
model-00010-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d782af17c36e83d981aa7a07cb0e0063b8956855f66158bf44410774e0161c0
3
+ size 4991605480
model-00011-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:512fbacfba567133a119fc8754194651e492c9cfe04cac8adb64fb630cb38c57
3
+ size 4993558592
model-00012-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fa1d225920b80645b4018cc8fe87299f0ef5107e3352805827ddf490487d82
3
+ size 4958009392
model-00013-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1db195a922ebfde60f1e94194808ddccd73633571462b9afc00365d74bc74834
3
+ size 4991605472
model-00014-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d87c9b7aabd0ba7d564c208322ed681eb03a0fa1133f2c91771371060adf5e
3
+ size 4991605472
model-00015-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b5d6e7ea18615f1aa6c8bd49dc96a3d3514bfbd16fcc68973806e08b293c28c
3
+ size 4991605472
model-00016-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc74d530dbfa2ba36335d8268705b4c7459eddb43891f084241c52edcb0d324c
3
+ size 4991605472
model-00017-of-00017.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1d4d7793e752a874b923bf988734eec919fe8f4317d4a124f30564c85ded637
3
+ size 3912021632
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_phimoe.py ADDED
@@ -0,0 +1,1800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ PyTorch PhiMoE model."""
17
+ import inspect
18
+ import math
19
+ import warnings
20
+ from typing import List, Optional, Tuple, Union
21
+
22
+ import torch
23
+ import torch.nn.functional as F
24
+ import torch.utils.checkpoint
25
+ from torch import nn
26
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
27
+
28
+ from transformers.activations import ACT2FN
29
+ from transformers.cache_utils import Cache, DynamicCache
30
+ from transformers.modeling_attn_mask_utils import (
31
+ _prepare_4d_causal_attention_mask,
32
+ _prepare_4d_causal_attention_mask_for_sdpa,
33
+ )
34
+ from transformers.modeling_outputs import (
35
+ MoeCausalLMOutputWithPast,
36
+ MoeModelOutputWithPast,
37
+ SequenceClassifierOutputWithPast,
38
+ )
39
+ from transformers.modeling_utils import PreTrainedModel
40
+ from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
41
+ from transformers.utils import (
42
+ add_start_docstrings,
43
+ add_start_docstrings_to_model_forward,
44
+ is_flash_attn_2_available,
45
+ is_flash_attn_greater_or_equal_2_10,
46
+ logging,
47
+ replace_return_docstrings,
48
+ )
49
+ from transformers.utils.import_utils import is_torch_fx_available
50
+ from .configuration_phimoe import PhiMoEConfig
51
+
52
+ from einops import rearrange
53
+ from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
54
+
55
+
56
+ if is_flash_attn_2_available():
57
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
58
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
59
+
60
+ _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
61
+
62
+ # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
63
+ # It means that the function will not be traced through and simply appear as a node in the graph.
64
+ if is_torch_fx_available():
65
+ if not is_torch_greater_or_equal_than_1_13:
66
+ import torch.fx
67
+
68
+ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
69
+
70
+
71
+ logger = logging.get_logger(__name__)
72
+
73
+ _CONFIG_FOR_DOC = "PhiMoEConfig"
74
+
75
+
76
+ def load_balancing_loss_func(
77
+ gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
78
+ ) -> float:
79
+ r"""
80
+ Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
81
+
82
+ See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
83
+ function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
84
+ experts is too unbalanced.
85
+
86
+ Args:
87
+ gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
88
+ Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
89
+ shape [batch_size X sequence_length, num_experts].
90
+ attention_mask (`torch.Tensor`, None):
91
+ The attention_mask used in forward function
92
+ shape [batch_size X sequence_length] if not None.
93
+ num_experts (`int`, *optional*):
94
+ Number of experts
95
+
96
+ Returns:
97
+ The auxiliary loss.
98
+ """
99
+ if gate_logits is None or not isinstance(gate_logits, tuple):
100
+ return 0
101
+
102
+ if isinstance(gate_logits, tuple):
103
+ compute_device = gate_logits[0].device
104
+ concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
105
+
106
+ routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
107
+
108
+ _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
109
+
110
+ expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
111
+
112
+ if attention_mask is None:
113
+ # Compute the percentage of tokens routed to each experts
114
+ tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
115
+
116
+ # Compute the average probability of routing to these experts
117
+ router_prob_per_expert = torch.mean(routing_weights, dim=0)
118
+ else:
119
+ batch_size, sequence_length = attention_mask.shape
120
+ num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
121
+
122
+ # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
123
+ expert_attention_mask = (
124
+ attention_mask[None, :, :, None, None]
125
+ .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
126
+ .reshape(-1, top_k, num_experts)
127
+ .to(compute_device)
128
+ )
129
+
130
+ # Compute the percentage of tokens routed to each experts
131
+ tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
132
+ expert_attention_mask, dim=0
133
+ )
134
+
135
+ # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
136
+ router_per_expert_attention_mask = (
137
+ attention_mask[None, :, :, None]
138
+ .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
139
+ .reshape(-1, num_experts)
140
+ .to(compute_device)
141
+ )
142
+
143
+ # Compute the average probability of routing to these experts
144
+ router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
145
+ router_per_expert_attention_mask, dim=0
146
+ )
147
+
148
+ overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
149
+ return overall_loss * num_experts
150
+
151
+
152
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
153
+ def _get_unpad_data(attention_mask):
154
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
155
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
156
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
157
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
158
+ return (
159
+ indices,
160
+ cu_seqlens,
161
+ max_seqlen_in_batch,
162
+ )
163
+
164
+
165
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->PhiMoE
166
+ ##https://dl.acm.org/doi/pdf/10.5555/3454287.3455397 The following is the implementation of layernorm
167
+
168
+
169
+ class PhiMoERotaryEmbedding(nn.Module):
170
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
171
+ super().__init__()
172
+
173
+ self.dim = dim
174
+ self.max_position_embeddings = max_position_embeddings
175
+ self.base = base
176
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
177
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
178
+
179
+ # Build here to make `torch.jit.trace` work.
180
+ self._set_cos_sin_cache(
181
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
182
+ )
183
+
184
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
185
+ self.max_seq_len_cached = seq_len
186
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
187
+
188
+ freqs = torch.outer(t, self.inv_freq)
189
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
190
+ emb = torch.cat((freqs, freqs), dim=-1)
191
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
192
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
193
+
194
+ def forward(self, x, seq_len=None):
195
+ # x: [bs, num_attention_heads, seq_len, head_size]
196
+ if seq_len > self.max_seq_len_cached:
197
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
198
+
199
+ return (
200
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
201
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
202
+ )
203
+
204
+
205
+ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
206
+
207
+ def __init__(self, dim, config):
208
+ super().__init__()
209
+ self.dim = dim
210
+ self.max_position_embeddings = config.max_position_embeddings
211
+ self.base = config.rope_theta
212
+ self.short_factor = config.rope_scaling["short_factor"]
213
+ self.long_factor = config.rope_scaling["long_factor"]
214
+ self.short_mscale = config.rope_scaling["short_mscale"]
215
+ self.long_mscale = config.rope_scaling["long_mscale"]
216
+ self.original_max_position_embeddings = config.rope_scaling["original_max_position_embeddings"]
217
+
218
+ def forward(self, x, seq_len=None):
219
+ if seq_len is None:
220
+ seq_len = x.shape[-2]
221
+
222
+ if seq_len > self.original_max_position_embeddings:
223
+ rescale_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
224
+ mscale = self.long_mscale
225
+ else:
226
+ rescale_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
227
+ mscale = self.short_mscale
228
+ assert rescale_factors.shape == (self.dim // 2, ), \
229
+ f"misaligned shape for LongRoPE rescale factors: {rescale_factors.shape}"
230
+
231
+ inv_freq = 1.0 / (rescale_factors * (self.base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim)))
232
+
233
+ t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
234
+ freqs = torch.outer(t, inv_freq)
235
+
236
+ emb = torch.cat((freqs, freqs), dim=-1)
237
+ return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
238
+
239
+
240
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
241
+ def rotate_half(x):
242
+ """Rotates half the hidden dims of the input."""
243
+ x1 = x[..., : x.shape[-1] // 2]
244
+ x2 = x[..., x.shape[-1] // 2 :]
245
+ return torch.cat((-x2, x1), dim=-1)
246
+
247
+
248
+
249
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
250
+ """Applies Rotary Position Embedding to the query and key tensors.
251
+
252
+ Args:
253
+ q (`torch.Tensor`): The query tensor.
254
+ k (`torch.Tensor`): The key tensor.
255
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
256
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
257
+ position_ids (`torch.Tensor`):
258
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
259
+ used to pass offsetted position ids when working with a KV-cache.
260
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
261
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
262
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
263
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
264
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
265
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
266
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
267
+ Returns:
268
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
269
+ """
270
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
271
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
272
+ q_embed = (q * cos) + (rotate_half(q) * sin)
273
+ k_embed = (k * cos) + (rotate_half(k) * sin)
274
+ return q_embed, k_embed
275
+
276
+
277
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
278
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
279
+ """
280
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
281
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
282
+ """
283
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
284
+ if n_rep == 1:
285
+ return hidden_states
286
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
287
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
288
+
289
+
290
+
291
+ class PhiMoEAttention(nn.Module):
292
+ """
293
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
294
+ and "Generating Long Sequences with Sparse Transformers".
295
+ """
296
+
297
+ def __init__(self, config: PhiMoEConfig, layer_idx: Optional[int] = None):
298
+ super().__init__()
299
+ self.config = config
300
+ self.layer_idx = layer_idx
301
+ if layer_idx is None:
302
+ logger.warning_once(
303
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
304
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
305
+ "when creating this class."
306
+ )
307
+
308
+ self.hidden_size = config.hidden_size
309
+ self.num_heads = config.num_attention_heads
310
+ self.head_dim = self.hidden_size // self.num_heads
311
+ self.num_key_value_heads = config.num_key_value_heads
312
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
313
+ self.max_position_embeddings = config.max_position_embeddings
314
+ self.rope_theta = config.rope_theta
315
+ self.is_causal = True
316
+ self.attention_dropout = config.attention_dropout
317
+
318
+ if (self.head_dim * self.num_heads) != self.hidden_size:
319
+ raise ValueError(
320
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
321
+ f" and `num_heads`: {self.num_heads})."
322
+ )
323
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.config.attention_bias)
324
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.attention_bias)
325
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.attention_bias)
326
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=self.config.attention_bias)
327
+
328
+ if getattr(config, 'rope_scaling', None) is None:
329
+ self.rotary_emb = PhiMoERotaryEmbedding(
330
+ self.head_dim,
331
+ max_position_embeddings=self.max_position_embeddings,
332
+ base=self.rope_theta,
333
+ )
334
+ else:
335
+ scaling_type = self.config.rope_scaling["type"]
336
+ if scaling_type == "longrope":
337
+ self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config)
338
+ else:
339
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
340
+
341
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
342
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
343
+
344
+ def forward(
345
+ self,
346
+ hidden_states: torch.Tensor,
347
+ attention_mask: Optional[torch.Tensor] = None,
348
+ position_ids: Optional[torch.LongTensor] = None,
349
+ past_key_value: Optional[Cache] = None,
350
+ output_attentions: bool = False,
351
+ use_cache: bool = False,
352
+ **kwargs,
353
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
354
+ if "padding_mask" in kwargs:
355
+ warnings.warn(
356
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
357
+ )
358
+ bsz, q_len, _ = hidden_states.size()
359
+
360
+ query_states = self.q_proj(hidden_states)
361
+ key_states = self.k_proj(hidden_states)
362
+ value_states = self.v_proj(hidden_states)
363
+
364
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
365
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
366
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
367
+
368
+ kv_seq_len = key_states.shape[-2]
369
+ if past_key_value is not None:
370
+ if self.layer_idx is None:
371
+ raise ValueError(
372
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
373
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
374
+ "with a layer index."
375
+ )
376
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
377
+
378
+ # print ("before apply rotary pos_emb", len(kv_seq_len),torch.norm(value_states).items(),\
379
+ # torch.norm(query_states).items(), torch.norm(key_states).items(), position_ids)
380
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
381
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
382
+
383
+ # print ('after pos emb', torch.norm(query_states).item(), torch.norm(key_states).items())
384
+ if past_key_value is not None:
385
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
386
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
387
+
388
+ # repeat k/v heads if n_kv_heads < n_heads
389
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
390
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
391
+
392
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
393
+
394
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
395
+ raise ValueError(
396
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
397
+ f" {attn_weights.size()}"
398
+ )
399
+
400
+ if attention_mask is not None:
401
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
402
+ raise ValueError(
403
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
404
+ )
405
+
406
+ attn_weights = attn_weights + attention_mask
407
+
408
+ # upcast attention to fp32
409
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
410
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
411
+ attn_output = torch.matmul(attn_weights, value_states)
412
+
413
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
414
+ raise ValueError(
415
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
416
+ f" {attn_output.size()}"
417
+ )
418
+
419
+ attn_output = attn_output.transpose(1, 2).contiguous()
420
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
421
+
422
+ attn_output = self.o_proj(attn_output)
423
+
424
+ if not output_attentions:
425
+ attn_weights = None
426
+
427
+ return attn_output, attn_weights, past_key_value
428
+
429
+
430
+
431
+ class PhiMoEFlashAttention2(PhiMoEAttention):
432
+ """
433
+ PhiMoE flash attention module. This module inherits from `PhiMoEAttention` as the weights of the module stays
434
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
435
+ flash attention and deal with padding tokens in case the input contains any of them.
436
+ """
437
+
438
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
439
+ def __init__(self, *args, **kwargs):
440
+ super().__init__(*args, **kwargs)
441
+
442
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
443
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
444
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
445
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
446
+
447
+ def forward(
448
+ self,
449
+ hidden_states: torch.Tensor,
450
+ attention_mask: Optional[torch.Tensor] = None,
451
+ position_ids: Optional[torch.LongTensor] = None,
452
+ past_key_value: Optional[Cache] = None,
453
+ output_attentions: bool = False,
454
+ use_cache: bool = False,
455
+ **kwargs,
456
+ ):
457
+ if "padding_mask" in kwargs:
458
+ warnings.warn(
459
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
460
+ )
461
+
462
+ # overwrite attention_mask with padding_mask
463
+ attention_mask = kwargs.pop("padding_mask")
464
+ bsz, q_len, _ = hidden_states.size()
465
+
466
+ query_states = self.q_proj(hidden_states)
467
+ key_states = self.k_proj(hidden_states)
468
+ value_states = self.v_proj(hidden_states)
469
+
470
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
471
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
472
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
473
+
474
+ kv_seq_len = key_states.shape[-2]
475
+ if past_key_value is not None:
476
+ if self.layer_idx is None:
477
+ raise ValueError(
478
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
479
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
480
+ "with a layer index."
481
+ )
482
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
483
+
484
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
485
+ rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item() + 1)
486
+ cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
487
+
488
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
489
+
490
+ use_sliding_windows = (
491
+ _flash_supports_window_size
492
+ and getattr(self.config, "sliding_window", None) is not None
493
+ and kv_seq_len > self.config.sliding_window
494
+ )
495
+
496
+ if not _flash_supports_window_size:
497
+ logger.warning_once(
498
+ "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
499
+ " make sure to upgrade flash-attn library."
500
+ )
501
+
502
+ if past_key_value is not None:
503
+ # Activate slicing cache only if the config has a value `sliding_windows` attribute
504
+ cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
505
+ if (
506
+ getattr(self.config, "sliding_window", None) is not None
507
+ and kv_seq_len > self.config.sliding_window
508
+ and cache_has_contents
509
+ ):
510
+ slicing_tokens = 1 - self.config.sliding_window
511
+
512
+ past_key = past_key_value[self.layer_idx][0]
513
+ past_value = past_key_value[self.layer_idx][1]
514
+
515
+ past_key = past_key[:, :, slicing_tokens:, :].contiguous()
516
+ past_value = past_value[:, :, slicing_tokens:, :].contiguous()
517
+
518
+ if past_key.shape[-2] != self.config.sliding_window - 1:
519
+ raise ValueError(
520
+ f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
521
+ f" {past_key.shape}"
522
+ )
523
+
524
+ if attention_mask is not None:
525
+ attention_mask = attention_mask[:, slicing_tokens:]
526
+ attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
527
+
528
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
529
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
530
+
531
+ # repeat k/v heads if n_kv_heads < n_heads
532
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
533
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
534
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
535
+
536
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
537
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
538
+ # cast them back in float16 just to be sure everything works as expected.
539
+ input_dtype = query_states.dtype
540
+ if input_dtype == torch.float32:
541
+ if torch.is_autocast_enabled():
542
+ target_dtype = torch.get_autocast_gpu_dtype()
543
+ # Handle the case where the model is quantized
544
+ elif hasattr(self.config, "_pre_quantization_dtype"):
545
+ target_dtype = self.config._pre_quantization_dtype
546
+ else:
547
+ target_dtype = self.q_proj.weight.dtype
548
+
549
+ logger.warning_once(
550
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
551
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
552
+ f" {target_dtype}."
553
+ )
554
+
555
+ query_states = query_states.to(target_dtype)
556
+ key_states = key_states.to(target_dtype)
557
+ value_states = value_states.to(target_dtype)
558
+
559
+ # Reashape to the expected shape for Flash Attention
560
+ query_states = query_states.transpose(1, 2)
561
+ key_states = key_states.transpose(1, 2)
562
+ value_states = value_states.transpose(1, 2)
563
+
564
+ attn_output = self._flash_attention_forward(
565
+ query_states,
566
+ key_states,
567
+ value_states,
568
+ attention_mask,
569
+ q_len,
570
+ dropout=dropout_rate,
571
+ use_sliding_windows=use_sliding_windows,
572
+ )
573
+
574
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
575
+ attn_output = self.o_proj(attn_output)
576
+
577
+ if not output_attentions:
578
+ attn_weights = None
579
+
580
+ return attn_output, attn_weights, past_key_value
581
+
582
+ def _flash_attention_forward(
583
+ self,
584
+ query_states,
585
+ key_states,
586
+ value_states,
587
+ attention_mask,
588
+ query_length,
589
+ dropout=0.0,
590
+ softmax_scale=None,
591
+ use_sliding_windows=False,
592
+ ):
593
+ """
594
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
595
+ first unpad the input, then computes the attention scores and pad the final attention scores.
596
+
597
+ Args:
598
+ query_states (`torch.Tensor`):
599
+ Input query states to be passed to Flash Attention API
600
+ key_states (`torch.Tensor`):
601
+ Input key states to be passed to Flash Attention API
602
+ value_states (`torch.Tensor`):
603
+ Input value states to be passed to Flash Attention API
604
+ attention_mask (`torch.Tensor`):
605
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
606
+ position of padding tokens and 1 for the position of non-padding tokens.
607
+ dropout (`float`):
608
+ Attention dropout
609
+ softmax_scale (`float`, *optional*):
610
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
611
+ use_sliding_windows (`bool`, *optional*):
612
+ Whether to activate sliding window attention.
613
+ """
614
+ if not self._flash_attn_uses_top_left_mask:
615
+ causal = self.is_causal
616
+ else:
617
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
618
+ causal = self.is_causal and query_length != 1
619
+
620
+ # Contains at least one padding token in the sequence
621
+ if attention_mask is not None:
622
+ batch_size = query_states.shape[0]
623
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
624
+ query_states, key_states, value_states, attention_mask, query_length
625
+ )
626
+
627
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
628
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
629
+
630
+ if not use_sliding_windows:
631
+ attn_output_unpad = flash_attn_varlen_func(
632
+ query_states,
633
+ key_states,
634
+ value_states,
635
+ cu_seqlens_q=cu_seqlens_q,
636
+ cu_seqlens_k=cu_seqlens_k,
637
+ max_seqlen_q=max_seqlen_in_batch_q,
638
+ max_seqlen_k=max_seqlen_in_batch_k,
639
+ dropout_p=dropout,
640
+ softmax_scale=softmax_scale,
641
+ causal=causal,
642
+ )
643
+ else:
644
+ attn_output_unpad = flash_attn_varlen_func(
645
+ query_states,
646
+ key_states,
647
+ value_states,
648
+ cu_seqlens_q=cu_seqlens_q,
649
+ cu_seqlens_k=cu_seqlens_k,
650
+ max_seqlen_q=max_seqlen_in_batch_q,
651
+ max_seqlen_k=max_seqlen_in_batch_k,
652
+ dropout_p=dropout,
653
+ softmax_scale=softmax_scale,
654
+ causal=causal,
655
+ window_size=(self.config.sliding_window, 0),
656
+ )
657
+
658
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
659
+ else:
660
+ if not use_sliding_windows:
661
+ attn_output = flash_attn_func(
662
+ query_states,
663
+ key_states,
664
+ value_states,
665
+ dropout,
666
+ softmax_scale=softmax_scale,
667
+ causal=causal,
668
+ )
669
+ else:
670
+ attn_output = flash_attn_func(
671
+ query_states,
672
+ key_states,
673
+ value_states,
674
+ dropout,
675
+ softmax_scale=softmax_scale,
676
+ causal=causal,
677
+ window_size=(self.config.sliding_window, 0),
678
+ )
679
+
680
+ return attn_output
681
+
682
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
683
+ batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
684
+
685
+ # On the first iteration we need to properly re-create the padding mask
686
+ # by slicing it on the proper place
687
+ if kv_seq_len != attention_mask.shape[-1]:
688
+ attention_mask_num_tokens = attention_mask.shape[-1]
689
+ attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
690
+
691
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
692
+
693
+ key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
694
+ value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
695
+
696
+ if query_length == kv_seq_len:
697
+ query_layer = index_first_axis(
698
+ query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
699
+ )
700
+ cu_seqlens_q = cu_seqlens_k
701
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
702
+ indices_q = indices_k
703
+ elif query_length == 1:
704
+ max_seqlen_in_batch_q = 1
705
+ cu_seqlens_q = torch.arange(
706
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
707
+ ) # There is a memcpy here, that is very bad.
708
+ indices_q = cu_seqlens_q[:-1]
709
+ query_layer = query_layer.squeeze(1)
710
+ else:
711
+ # The -q_len: slice assumes left padding.
712
+ attention_mask = attention_mask[:, -query_length:]
713
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
714
+
715
+ return (
716
+ query_layer,
717
+ key_layer,
718
+ value_layer,
719
+ indices_q,
720
+ (cu_seqlens_q, cu_seqlens_k),
721
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
722
+ )
723
+
724
+
725
+
726
+ class PhiMoESdpaAttention(PhiMoEAttention):
727
+ """
728
+ PhiMoE attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
729
+ `PhiMoEAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
730
+ SDPA API.
731
+ """
732
+
733
+ # Adapted from PhiMoEAttention.forward
734
+ def forward(
735
+ self,
736
+ hidden_states: torch.Tensor,
737
+ attention_mask: Optional[torch.Tensor] = None,
738
+ position_ids: Optional[torch.LongTensor] = None,
739
+ past_key_value: Optional[Cache] = None,
740
+ output_attentions: bool = False,
741
+ use_cache: bool = False,
742
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
743
+ if output_attentions:
744
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
745
+ logger.warning_once(
746
+ "PhiMoEModel is using PhiMoESdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
747
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
748
+ )
749
+ return super().forward(
750
+ hidden_states=hidden_states,
751
+ attention_mask=attention_mask,
752
+ position_ids=position_ids,
753
+ past_key_value=past_key_value,
754
+ output_attentions=output_attentions,
755
+ use_cache=use_cache,
756
+ )
757
+
758
+ bsz, q_len, _ = hidden_states.size()
759
+
760
+ query_states = self.q_proj(hidden_states)
761
+ key_states = self.k_proj(hidden_states)
762
+ value_states = self.v_proj(hidden_states)
763
+
764
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
765
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
766
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
767
+
768
+ kv_seq_len = key_states.shape[-2]
769
+ if past_key_value is not None:
770
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
771
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
772
+
773
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
774
+
775
+ if past_key_value is not None:
776
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
777
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
778
+
779
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
780
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
781
+
782
+ if attention_mask is not None:
783
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
784
+ raise ValueError(
785
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
786
+ )
787
+
788
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
789
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
790
+ if query_states.device.type == "cuda" and attention_mask is not None:
791
+ query_states = query_states.contiguous()
792
+ key_states = key_states.contiguous()
793
+ value_states = value_states.contiguous()
794
+
795
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
796
+ query_states,
797
+ key_states,
798
+ value_states,
799
+ attn_mask=attention_mask,
800
+ dropout_p=self.attention_dropout if self.training else 0.0,
801
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
802
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
803
+ )
804
+
805
+ attn_output = attn_output.transpose(1, 2).contiguous()
806
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
807
+
808
+ attn_output = self.o_proj(attn_output)
809
+
810
+ return attn_output, None, past_key_value
811
+
812
+
813
+ PHIMOE_ATTENTION_CLASSES = {
814
+ "eager": PhiMoEAttention,
815
+ "flash_attention_2": PhiMoEFlashAttention2,
816
+ "sdpa": PhiMoESdpaAttention,
817
+ }
818
+
819
+
820
+ class PhiMoEBlockSparseTop2MLP(nn.Module):
821
+ def __init__(self, config: PhiMoEConfig):
822
+ super().__init__()
823
+ self.ffn_dim = config.intermediate_size
824
+ self.hidden_dim = config.hidden_size
825
+
826
+ self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
827
+ self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
828
+ self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
829
+
830
+ self.act_fn = ACT2FN[config.hidden_act]
831
+
832
+ def forward(self, hidden_states):
833
+ current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
834
+ current_hidden_states = self.w2(current_hidden_states)
835
+ return current_hidden_states
836
+
837
+
838
+ class PhiMoEBLockSparseTop2MLP(PhiMoEBlockSparseTop2MLP):
839
+ def __init__(self, *args, **kwargs):
840
+ logger.warning_once(
841
+ "PhiMoEBLockSparseTop2MLP is deprecated by PhiMoEBlockSparseTop2MLP and will be removed in v4.40."
842
+ )
843
+ super().__init__(*args, **kwargs)
844
+
845
+
846
+ class mp(torch.autograd.Function):
847
+ @staticmethod
848
+ def forward(
849
+ ctx,
850
+ scores: torch.Tensor,
851
+ multiplier: torch.Tensor,
852
+ selected_experts: torch.Tensor,
853
+ masked_gates: torch.Tensor,
854
+ mask_for_one: torch.Tensor,
855
+ ):
856
+ ctx.save_for_backward(multiplier, selected_experts, masked_gates)
857
+ return multiplier * mask_for_one
858
+
859
+ @staticmethod
860
+ def backward(
861
+ ctx,
862
+ grad_at_output: torch.Tensor,
863
+ ):
864
+ multiplier, selected_experts, masked_gates = ctx.saved_tensors
865
+
866
+ grad_at_output = grad_at_output * multiplier
867
+
868
+ grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
869
+ grad_at_scores_expaned.scatter_add_(
870
+ dim=-1,
871
+ index=selected_experts,
872
+ src=grad_at_output,
873
+ )
874
+
875
+ return (
876
+ grad_at_scores_expaned,
877
+ None,
878
+ None,
879
+ None,
880
+ None,
881
+ )
882
+
883
+ def sparsemixer(scores, top_k, jitter_eps, training):
884
+ assert top_k == 2
885
+
886
+ ################ first expert ################
887
+
888
+ with torch.no_grad():
889
+ # compute mask for sparsity
890
+ mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
891
+ factor = scores.abs().clamp(min=mask_logits_threshold)
892
+ mask_logits_threshold = (
893
+ (mask_logits_threshold - scores) / factor
894
+ ) > (2 * jitter_eps)
895
+
896
+ # apply mask
897
+ masked_gates = scores.masked_fill(mask_logits_threshold, float('-inf'))
898
+ if training:
899
+ selected_experts = (
900
+ masked_gates - torch.empty_like(masked_gates, memory_format=torch.legacy_contiguous_format).exponential_().log()
901
+ ).max(dim=-1)[1].unsqueeze(-1) # gumbel sampling, more robust than than the multinomial method
902
+ else:
903
+ selected_experts = max_ind
904
+
905
+ # compute scores for gradients
906
+ masked_gates = torch.softmax(masked_gates, dim=-1)
907
+ multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
908
+
909
+ if training:
910
+ # compute midpoint mask
911
+ max_scores, max_ind = masked_gates.max(dim=-1, keepdim=True)
912
+ mask_for_one = torch.logical_or(
913
+ selected_experts == max_ind,
914
+ torch.rand_like(max_scores) > 0.75 # Heun's third-order method: f(x) - f(0) = .25 f'(x) + .75 f'(x/3.)
915
+ )
916
+ # 1 -> 1.0 & 0 -> 1./3: lambda x: (x + 0.5) / 1.5
917
+ mask_for_one = torch.add(0.3333, mask_for_one, alpha=0.6667).type_as(masked_gates)
918
+
919
+ multiplier = mp.apply(
920
+ scores,
921
+ multiplier_o,
922
+ selected_experts,
923
+ masked_gates,
924
+ mask_for_one,
925
+ )
926
+ else:
927
+ multiplier = multiplier_o
928
+
929
+ # masked out first expert
930
+ masked_scores = torch.scatter(
931
+ scores,
932
+ -1,
933
+ selected_experts,
934
+ float('-inf'),
935
+ )
936
+ with torch.no_grad():
937
+ # compute mask for sparsity
938
+ mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True)
939
+ factor = scores.abs().clamp(min=mask_logits_threshold)
940
+ mask_logits_threshold = (
941
+ (mask_logits_threshold - scores) / factor
942
+ ) > (2 * jitter_eps)
943
+
944
+ # apply mask
945
+ masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, float('-inf'))
946
+ if training:
947
+ selected_experts_top2 = (
948
+ masked_gates_top2 - torch.empty_like(masked_gates_top2, memory_format=torch.legacy_contiguous_format).exponential_().log()
949
+ ).max(dim=-1)[1].unsqueeze(-1) # gumbel sampling, more robust than than the multinomial method
950
+ else:
951
+ selected_experts_top2 = max_ind
952
+ # compute scores for gradients
953
+ masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
954
+ multiplier_top2_o = masked_gates_top2.gather(dim=-1, index=selected_experts_top2)
955
+
956
+ if training:
957
+ # compute midpoint mask
958
+ max_scores, max_ind = masked_gates_top2.max(dim=-1, keepdim=True)
959
+ mask_for_one_top2 = torch.logical_or(
960
+ selected_experts_top2 == max_ind,
961
+ torch.rand_like(max_scores).uniform_() > 0.75 # Heun's third-order method: f(x) - f(0) = .25 f'(x) + .75 f'(x/3.)
962
+ )
963
+ # 1 -> 1.0 & 0 -> 1./3: lambda x: (x + 0.5) / 1.5
964
+ mask_for_one_top2 = torch.add(0.3333, mask_for_one_top2, alpha=0.6667).type_as(masked_gates_top2)
965
+
966
+ multiplier_top2 = mp.apply(
967
+ scores,
968
+ multiplier_top2_o,
969
+ selected_experts_top2,
970
+ masked_gates_top2,
971
+ mask_for_one_top2,
972
+ )
973
+ else:
974
+ multiplier_top2 = multiplier_top2_o
975
+
976
+ multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
977
+ selected_experts = torch.concat((selected_experts, selected_experts_top2), dim=-1)
978
+
979
+ return (
980
+ multiplier,
981
+ selected_experts,
982
+ )
983
+
984
+ iterations = 0
985
+ class PhiMoESparseMoeBlock(nn.Module):
986
+ """
987
+ This implementation is
988
+ strictly equivalent to standard MoE with full capacity (no
989
+ dropped tokens). It's faster since it formulates MoE operations
990
+ in terms of block-sparse operations to accomodate imbalanced
991
+ assignments of tokens to experts, whereas standard MoE either
992
+ (1) drop tokens at the cost of reduced performance or (2) set
993
+ capacity factor to number of experts and thus waste computation
994
+ and memory on padding.
995
+ """
996
+
997
+ def __init__(self, config):
998
+ super().__init__()
999
+ self.hidden_dim = config.hidden_size
1000
+ self.ffn_dim = config.intermediate_size
1001
+ self.num_experts = config.num_local_experts
1002
+ self.top_k = config.num_experts_per_tok
1003
+ global iterations
1004
+ iterations +=1
1005
+ self.iter = iterations
1006
+ # gating
1007
+ self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
1008
+
1009
+ self.experts = nn.ModuleList([PhiMoEBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
1010
+
1011
+ # Jitter parameters
1012
+ self.router_jitter_noise = config.router_jitter_noise
1013
+ self.input_jitter_noise = config.input_jitter_noise
1014
+
1015
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
1016
+ """ """
1017
+ batch_size, sequence_length, hidden_dim = hidden_states.shape
1018
+ if self.training and self.input_jitter_noise > 0:
1019
+ hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.input_jitter_noise, 1.0 + self.input_jitter_noise)
1020
+ hidden_states = hidden_states.view(-1, hidden_dim)
1021
+ # router_logits: (batch * sequence_length, n_experts)
1022
+ # print ( 'moe', self.iter, torch.norm(hidden_states).item())
1023
+ router_logits = self.gate(hidden_states)
1024
+
1025
+ routing_weights, selected_experts = sparsemixer(
1026
+ router_logits,
1027
+ top_k=2,
1028
+ jitter_eps=self.router_jitter_noise,
1029
+ training=self.training,
1030
+ )
1031
+
1032
+ final_hidden_states = torch.zeros(
1033
+ (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
1034
+ )
1035
+
1036
+ # One hot encode the selected experts to create an expert mask
1037
+ # this will be used to easily index which expert is going to be sollicitated
1038
+ expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
1039
+
1040
+ # Loop over all available experts in the model and perform the computation on each expert
1041
+ for expert_idx in range(self.num_experts):
1042
+ expert_layer = self.experts[expert_idx]
1043
+ idx, top_x = torch.where(expert_mask[expert_idx])
1044
+
1045
+ if top_x.shape[0] == 0:
1046
+ continue
1047
+
1048
+ # in torch it is faster to index using lists than torch tensors
1049
+ top_x_list = top_x.tolist()
1050
+ idx_list = idx.tolist()
1051
+
1052
+ # Index the correct hidden states and compute the expert hidden state for
1053
+ # the current expert. We need to make sure to multiply the output hidden
1054
+ # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
1055
+ current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
1056
+ current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
1057
+
1058
+ # However `index_add_` only support torch tensors for indexing so we'll use
1059
+ # the `top_x` tensor here.
1060
+ final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
1061
+ final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
1062
+ # print ( 'moe', self.iter, torch.norm(final_hidden_states).item())
1063
+ return final_hidden_states, router_logits
1064
+
1065
+
1066
+ class PhiMoEDecoderLayer(nn.Module):
1067
+ def __init__(self, config: PhiMoEConfig, layer_idx: int):
1068
+ super().__init__()
1069
+ self.hidden_size = config.hidden_size
1070
+
1071
+ self.self_attn = PHIMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
1072
+
1073
+ self.block_sparse_moe = PhiMoESparseMoeBlock(config)
1074
+ self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True)
1075
+ self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True)
1076
+
1077
+ def forward(
1078
+ self,
1079
+ hidden_states: torch.Tensor,
1080
+ attention_mask: Optional[torch.Tensor] = None,
1081
+ position_ids: Optional[torch.LongTensor] = None,
1082
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
1083
+ output_attentions: Optional[bool] = False,
1084
+ output_router_logits: Optional[bool] = False,
1085
+ use_cache: Optional[bool] = False,
1086
+ **kwargs,
1087
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
1088
+ if "padding_mask" in kwargs:
1089
+ warnings.warn(
1090
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
1091
+ )
1092
+ """
1093
+ Args:
1094
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
1095
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
1096
+ `(batch, sequence_length)` where padding elements are indicated by 0.
1097
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
1098
+ output_attentions (`bool`, *optional*):
1099
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1100
+ returned tensors for more detail.
1101
+ output_router_logits (`bool`, *optional*):
1102
+ Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
1103
+ should not be returned during inference.
1104
+ use_cache (`bool`, *optional*):
1105
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
1106
+ (see `past_key_values`).
1107
+ """
1108
+
1109
+ residual = hidden_states
1110
+
1111
+ hidden_states = self.input_layernorm(hidden_states)
1112
+
1113
+ # Self Attention
1114
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
1115
+ hidden_states=hidden_states,
1116
+ attention_mask=attention_mask,
1117
+ position_ids=position_ids,
1118
+ past_key_value=past_key_value,
1119
+ output_attentions=output_attentions,
1120
+ use_cache=use_cache,
1121
+ )
1122
+ hidden_states = residual + hidden_states
1123
+
1124
+ # Fully Connected
1125
+ residual = hidden_states
1126
+ hidden_states = self.post_attention_layernorm(hidden_states)
1127
+ hidden_states, router_logits = self.block_sparse_moe(hidden_states)
1128
+ hidden_states = residual + hidden_states
1129
+
1130
+ outputs = (hidden_states,)
1131
+
1132
+ if output_attentions:
1133
+ outputs += (self_attn_weights,)
1134
+
1135
+ if use_cache:
1136
+ outputs += (present_key_value,)
1137
+
1138
+ if output_router_logits:
1139
+ outputs += (router_logits,)
1140
+
1141
+ return outputs
1142
+
1143
+
1144
+ PHIMOE_START_DOCSTRING = r"""
1145
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1146
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1147
+ etc.)
1148
+
1149
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1150
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1151
+ and behavior.
1152
+
1153
+ Parameters:
1154
+ config ([`PhiMoEConfig`]):
1155
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
1156
+ load the weights associated with the model, only the configuration. Check out the
1157
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1158
+ """
1159
+
1160
+
1161
+ @add_start_docstrings(
1162
+ "The bare PhiMoE Model outputting raw hidden-states without any specific head on top.",
1163
+ PHIMOE_START_DOCSTRING,
1164
+ )
1165
+
1166
+ class PhiMoEPreTrainedModel(PreTrainedModel):
1167
+ config_class = PhiMoEConfig
1168
+ base_model_prefix = "model"
1169
+ supports_gradient_checkpointing = True
1170
+ _no_split_modules = ["PhiMoEDecoderLayer"]
1171
+ _skip_keys_device_placement = "past_key_values"
1172
+ _supports_flash_attn_2 = True
1173
+ _supports_sdpa = True
1174
+ _supports_cache_class = True
1175
+
1176
+ def _init_weights(self, module):
1177
+ pass
1178
+ # std = self.config.initializer_range
1179
+ # if isinstance(module, nn.Linear):
1180
+ # module.weight.data.normal_(mean=0.0, std=std)
1181
+ # if module.bias is not None:
1182
+ # module.bias.data.zero_()
1183
+ # elif isinstance(module, nn.Embedding):
1184
+ # module.weight.data.normal_(mean=0.0, std=std)
1185
+ # if module.padding_idx is not None:
1186
+ # module.weight.data[module.padding_idx].zero_()
1187
+
1188
+
1189
+ PHIMOE_INPUTS_DOCSTRING = r"""
1190
+ Args:
1191
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1192
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1193
+ it.
1194
+
1195
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1196
+ [`PreTrainedTokenizer.__call__`] for details.
1197
+
1198
+ [What are input IDs?](../glossary#input-ids)
1199
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1200
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1201
+
1202
+ - 1 for tokens that are **not masked**,
1203
+ - 0 for tokens that are **masked**.
1204
+
1205
+ [What are attention masks?](../glossary#attention-mask)
1206
+
1207
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1208
+ [`PreTrainedTokenizer.__call__`] for details.
1209
+
1210
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
1211
+ `past_key_values`).
1212
+
1213
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1214
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1215
+ information on the default strategy.
1216
+
1217
+ - 1 indicates the head is **not masked**,
1218
+ - 0 indicates the head is **masked**.
1219
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1220
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1221
+ config.n_positions - 1]`.
1222
+
1223
+ [What are position IDs?](../glossary#position-ids)
1224
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
1225
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
1226
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
1227
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
1228
+
1229
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1230
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
1231
+
1232
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1233
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1234
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1235
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1236
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1237
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1238
+ model's internal embedding lookup matrix.
1239
+ use_cache (`bool`, *optional*):
1240
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1241
+ `past_key_values`).
1242
+ output_attentions (`bool`, *optional*):
1243
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1244
+ tensors for more detail.
1245
+ output_hidden_states (`bool`, *optional*):
1246
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1247
+ more detail.
1248
+ output_router_logits (`bool`, *optional*):
1249
+ Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
1250
+ should not be returned during inference.
1251
+ return_dict (`bool`, *optional*):
1252
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1253
+ """
1254
+
1255
+
1256
+ @add_start_docstrings(
1257
+ "The bare PhiMoE Model outputting raw hidden-states without any specific head on top.",
1258
+ PHIMOE_START_DOCSTRING,
1259
+ )
1260
+
1261
+ class PhiMoEModel(PhiMoEPreTrainedModel):
1262
+ """
1263
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiMoEDecoderLayer`]
1264
+
1265
+ Args:
1266
+ config: PhiMoEConfig
1267
+ """
1268
+
1269
+ def __init__(self, config: PhiMoEConfig):
1270
+ super().__init__(config)
1271
+ self.padding_idx = config.pad_token_id
1272
+ self.vocab_size = config.vocab_size
1273
+
1274
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1275
+ self.layers = nn.ModuleList(
1276
+ [PhiMoEDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1277
+ )
1278
+ self._attn_implementation = config._attn_implementation
1279
+ self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True)
1280
+
1281
+ self.gradient_checkpointing = False
1282
+ # Initialize weights and apply final processing
1283
+ self.post_init()
1284
+
1285
+ def get_input_embeddings(self):
1286
+ return self.embed_tokens
1287
+
1288
+ def set_input_embeddings(self, value):
1289
+ self.embed_tokens = value
1290
+
1291
+ # Ignore copy
1292
+ @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
1293
+ def forward(
1294
+ self,
1295
+ input_ids: torch.LongTensor = None,
1296
+ attention_mask: Optional[torch.Tensor] = None,
1297
+ position_ids: Optional[torch.LongTensor] = None,
1298
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1299
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1300
+ use_cache: Optional[bool] = None,
1301
+ output_attentions: Optional[bool] = None,
1302
+ output_hidden_states: Optional[bool] = None,
1303
+ output_router_logits: Optional[bool] = None,
1304
+ return_dict: Optional[bool] = None,
1305
+ ) -> Union[Tuple, MoeModelOutputWithPast]:
1306
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1307
+ output_router_logits = (
1308
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
1309
+ )
1310
+ output_hidden_states = (
1311
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1312
+ )
1313
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1314
+
1315
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1316
+
1317
+ # retrieve input_ids and inputs_embeds
1318
+ if input_ids is not None and inputs_embeds is not None:
1319
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
1320
+ elif input_ids is not None:
1321
+ batch_size, seq_length = input_ids.shape
1322
+ elif inputs_embeds is not None:
1323
+ batch_size, seq_length, _ = inputs_embeds.shape
1324
+ else:
1325
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
1326
+
1327
+ past_key_values_length = 0
1328
+
1329
+ if self.gradient_checkpointing and self.training:
1330
+ if use_cache:
1331
+ logger.warning_once(
1332
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
1333
+ )
1334
+ use_cache = False
1335
+
1336
+ if use_cache:
1337
+ use_legacy_cache = not isinstance(past_key_values, Cache)
1338
+ if use_legacy_cache:
1339
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
1340
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
1341
+
1342
+ if position_ids is None:
1343
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1344
+ position_ids = torch.arange(
1345
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
1346
+ )
1347
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
1348
+ else:
1349
+ position_ids = position_ids.view(-1, seq_length).long()
1350
+
1351
+ if inputs_embeds is None:
1352
+ inputs_embeds = self.embed_tokens(input_ids)
1353
+
1354
+ if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
1355
+ is_padding_right = attention_mask[:, -1].sum().item() != batch_size
1356
+ if is_padding_right:
1357
+ raise ValueError(
1358
+ "You are attempting to perform batched generation with padding_side='right'"
1359
+ " this may lead to unexpected behaviour for Flash Attention version of PhiMoE. Make sure to "
1360
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1361
+ )
1362
+
1363
+ if self._attn_implementation == "flash_attention_2":
1364
+ # 2d mask is passed through the layers
1365
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
1366
+ elif self._attn_implementation == "sdpa" and not output_attentions:
1367
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
1368
+ # the manual implementation that requires a 4D causal mask in all cases.
1369
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1370
+ attention_mask,
1371
+ (batch_size, seq_length),
1372
+ inputs_embeds,
1373
+ past_key_values_length,
1374
+ )
1375
+ else:
1376
+ # 4d mask is passed through the layers
1377
+ attention_mask = _prepare_4d_causal_attention_mask(
1378
+ attention_mask,
1379
+ (batch_size, seq_length),
1380
+ inputs_embeds,
1381
+ past_key_values_length,
1382
+ sliding_window=self.config.sliding_window,
1383
+ )
1384
+
1385
+ hidden_states = inputs_embeds
1386
+
1387
+ # decoder layers
1388
+ all_hidden_states = () if output_hidden_states else None
1389
+ all_self_attns = () if output_attentions else None
1390
+ all_router_logits = () if output_router_logits else None
1391
+ next_decoder_cache = None
1392
+
1393
+ for decoder_layer in self.layers:
1394
+ if output_hidden_states:
1395
+ all_hidden_states += (hidden_states,)
1396
+
1397
+ if self.gradient_checkpointing and self.training:
1398
+ layer_outputs = self._gradient_checkpointing_func(
1399
+ decoder_layer.__call__,
1400
+ hidden_states,
1401
+ attention_mask,
1402
+ position_ids,
1403
+ past_key_values,
1404
+ output_attentions,
1405
+ output_router_logits,
1406
+ use_cache,
1407
+ )
1408
+ else:
1409
+ layer_outputs = decoder_layer(
1410
+ hidden_states,
1411
+ attention_mask=attention_mask,
1412
+ position_ids=position_ids,
1413
+ past_key_value=past_key_values,
1414
+ output_attentions=output_attentions,
1415
+ output_router_logits=output_router_logits,
1416
+ use_cache=use_cache,
1417
+ )
1418
+
1419
+ hidden_states = layer_outputs[0]
1420
+
1421
+ if use_cache:
1422
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1423
+
1424
+ if output_attentions:
1425
+ all_self_attns += (layer_outputs[1],)
1426
+
1427
+ if output_router_logits:
1428
+ all_router_logits += (layer_outputs[-1],)
1429
+
1430
+ hidden_states = self.norm(hidden_states)
1431
+
1432
+ # add hidden states from the last decoder layer
1433
+ if output_hidden_states:
1434
+ all_hidden_states += (hidden_states,)
1435
+
1436
+ next_cache = None
1437
+ if use_cache:
1438
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
1439
+
1440
+ if not return_dict:
1441
+ return tuple(
1442
+ v
1443
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
1444
+ if v is not None
1445
+ )
1446
+ return MoeModelOutputWithPast(
1447
+ last_hidden_state=hidden_states,
1448
+ past_key_values=next_cache,
1449
+ hidden_states=all_hidden_states,
1450
+ attentions=all_self_attns,
1451
+ router_logits=all_router_logits,
1452
+ )
1453
+
1454
+
1455
+ class PhiMoEForCausalLM(PhiMoEPreTrainedModel):
1456
+ _tied_weights_keys = ["lm_head.weight"]
1457
+
1458
+ def __init__(self, config):
1459
+ super().__init__(config)
1460
+ self.model = PhiMoEModel(config)
1461
+ self.vocab_size = config.vocab_size
1462
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=self.config.lm_head_bias)
1463
+ self.router_aux_loss_coef = config.router_aux_loss_coef
1464
+ self.num_experts = config.num_local_experts
1465
+ self.num_experts_per_tok = config.num_experts_per_tok
1466
+ # Initialize weights and apply final processing
1467
+ self.post_init()
1468
+
1469
+ def get_input_embeddings(self):
1470
+ return self.model.embed_tokens
1471
+
1472
+ def set_input_embeddings(self, value):
1473
+ self.model.embed_tokens = value
1474
+
1475
+ def get_output_embeddings(self):
1476
+ return self.lm_head
1477
+
1478
+ def set_output_embeddings(self, new_embeddings):
1479
+ self.lm_head = new_embeddings
1480
+
1481
+ def set_decoder(self, decoder):
1482
+ self.model = decoder
1483
+
1484
+ def get_decoder(self):
1485
+ return self.model
1486
+
1487
+ @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
1488
+ @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1489
+ # Ignore copy
1490
+ def forward(
1491
+ self,
1492
+ input_ids: torch.LongTensor = None,
1493
+ attention_mask: Optional[torch.Tensor] = None,
1494
+ position_ids: Optional[torch.LongTensor] = None,
1495
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1496
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1497
+ labels: Optional[torch.LongTensor] = None,
1498
+ use_cache: Optional[bool] = None,
1499
+ output_attentions: Optional[bool] = None,
1500
+ output_hidden_states: Optional[bool] = None,
1501
+ output_router_logits: Optional[bool] = None,
1502
+ return_dict: Optional[bool] = None,
1503
+ ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
1504
+ r"""
1505
+ Args:
1506
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1507
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
1508
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1509
+ (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
1510
+
1511
+ Returns:
1512
+
1513
+ Example:
1514
+
1515
+ ```python
1516
+ >>> from transformers import AutoTokenizer, PhiMoEForCausalLM
1517
+
1518
+ >>> model = PhiMoEForCausalLM.from_pretrained("microsoft/Phi-3.5-moe-instruct")
1519
+ >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-moe-instruct")
1520
+
1521
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1522
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1523
+
1524
+ >>> # Generate
1525
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1526
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1527
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1528
+ ```"""
1529
+
1530
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1531
+ output_router_logits = (
1532
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
1533
+ )
1534
+
1535
+ output_hidden_states = (
1536
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1537
+ )
1538
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1539
+
1540
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1541
+ outputs = self.model(
1542
+ input_ids=input_ids,
1543
+ attention_mask=attention_mask,
1544
+ position_ids=position_ids,
1545
+ past_key_values=past_key_values,
1546
+ inputs_embeds=inputs_embeds,
1547
+ use_cache=use_cache,
1548
+ output_attentions=output_attentions,
1549
+ output_hidden_states=output_hidden_states,
1550
+ output_router_logits=output_router_logits,
1551
+ return_dict=return_dict,
1552
+ )
1553
+
1554
+ hidden_states = outputs[0]
1555
+ logits = self.lm_head(hidden_states)
1556
+ logits = logits.float()
1557
+
1558
+ loss = None
1559
+ if labels is not None:
1560
+ # Shift so that tokens < n predict n
1561
+ shift_logits = logits[..., :-1, :].contiguous()
1562
+ shift_labels = labels[..., 1:].contiguous()
1563
+ # Flatten the tokens
1564
+ loss_fct = CrossEntropyLoss()
1565
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1566
+ shift_labels = shift_labels.view(-1)
1567
+ # Enable model parallelism
1568
+ shift_labels = shift_labels.to(shift_logits.device)
1569
+ loss = loss_fct(shift_logits, shift_labels)
1570
+
1571
+ aux_loss = None
1572
+ if output_router_logits:
1573
+ aux_loss = load_balancing_loss_func(
1574
+ outputs.router_logits if return_dict else outputs[-1],
1575
+ self.num_experts,
1576
+ self.num_experts_per_tok,
1577
+ attention_mask,
1578
+ )
1579
+ if labels is not None:
1580
+ loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
1581
+
1582
+ if not return_dict:
1583
+ output = (logits,) + outputs[1:]
1584
+ if output_router_logits:
1585
+ output = (aux_loss,) + output
1586
+ return (loss,) + output if loss is not None else output
1587
+
1588
+ return MoeCausalLMOutputWithPast(
1589
+ loss=loss,
1590
+ aux_loss=aux_loss,
1591
+ logits=logits,
1592
+ past_key_values=outputs.past_key_values,
1593
+ hidden_states=outputs.hidden_states,
1594
+ attentions=outputs.attentions,
1595
+ router_logits=outputs.router_logits,
1596
+ )
1597
+
1598
+ def prepare_inputs_for_generation(
1599
+ self,
1600
+ input_ids,
1601
+ past_key_values=None,
1602
+ attention_mask=None,
1603
+ inputs_embeds=None,
1604
+ output_router_logits=False,
1605
+ **kwargs,
1606
+ ):
1607
+ # When the first time input length reached long and short factor switching point, enforce re-compute cache
1608
+ # It will cause downside of slower at this single token position, however, better than current failure.
1609
+ if past_key_values and self.config.rope_scaling and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1:
1610
+ past_length = past_key_values.seen_tokens if isinstance(past_key_values, Cache) else past_key_values[0][0].shape[2]
1611
+ if past_length <= self.config.original_max_position_embeddings:
1612
+ past_key_values = None
1613
+
1614
+ # Omit tokens covered by past_key_values
1615
+ if past_key_values is not None:
1616
+ if isinstance(past_key_values, Cache):
1617
+ cache_length = past_key_values.get_seq_length()
1618
+ past_length = past_key_values.seen_tokens
1619
+ max_cache_length = past_key_values.get_max_length()
1620
+ else:
1621
+ cache_length = past_length = past_key_values[0][0].shape[2]
1622
+ max_cache_length = None
1623
+
1624
+ # Keep only the unprocessed tokens:
1625
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1626
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
1627
+ # input)
1628
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
1629
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1630
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1631
+ # input_ids based on the past_length.
1632
+ elif past_length < input_ids.shape[1]:
1633
+ input_ids = input_ids[:, past_length:]
1634
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1635
+
1636
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1637
+ if (
1638
+ max_cache_length is not None
1639
+ and attention_mask is not None
1640
+ and cache_length + input_ids.shape[1] > max_cache_length
1641
+ ):
1642
+ attention_mask = attention_mask[:, -max_cache_length:]
1643
+
1644
+ position_ids = kwargs.get("position_ids", None)
1645
+ if attention_mask is not None and position_ids is None:
1646
+ # create position_ids on the fly for batch generation
1647
+ position_ids = attention_mask.long().cumsum(-1) - 1
1648
+ position_ids.masked_fill_(attention_mask == 0, 1)
1649
+ if past_key_values:
1650
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1651
+
1652
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1653
+ if inputs_embeds is not None and past_key_values is None:
1654
+ model_inputs = {"inputs_embeds": inputs_embeds}
1655
+ else:
1656
+ model_inputs = {"input_ids": input_ids}
1657
+
1658
+ model_inputs.update(
1659
+ {
1660
+ "position_ids": position_ids,
1661
+ "past_key_values": past_key_values,
1662
+ "use_cache": kwargs.get("use_cache"),
1663
+ "attention_mask": attention_mask,
1664
+ "output_router_logits": output_router_logits,
1665
+ }
1666
+ )
1667
+ return model_inputs
1668
+
1669
+ @staticmethod
1670
+ def _reorder_cache(past_key_values, beam_idx):
1671
+ reordered_past = ()
1672
+ for layer_past in past_key_values:
1673
+ reordered_past += (
1674
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1675
+ )
1676
+ return reordered_past
1677
+
1678
+
1679
+ @add_start_docstrings(
1680
+ """
1681
+ The PhiMoE Model transformer with a sequence classification head on top (linear layer).
1682
+
1683
+ [`PhiMoEForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1684
+ (e.g. GPT-2) do.
1685
+
1686
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1687
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1688
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1689
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1690
+ each row of the batch).
1691
+ """,
1692
+ PHIMOE_START_DOCSTRING,
1693
+ )
1694
+ # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->PhiMoE, LLAMA->PHIMOE
1695
+ class PhiMoEForSequenceClassification(PhiMoEPreTrainedModel):
1696
+ def __init__(self, config):
1697
+ super().__init__(config)
1698
+ self.num_labels = config.num_labels
1699
+ self.model = PhiMoEModel(config)
1700
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1701
+
1702
+ # Initialize weights and apply final processing
1703
+ self.post_init()
1704
+
1705
+ def get_input_embeddings(self):
1706
+ return self.model.embed_tokens
1707
+
1708
+ def set_input_embeddings(self, value):
1709
+ self.model.embed_tokens = value
1710
+
1711
+ @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
1712
+ def forward(
1713
+ self,
1714
+ input_ids: torch.LongTensor = None,
1715
+ attention_mask: Optional[torch.Tensor] = None,
1716
+ position_ids: Optional[torch.LongTensor] = None,
1717
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1718
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1719
+ labels: Optional[torch.LongTensor] = None,
1720
+ use_cache: Optional[bool] = None,
1721
+ output_attentions: Optional[bool] = None,
1722
+ output_hidden_states: Optional[bool] = None,
1723
+ return_dict: Optional[bool] = None,
1724
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1725
+ r"""
1726
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1727
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
1728
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1729
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1730
+ """
1731
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1732
+
1733
+ transformer_outputs = self.model(
1734
+ input_ids,
1735
+ attention_mask=attention_mask,
1736
+ position_ids=position_ids,
1737
+ past_key_values=past_key_values,
1738
+ inputs_embeds=inputs_embeds,
1739
+ use_cache=use_cache,
1740
+ output_attentions=output_attentions,
1741
+ output_hidden_states=output_hidden_states,
1742
+ return_dict=return_dict,
1743
+ )
1744
+ hidden_states = transformer_outputs[0]
1745
+ logits = self.score(hidden_states)
1746
+
1747
+ if input_ids is not None:
1748
+ batch_size = input_ids.shape[0]
1749
+ else:
1750
+ batch_size = inputs_embeds.shape[0]
1751
+
1752
+ if self.config.pad_token_id is None and batch_size != 1:
1753
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1754
+ if self.config.pad_token_id is None:
1755
+ sequence_lengths = -1
1756
+ else:
1757
+ if input_ids is not None:
1758
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1759
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1760
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1761
+ sequence_lengths = sequence_lengths.to(logits.device)
1762
+ else:
1763
+ sequence_lengths = -1
1764
+
1765
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1766
+
1767
+ loss = None
1768
+ if labels is not None:
1769
+ labels = labels.to(logits.device)
1770
+ if self.config.problem_type is None:
1771
+ if self.num_labels == 1:
1772
+ self.config.problem_type = "regression"
1773
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1774
+ self.config.problem_type = "single_label_classification"
1775
+ else:
1776
+ self.config.problem_type = "multi_label_classification"
1777
+
1778
+ if self.config.problem_type == "regression":
1779
+ loss_fct = MSELoss()
1780
+ if self.num_labels == 1:
1781
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1782
+ else:
1783
+ loss = loss_fct(pooled_logits, labels)
1784
+ elif self.config.problem_type == "single_label_classification":
1785
+ loss_fct = CrossEntropyLoss()
1786
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1787
+ elif self.config.problem_type == "multi_label_classification":
1788
+ loss_fct = BCEWithLogitsLoss()
1789
+ loss = loss_fct(pooled_logits, labels)
1790
+ if not return_dict:
1791
+ output = (pooled_logits,) + transformer_outputs[1:]
1792
+ return ((loss,) + output) if loss is not None else output
1793
+
1794
+ return SequenceClassifierOutputWithPast(
1795
+ loss=loss,
1796
+ logits=pooled_logits,
1797
+ past_key_values=transformer_outputs.past_key_values,
1798
+ hidden_states=transformer_outputs.hidden_states,
1799
+ attentions=transformer_outputs.attentions,
1800
+ )
sample_finetune.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import logging
3
+
4
+ import deepspeed
5
+ import datasets
6
+ from datasets import load_dataset
7
+ from peft import LoraConfig
8
+ import torch
9
+ import transformers
10
+ from trl import SFTTrainer
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
12
+
13
+ """
14
+ A simple example on using SFTTrainer and Accelerate to finetune Phi-3 models. For
15
+ a more advanced example, please follow HF alignment-handbook/scripts/run_sft.py.
16
+ This example has utilized DeepSpeed ZeRO3 offload to reduce the memory usage. The
17
+ script can be run on A100 or later generation GPUs. Here are some suggestions on
18
+ futher reducing memory consumption:
19
+ - reduce batch size
20
+ - decrease lora dimension
21
+ - restrict lora target modules
22
+ Please follow these steps to run the script:
23
+ 1. Install dependencies:
24
+ conda install -c conda-forge accelerate
25
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
26
+ pip3 install -i https://pypi.org/simple/ bitsandbytes
27
+ pip3 install peft trl transformers datasets
28
+ pip3 install deepspeed
29
+ 2. Setup accelerate and deepspeed config based on the machine used:
30
+ accelerate config
31
+ Here is a sample config for deepspeed zero3:
32
+ compute_environment: LOCAL_MACHINE
33
+ debug: false
34
+ deepspeed_config:
35
+ gradient_accumulation_steps: 1
36
+ offload_optimizer_device: none
37
+ offload_param_device: none
38
+ zero3_init_flag: true
39
+ zero3_save_16bit_model: true
40
+ zero_stage: 3
41
+ distributed_type: DEEPSPEED
42
+ downcast_bf16: 'no'
43
+ enable_cpu_affinity: false
44
+ machine_rank: 0
45
+ main_training_function: main
46
+ mixed_precision: bf16
47
+ num_machines: 1
48
+ num_processes: 2
49
+ rdzv_backend: static
50
+ same_network: true
51
+ tpu_env: []
52
+ tpu_use_cluster: false
53
+ tpu_use_sudo: false
54
+ use_cpu: false
55
+ 3. check accelerate config:
56
+ accelerate env
57
+ 4. Run the code, and make sure to use accelerate launch alongside with
58
+ at least 2 A100 80GB GPUs:
59
+
60
+ accelerate launch sample_finetune.py
61
+ """
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+
66
+ ###################
67
+ # Hyper-parameters
68
+ ###################
69
+ training_config = {
70
+ "bf16": True,
71
+ "do_eval": False,
72
+ "learning_rate": 5.0e-06,
73
+ "log_level": "info",
74
+ "logging_steps": 20,
75
+ "logging_strategy": "steps",
76
+ "lr_scheduler_type": "cosine",
77
+ "num_train_epochs": 1,
78
+ "max_steps": -1,
79
+ "output_dir": "./checkpoint_dir",
80
+ "overwrite_output_dir": True,
81
+ "per_device_eval_batch_size": 4,
82
+ "per_device_train_batch_size": 4,
83
+ "remove_unused_columns": True,
84
+ "save_steps": 100,
85
+ "save_total_limit": 1,
86
+ "seed": 0,
87
+ "gradient_checkpointing": True,
88
+ "gradient_checkpointing_kwargs":{"use_reentrant": False},
89
+ "gradient_accumulation_steps": 1,
90
+ "warmup_ratio": 0.2,
91
+ }
92
+
93
+ peft_config = {
94
+ "r": 16,
95
+ "lora_alpha": 32,
96
+ "lora_dropout": 0.05,
97
+ "bias": "none",
98
+ "task_type": "CAUSAL_LM",
99
+ "target_modules": "all-linear",
100
+ "modules_to_save": None,
101
+ }
102
+ train_conf = TrainingArguments(**training_config)
103
+ peft_conf = LoraConfig(**peft_config)
104
+
105
+
106
+ ###############
107
+ # Setup logging
108
+ ###############
109
+ logging.basicConfig(
110
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
111
+ datefmt="%Y-%m-%d %H:%M:%S",
112
+ handlers=[logging.StreamHandler(sys.stdout)],
113
+ )
114
+ log_level = train_conf.get_process_log_level()
115
+ logger.setLevel(log_level)
116
+ datasets.utils.logging.set_verbosity(log_level)
117
+ transformers.utils.logging.set_verbosity(log_level)
118
+ transformers.utils.logging.enable_default_handler()
119
+ transformers.utils.logging.enable_explicit_format()
120
+
121
+ # Log on each process a small summary
122
+ logger.warning(
123
+ f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
124
+ + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
125
+ )
126
+ logger.info(f"Training/evaluation parameters {train_conf}")
127
+ logger.info(f"PEFT parameters {peft_conf}")
128
+
129
+
130
+ ################
131
+ # Model Loading
132
+ ################
133
+ checkpoint_path = "microsoft/Phi-3.5-MoE-instruct"
134
+ model_kwargs = dict(
135
+ use_cache=False,
136
+ trust_remote_code=True,
137
+ attn_implementation="flash_attention_2", # loading the model with flash-attenstion support
138
+ torch_dtype=torch.bfloat16,
139
+ device_map=None
140
+ )
141
+ model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
142
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
143
+ tokenizer.model_max_length = 2048
144
+ tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
145
+ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
146
+ tokenizer.padding_side = 'right'
147
+
148
+ for m in model.modules():
149
+ # https://github.com/microsoft/DeepSpeed/pull/4966
150
+ if "PhiMoESparseMoeBlock" in m.__class__.__name__:
151
+ deepspeed.utils.set_z3_leaf_modules(model, [m.__class__])
152
+ logger.info(f"Setting zero3 leaf for model on class with name: {m.__class__.__name__}")
153
+ break
154
+
155
+
156
+ ##################
157
+ # Data Processing
158
+ ##################
159
+ def apply_chat_template(
160
+ example,
161
+ tokenizer,
162
+ ):
163
+ messages = example["messages"]
164
+ example["text"] = tokenizer.apply_chat_template(
165
+ messages, tokenize=False, add_generation_prompt=False)
166
+ return example
167
+
168
+ raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
169
+ train_dataset = raw_dataset["train_sft"]
170
+ test_dataset = raw_dataset["test_sft"]
171
+ column_names = list(train_dataset.features)
172
+
173
+ processed_train_dataset = train_dataset.map(
174
+ apply_chat_template,
175
+ fn_kwargs={"tokenizer": tokenizer},
176
+ num_proc=10,
177
+ remove_columns=column_names,
178
+ desc="Applying chat template to train_sft",
179
+ )
180
+
181
+ processed_test_dataset = test_dataset.map(
182
+ apply_chat_template,
183
+ fn_kwargs={"tokenizer": tokenizer},
184
+ num_proc=10,
185
+ remove_columns=column_names,
186
+ desc="Applying chat template to test_sft",
187
+ )
188
+
189
+
190
+ ###########
191
+ # Training
192
+ ###########
193
+ trainer = SFTTrainer(
194
+ model=model,
195
+ args=train_conf,
196
+ peft_config=peft_conf,
197
+ train_dataset=processed_train_dataset,
198
+ eval_dataset=processed_test_dataset,
199
+ max_seq_length=2048,
200
+ dataset_text_field="text",
201
+ tokenizer=tokenizer,
202
+ packing=True
203
+ )
204
+ train_result = trainer.train()
205
+ metrics = train_result.metrics
206
+ trainer.log_metrics("train", metrics)
207
+ trainer.save_metrics("train", metrics)
208
+ trainer.save_state()
209
+
210
+
211
+ #############
212
+ # Evaluation
213
+ #############
214
+ tokenizer.padding_side = 'left'
215
+ metrics = trainer.evaluate()
216
+ metrics["eval_samples"] = len(processed_test_dataset)
217
+ trainer.log_metrics("eval", metrics)
218
+ trainer.save_metrics("eval", metrics)
219
+
220
+
221
+ # ############
222
+ # # Save model
223
+ # ############
224
+ trainer.save_model(train_conf.output_dir)
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": true,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32000": {
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|assistant|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": true,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<|placeholder1|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": true,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32003": {
54
+ "content": "<|placeholder2|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": true,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "32004": {
62
+ "content": "<|placeholder3|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": true,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "32005": {
70
+ "content": "<|placeholder4|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": true,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "32006": {
78
+ "content": "<|system|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": true,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "32007": {
86
+ "content": "<|end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": true,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "32008": {
94
+ "content": "<|placeholder5|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": true,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "32009": {
102
+ "content": "<|placeholder6|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": true,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "32010": {
110
+ "content": "<|user|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": true,
114
+ "single_word": false,
115
+ "special": true
116
+ }
117
+ },
118
+ "bos_token": "<s>",
119
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
120
+ "clean_up_tokenization_spaces": false,
121
+ "eos_token": "<|endoftext|>",
122
+ "legacy": false,
123
+ "model_max_length": 131072,
124
+ "pad_token": "<|endoftext|>",
125
+ "padding_side": "left",
126
+ "sp_model_kwargs": {},
127
+ "tokenizer_class": "LlamaTokenizer",
128
+ "unk_token": "<unk>",
129
+ "use_default_system_prompt": false
130
+ }