lukasmoeller
commited on
Commit
•
9c0f1cf
1
Parent(s):
7154cc3
Upload 15 files
Browse files- README.md +228 -0
- adapt_tokenizer.py +41 -0
- attention.py +276 -0
- blocks.py +41 -0
- config.json +1 -2
- hf_prefixlm_converter.py +415 -0
- meta_init_context.py +94 -0
- modeling_mpt.py +290 -0
- norm.py +56 -0
- param_init_fns.py +181 -0
- special_tokens_map.json +5 -0
- tokenizer.json +0 -0
- tokenizer_config.json +9 -0
README.md
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
tags:
|
4 |
+
- Composer
|
5 |
+
- MosaicML
|
6 |
+
- llm-foundry
|
7 |
+
- StreamingDatasets
|
8 |
+
datasets:
|
9 |
+
- mc4
|
10 |
+
- c4
|
11 |
+
- togethercomputer/RedPajama-Data-1T
|
12 |
+
- bigcode/the-stack
|
13 |
+
- allenai/s2orc
|
14 |
+
inference: false
|
15 |
+
---
|
16 |
+
|
17 |
+
# MPT-7B
|
18 |
+
|
19 |
+
MPT-7B is a decoder-style transformer pretrained from scratch on 1T tokens of English text and code.
|
20 |
+
This model was trained by [MosaicML](https://www.mosaicml.com).
|
21 |
+
|
22 |
+
MPT-7B is part of the family of MosaicPretrainedTransformer (MPT) models, which use a modified transformer architecture optimized for efficient training and inference.
|
23 |
+
|
24 |
+
These architectural changes include performance-optimized layer implementations and the elimination of context length limits by replacing
|
25 |
+
positional embeddings with Attention with Linear Biases ([ALiBi](https://arxiv.org/abs/2108.12409)).
|
26 |
+
Thanks to these modifications, MPT models can be trained with high throughput efficiency and stable convergence.
|
27 |
+
MPT models can also be served efficiently with both standard HuggingFace pipelines and NVIDIA's [FasterTransformer](https://github.com/NVIDIA/FasterTransformer).
|
28 |
+
|
29 |
+
This model uses the MosaicML LLM codebase, which can be found in the [llm-foundry repository](https://github.com/mosaicml/llm-foundry). It was trained by MosaicML’s NLP team on the [MosaicML platform](https://www.mosaicml.com/training) for LLM pretraining, finetuning, and inference.
|
30 |
+
|
31 |
+
### How is this model different?
|
32 |
+
|
33 |
+
MPT-7B is
|
34 |
+
|
35 |
+
* **Licensed for the possibility of commercial use** (unlike [LLaMA](https://arxiv.org/abs/2302.13971)).
|
36 |
+
* **Trained on a large amount of data** (1T tokens like [LLaMA](https://arxiv.org/abs/2302.13971) vs. 300B for [Pythia](https://github.com/EleutherAI/pythia), 300B for [OpenLLaMA](https://github.com/openlm-research/open_llama), and 800B for [StableLM](https://github.com/Stability-AI/StableLM)).
|
37 |
+
* **Prepared to handle extremely long inputs** thanks to [ALiBi](https://arxiv.org/abs/2108.12409) (we finetuned [MPT-7B-StoryWriter-65k+](https://huggingface.co/mosaicml/mpt-7b-storywriter) on up to 65k inputs and can handle up to 84k vs. 2k-4k for other open source models).
|
38 |
+
* **Capable of fast training and inference** (via [FlashAttention](https://arxiv.org/pdf/2205.14135.pdf) and [FasterTransformer](https://github.com/NVIDIA/FasterTransformer))
|
39 |
+
* **Equipped with highly efficient open-source training code** via the [llm-foundry repository](https://github.com/mosaicml/llm-foundry)
|
40 |
+
|
41 |
+
### Models finetuned off MPT-7B:
|
42 |
+
|
43 |
+
The following models are finetuned on MPT-7B:
|
44 |
+
|
45 |
+
* [MPT-7B-StoryWriter-65k+](https://huggingface.co/mosaicml/mpt-7b-storywriter): a model designed to read and write fictional stories with super long context lengths.
|
46 |
+
Built by finetuning MPT-7B with a context length of 65k tokens on a filtered fiction subset of the [books3 dataset](https://huggingface.co/datasets/the_pile_books3).
|
47 |
+
At inference time, thanks to [ALiBi](https://arxiv.org/abs/2108.12409), MPT-7B-StoryWriter-65k+ can extrapolate even beyond 65k tokens.
|
48 |
+
We demonstrate generations as long as 80k tokens on a single A100-80GB GPU in our [blogpost](www.mosaicml.com/blog/mpt-7b).
|
49 |
+
* License: Apache 2.0
|
50 |
+
|
51 |
+
* [MPT-7B-Instruct](https://huggingface.co/mosaicml/mpt-7b-instruct): a model for short-form instruction following.
|
52 |
+
Built by finetuning MPT-7B on a [dataset](https://huggingface.co/datasets/mosaicml/dolly_hhrlhf) we also release, derived from the [Databricks Dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) and the [Anthropic Helpful and Harmless (HH-RLHF)](https://huggingface.co/datasets/Anthropic/hh-rlhf) datasets.
|
53 |
+
* License: _CC-By-SA-3.0_
|
54 |
+
* [Demo on Hugging Face Spaces](https://huggingface.co/spaces/mosaicml/mpt-7b-instruct)
|
55 |
+
|
56 |
+
* [MPT-7B-Chat](https://huggingface.co/mosaicml/mpt-7b-chat): a chatbot-like model for dialogue generation.
|
57 |
+
Built by finetuning MPT-7B on the [ShareGPT-Vicuna](https://huggingface.co/datasets/jeffwan/sharegpt_vicuna), [HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3),
|
58 |
+
[Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca), [HH-RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf), and [Evol-Instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k) datasets.
|
59 |
+
* License: _CC-By-NC-SA-4.0_
|
60 |
+
* [Demo on Hugging Face Spaces](https://huggingface.co/spaces/mosaicml/mpt-7b-chat)
|
61 |
+
|
62 |
+
## Model Date
|
63 |
+
|
64 |
+
May 5, 2023
|
65 |
+
|
66 |
+
## Model License
|
67 |
+
|
68 |
+
Apache-2.0
|
69 |
+
|
70 |
+
## Documentation
|
71 |
+
|
72 |
+
* [Blog post: Introducing MPT-7B: A New Standard for Open-Source, Commercially Usable LLMs](https://www.mosaicml.com/blog/mpt-7b)
|
73 |
+
* [Codebase (mosaicml/llm-foundry repo)](https://github.com/mosaicml/llm-foundry/)
|
74 |
+
* Questions: Feel free to contact us via the [MosaicML Community Slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-1btms90mc-GipE2ufuPkKY0QBrmF3LSA)!
|
75 |
+
|
76 |
+
|
77 |
+
## How to Use
|
78 |
+
|
79 |
+
This model is best used with the MosaicML [llm-foundry repository](https://github.com/mosaicml/llm-foundry) for training and finetuning.
|
80 |
+
|
81 |
+
```python
|
82 |
+
import transformers
|
83 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
84 |
+
'mosaicml/mpt-7b',
|
85 |
+
trust_remote_code=True
|
86 |
+
)
|
87 |
+
```
|
88 |
+
Note: This model requires that `trust_remote_code=True` be passed to the `from_pretrained` method.
|
89 |
+
This is because we use a custom `MPT` model architecture that is not yet part of the Hugging Face `transformers` package.
|
90 |
+
`MPT` includes options for many training efficiency features such as [FlashAttention](https://arxiv.org/pdf/2205.14135.pdf), [ALiBi](https://arxiv.org/abs/2108.12409), [QK LayerNorm](https://arxiv.org/abs/2010.04245), and more.
|
91 |
+
|
92 |
+
To use the optimized [triton implementation](https://github.com/openai/triton) of FlashAttention, you can load the model with `attn_impl='triton'` and move the model to `bfloat16`:
|
93 |
+
```python
|
94 |
+
config = transformers.AutoConfig.from_pretrained(
|
95 |
+
'mosaicml/mpt-7b',
|
96 |
+
trust_remote_code=True
|
97 |
+
)
|
98 |
+
config.attn_config['attn_impl'] = 'triton'
|
99 |
+
|
100 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
101 |
+
'mosaicml/mpt-7b',
|
102 |
+
config=config,
|
103 |
+
torch_dtype=torch.bfloat16,
|
104 |
+
trust_remote_code=True
|
105 |
+
)
|
106 |
+
model.to(device='cuda:0')
|
107 |
+
```
|
108 |
+
|
109 |
+
Although the model was trained with a sequence length of 2048, ALiBi enables users to increase the maximum sequence length during finetuning and/or inference. For example:
|
110 |
+
|
111 |
+
```python
|
112 |
+
config = transformers.AutoConfig.from_pretrained(
|
113 |
+
'mosaicml/mpt-7b',
|
114 |
+
trust_remote_code=True
|
115 |
+
)
|
116 |
+
config.update({"max_seq_len": 4096})
|
117 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
118 |
+
'mosaicml/mpt-7b',
|
119 |
+
config=config,
|
120 |
+
trust_remote_code=True
|
121 |
+
)
|
122 |
+
```
|
123 |
+
|
124 |
+
This model was trained with the [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) tokenizer.
|
125 |
+
|
126 |
+
```python
|
127 |
+
from transformers import AutoTokenizer
|
128 |
+
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
|
129 |
+
```
|
130 |
+
|
131 |
+
## Model Description
|
132 |
+
|
133 |
+
The architecture is a modification of a standard decoder-only transformer.
|
134 |
+
|
135 |
+
The model has been modified from a standard transformer in the following ways:
|
136 |
+
* It uses [FlashAttention](https://arxiv.org/pdf/2205.14135.pdf)
|
137 |
+
* It uses [ALiBi (Attention with Linear Biases)](https://arxiv.org/abs/2108.12409) and does not use positional embeddings
|
138 |
+
* It does not use biases
|
139 |
+
|
140 |
+
|
141 |
+
| Hyperparameter | Value |
|
142 |
+
|----------------|-------|
|
143 |
+
|n_parameters | 6.7B |
|
144 |
+
|n_layers | 32 |
|
145 |
+
| n_heads | 32 |
|
146 |
+
| d_model | 4096 |
|
147 |
+
| vocab size | 50432 |
|
148 |
+
| sequence length | 2048 |
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
## Training Data
|
153 |
+
|
154 |
+
### Streaming Datasets
|
155 |
+
|
156 |
+
Data was formatted using the MosaicML [StreamingDataset](https://github.com/mosaicml/streaming) library to host our data in object storage and efficiently stream it to our compute cluster during training.
|
157 |
+
StreamingDataset obviates the need to download the whole dataset before starting training, and allows instant resumption of training from any point in the dataset.
|
158 |
+
|
159 |
+
|
160 |
+
### Data Mix
|
161 |
+
|
162 |
+
The model was trained for 1T tokens (with batch size 1760 and sequence length 2048). It was trained on the following data mix:
|
163 |
+
|
164 |
+
|
165 |
+
| Data Source | Number of Tokens in Source | Proportion | Effective Number of Tokens | Epochs |
|
166 |
+
|-------------|----------------------------|------------|----------------------------|--------|
|
167 |
+
| mC4 3.1.0 - English | 417.99 B | 0.33 | 330 B | 0.14 |
|
168 |
+
| C4 - English - SemDedup 80% | 100.42 B | 0.299 | 299 B | 2.98 |
|
169 |
+
| RedPajama - CommonCrawl | 878.45 B | 0.1 | 100 B | 0.11 |
|
170 |
+
| The Stack - Selected Languages | 463.78 B | 0.1 | 100 B | 0.22 |
|
171 |
+
| RedPajama - Wikipedia - En | 4.87 B | 0.04 | 40 B | 8.21 |
|
172 |
+
| The Stack - Markdown | 107.07 B | 0.035 | 35 B | 0.33 |
|
173 |
+
| S2ORC | 48.85 B | 0.033 | 33 B | 0.68 |
|
174 |
+
| RedPajama - Books | 26.02 B | 0.03 | 30B | 1.15 |
|
175 |
+
| RedPajama - arXiv | 28.10 B | 0.019 | 19 B | 0.68 |
|
176 |
+
| RedPajama - StackExchange | 20.54 B | 0.014 | 14 B |0.68 |
|
177 |
+
|
178 |
+
Samples for each batch were selected from one of the datasets with the probability specified above.
|
179 |
+
The examples were shuffled within each dataset, and each example was constructed from as many sequences from that dataset as were necessary to fill the 2048 sequence length.
|
180 |
+
|
181 |
+
The data was tokenized using the [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) tokenizer. This BPE tokenizer has a number of desirable characteristics,
|
182 |
+
most of which are relevant for tokenizing code:
|
183 |
+
(1) It was trained on a diverse mix of data that includes code (The Pile)
|
184 |
+
(2) It applies consistent space delimitation, unlike the GPT2 tokenizer which tokenizes inconsistently depending on the presence of prefix spaces
|
185 |
+
(3) It contains tokens for repeated space characters, which allows superior compression of text with large amounts of repeated space characters.
|
186 |
+
|
187 |
+
The model vocabulary size of 50432 was set to be a multiple of 128 (as in [MEGATRON-LM](https://arxiv.org/abs/1909.08053)), model flop utilization (MFU) increased by up to four percentage points.
|
188 |
+
|
189 |
+
### Training Configuration
|
190 |
+
|
191 |
+
This model was trained on 440 A100-40GBs for about 9.5 days using the [MosaicML Platform](https://www.mosaicml.com/platform).
|
192 |
+
The model was trained with sharded data parallelism using [FSDP](https://pytorch.org/docs/stable/fsdp.html) and used the [LION](https://arxiv.org/abs/2302.06675) optimizer.
|
193 |
+
|
194 |
+
## Limitations and Biases
|
195 |
+
|
196 |
+
_The following language is modified from [EleutherAI's GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b)_
|
197 |
+
|
198 |
+
MPT-7B (Base) is **not** intended for deployment without finetuning.
|
199 |
+
It should not be used for human-facing interactions without further guardrails and user consent.
|
200 |
+
|
201 |
+
MPT-7B can produce factually incorrect output, and should not be relied on to produce factually accurate information.
|
202 |
+
MPT-7B was trained on various public datasets.
|
203 |
+
While great efforts have been taken to clean the pretraining data, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
|
204 |
+
|
205 |
+
|
206 |
+
## MosaicML Platform
|
207 |
+
|
208 |
+
If you're interested in [training](https://www.mosaicml.com/training) and [deploying](https://www.mosaicml.com/inference) your own MPT or LLMs on the MosaicML Platform, [sign up here](https://forms.mosaicml.com/demo?utm_source=huggingface&utm_medium=referral&utm_campaign=mpt-7b).
|
209 |
+
|
210 |
+
## Disclaimer
|
211 |
+
|
212 |
+
The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please cosult an attorney before using this model for commercial purposes.
|
213 |
+
|
214 |
+
## Citation
|
215 |
+
|
216 |
+
Please cite this model using the following format:
|
217 |
+
|
218 |
+
```
|
219 |
+
@online{MosaicML2023Introducing,
|
220 |
+
author = {MosaicML NLP Team},
|
221 |
+
title = {Introducing MPT-7B: A New Standard for Open-Source,
|
222 |
+
ly Usable LLMs},
|
223 |
+
year = {2023},
|
224 |
+
url = {www.mosaicml.com/blog/mpt-7b},
|
225 |
+
note = {Accessed: 2023-03-28}, % change this date
|
226 |
+
urldate = {2023-03-28} % change this date
|
227 |
+
}
|
228 |
+
```
|
adapt_tokenizer.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
3 |
+
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
4 |
+
NUM_SENTINEL_TOKENS: int = 100
|
5 |
+
|
6 |
+
def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
|
7 |
+
"""Adds sentinel tokens and padding token (if missing).
|
8 |
+
|
9 |
+
Expands the tokenizer vocabulary to include sentinel tokens
|
10 |
+
used in mixture-of-denoiser tasks as well as a padding token.
|
11 |
+
|
12 |
+
All added tokens are added as special tokens. No tokens are
|
13 |
+
added if sentinel tokens and padding token already exist.
|
14 |
+
"""
|
15 |
+
sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
|
16 |
+
tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
|
17 |
+
if tokenizer.pad_token is None:
|
18 |
+
tokenizer.add_tokens('<pad>', special_tokens=True)
|
19 |
+
tokenizer.pad_token = '<pad>'
|
20 |
+
assert tokenizer.pad_token_id is not None
|
21 |
+
sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
|
22 |
+
_sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
|
23 |
+
tokenizer.sentinel_token_ids = _sentinel_token_ids
|
24 |
+
|
25 |
+
class AutoTokenizerForMOD(AutoTokenizer):
|
26 |
+
"""AutoTokenizer + Adaptation for MOD.
|
27 |
+
|
28 |
+
A simple wrapper around AutoTokenizer to make instantiating
|
29 |
+
an MOD-adapted tokenizer a bit easier.
|
30 |
+
|
31 |
+
MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
|
32 |
+
a padding token, and a property to get the token ids of the
|
33 |
+
sentinel tokens.
|
34 |
+
"""
|
35 |
+
|
36 |
+
@classmethod
|
37 |
+
def from_pretrained(cls, *args, **kwargs):
|
38 |
+
"""See `AutoTokenizer.from_pretrained` docstring."""
|
39 |
+
tokenizer = super().from_pretrained(*args, **kwargs)
|
40 |
+
adapt_tokenizer_for_denoising(tokenizer)
|
41 |
+
return tokenizer
|
attention.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Attention layers."""
|
2 |
+
import math
|
3 |
+
import warnings
|
4 |
+
from typing import Optional
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
from einops import rearrange
|
8 |
+
from torch import nn
|
9 |
+
from .norm import LPLayerNorm
|
10 |
+
|
11 |
+
def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
|
12 |
+
if original_is_causal and num_query_tokens != num_key_tokens:
|
13 |
+
if num_query_tokens != 1:
|
14 |
+
raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
|
15 |
+
else:
|
16 |
+
return False
|
17 |
+
return original_is_causal
|
18 |
+
|
19 |
+
def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
20 |
+
q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
|
21 |
+
k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
|
22 |
+
v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
|
23 |
+
min_val = torch.finfo(q.dtype).min
|
24 |
+
(b, _, s_q, d) = q.shape
|
25 |
+
s_k = k.size(-1)
|
26 |
+
if softmax_scale is None:
|
27 |
+
softmax_scale = 1 / math.sqrt(d)
|
28 |
+
attn_weight = q.matmul(k) * softmax_scale
|
29 |
+
if attn_bias is not None:
|
30 |
+
if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
|
31 |
+
raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
|
32 |
+
attn_weight = attn_weight + attn_bias
|
33 |
+
if key_padding_mask is not None:
|
34 |
+
if attn_bias is not None:
|
35 |
+
warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
|
36 |
+
attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
|
37 |
+
if is_causal:
|
38 |
+
s = max(s_q, s_k)
|
39 |
+
causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
|
40 |
+
causal_mask = causal_mask.tril()
|
41 |
+
causal_mask = causal_mask.to(torch.bool)
|
42 |
+
causal_mask = ~causal_mask
|
43 |
+
causal_mask = causal_mask[-s_q:, -s_k:]
|
44 |
+
attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
|
45 |
+
attn_weight = torch.softmax(attn_weight, dim=-1)
|
46 |
+
if dropout_p:
|
47 |
+
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
|
48 |
+
out = attn_weight.matmul(v)
|
49 |
+
out = rearrange(out, 'b h s d -> b s (h d)')
|
50 |
+
if needs_weights:
|
51 |
+
return (out, attn_weight)
|
52 |
+
return (out, None)
|
53 |
+
|
54 |
+
def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
|
55 |
+
for tensor in tensors:
|
56 |
+
if tensor.dtype not in valid_dtypes:
|
57 |
+
raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
|
58 |
+
if not tensor.is_cuda:
|
59 |
+
raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
|
60 |
+
|
61 |
+
def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
62 |
+
try:
|
63 |
+
from flash_attn import bert_padding, flash_attn_interface
|
64 |
+
except:
|
65 |
+
raise RuntimeError('Please install flash-attn==1.0.3.post0')
|
66 |
+
check_valid_inputs(query, key, value)
|
67 |
+
if attn_bias is not None:
|
68 |
+
raise NotImplementedError(f'attn_bias not implemented for flash attn.')
|
69 |
+
(batch_size, seqlen) = query.shape[:2]
|
70 |
+
if key_padding_mask is None:
|
71 |
+
key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
|
72 |
+
query_padding_mask = key_padding_mask[:, -query.size(1):]
|
73 |
+
(query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
|
74 |
+
query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
|
75 |
+
(key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
|
76 |
+
key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
|
77 |
+
(value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
|
78 |
+
value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
|
79 |
+
if multiquery:
|
80 |
+
key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
|
81 |
+
value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
|
82 |
+
dropout_p = dropout_p if training else 0.0
|
83 |
+
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
84 |
+
output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
|
85 |
+
output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
|
86 |
+
return (output, None)
|
87 |
+
|
88 |
+
def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
89 |
+
try:
|
90 |
+
from flash_attn import flash_attn_triton
|
91 |
+
except:
|
92 |
+
raise RuntimeError('Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202')
|
93 |
+
check_valid_inputs(query, key, value)
|
94 |
+
if dropout_p:
|
95 |
+
raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
|
96 |
+
if needs_weights:
|
97 |
+
raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
|
98 |
+
if key_padding_mask is not None:
|
99 |
+
warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
|
100 |
+
(b_size, s_k) = key_padding_mask.shape[:2]
|
101 |
+
if attn_bias is None:
|
102 |
+
attn_bias = query.new_zeros(b_size, 1, 1, s_k)
|
103 |
+
attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
|
104 |
+
query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
|
105 |
+
key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
106 |
+
value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
107 |
+
if multiquery:
|
108 |
+
key = key.expand(*key.shape[:2], n_heads, key.size(-1))
|
109 |
+
value = value.expand(*value.shape[:2], n_heads, value.size(-1))
|
110 |
+
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
111 |
+
attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
|
112 |
+
output = attn_output.view(*attn_output.shape[:2], -1)
|
113 |
+
return (output, None)
|
114 |
+
|
115 |
+
class MultiheadAttention(nn.Module):
|
116 |
+
"""Multi-head self attention.
|
117 |
+
|
118 |
+
Using torch or triton attention implemetation enables user to also use
|
119 |
+
additive bias.
|
120 |
+
"""
|
121 |
+
|
122 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
|
123 |
+
super().__init__()
|
124 |
+
self.attn_impl = attn_impl
|
125 |
+
self.clip_qkv = clip_qkv
|
126 |
+
self.qk_ln = qk_ln
|
127 |
+
self.d_model = d_model
|
128 |
+
self.n_heads = n_heads
|
129 |
+
self.softmax_scale = softmax_scale
|
130 |
+
if self.softmax_scale is None:
|
131 |
+
self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
|
132 |
+
self.attn_dropout_p = attn_pdrop
|
133 |
+
self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
|
134 |
+
fuse_splits = (d_model, 2 * d_model)
|
135 |
+
self.Wqkv._fused = (0, fuse_splits)
|
136 |
+
if self.qk_ln:
|
137 |
+
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
|
138 |
+
self.q_ln = layernorm_class(self.d_model, device=device)
|
139 |
+
self.k_ln = layernorm_class(self.d_model, device=device)
|
140 |
+
if self.attn_impl == 'flash':
|
141 |
+
self.attn_fn = flash_attn_fn
|
142 |
+
elif self.attn_impl == 'triton':
|
143 |
+
self.attn_fn = triton_flash_attn_fn
|
144 |
+
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
|
145 |
+
elif self.attn_impl == 'torch':
|
146 |
+
self.attn_fn = scaled_multihead_dot_product_attention
|
147 |
+
if torch.cuda.is_available():
|
148 |
+
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
149 |
+
else:
|
150 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
151 |
+
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
|
152 |
+
self.out_proj._is_residual = True
|
153 |
+
|
154 |
+
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
155 |
+
qkv = self.Wqkv(x)
|
156 |
+
if self.clip_qkv:
|
157 |
+
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
|
158 |
+
(query, key, value) = qkv.chunk(3, dim=2)
|
159 |
+
key_padding_mask = attention_mask
|
160 |
+
if self.qk_ln:
|
161 |
+
dtype = query.dtype
|
162 |
+
query = self.q_ln(query).to(dtype)
|
163 |
+
key = self.k_ln(key).to(dtype)
|
164 |
+
if past_key_value is not None:
|
165 |
+
if len(past_key_value) != 0:
|
166 |
+
key = torch.cat([past_key_value[0], key], dim=1)
|
167 |
+
value = torch.cat([past_key_value[1], value], dim=1)
|
168 |
+
past_key_value = (key, value)
|
169 |
+
if attn_bias is not None:
|
170 |
+
attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
|
171 |
+
(context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
|
172 |
+
return (self.out_proj(context), attn_weights, past_key_value)
|
173 |
+
|
174 |
+
class MultiQueryAttention(nn.Module):
|
175 |
+
"""Multi-Query self attention.
|
176 |
+
|
177 |
+
Using torch or triton attention implemetation enables user to also use
|
178 |
+
additive bias.
|
179 |
+
"""
|
180 |
+
|
181 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
|
182 |
+
super().__init__()
|
183 |
+
self.attn_impl = attn_impl
|
184 |
+
self.clip_qkv = clip_qkv
|
185 |
+
self.qk_ln = qk_ln
|
186 |
+
self.d_model = d_model
|
187 |
+
self.n_heads = n_heads
|
188 |
+
self.head_dim = d_model // n_heads
|
189 |
+
self.softmax_scale = softmax_scale
|
190 |
+
if self.softmax_scale is None:
|
191 |
+
self.softmax_scale = 1 / math.sqrt(self.head_dim)
|
192 |
+
self.attn_dropout_p = attn_pdrop
|
193 |
+
self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
|
194 |
+
fuse_splits = (d_model, d_model + self.head_dim)
|
195 |
+
self.Wqkv._fused = (0, fuse_splits)
|
196 |
+
if self.qk_ln:
|
197 |
+
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
|
198 |
+
self.q_ln = layernorm_class(d_model, device=device)
|
199 |
+
self.k_ln = layernorm_class(self.head_dim, device=device)
|
200 |
+
if self.attn_impl == 'flash':
|
201 |
+
self.attn_fn = flash_attn_fn
|
202 |
+
elif self.attn_impl == 'triton':
|
203 |
+
self.attn_fn = triton_flash_attn_fn
|
204 |
+
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
|
205 |
+
elif self.attn_impl == 'torch':
|
206 |
+
self.attn_fn = scaled_multihead_dot_product_attention
|
207 |
+
if torch.cuda.is_available():
|
208 |
+
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
209 |
+
else:
|
210 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
211 |
+
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
|
212 |
+
self.out_proj._is_residual = True
|
213 |
+
|
214 |
+
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
215 |
+
qkv = self.Wqkv(x)
|
216 |
+
if self.clip_qkv:
|
217 |
+
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
|
218 |
+
(query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
|
219 |
+
key_padding_mask = attention_mask
|
220 |
+
if self.qk_ln:
|
221 |
+
dtype = query.dtype
|
222 |
+
query = self.q_ln(query).to(dtype)
|
223 |
+
key = self.k_ln(key).to(dtype)
|
224 |
+
if past_key_value is not None:
|
225 |
+
if len(past_key_value) != 0:
|
226 |
+
key = torch.cat([past_key_value[0], key], dim=1)
|
227 |
+
value = torch.cat([past_key_value[1], value], dim=1)
|
228 |
+
past_key_value = (key, value)
|
229 |
+
if attn_bias is not None:
|
230 |
+
attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
|
231 |
+
(context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
|
232 |
+
return (self.out_proj(context), attn_weights, past_key_value)
|
233 |
+
|
234 |
+
def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
|
235 |
+
if attn_impl == 'flash':
|
236 |
+
return None
|
237 |
+
elif attn_impl in ['torch', 'triton']:
|
238 |
+
if alibi:
|
239 |
+
if (prefix_lm or not causal) or use_sequence_id:
|
240 |
+
return (1, n_heads, seq_len, seq_len)
|
241 |
+
return (1, n_heads, 1, seq_len)
|
242 |
+
elif prefix_lm or use_sequence_id:
|
243 |
+
return (1, 1, seq_len, seq_len)
|
244 |
+
return None
|
245 |
+
else:
|
246 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
247 |
+
|
248 |
+
def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
|
249 |
+
if attn_impl == 'flash':
|
250 |
+
return None
|
251 |
+
elif attn_impl in ['torch', 'triton']:
|
252 |
+
if alibi:
|
253 |
+
(device, dtype) = (attn_bias.device, attn_bias.dtype)
|
254 |
+
attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
|
255 |
+
return attn_bias
|
256 |
+
else:
|
257 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
258 |
+
|
259 |
+
def gen_slopes(n_heads, alibi_bias_max=8, device=None):
|
260 |
+
_n_heads = 2 ** math.ceil(math.log2(n_heads))
|
261 |
+
m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
|
262 |
+
m = m.mul(alibi_bias_max / _n_heads)
|
263 |
+
slopes = 1.0 / torch.pow(2, m)
|
264 |
+
if _n_heads != n_heads:
|
265 |
+
slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
|
266 |
+
return slopes.view(1, n_heads, 1, 1)
|
267 |
+
|
268 |
+
def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
|
269 |
+
alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
|
270 |
+
if full:
|
271 |
+
alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
|
272 |
+
alibi_bias = alibi_bias.abs().mul(-1)
|
273 |
+
slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
|
274 |
+
alibi_bias = alibi_bias * slopes
|
275 |
+
return alibi_bias.to(dtype=dtype)
|
276 |
+
ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}
|
blocks.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""GPT Blocks used for the GPT Model."""
|
2 |
+
from typing import Dict, Optional, Tuple
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from .attention import ATTN_CLASS_REGISTRY
|
6 |
+
from .norm import NORM_CLASS_REGISTRY
|
7 |
+
|
8 |
+
class MPTMLP(nn.Module):
|
9 |
+
|
10 |
+
def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
|
11 |
+
super().__init__()
|
12 |
+
self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
|
13 |
+
self.act = nn.GELU(approximate='none')
|
14 |
+
self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
|
15 |
+
self.down_proj._is_residual = True
|
16 |
+
|
17 |
+
def forward(self, x):
|
18 |
+
return self.down_proj(self.act(self.up_proj(x)))
|
19 |
+
|
20 |
+
class MPTBlock(nn.Module):
|
21 |
+
|
22 |
+
def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
|
23 |
+
del kwargs
|
24 |
+
super().__init__()
|
25 |
+
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
|
26 |
+
attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
|
27 |
+
self.norm_1 = norm_class(d_model, device=device)
|
28 |
+
self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, device=device)
|
29 |
+
self.norm_2 = norm_class(d_model, device=device)
|
30 |
+
self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
|
31 |
+
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
|
32 |
+
self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
|
33 |
+
|
34 |
+
def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
|
35 |
+
a = self.norm_1(x)
|
36 |
+
(b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
|
37 |
+
x = x + self.resid_attn_dropout(b)
|
38 |
+
m = self.norm_2(x)
|
39 |
+
n = self.ffn(m)
|
40 |
+
x = x + self.resid_ffn_dropout(n)
|
41 |
+
return (x, past_key_value)
|
config.json
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "mosaicml/mpt-7b",
|
3 |
"architectures": [
|
4 |
"MPTForCausalLM"
|
5 |
],
|
@@ -45,7 +44,7 @@
|
|
45 |
"norm_type": "low_precision_layernorm",
|
46 |
"resid_pdrop": 0,
|
47 |
"tokenizer_name": "EleutherAI/gpt-neox-20b",
|
48 |
-
"torch_dtype": "
|
49 |
"transformers_version": "4.28.1",
|
50 |
"use_cache": false,
|
51 |
"verbose": 0,
|
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
"MPTForCausalLM"
|
4 |
],
|
|
|
44 |
"norm_type": "low_precision_layernorm",
|
45 |
"resid_pdrop": 0,
|
46 |
"tokenizer_name": "EleutherAI/gpt-neox-20b",
|
47 |
+
"torch_dtype": "bfloat16",
|
48 |
"transformers_version": "4.28.1",
|
49 |
"use_cache": false,
|
50 |
"verbose": 0,
|
hf_prefixlm_converter.py
ADDED
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Converts Huggingface Causal LM to Prefix LM.
|
2 |
+
|
3 |
+
Conversion does lightweight surgery on a HuggingFace
|
4 |
+
Causal LM to convert it to a Prefix LM.
|
5 |
+
|
6 |
+
Prefix LMs accepts a `bidirectional_mask` input in `forward`
|
7 |
+
and treat the input prompt as the prefix in `generate`.
|
8 |
+
"""
|
9 |
+
import math
|
10 |
+
import warnings
|
11 |
+
from types import MethodType
|
12 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
13 |
+
import torch
|
14 |
+
from transformers.models.bloom.modeling_bloom import BaseModelOutputWithPastAndCrossAttentions, BloomForCausalLM, BloomModel, CausalLMOutputWithCrossAttentions, CrossEntropyLoss
|
15 |
+
from transformers.models.bloom.modeling_bloom import _expand_mask as _expand_mask_bloom
|
16 |
+
from transformers.models.bloom.modeling_bloom import _make_causal_mask as _make_causal_mask_bloom
|
17 |
+
from transformers.models.bloom.modeling_bloom import logging
|
18 |
+
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
|
19 |
+
from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
|
20 |
+
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
|
21 |
+
from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
|
22 |
+
from transformers.models.opt.modeling_opt import OPTForCausalLM
|
23 |
+
from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_opt
|
24 |
+
from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
|
25 |
+
logger = logging.get_logger(__name__)
|
26 |
+
_SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
|
27 |
+
CAUSAL_GPT_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM]
|
28 |
+
|
29 |
+
def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
|
30 |
+
"""Converts a GPT-style Causal LM to a Prefix LM.
|
31 |
+
|
32 |
+
Supported HuggingFace model classes:
|
33 |
+
- `GPT2LMHeadModel`
|
34 |
+
- `GPTNeoForCausalLM`
|
35 |
+
- `GPTNeoXForCausalLM`
|
36 |
+
- `GPTJForCausalLM`
|
37 |
+
|
38 |
+
See `convert_hf_causal_lm_to_prefix_lm` for more details.
|
39 |
+
"""
|
40 |
+
if hasattr(model, '_prefix_lm_converted'):
|
41 |
+
return model
|
42 |
+
assert isinstance(model, _SUPPORTED_GPT_MODELS)
|
43 |
+
assert model.config.add_cross_attention == False, 'Only supports GPT-style decoder-only models'
|
44 |
+
|
45 |
+
def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
|
46 |
+
"""Helper that gets a list of the model's attention modules.
|
47 |
+
|
48 |
+
Each module has a `bias` buffer used for causal masking. The Prefix LM
|
49 |
+
conversion adds logic to dynamically manipulate these biases to support
|
50 |
+
Prefix LM attention masking.
|
51 |
+
"""
|
52 |
+
attn_modules = []
|
53 |
+
if isinstance(model, GPTNeoXForCausalLM):
|
54 |
+
blocks = model.gpt_neox.layers
|
55 |
+
else:
|
56 |
+
blocks = model.transformer.h
|
57 |
+
for block in blocks:
|
58 |
+
if isinstance(model, GPTNeoForCausalLM):
|
59 |
+
if block.attn.attention_type != 'global':
|
60 |
+
continue
|
61 |
+
attn_module = block.attn.attention
|
62 |
+
elif isinstance(model, GPTNeoXForCausalLM):
|
63 |
+
attn_module = block.attention
|
64 |
+
else:
|
65 |
+
attn_module = block.attn
|
66 |
+
attn_modules.append(attn_module)
|
67 |
+
return attn_modules
|
68 |
+
setattr(model, '_original_forward', getattr(model, 'forward'))
|
69 |
+
setattr(model, '_original_generate', getattr(model, 'generate'))
|
70 |
+
|
71 |
+
def forward(self: CAUSAL_GPT_TYPES, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]]=None, attention_mask: Optional[torch.FloatTensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, token_type_ids: Optional[torch.LongTensor]=None, position_ids: Optional[torch.LongTensor]=None, head_mask: Optional[torch.FloatTensor]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
|
72 |
+
"""Wraps original forward to enable PrefixLM attention."""
|
73 |
+
|
74 |
+
def call_og_forward():
|
75 |
+
if isinstance(self, GPTNeoXForCausalLM):
|
76 |
+
return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
77 |
+
else:
|
78 |
+
return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
79 |
+
if bidirectional_mask is None:
|
80 |
+
return call_og_forward()
|
81 |
+
assert isinstance(bidirectional_mask, torch.Tensor)
|
82 |
+
attn_modules = _get_attn_modules(model)
|
83 |
+
(b, s) = bidirectional_mask.shape
|
84 |
+
max_length = attn_modules[0].bias.shape[-1]
|
85 |
+
if s > max_length:
|
86 |
+
raise ValueError(f'bidirectional_mask sequence length (={s}) exceeds the ' + f'max length allowed by the model ({max_length}).')
|
87 |
+
assert s <= max_length
|
88 |
+
if s < max_length:
|
89 |
+
pad = torch.zeros((int(b), int(max_length - s)), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
|
90 |
+
bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
|
91 |
+
bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
|
92 |
+
for attn_module in attn_modules:
|
93 |
+
attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
|
94 |
+
output = call_og_forward()
|
95 |
+
for attn_module in attn_modules:
|
96 |
+
attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
|
97 |
+
return output
|
98 |
+
|
99 |
+
def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
|
100 |
+
"""Wraps original generate to enable PrefixLM attention."""
|
101 |
+
attn_modules = _get_attn_modules(model)
|
102 |
+
for attn_module in attn_modules:
|
103 |
+
attn_module.bias.data[:] = 1
|
104 |
+
output = self._original_generate(*args, **kwargs)
|
105 |
+
for attn_module in attn_modules:
|
106 |
+
attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
|
107 |
+
return output
|
108 |
+
setattr(model, 'forward', MethodType(forward, model))
|
109 |
+
setattr(model, 'generate', MethodType(generate, model))
|
110 |
+
setattr(model, '_prefix_lm_converted', True)
|
111 |
+
return model
|
112 |
+
|
113 |
+
def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
|
114 |
+
"""Converts a BLOOM Causal LM to a Prefix LM.
|
115 |
+
|
116 |
+
Supported HuggingFace model classes:
|
117 |
+
- `BloomForCausalLM`
|
118 |
+
|
119 |
+
See `convert_hf_causal_lm_to_prefix_lm` for more details.
|
120 |
+
"""
|
121 |
+
if hasattr(model, '_prefix_lm_converted'):
|
122 |
+
return model
|
123 |
+
assert isinstance(model, BloomForCausalLM)
|
124 |
+
assert model.config.add_cross_attention == False, 'Only supports BLOOM decoder-only models'
|
125 |
+
|
126 |
+
def _prepare_attn_mask(self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[int, int], past_key_values_length: int) -> torch.BoolTensor:
|
127 |
+
combined_attention_mask = None
|
128 |
+
device = attention_mask.device
|
129 |
+
(_, src_length) = input_shape
|
130 |
+
if src_length > 1:
|
131 |
+
combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
|
132 |
+
if bidirectional_mask is not None:
|
133 |
+
assert attention_mask.shape == bidirectional_mask.shape
|
134 |
+
expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
|
135 |
+
combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
|
136 |
+
expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
|
137 |
+
combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
|
138 |
+
return combined_attention_mask
|
139 |
+
|
140 |
+
def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
|
141 |
+
num_heads = self.config.n_head
|
142 |
+
closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
|
143 |
+
base = torch.tensor(2 ** (-2 ** (-(math.log2(closest_power_of_2) - 3))), device=device, dtype=torch.float32)
|
144 |
+
powers = torch.arange(1, 1 + closest_power_of_2, device=device, dtype=torch.int32)
|
145 |
+
slopes = torch.pow(base, powers)
|
146 |
+
if closest_power_of_2 != num_heads:
|
147 |
+
extra_base = torch.tensor(2 ** (-2 ** (-(math.log2(2 * closest_power_of_2) - 3))), device=device, dtype=torch.float32)
|
148 |
+
num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
|
149 |
+
extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32)
|
150 |
+
slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
|
151 |
+
qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
|
152 |
+
ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
|
153 |
+
diffs = qa - ka + key_length - query_length
|
154 |
+
diffs = -diffs.abs()
|
155 |
+
alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length)
|
156 |
+
alibi = alibi.expand(batch_size, -1, -1, -1).reshape(-1, query_length, key_length)
|
157 |
+
return alibi.to(dtype)
|
158 |
+
KeyValueT = Tuple[torch.Tensor, torch.Tensor]
|
159 |
+
|
160 |
+
def forward(self: BloomModel, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.LongTensor]=None, inputs_embeds: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
|
161 |
+
if deprecated_arguments.pop('position_ids', False) is not False:
|
162 |
+
warnings.warn('`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' + 'You can safely ignore passing `position_ids`.', FutureWarning)
|
163 |
+
if len(deprecated_arguments) > 0:
|
164 |
+
raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
|
165 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
166 |
+
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
167 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
168 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
169 |
+
if input_ids is not None and inputs_embeds is not None:
|
170 |
+
raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
|
171 |
+
elif input_ids is not None:
|
172 |
+
(batch_size, seq_length) = input_ids.shape
|
173 |
+
elif inputs_embeds is not None:
|
174 |
+
(batch_size, seq_length, _) = inputs_embeds.shape
|
175 |
+
else:
|
176 |
+
raise ValueError('You have to specify either input_ids or inputs_embeds')
|
177 |
+
if past_key_values is None:
|
178 |
+
past_key_values = tuple([None] * len(self.h))
|
179 |
+
head_mask = self.get_head_mask(head_mask, self.config.n_layer)
|
180 |
+
if inputs_embeds is None:
|
181 |
+
inputs_embeds = self.word_embeddings(input_ids)
|
182 |
+
hidden_states = self.word_embeddings_layernorm(inputs_embeds)
|
183 |
+
presents = () if use_cache else None
|
184 |
+
all_self_attentions = () if output_attentions else None
|
185 |
+
all_hidden_states = () if output_hidden_states else None
|
186 |
+
seq_length_with_past = seq_length
|
187 |
+
past_key_values_length = 0
|
188 |
+
if past_key_values[0] is not None:
|
189 |
+
tmp = past_key_values[0][0]
|
190 |
+
past_key_values_length = tmp.shape[2]
|
191 |
+
seq_length_with_past = seq_length_with_past + past_key_values_length
|
192 |
+
if attention_mask is None:
|
193 |
+
attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
|
194 |
+
else:
|
195 |
+
attention_mask = attention_mask.to(hidden_states.device)
|
196 |
+
alibi = self._build_alibi_tensor(batch_size=batch_size, query_length=seq_length, key_length=seq_length_with_past, dtype=hidden_states.dtype, device=hidden_states.device)
|
197 |
+
causal_mask = self._prepare_attn_mask(attention_mask, bidirectional_mask, input_shape=(batch_size, seq_length), past_key_values_length=past_key_values_length)
|
198 |
+
for (i, (block, layer_past)) in enumerate(zip(self.h, past_key_values)):
|
199 |
+
if output_hidden_states:
|
200 |
+
hst = (hidden_states,)
|
201 |
+
all_hidden_states = all_hidden_states + hst
|
202 |
+
if self.gradient_checkpointing and self.training:
|
203 |
+
if use_cache:
|
204 |
+
logger.warning('`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...')
|
205 |
+
use_cache = False
|
206 |
+
|
207 |
+
def create_custom_forward(module):
|
208 |
+
|
209 |
+
def custom_forward(*inputs):
|
210 |
+
return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
|
211 |
+
return custom_forward
|
212 |
+
outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(block), hidden_states, alibi, causal_mask, head_mask[i])
|
213 |
+
else:
|
214 |
+
outputs = block(hidden_states, layer_past=layer_past, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi)
|
215 |
+
hidden_states = outputs[0]
|
216 |
+
if use_cache is True:
|
217 |
+
presents = presents + (outputs[1],)
|
218 |
+
if output_attentions:
|
219 |
+
oa = (outputs[2 if use_cache else 1],)
|
220 |
+
all_self_attentions = all_self_attentions + oa
|
221 |
+
hidden_states = self.ln_f(hidden_states)
|
222 |
+
if output_hidden_states:
|
223 |
+
hst = (hidden_states,)
|
224 |
+
all_hidden_states = all_hidden_states + hst
|
225 |
+
if not return_dict:
|
226 |
+
return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None))
|
227 |
+
return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions)
|
228 |
+
setattr(model.transformer, '_prepare_attn_mask', MethodType(_prepare_attn_mask, model.transformer))
|
229 |
+
setattr(model.transformer, '_build_alibi_tensor', MethodType(_build_alibi_tensor, model.transformer))
|
230 |
+
setattr(model.transformer, 'forward', MethodType(forward, model.transformer))
|
231 |
+
KeyValueT = Tuple[torch.Tensor, torch.Tensor]
|
232 |
+
|
233 |
+
def forward(self: BloomForCausalLM, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.Tensor]=None, inputs_embeds: Optional[torch.Tensor]=None, labels: Optional[torch.Tensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
|
234 |
+
"""Replacement forward method for BloomCausalLM."""
|
235 |
+
if deprecated_arguments.pop('position_ids', False) is not False:
|
236 |
+
warnings.warn('`position_ids` have no functionality in BLOOM and will be removed ' + 'in v5.0.0. You can safely ignore passing `position_ids`.', FutureWarning)
|
237 |
+
if len(deprecated_arguments) > 0:
|
238 |
+
raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
|
239 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
240 |
+
transformer_outputs = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask, bidirectional_mask=bidirectional_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
241 |
+
hidden_states = transformer_outputs[0]
|
242 |
+
lm_logits = self.lm_head(hidden_states)
|
243 |
+
loss = None
|
244 |
+
if labels is not None:
|
245 |
+
shift_logits = lm_logits[..., :-1, :].contiguous()
|
246 |
+
shift_labels = labels[..., 1:].contiguous()
|
247 |
+
(batch_size, seq_length, vocab_size) = shift_logits.shape
|
248 |
+
loss_fct = CrossEntropyLoss()
|
249 |
+
loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
|
250 |
+
if not return_dict:
|
251 |
+
output = (lm_logits,) + transformer_outputs[1:]
|
252 |
+
return (loss,) + output if loss is not None else output
|
253 |
+
return CausalLMOutputWithCrossAttentions(loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions)
|
254 |
+
|
255 |
+
def prepare_inputs_for_generation(self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, **kwargs) -> dict:
|
256 |
+
if past:
|
257 |
+
input_ids = input_ids[:, -1].unsqueeze(-1)
|
258 |
+
bidirectional_mask = None
|
259 |
+
if past[0][0].shape[0] == input_ids.shape[0]:
|
260 |
+
past = self._convert_to_bloom_cache(past)
|
261 |
+
else:
|
262 |
+
bidirectional_mask = torch.ones_like(input_ids)
|
263 |
+
return {'input_ids': input_ids, 'past_key_values': past, 'use_cache': True, 'attention_mask': attention_mask, 'bidirectional_mask': bidirectional_mask}
|
264 |
+
setattr(model, 'forward', MethodType(forward, model))
|
265 |
+
setattr(model, 'prepare_inputs_for_generation', MethodType(prepare_inputs_for_generation, model))
|
266 |
+
setattr(model, '_prefix_lm_converted', True)
|
267 |
+
return model
|
268 |
+
|
269 |
+
def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
|
270 |
+
"""Converts an OPT Causal LM to a Prefix LM.
|
271 |
+
|
272 |
+
Supported HuggingFace model classes:
|
273 |
+
- `OPTForCausalLM`
|
274 |
+
|
275 |
+
See `convert_hf_causal_lm_to_prefix_lm` for more details.
|
276 |
+
"""
|
277 |
+
if hasattr(model, '_prefix_lm_converted'):
|
278 |
+
return model
|
279 |
+
assert isinstance(model, OPTForCausalLM)
|
280 |
+
assert model.config.add_cross_attention == False, 'Only supports OPT decoder-only models'
|
281 |
+
setattr(model, '_original_forward', getattr(model, 'forward'))
|
282 |
+
setattr(model, '_original_generate', getattr(model, 'generate'))
|
283 |
+
model.model.decoder.bidirectional_mask = None
|
284 |
+
|
285 |
+
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
|
286 |
+
combined_attention_mask = None
|
287 |
+
if input_shape[-1] > 1:
|
288 |
+
if self.bidirectional_mask == 'g':
|
289 |
+
(bsz, src_length) = input_shape
|
290 |
+
combined_attention_mask = torch.zeros((bsz, 1, src_length, src_length + past_key_values_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
|
291 |
+
else:
|
292 |
+
combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(inputs_embeds.device)
|
293 |
+
if self.bidirectional_mask is not None:
|
294 |
+
assert attention_mask.shape == self.bidirectional_mask.shape
|
295 |
+
expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
|
296 |
+
combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
|
297 |
+
if attention_mask is not None:
|
298 |
+
expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
|
299 |
+
combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
|
300 |
+
return combined_attention_mask
|
301 |
+
setattr(model.model.decoder, '_prepare_decoder_attention_mask', MethodType(_prepare_decoder_attention_mask, model.model.decoder))
|
302 |
+
|
303 |
+
def forward(self: OPTForCausalLM, input_ids: Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.ByteTensor]=None, head_mask: Optional[torch.Tensor]=None, past_key_values: Optional[List[torch.FloatTensor]]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
|
304 |
+
|
305 |
+
def call_og_forward():
|
306 |
+
return self._original_forward(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
307 |
+
if bidirectional_mask is None:
|
308 |
+
return call_og_forward()
|
309 |
+
self.model.decoder.bidirectional_mask = bidirectional_mask
|
310 |
+
try:
|
311 |
+
outputs = call_og_forward()
|
312 |
+
except:
|
313 |
+
self.model.decoder.bidirectional_mask = None
|
314 |
+
raise
|
315 |
+
self.model.decoder.bidirectional_mask = None
|
316 |
+
return outputs
|
317 |
+
|
318 |
+
def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
|
319 |
+
"""Wraps original generate to enable PrefixLM-style attention."""
|
320 |
+
self.model.decoder.bidirectional_mask = 'g'
|
321 |
+
try:
|
322 |
+
output = self._original_generate(*args, **kwargs)
|
323 |
+
except:
|
324 |
+
self.model.decoder.bidirectional_mask = None
|
325 |
+
raise
|
326 |
+
self.model.decoder.bidirectional_mask = None
|
327 |
+
return output
|
328 |
+
setattr(model, 'forward', MethodType(forward, model))
|
329 |
+
setattr(model, 'generate', MethodType(generate, model))
|
330 |
+
setattr(model, '_prefix_lm_converted', True)
|
331 |
+
return model
|
332 |
+
_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
|
333 |
+
CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
|
334 |
+
|
335 |
+
def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
|
336 |
+
"""Converts a HuggingFace Causal LM to a Prefix LM.
|
337 |
+
|
338 |
+
Supported HuggingFace model classes:
|
339 |
+
- `GPT2LMHeadModel`
|
340 |
+
- `GPTNeoForCausalLM`
|
341 |
+
- `GPTNeoXForCausalLM`
|
342 |
+
- `GPTJForCausalLM`
|
343 |
+
- `BloomForCausalLM`
|
344 |
+
- `OPTForCausalLM`
|
345 |
+
|
346 |
+
Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
|
347 |
+
`generate` method and/or select underlying methods depending on the model class.
|
348 |
+
|
349 |
+
These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
|
350 |
+
|
351 |
+
Notes on training:
|
352 |
+
To actually train the converted model as a Prefix LM, training batches will need to indicate
|
353 |
+
the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
|
354 |
+
|
355 |
+
**This is not a standard input and requires custom layers either within or after your dataloader.**
|
356 |
+
|
357 |
+
In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
|
358 |
+
such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
|
359 |
+
That is, the prefix portion of the sequence should not generate any loss. Loss should only be
|
360 |
+
generated by the target portion of the sequence.
|
361 |
+
|
362 |
+
Notes on `GPTNeoForCausalLM`:
|
363 |
+
To simplify the implementation, "global" and "local" attention layers are handled differently.
|
364 |
+
For "global" layers, we handle conversion as described above. For "local" layers, which use a
|
365 |
+
causal attention mask within a restricted local window, we do not alter the masking.
|
366 |
+
|
367 |
+
Notes on `forward` method conversion:
|
368 |
+
After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
|
369 |
+
which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
|
370 |
+
belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
|
371 |
+
0 indicates token positions belonging to the target.
|
372 |
+
|
373 |
+
The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
|
374 |
+
causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
|
375 |
+
the causal masks before returning the result.
|
376 |
+
|
377 |
+
Notes on `generate` method conversion:
|
378 |
+
After conversion, the `generate` method will have the same signature but will internally
|
379 |
+
convert all causal masks to be purely bidirectional, call the original `generate` method, and
|
380 |
+
(where appropriate) reset the causal masks before returning the result.
|
381 |
+
|
382 |
+
This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
|
383 |
+
"prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
|
384 |
+
each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
|
385 |
+
another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
|
386 |
+
previously-generated tokens (also as expected in a Prefix LM).
|
387 |
+
|
388 |
+
To preserve the API, the original methods are renamed to `_original_forward` and
|
389 |
+
`_original_generate`, and replaced with new `forward` and `generate` methods that wrap
|
390 |
+
them, respectively. Although implementation details vary by model class.
|
391 |
+
"""
|
392 |
+
if isinstance(model, _SUPPORTED_GPT_MODELS):
|
393 |
+
return _convert_gpt_causal_lm_to_prefix_lm(model)
|
394 |
+
elif isinstance(model, BloomForCausalLM):
|
395 |
+
return _convert_bloom_causal_lm_to_prefix_lm(model)
|
396 |
+
elif isinstance(model, OPTForCausalLM):
|
397 |
+
return _convert_opt_causal_lm_to_prefix_lm(model)
|
398 |
+
else:
|
399 |
+
raise TypeError(f'Cannot convert model to Prefix LM. ' + f'Model does not belong to set of supported HF models:' + f'\n{_SUPPORTED_HF_MODELS}')
|
400 |
+
|
401 |
+
def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
|
402 |
+
"""Attempts to add bidirectional_mask to batch if missing.
|
403 |
+
|
404 |
+
Raises:
|
405 |
+
KeyError if bidirectional_mask is missing and can't be inferred
|
406 |
+
"""
|
407 |
+
if 'bidirectional_mask' not in batch:
|
408 |
+
if batch.get('mode', None) == 'icl_task':
|
409 |
+
batch['bidirectional_mask'] = batch['attention_mask'].clone()
|
410 |
+
for (i, continuation_indices) in enumerate(batch['continuation_indices']):
|
411 |
+
batch['bidirectional_mask'][i, continuation_indices] = 0
|
412 |
+
elif 'labels' in batch and 'attention_mask' in batch:
|
413 |
+
batch['bidirectional_mask'] = torch.logical_and(torch.eq(batch['attention_mask'], 1), torch.eq(batch['labels'], -100)).type_as(batch['attention_mask'])
|
414 |
+
else:
|
415 |
+
raise KeyError('No bidirectional_mask in batch and not sure how to construct one.')
|
meta_init_context.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from contextlib import contextmanager
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
|
5 |
+
@contextmanager
|
6 |
+
def init_empty_weights(include_buffers: bool=False):
|
7 |
+
"""Meta initialization context manager.
|
8 |
+
|
9 |
+
A context manager under which models are initialized with all parameters
|
10 |
+
on the meta device, therefore creating an empty model. Useful when just
|
11 |
+
initializing the model would blow the available RAM.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
include_buffers (`bool`, *optional*, defaults to `False`): Whether or
|
15 |
+
not to also put all buffers on the meta device while initializing.
|
16 |
+
|
17 |
+
Example:
|
18 |
+
```python
|
19 |
+
import torch.nn as nn
|
20 |
+
|
21 |
+
# Initialize a model with 100 billions parameters in no time and without using any RAM.
|
22 |
+
with init_empty_weights():
|
23 |
+
tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
|
24 |
+
```
|
25 |
+
|
26 |
+
<Tip warning={true}>
|
27 |
+
|
28 |
+
Any model created under this context manager has no weights. As such you can't do something like
|
29 |
+
`model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
|
30 |
+
|
31 |
+
</Tip>
|
32 |
+
"""
|
33 |
+
with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
|
34 |
+
yield f
|
35 |
+
|
36 |
+
@contextmanager
|
37 |
+
def init_on_device(device: torch.device, include_buffers: bool=False):
|
38 |
+
"""Device initialization context manager.
|
39 |
+
|
40 |
+
A context manager under which models are initialized with all parameters
|
41 |
+
on the specified device.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
device (`torch.device`): Device to initialize all parameters on.
|
45 |
+
include_buffers (`bool`, *optional*, defaults to `False`): Whether or
|
46 |
+
not to also put all buffers on the meta device while initializing.
|
47 |
+
|
48 |
+
Example:
|
49 |
+
```python
|
50 |
+
import torch.nn as nn
|
51 |
+
|
52 |
+
with init_on_device(device=torch.device("cuda")):
|
53 |
+
tst = nn.Liner(100, 100) # on `cuda` device
|
54 |
+
```
|
55 |
+
"""
|
56 |
+
old_register_parameter = nn.Module.register_parameter
|
57 |
+
if include_buffers:
|
58 |
+
old_register_buffer = nn.Module.register_buffer
|
59 |
+
|
60 |
+
def register_empty_parameter(module, name, param):
|
61 |
+
old_register_parameter(module, name, param)
|
62 |
+
if param is not None:
|
63 |
+
param_cls = type(module._parameters[name])
|
64 |
+
kwargs = module._parameters[name].__dict__
|
65 |
+
module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
|
66 |
+
|
67 |
+
def register_empty_buffer(module, name, buffer):
|
68 |
+
old_register_buffer(module, name, buffer)
|
69 |
+
if buffer is not None:
|
70 |
+
module._buffers[name] = module._buffers[name].to(device)
|
71 |
+
if include_buffers:
|
72 |
+
tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
|
73 |
+
else:
|
74 |
+
tensor_constructors_to_patch = {}
|
75 |
+
|
76 |
+
def patch_tensor_constructor(fn):
|
77 |
+
|
78 |
+
def wrapper(*args, **kwargs):
|
79 |
+
kwargs['device'] = device
|
80 |
+
return fn(*args, **kwargs)
|
81 |
+
return wrapper
|
82 |
+
try:
|
83 |
+
nn.Module.register_parameter = register_empty_parameter
|
84 |
+
if include_buffers:
|
85 |
+
nn.Module.register_buffer = register_empty_buffer
|
86 |
+
for torch_function_name in tensor_constructors_to_patch.keys():
|
87 |
+
setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
|
88 |
+
yield
|
89 |
+
finally:
|
90 |
+
nn.Module.register_parameter = old_register_parameter
|
91 |
+
if include_buffers:
|
92 |
+
nn.Module.register_buffer = old_register_buffer
|
93 |
+
for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
|
94 |
+
setattr(torch, torch_function_name, old_torch_function)
|
modeling_mpt.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A simple, flexible implementation of a GPT model.
|
2 |
+
|
3 |
+
Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
|
4 |
+
"""
|
5 |
+
import math
|
6 |
+
import warnings
|
7 |
+
from typing import List, Optional, Tuple, Union
|
8 |
+
import torch
|
9 |
+
import torch.nn as nn
|
10 |
+
import torch.nn.functional as F
|
11 |
+
from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
|
12 |
+
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
13 |
+
from .attention import attn_bias_shape, build_attn_bias
|
14 |
+
from .blocks import MPTBlock
|
15 |
+
from .norm import NORM_CLASS_REGISTRY
|
16 |
+
from .configuration_mpt import MPTConfig
|
17 |
+
from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
|
18 |
+
from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
|
19 |
+
from .meta_init_context import init_empty_weights
|
20 |
+
from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
|
21 |
+
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
22 |
+
|
23 |
+
class MPTPreTrainedModel(PreTrainedModel):
|
24 |
+
config_class = MPTConfig
|
25 |
+
base_model_prefix = 'model'
|
26 |
+
|
27 |
+
class MPTModel(MPTPreTrainedModel):
|
28 |
+
|
29 |
+
def __init__(self, config: MPTConfig):
|
30 |
+
config._validate_config()
|
31 |
+
super().__init__(config)
|
32 |
+
self.attn_impl = config.attn_config['attn_impl']
|
33 |
+
self.prefix_lm = config.attn_config['prefix_lm']
|
34 |
+
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
35 |
+
self.alibi = config.attn_config['alibi']
|
36 |
+
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
37 |
+
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
|
38 |
+
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
|
39 |
+
raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
|
40 |
+
norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
|
41 |
+
self.embedding_fraction = config.embedding_fraction
|
42 |
+
self.wte = nn.Embedding(config.vocab_size, config.d_model, device=config.init_device)
|
43 |
+
if not self.alibi:
|
44 |
+
self.wpe = nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
|
45 |
+
self.emb_drop = nn.Dropout(config.emb_pdrop)
|
46 |
+
self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
|
47 |
+
self.norm_f = norm_class(config.d_model, device=config.init_device)
|
48 |
+
if config.init_device != 'meta':
|
49 |
+
self.apply(self.param_init_fn)
|
50 |
+
self.is_causal = not self.prefix_lm
|
51 |
+
self._attn_bias_initialized = False
|
52 |
+
self.attn_bias = None
|
53 |
+
self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
|
54 |
+
if config.no_bias:
|
55 |
+
for module in self.modules():
|
56 |
+
if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
|
57 |
+
if config.verbose:
|
58 |
+
warnings.warn(f'Removing bias ({module.bias}) from {module}.')
|
59 |
+
module.register_parameter('bias', None)
|
60 |
+
if config.verbose and config.verbose > 2:
|
61 |
+
print(self)
|
62 |
+
if 'verbose' not in self.config.init_config:
|
63 |
+
self.config.init_config['verbose'] = self.config.verbose
|
64 |
+
if self.config.init_config['verbose'] > 1:
|
65 |
+
init_fn_name = self.config.init_config['name']
|
66 |
+
warnings.warn(f'Using {init_fn_name} initialization.')
|
67 |
+
|
68 |
+
def get_input_embeddings(self):
|
69 |
+
return self.wte
|
70 |
+
|
71 |
+
def set_input_embeddings(self, value):
|
72 |
+
self.wte = value
|
73 |
+
|
74 |
+
@torch.no_grad()
|
75 |
+
def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None):
|
76 |
+
if not self._attn_bias_initialized:
|
77 |
+
if self.attn_bias_shape:
|
78 |
+
self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
|
79 |
+
self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
|
80 |
+
self._attn_bias_initialized = True
|
81 |
+
if self.attn_impl == 'flash':
|
82 |
+
return (self.attn_bias, attention_mask)
|
83 |
+
if self.attn_bias is not None:
|
84 |
+
self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
|
85 |
+
attn_bias = self.attn_bias
|
86 |
+
if self.prefix_lm:
|
87 |
+
assert isinstance(attn_bias, torch.Tensor)
|
88 |
+
assert isinstance(prefix_mask, torch.Tensor)
|
89 |
+
attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
|
90 |
+
if self.attn_uses_sequence_id and sequence_id is not None:
|
91 |
+
assert isinstance(attn_bias, torch.Tensor)
|
92 |
+
attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
|
93 |
+
if attention_mask is not None:
|
94 |
+
s_k = attention_mask.shape[-1]
|
95 |
+
if attn_bias is None:
|
96 |
+
attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
|
97 |
+
else:
|
98 |
+
attn_bias = attn_bias[:, :, :, -s_k:]
|
99 |
+
if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
|
100 |
+
raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
|
101 |
+
min_val = torch.finfo(attn_bias.dtype).min
|
102 |
+
attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
|
103 |
+
return (attn_bias, None)
|
104 |
+
|
105 |
+
def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
|
106 |
+
(s_k, s_q) = attn_bias.shape[-2:]
|
107 |
+
if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
|
108 |
+
raise ValueError('attn_bias does not match the expected shape. ' + f'The last two dimensions should both be {self.config.max_length} ' + f'but are {s_k} and {s_q}.')
|
109 |
+
seq_len = prefix_mask.shape[-1]
|
110 |
+
if seq_len > self.config.max_seq_len:
|
111 |
+
raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
|
112 |
+
attn_bias = attn_bias[..., :seq_len, :seq_len]
|
113 |
+
causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
|
114 |
+
prefix = prefix_mask.view(-1, 1, 1, seq_len)
|
115 |
+
cannot_attend = ~torch.logical_or(causal, prefix.bool())
|
116 |
+
min_val = torch.finfo(attn_bias.dtype).min
|
117 |
+
attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
|
118 |
+
return attn_bias
|
119 |
+
|
120 |
+
def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
|
121 |
+
seq_len = sequence_id.shape[-1]
|
122 |
+
if seq_len > self.config.max_seq_len:
|
123 |
+
raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
|
124 |
+
attn_bias = attn_bias[..., :seq_len, :seq_len]
|
125 |
+
cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
|
126 |
+
min_val = torch.finfo(attn_bias.dtype).min
|
127 |
+
attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
|
128 |
+
return attn_bias
|
129 |
+
|
130 |
+
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
|
131 |
+
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
132 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
133 |
+
if attention_mask is not None:
|
134 |
+
attention_mask = attention_mask.bool()
|
135 |
+
if prefix_mask is not None:
|
136 |
+
prefix_mask = prefix_mask.bool()
|
137 |
+
if not return_dict:
|
138 |
+
raise NotImplementedError('return_dict False is not implemented yet for MPT')
|
139 |
+
if output_attentions:
|
140 |
+
raise NotImplementedError('output_attentions is not implemented yet for MPT')
|
141 |
+
if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
|
142 |
+
raise NotImplementedError('MPT does not support training with left padding.')
|
143 |
+
if self.prefix_lm and prefix_mask is None:
|
144 |
+
raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
|
145 |
+
if self.training:
|
146 |
+
if self.attn_uses_sequence_id and sequence_id is None:
|
147 |
+
raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
|
148 |
+
elif self.attn_uses_sequence_id is False and sequence_id is not None:
|
149 |
+
warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
|
150 |
+
S = input_ids.size(1)
|
151 |
+
assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
|
152 |
+
tok_emb = self.wte(input_ids)
|
153 |
+
if self.alibi:
|
154 |
+
x = tok_emb
|
155 |
+
else:
|
156 |
+
past_position = 0
|
157 |
+
if past_key_values is not None:
|
158 |
+
if len(past_key_values) != self.config.n_layers:
|
159 |
+
raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
|
160 |
+
past_position = past_key_values[0][0].size(1)
|
161 |
+
if S + past_position > self.config.max_seq_len:
|
162 |
+
raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
|
163 |
+
pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
|
164 |
+
if attention_mask is not None:
|
165 |
+
pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
|
166 |
+
pos_emb = self.wpe(pos)
|
167 |
+
x = tok_emb + pos_emb
|
168 |
+
if self.embedding_fraction == 1:
|
169 |
+
x = self.emb_drop(x)
|
170 |
+
else:
|
171 |
+
x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
|
172 |
+
assert isinstance(self.emb_drop, nn.Module)
|
173 |
+
x = self.emb_drop(x_shrunk)
|
174 |
+
(attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
|
175 |
+
if use_cache and past_key_values is None:
|
176 |
+
past_key_values = [() for _ in range(self.config.n_layers)]
|
177 |
+
all_hidden_states = () if output_hidden_states else None
|
178 |
+
for (b_idx, block) in enumerate(self.blocks):
|
179 |
+
if output_hidden_states:
|
180 |
+
assert all_hidden_states is not None
|
181 |
+
all_hidden_states = all_hidden_states + (x,)
|
182 |
+
past_key_value = past_key_values[b_idx] if past_key_values is not None else None
|
183 |
+
(x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
|
184 |
+
if past_key_values is not None:
|
185 |
+
past_key_values[b_idx] = past_key_value
|
186 |
+
x = self.norm_f(x)
|
187 |
+
return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)
|
188 |
+
|
189 |
+
def param_init_fn(self, module):
|
190 |
+
init_fn_name = self.config.init_config['name']
|
191 |
+
MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
|
192 |
+
|
193 |
+
def fsdp_wrap_fn(self, module):
|
194 |
+
return isinstance(module, MPTBlock)
|
195 |
+
|
196 |
+
def activation_checkpointing_fn(self, module):
|
197 |
+
return isinstance(module, MPTBlock)
|
198 |
+
|
199 |
+
class MPTForCausalLM(MPTPreTrainedModel):
|
200 |
+
|
201 |
+
def __init__(self, config: MPTConfig):
|
202 |
+
super().__init__(config)
|
203 |
+
if not config.tie_word_embeddings:
|
204 |
+
raise ValueError('MPTForCausalLM only supports tied word embeddings')
|
205 |
+
self.transformer = MPTModel(config)
|
206 |
+
self.logit_scale = None
|
207 |
+
if config.logit_scale is not None:
|
208 |
+
logit_scale = config.logit_scale
|
209 |
+
if isinstance(logit_scale, str):
|
210 |
+
if logit_scale == 'inv_sqrt_d_model':
|
211 |
+
logit_scale = 1 / math.sqrt(config.d_model)
|
212 |
+
else:
|
213 |
+
raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
|
214 |
+
self.logit_scale = logit_scale
|
215 |
+
|
216 |
+
def get_input_embeddings(self):
|
217 |
+
return self.transformer.wte
|
218 |
+
|
219 |
+
def set_input_embeddings(self, value):
|
220 |
+
self.transformer.wte = value
|
221 |
+
|
222 |
+
def get_output_embeddings(self):
|
223 |
+
return self.transformer.wte
|
224 |
+
|
225 |
+
def set_output_embeddings(self, new_embeddings):
|
226 |
+
self.transformer.wte = new_embeddings
|
227 |
+
|
228 |
+
def set_decoder(self, decoder):
|
229 |
+
self.transformer = decoder
|
230 |
+
|
231 |
+
def get_decoder(self):
|
232 |
+
return self.transformer
|
233 |
+
|
234 |
+
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
|
235 |
+
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
236 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
237 |
+
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
|
238 |
+
logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
|
239 |
+
if self.logit_scale is not None:
|
240 |
+
if self.logit_scale == 0:
|
241 |
+
warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
|
242 |
+
logits *= self.logit_scale
|
243 |
+
loss = None
|
244 |
+
if labels is not None:
|
245 |
+
labels = torch.roll(labels, shifts=-1)
|
246 |
+
labels[:, -1] = -100
|
247 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
|
248 |
+
return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
|
249 |
+
|
250 |
+
def param_init_fn(self, module):
|
251 |
+
init_fn_name = self.config.init_config['name']
|
252 |
+
MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
|
253 |
+
|
254 |
+
def fsdp_wrap_fn(self, module):
|
255 |
+
return isinstance(module, MPTBlock)
|
256 |
+
|
257 |
+
def activation_checkpointing_fn(self, module):
|
258 |
+
return isinstance(module, MPTBlock)
|
259 |
+
|
260 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
|
261 |
+
if inputs_embeds is not None:
|
262 |
+
raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
|
263 |
+
attention_mask = kwargs['attention_mask'].bool()
|
264 |
+
if attention_mask[:, -1].sum() != attention_mask.shape[0]:
|
265 |
+
raise NotImplementedError('MPT does not support generation with right padding.')
|
266 |
+
if self.transformer.attn_uses_sequence_id and self.training:
|
267 |
+
sequence_id = torch.zeros_like(input_ids[:1])
|
268 |
+
else:
|
269 |
+
sequence_id = None
|
270 |
+
if past_key_values is not None:
|
271 |
+
input_ids = input_ids[:, -1].unsqueeze(-1)
|
272 |
+
if self.transformer.prefix_lm:
|
273 |
+
prefix_mask = torch.ones_like(attention_mask)
|
274 |
+
if kwargs.get('use_cache') == False:
|
275 |
+
raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
|
276 |
+
else:
|
277 |
+
prefix_mask = None
|
278 |
+
return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)}
|
279 |
+
|
280 |
+
@staticmethod
|
281 |
+
def _reorder_cache(past_key_values, beam_idx):
|
282 |
+
"""Used by HuggingFace generate when using beam search with kv-caching.
|
283 |
+
|
284 |
+
See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
|
285 |
+
for an example in transformers.
|
286 |
+
"""
|
287 |
+
reordered_past = []
|
288 |
+
for layer_past in past_key_values:
|
289 |
+
reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
|
290 |
+
return reordered_past
|
norm.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
def _cast_if_autocast_enabled(tensor):
|
4 |
+
if torch.is_autocast_enabled():
|
5 |
+
if tensor.device.type == 'cuda':
|
6 |
+
dtype = torch.get_autocast_gpu_dtype()
|
7 |
+
elif tensor.device.type == 'cpu':
|
8 |
+
dtype = torch.get_autocast_cpu_dtype()
|
9 |
+
else:
|
10 |
+
raise NotImplementedError()
|
11 |
+
return tensor.to(dtype=dtype)
|
12 |
+
return tensor
|
13 |
+
|
14 |
+
class LPLayerNorm(torch.nn.LayerNorm):
|
15 |
+
|
16 |
+
def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
|
17 |
+
super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
18 |
+
|
19 |
+
def forward(self, x):
|
20 |
+
module_device = x.device
|
21 |
+
downcast_x = _cast_if_autocast_enabled(x)
|
22 |
+
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
|
23 |
+
downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
|
24 |
+
with torch.autocast(enabled=False, device_type=module_device.type):
|
25 |
+
return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
|
26 |
+
|
27 |
+
def rms_norm(x, weight=None, eps=1e-05):
|
28 |
+
output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
29 |
+
if weight is not None:
|
30 |
+
return output * weight
|
31 |
+
return output
|
32 |
+
|
33 |
+
class RMSNorm(torch.nn.Module):
|
34 |
+
|
35 |
+
def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
|
36 |
+
super().__init__()
|
37 |
+
self.eps = eps
|
38 |
+
if weight:
|
39 |
+
self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
|
40 |
+
else:
|
41 |
+
self.register_parameter('weight', None)
|
42 |
+
|
43 |
+
def forward(self, x):
|
44 |
+
return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
|
45 |
+
|
46 |
+
class LPRMSNorm(RMSNorm):
|
47 |
+
|
48 |
+
def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
|
49 |
+
super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
|
50 |
+
|
51 |
+
def forward(self, x):
|
52 |
+
downcast_x = _cast_if_autocast_enabled(x)
|
53 |
+
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
|
54 |
+
with torch.autocast(enabled=False, device_type=x.device.type):
|
55 |
+
return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
|
56 |
+
NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
|
param_init_fns.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import warnings
|
3 |
+
from collections.abc import Sequence
|
4 |
+
from functools import partial
|
5 |
+
from typing import Optional, Tuple, Union
|
6 |
+
import torch
|
7 |
+
from torch import nn
|
8 |
+
from .norm import NORM_CLASS_REGISTRY
|
9 |
+
|
10 |
+
def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
|
11 |
+
del kwargs
|
12 |
+
if verbose > 1:
|
13 |
+
warnings.warn(f"Initializing network using module's reset_parameters attribute")
|
14 |
+
if hasattr(module, 'reset_parameters'):
|
15 |
+
module.reset_parameters()
|
16 |
+
|
17 |
+
def fused_init_helper_(module: nn.Module, init_fn_):
|
18 |
+
_fused = getattr(module, '_fused', None)
|
19 |
+
if _fused is None:
|
20 |
+
raise RuntimeError(f'Internal logic error')
|
21 |
+
(dim, splits) = _fused
|
22 |
+
splits = (0, *splits, module.weight.size(dim))
|
23 |
+
for (s, e) in zip(splits[:-1], splits[1:]):
|
24 |
+
slice_indices = [slice(None)] * module.weight.ndim
|
25 |
+
slice_indices[dim] = slice(s, e)
|
26 |
+
init_fn_(module.weight[slice_indices])
|
27 |
+
|
28 |
+
def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
29 |
+
del kwargs
|
30 |
+
if verbose > 1:
|
31 |
+
warnings.warn(f'If model has bias parameters they are initialized to 0.')
|
32 |
+
init_div_is_residual = init_div_is_residual
|
33 |
+
if init_div_is_residual is False:
|
34 |
+
div_is_residual = 1.0
|
35 |
+
elif init_div_is_residual is True:
|
36 |
+
div_is_residual = math.sqrt(2 * n_layers)
|
37 |
+
elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
|
38 |
+
div_is_residual = init_div_is_residual
|
39 |
+
elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
|
40 |
+
div_is_residual = float(init_div_is_residual)
|
41 |
+
else:
|
42 |
+
div_is_residual = 1.0
|
43 |
+
raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
|
44 |
+
if init_div_is_residual is not False:
|
45 |
+
if verbose > 1:
|
46 |
+
warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
|
47 |
+
if isinstance(module, nn.Linear):
|
48 |
+
if hasattr(module, '_fused'):
|
49 |
+
fused_init_helper_(module, init_fn_)
|
50 |
+
else:
|
51 |
+
init_fn_(module.weight)
|
52 |
+
if module.bias is not None:
|
53 |
+
torch.nn.init.zeros_(module.bias)
|
54 |
+
if init_div_is_residual is not False and getattr(module, '_is_residual', False):
|
55 |
+
with torch.no_grad():
|
56 |
+
module.weight.div_(div_is_residual)
|
57 |
+
elif isinstance(module, nn.Embedding):
|
58 |
+
if emb_init_std is not None:
|
59 |
+
std = emb_init_std
|
60 |
+
if std == 0:
|
61 |
+
warnings.warn(f'Embedding layer initialized to 0.')
|
62 |
+
emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
|
63 |
+
if verbose > 1:
|
64 |
+
warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
|
65 |
+
elif emb_init_uniform_lim is not None:
|
66 |
+
lim = emb_init_uniform_lim
|
67 |
+
if isinstance(lim, Sequence):
|
68 |
+
if len(lim) > 2:
|
69 |
+
raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
|
70 |
+
if lim[0] == lim[1]:
|
71 |
+
warnings.warn(f'Embedding layer initialized to {lim[0]}.')
|
72 |
+
else:
|
73 |
+
if lim == 0:
|
74 |
+
warnings.warn(f'Embedding layer initialized to 0.')
|
75 |
+
lim = [-lim, lim]
|
76 |
+
(a, b) = lim
|
77 |
+
emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
|
78 |
+
if verbose > 1:
|
79 |
+
warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
|
80 |
+
else:
|
81 |
+
emb_init_fn_ = init_fn_
|
82 |
+
emb_init_fn_(module.weight)
|
83 |
+
elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
|
84 |
+
if verbose > 1:
|
85 |
+
warnings.warn(f'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.')
|
86 |
+
if hasattr(module, 'weight') and module.weight is not None:
|
87 |
+
torch.nn.init.ones_(module.weight)
|
88 |
+
if hasattr(module, 'bias') and module.bias is not None:
|
89 |
+
torch.nn.init.zeros_(module.bias)
|
90 |
+
elif isinstance(module, nn.MultiheadAttention):
|
91 |
+
if module._qkv_same_embed_dim:
|
92 |
+
assert module.in_proj_weight is not None
|
93 |
+
assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
|
94 |
+
assert d_model is not None
|
95 |
+
_d = d_model
|
96 |
+
splits = (0, _d, 2 * _d, 3 * _d)
|
97 |
+
for (s, e) in zip(splits[:-1], splits[1:]):
|
98 |
+
init_fn_(module.in_proj_weight[s:e])
|
99 |
+
else:
|
100 |
+
assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
|
101 |
+
assert module.in_proj_weight is None
|
102 |
+
init_fn_(module.q_proj_weight)
|
103 |
+
init_fn_(module.k_proj_weight)
|
104 |
+
init_fn_(module.v_proj_weight)
|
105 |
+
if module.in_proj_bias is not None:
|
106 |
+
torch.nn.init.zeros_(module.in_proj_bias)
|
107 |
+
if module.bias_k is not None:
|
108 |
+
torch.nn.init.zeros_(module.bias_k)
|
109 |
+
if module.bias_v is not None:
|
110 |
+
torch.nn.init.zeros_(module.bias_v)
|
111 |
+
init_fn_(module.out_proj.weight)
|
112 |
+
if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
|
113 |
+
with torch.no_grad():
|
114 |
+
module.out_proj.weight.div_(div_is_residual)
|
115 |
+
if module.out_proj.bias is not None:
|
116 |
+
torch.nn.init.zeros_(module.out_proj.bias)
|
117 |
+
else:
|
118 |
+
for _ in module.parameters(recurse=False):
|
119 |
+
raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
|
120 |
+
|
121 |
+
def _normal_init_(std, mean=0.0):
|
122 |
+
return partial(torch.nn.init.normal_, mean=mean, std=std)
|
123 |
+
|
124 |
+
def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
125 |
+
del kwargs
|
126 |
+
init_fn_ = _normal_init_(std=std)
|
127 |
+
if verbose > 1:
|
128 |
+
warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
|
129 |
+
generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
130 |
+
|
131 |
+
def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
132 |
+
del kwargs
|
133 |
+
if init_std is None:
|
134 |
+
raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
|
135 |
+
_normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
136 |
+
|
137 |
+
def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
138 |
+
del kwargs
|
139 |
+
std = math.sqrt(2 / (5 * d_model))
|
140 |
+
_normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
141 |
+
|
142 |
+
def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
143 |
+
"""From section 2.3.1 of GPT-NeoX-20B:
|
144 |
+
|
145 |
+
An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
|
146 |
+
see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
|
147 |
+
and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
|
148 |
+
"""
|
149 |
+
del kwargs
|
150 |
+
residual_div = n_layers / math.sqrt(10)
|
151 |
+
if verbose > 1:
|
152 |
+
warnings.warn(f'setting init_div_is_residual to {residual_div}')
|
153 |
+
small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
154 |
+
|
155 |
+
def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
|
156 |
+
del kwargs
|
157 |
+
if verbose > 1:
|
158 |
+
warnings.warn(f'Using nn.init.kaiming_uniform_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
|
159 |
+
kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
|
160 |
+
generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
161 |
+
|
162 |
+
def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
|
163 |
+
del kwargs
|
164 |
+
if verbose > 1:
|
165 |
+
warnings.warn(f'Using nn.init.kaiming_normal_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
|
166 |
+
kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
|
167 |
+
generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
168 |
+
|
169 |
+
def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
|
170 |
+
del kwargs
|
171 |
+
xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
|
172 |
+
if verbose > 1:
|
173 |
+
warnings.warn(f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' + f'gain={init_gain}')
|
174 |
+
generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
175 |
+
|
176 |
+
def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
|
177 |
+
xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
|
178 |
+
if verbose > 1:
|
179 |
+
warnings.warn(f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' + f'gain={init_gain}')
|
180 |
+
generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
181 |
+
MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"unk_token": "<|endoftext|>"
|
5 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": "<|endoftext|>",
|
4 |
+
"clean_up_tokenization_spaces": true,
|
5 |
+
"eos_token": "<|endoftext|>",
|
6 |
+
"model_max_length": 2048,
|
7 |
+
"tokenizer_class": "GPTNeoXTokenizer",
|
8 |
+
"unk_token": "<|endoftext|>"
|
9 |
+
}
|