from transformers import PretrainedConfig, GPT2Config | |
from typing import List | |
class GPTOptimConfig(GPT2Config): | |
model_type = "gpt_optimized" | |
def __init__( | |
self, | |
block_size: int = 1024, # max sequence length | |
vocab_size: int = 50257, # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token | |
n_layer: int = 16, # number of layers | |
n_head: int = 16, # number of heads | |
n_embd: int = 1024, # embedding dimension | |
**kwargs, | |
): | |
super().__init__(**kwargs) | |
self.block_size = block_size | |
self.vocab_size = vocab_size | |
self.n_layer = n_layer | |
self.n_head = n_head | |
self.n_embd = n_embd |