inwaves commited on
Commit
2896dec
1 Parent(s): 4e1467d

Fleshing out model, config

Browse files
Files changed (2) hide show
  1. model.py +7 -5
  2. utils.py +8 -4
model.py CHANGED
@@ -22,11 +22,13 @@ class TransformerBlock(nn.Module):
22
  super().__init__()
23
  self.config = config
24
 
25
- # Embed,
26
- self.embed = nn.Embedding(num_embeddings, config.d_model)
27
- # One MLP, one attention
28
- # one layernorm, one dropout (?)
29
- # Unembed
 
 
30
 
31
  def forward(self, x: t.Tensor) -> t.Tensor:
32
  pass
 
22
  super().__init__()
23
  self.config = config
24
 
25
+ self.embed = nn.Embedding(config.num_embeddings, config.d_model)
26
+ self.linear = nn.Sequential(
27
+ nn.Linear(config.d_model, config.d_model),
28
+ SoLU(),
29
+ )
30
+ self.layer_norm = nn.LayerNorm(normalized_shape)
31
+ self.unembed = nn.Embedding(config.num_embeddings, config.d_model)
32
 
33
  def forward(self, x: t.Tensor) -> t.Tensor:
34
  pass
utils.py CHANGED
@@ -1,6 +1,10 @@
1
  @dataclass
2
  class OsSoluConfig:
3
- d_model: int = 512
4
- vocab_size: int = 65536 # Unsure about this.
5
- learning_rate: float = 1e-3
6
- num_embeddings: int = 1024 # Unsure about this.
 
 
 
 
 
1
  @dataclass
2
  class OsSoluConfig:
3
+ d_model: int = 512 # Hidden size of the model.
4
+ vocab_size: int = 65536 # Vocabulary size of the input sequence. Unsure about this.
5
+ learning_rate: float = 1e-3 # Learning rate for the optimiser.
6
+ num_embeddings: int = 1024 # Number of embeddings. Unsure about this.
7
+ num_blocks: int = 1 # Number of transformer blocks.
8
+ dropout: float = 0.1 # Probability of dropout.
9
+ ln_eps: float = 1e-3 # Layer norm epsilon.
10
+ num_heads: int = 4 # Number of attention heads in each attention layer.