inwaves
/

os-solu

Model card Files Files and versions Community

inwaves commited on Sep 7, 2022

Commit

405f5b1

•

1 Parent(s): 0b6a10a

Refactor config class, add argparser

Browse files

Files changed (4) hide show

.gitignore +1 -0
main.py +30 -9
model.py +22 -7
utils.py +26 -11

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *__pycache__/

main.py CHANGED Viewed

@@ -2,22 +2,43 @@ import torch as t
 import torch.nn as nn
 import torch.functional as F
 import torch.optim as optim
-def parse_args():
     # TODO: command-line args for hparams
-    pass
-def train():
     # TODO: training loop
-    pass
 def eval():
     pass
-def setup():
-    # TODO: wandb logging, load configs, all that stuff
-    pass
 if __name__=="__main__":
-    parse_args()

 import torch.nn as nn
 import torch.functional as F
 import torch.optim as optim
+import argparse
+from utils import OsSoluConfig
+from model import OsSoluModel
+from typing import Tuple
+def parse_arguments() -> argparse.Namespace:
     # TODO: command-line args for hparams
+    parser = argparse.ArgumentParser(description="Parse command-line arguments for this model.")
+    parser.add_argument("--d_model", type=int, default=512, help="Hidden size of the model.")
+    parser.add_argument("--vocab_size", type=int, default=65536, help="Vocabulary size of the input sequence.")
+    parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for the optimiser.")
+    parser.add_argument("--num_embeddings", type=int, default=1024, help="Number of embeddings.")
+    parser.add_argument("--num_blocks", type=int, default=1, help="Number of transformer blocks.")
+    parser.add_argument("--dropout", type=float, default=0.1, help="Probability of dropout.")
+    parser.add_argument("--ln_eps", type=float, default=1e-3, help="Layer norm epsilon.")
+    parser.add_argument("--num_heads", type=int, default=4, help="Number of attention heads in each attention layer.")
+    parser.add_argument("--self_attention_type", type=str, default="unidirectional", help="What type of attention to use: rotary or unidirectional. ")
+    parser.add_argument("--max_positional_embeddings", type=int, default=1024, help="Maximum number of positional embeddings.")
+    args = parser.parse_args()
+    return args
+def train(config: OsSoluConfig, model: OsSoluModel) -> OsSoluModel:
     # TODO: training loop
+    return model
 def eval():
     pass
+def setup() -> Tuple[OsSoluConfig, OsSoluModel]:
+    # TODO: wandb logging
+    args = parse_arguments()
+    config = OsSoluConfig(args)
+    model = OsSoluModel(config)
+    return config, model
 if __name__=="__main__":
+    config, model = setup()
+    trained_model = train(config, model)
+    eval()

model.py CHANGED Viewed

@@ -15,7 +15,8 @@ class OsSoluModel(nn.Module):
         self.config = config
         self.embed_positions = nn.Embedding(config.max_positional_embeddings, config.d_model)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
-        self.transformer_block = TransformerBlock(config)
         self.final_ln = nn.LayerNorm(normalized_shape, config.ln_eps)
         self.unembed = nn
@@ -23,23 +24,36 @@ class OsSoluModel(nn.Module):
         positional_embeddings = self.embed_positions(t.arange(x.size(1)))
         token_embeddings = self.embed_tokens(x)
         embeddings = positional_embeddings + token_embeddings
-class TransformerBlock(nn.Module):
     def __init__(self, config: OsSoluConfig) -> None:
         super().__init__()
         self.config = config
         self.attention = UnidirectionalAttention(config) if config.self_attention_type == "unidirectional" else RotaryAttention(config)
-        self.linear = nn.Sequential(
-            nn.Linear(config.d_model, config.d_model),
             SoLU(),
         )
-        self.layer_norm = nn.LayerNorm(normalized_shape, config.ln_eps)
-        self.unembed = nn.Embedding(config.num_embeddings, config.d_model)
     def forward(self, x: t.Tensor) -> t.Tensor:
-        pass
 class UnidirectionalAttention(nn.Module):
@@ -96,4 +110,5 @@ class RotaryAttention(nn.Module):
         self.config = config
     def forward(self, x: t.Tensor) -> t.Tensor:
         pass

         self.config = config
         self.embed_positions = nn.Embedding(config.max_positional_embeddings, config.d_model)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        self.transformer_blocks = nn.ModuleList([GPT2Block(config) for _ in range(config.num_blocks)])
         self.final_ln = nn.LayerNorm(normalized_shape, config.ln_eps)
         self.unembed = nn
         positional_embeddings = self.embed_positions(t.arange(x.size(1)))
         token_embeddings = self.embed_tokens(x)
         embeddings = positional_embeddings + token_embeddings
+        out = self.dropout(embeddings)
+        out = self.transformer_blocks(out)
+class SoLU(nn.Module):
+    def __init__(self):
+        pass
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        return x * x.softmax(dim=-1)
+class GPT2Block(nn.Module):
     def __init__(self, config: OsSoluConfig) -> None:
         super().__init__()
         self.config = config
+        self.layer_norm1 = nn.LayerNorm(normalized_shape, config.ln_eps)
         self.attention = UnidirectionalAttention(config) if config.self_attention_type == "unidirectional" else RotaryAttention(config)
+        self.MLP = nn.Sequential(
+            nn.LayerNorm(normalized_shape, config.ln_eps),
+            nn.Linear(config.d_model, 4*config.d_model),
             SoLU(),
+            nn.Linear(4*config.d_model, config.d_model),
+            nn.Dropout(config.dropout)
         )
     def forward(self, x: t.Tensor) -> t.Tensor:
+        x = x + self.attention(self.layer_norm1(x))
+        x = x + self.MLP(x)
+        return x
 class UnidirectionalAttention(nn.Module):
         self.config = config
     def forward(self, x: t.Tensor) -> t.Tensor:
+        # TODO: implement rotary self-attention
         pass

utils.py CHANGED Viewed

@@ -1,12 +1,27 @@
-@dataclass
 class OsSoluConfig:
-    d_model: int = 512                              # Hidden size of the model.
-    vocab_size: int = 65536                         # Vocabulary size of the input sequence. Unsure about this.
-    learning_rate: float = 1e-3                     # Learning rate for the optimiser.
-    num_embeddings: int = 1024                      # Number of embeddings. Unsure about this.
-    num_blocks: int = 1                             # Number of transformer blocks.
-    dropout: float = 0.1                            # Probability of dropout.
-    ln_eps: float = 1e-3                            # Layer norm epsilon.
-    num_heads: int = 4                              # Number of attention heads in each attention layer.
-    self_attention_type: str = "unidirectional"     # What type of attention to use: rotary or unidirectional.
-    max_positional_embeddings: int = 1024           # Maximum number of positional embeddings.

+import argparse
 class OsSoluConfig:
+    d_model: int                                # Hidden size of the model.
+    vocab_size: int                             # Vocabulary size of the input sequence. Unsure about this.
+    learning_rate: float                        # Learning rate for the optimiser.
+    num_embeddings: int                         # Number of embeddings. Unsure about this.
+    num_blocks: int                             # Number of transformer blocks.
+    dropout: float                              # Probability of dropout.
+    ln_eps: float                               # Layer norm epsilon.
+    num_heads: int                              # Number of attention heads in each attention layer.
+    self_attention_type: str                    # What type of attention to use: rotary or unidirectional.
+    max_positional_embeddings: int              # Maximum number of positional embeddings.
+    def __init__(self, args: argparse.Namespace) -> None:
+        """Initialise this config class with values provided by a command-line argument parser.
+           Values are never None here, as we provide suitable defaults in the parser call."""
+        self.d_model = args.d_model
+        self.vocab_size = args.vocab_size
+        self.learning_rate = args.learning_rate
+        self.num_embeddings = args.num_embeddings
+        self.num_blocks = args.num_blocks
+        self.dropout = args.dropout
+        self.ln_eps = args.ln_eps
+        self.num_heads = args.num_heads
+        self.self_attention_type = args.self_attention_type
+        self.max_positional_embeddings = args.max_positional_embeddings