inwaves
/

os-solu

Model card Files Files and versions Community

inwaves commited on Sep 9, 2022

Commit

1bcfe48

•

1 Parent(s): d97c361

First successful run, added checkpoints

Browse files

Files changed (3) hide show

main.py +39 -22
model.py +11 -1
utils.py +26 -3

main.py CHANGED Viewed

@@ -11,12 +11,15 @@ from typing import Tuple
 from torch.utils.data.dataloader import DataLoader
 from datasets import load_dataset
 from transformers import AutoTokenizer
-from utils import OsSoluConfig, tokenise
 from model import OsSoluModel
 WANDB_PROJECT_NAME = "os_solu"
 DEVICE = "cuda" if t.cuda.is_available() else "cpu"
 def parse_arguments() -> dict:
     """Parses command-line arguments for this model run. Arguments of type string have allowed values,
        which are enforced. Default parameter values are provided such that fields in the config are never None.
@@ -29,7 +32,8 @@ def parse_arguments() -> dict:
         dict: a dictionary containing the command-line arguments parsed by this function.
     """
     parser = argparse.ArgumentParser(description="Parse command-line arguments for this model.")
-    parser.add_argument("--batch_size", type=int, default=256, help="Batch size used in training.")
     parser.add_argument("--d_model", type=int, default=512, help="Hidden size of the model.")
     parser.add_argument("--dropout", type=float, default=0.1, help="Probability of dropout.")
     parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for the optimiser.")
@@ -38,7 +42,7 @@ def parse_arguments() -> dict:
     parser.add_argument("--nonlinearity", type=str, default="solu", help=" Nonlinearity to use inside MLP block: must be relu or solu.")
     parser.add_argument("--num_blocks", type=int, default=1, help="Number of transformer blocks.")
     parser.add_argument("--num_embeddings", type=int, default=1024, help="Number of embeddings.")
-    parser.add_argument("--num_epochs", type=int, default=5, help="Number of epochs to run for.")
     parser.add_argument("--num_heads", type=int, default=4, help="Number of attention heads in each attention layer.")
     parser.add_argument("--optimiser_type", type=str, default="adam", help="Optimiser type.")
     parser.add_argument("--self_attention_type", type=str, default="unidirectional", help="What type of attention to use: rotary or unidirectional.")
@@ -69,8 +73,7 @@ def train(config: OsSoluConfig, model: OsSoluModel, train_dataloader: DataLoader
     Returns:
         OsSoluModel: The trained model.
     """
-    train_loss_fn = t.nn.CrossEntropyLoss()
-    wandb.watch(model, criterion=train_loss_fn, log="all", log_freq=10, log_graph=True)
     # Initialise optimiser.
     opt = optim.Adam if config.optimiser_type.lower() == "adam" else optim.SGD
@@ -82,18 +85,32 @@ def train(config: OsSoluConfig, model: OsSoluModel, train_dataloader: DataLoader
     for epoch in range(config.num_epochs):
         for i, batch in enumerate(tqdm(train_data_iterator
     )):
-            data = batch["text"]
-            data = data.to(DEVICE)
-            predictions = model(data)
-            accuracy = (predictions.argmax(dim=-1) == target).sum() / len(data)
             optimiser.zero_grad()
-            # loss = train_loss_fn(data, predictions)
             loss.backward()
             optimiser.step()
-            wandb.log(dict(train_loss=loss, train_accuracy=accuracy, elapsed=time.time() - start_time), step=examples_seen)
-            examples_seen += len(data)
     return model
@@ -112,15 +129,14 @@ def eval(model: OsSoluModel, test_dataloader: DataLoader) -> None:
     model.eval()
     with t.inference_mode():
         test_data_iterator = iter(test_dataloader)
-        for i, (data, target) in enumerate(tqdm(test_data_iterator)):
-            data = batch["text"]
-            data = data.to(DEVICE)
-            predictions = model(data)
-            num_correct += (predictions.argmax(dim=-1) == target).sum().item()
-            total_loss += test_loss_fn(target, predictions).item()
-            examples_seen += len(data)
-        wandb.log(dict(test_loss=total_loss, test_accuracy=num_correct / examples_seen, elapsed=time.time() - start_time), step=examples_seen)
     # Save the model's state on disk, then upload to wandb.
     filename = f"{wandb.run.dir}/model_state_dict.pt"
@@ -135,9 +151,10 @@ def setup() -> Tuple[OsSoluConfig, OsSoluModel]:
         Tuple[OsSoluConfig, OsSoluModel, datasets.iterable_dataset.IterableDataset, datasets.iterable_dataset.IterableDataset]: A tuple containing a config, a model, a training dataset and a test dataset.
     """
     args = parse_arguments()
-    wandb.init(project=WANDB_PROJECT_NAME, config=args)
     config = OsSoluConfig(args)
     model = OsSoluModel(config).to(DEVICE)
     start_data_time = time.time()
     # Load and prep data.

 from torch.utils.data.dataloader import DataLoader
 from datasets import load_dataset
 from transformers import AutoTokenizer
+from utils import OsSoluConfig, tokenise, loss_fn, count_parameters
 from model import OsSoluModel
 WANDB_PROJECT_NAME = "os_solu"
 DEVICE = "cuda" if t.cuda.is_available() else "cpu"
+# TODO: Add support for distributed training.
+# TODO: Use only book data from dataset.
 def parse_arguments() -> dict:
     """Parses command-line arguments for this model run. Arguments of type string have allowed values,
        which are enforced. Default parameter values are provided such that fields in the config are never None.
         dict: a dictionary containing the command-line arguments parsed by this function.
     """
     parser = argparse.ArgumentParser(description="Parse command-line arguments for this model.")
+    parser.add_argument("--batch_size", type=int, default=40, help="Batch size used in training.")
+    parser.add_argument("--checkpoint_every_n_tokens", type=int, default=50_000, help="Save a checkpoint of the model every n tokens processed.")
     parser.add_argument("--d_model", type=int, default=512, help="Hidden size of the model.")
     parser.add_argument("--dropout", type=float, default=0.1, help="Probability of dropout.")
     parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for the optimiser.")
     parser.add_argument("--nonlinearity", type=str, default="solu", help=" Nonlinearity to use inside MLP block: must be relu or solu.")
     parser.add_argument("--num_blocks", type=int, default=1, help="Number of transformer blocks.")
     parser.add_argument("--num_embeddings", type=int, default=1024, help="Number of embeddings.")
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs to run for.")
     parser.add_argument("--num_heads", type=int, default=4, help="Number of attention heads in each attention layer.")
     parser.add_argument("--optimiser_type", type=str, default="adam", help="Optimiser type.")
     parser.add_argument("--self_attention_type", type=str, default="unidirectional", help="What type of attention to use: rotary or unidirectional.")
     Returns:
         OsSoluModel: The trained model.
     """
+    wandb.watch(model, criterion=loss_fn, log="all", log_freq=10, log_graph=True)
     # Initialise optimiser.
     opt = optim.Adam if config.optimiser_type.lower() == "adam" else optim.SGD
     for epoch in range(config.num_epochs):
         for i, batch in enumerate(tqdm(train_data_iterator
     )):
+            start_time = time.time()
+            batch = batch["text"]
+            batch = batch.to(DEVICE)
+            logits = model(batch)
             optimiser.zero_grad()
+            loss = loss_fn(logits, batch)
             loss.backward()
             optimiser.step()
+            wandb.log(dict(train_loss=loss, elapsed=time.time() - start_time), step=examples_seen)
+            examples_seen += len(batch)
+            # Save a checkpoint of the model.
+            if examples_seen % config.checkpoint_every_n_tokens == 0:
+                # Save the model's state on disk, then upload to wandb.
+                filename = f"{wandb.run.dir}/os_solu_model_ckpt_step_{examples_seen}.pt"
+                t.save({
+                    "step": examples_seen,
+                    "model_state_dict": model.state_dict(),
+                    "optimiser_state_dict": optimiser.state_dict(),
+                    "loss": loss.item()
+                }, filename)
+                wandb.save(filename)
+                print(f"Checkpointing model at {examples_seen} tokens seen.")
     return model
     model.eval()
     with t.inference_mode():
         test_data_iterator = iter(test_dataloader)
+        for i, batch in enumerate(tqdm(test_data_iterator)):
+            batch = batch["text"]
+            batch = batch.to(DEVICE)
+            logits = model(batch)
+            total_loss += loss_fn(logits, batch).item()
+            examples_seen += len(batch)
+        wandb.log(dict(test_loss=total_loss, elapsed=time.time() - start_time), step=examples_seen)
     # Save the model's state on disk, then upload to wandb.
     filename = f"{wandb.run.dir}/model_state_dict.pt"
         Tuple[OsSoluConfig, OsSoluModel, datasets.iterable_dataset.IterableDataset, datasets.iterable_dataset.IterableDataset]: A tuple containing a config, a model, a training dataset and a test dataset.
     """
     args = parse_arguments()
     config = OsSoluConfig(args)
     model = OsSoluModel(config).to(DEVICE)
+    args["num_parameters"] = count_parameters(model)
+    wandb.init(project=WANDB_PROJECT_NAME, config=args)
     start_data_time = time.time()
     # Load and prep data.

model.py CHANGED Viewed

@@ -7,7 +7,8 @@ from fancy_einsum import einsum
 from einops import rearrange, repeat, reduce
 from utils import OsSoluConfig
 class OsSoluModel(nn.Module):
     """An open-source implementation of a SoLU-based transformer. This is a GPT-style architecture model
@@ -128,4 +129,13 @@ class RotaryAttention(nn.Module):
     def forward(self, x: t.Tensor) -> t.Tensor:
         # TODO: implement rotary self-attention
         pass

 from einops import rearrange, repeat, reduce
 from utils import OsSoluConfig
+# TODO: Add hooks to the model.
+# TODO: Add support for mixing dense and sparse attention.
 class OsSoluModel(nn.Module):
     """An open-source implementation of a SoLU-based transformer. This is a GPT-style architecture model
     def forward(self, x: t.Tensor) -> t.Tensor:
         # TODO: implement rotary self-attention
+        pass
+class LayerNorm(nn.Module):
+    def __init__(self, config: OsSoluConfig) -> None:
+        super().__init__()
+        self.config = config
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        # TODO: implement layernorm with hooks on normalisation only.
         pass

utils.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import numpy as np
 from einops import rearrange
 class OsSoluConfig:
     """A class to hold hyperparameters for the model itself and for the training process."""
     batch_size: int                             # Training data batch size.
     d_model: int                                # Hidden size of the model.
     dropout: float                              # Probability of dropout.
     learning_rate: float                        # Learning rate for the optimiser.
@@ -23,6 +27,7 @@ class OsSoluConfig:
         """Initialise this config class with values provided by a command-line argument parser.
            Values are never None here, as we provide suitable defaults in the parser call."""
         self.batch_size = args["batch_size"]
         self.d_model = args["d_model"]
         self.dropout = args["dropout"]
         self.learning_rate = args["learning_rate"]
@@ -38,7 +43,7 @@ class OsSoluConfig:
         self.vocab_size = args["vocab_size"]
 def tokenise(batch, tokeniser, num_gpus: int = 1, context_length: int = 1024):
-    """Tokenise a batch of text data. This implementation is idiosyncratic to the Pile dataset, but can be easily modified to work with e.g. C4.
     Args:
         batch (dict): The batch of text, as a dict with a 'text' field.
@@ -70,7 +75,25 @@ def tokenise(batch, tokeniser, num_gpus: int = 1, context_length: int = 1024):
     tokenised_text = np.concatenate([prefix, all_tokens], axis=1)
     assert  tokenised_text.shape == (current_batch_size, context_length)
-    print(f"{current_batch_size=}, {context_length=}")
     return {"text": tokenised_text}

 import numpy as np
+import torch as t
+import torch.nn.functional as F
 from einops import rearrange
+# TODO: Add functionality to load this from a config file as an alternative to command-line args.
 class OsSoluConfig:
     """A class to hold hyperparameters for the model itself and for the training process."""
     batch_size: int                             # Training data batch size.
+    checkpoint_every_n_tokens: int              # Save a checkpoint of the model every n tokens processed.
     d_model: int                                # Hidden size of the model.
     dropout: float                              # Probability of dropout.
     learning_rate: float                        # Learning rate for the optimiser.
         """Initialise this config class with values provided by a command-line argument parser.
            Values are never None here, as we provide suitable defaults in the parser call."""
         self.batch_size = args["batch_size"]
+        self.checkpoint_every_n_tokens = args["checkpoint_every_n_tokens"]
         self.d_model = args["d_model"]
         self.dropout = args["dropout"]
         self.learning_rate = args["learning_rate"]
         self.vocab_size = args["vocab_size"]
 def tokenise(batch, tokeniser, num_gpus: int = 1, context_length: int = 1024):
+    """Tokenise a batch of text data. This implementation is idiosyncratic to the Pile dataset, but can be easily modified to work with e.g. C4. Code from Neel.
     Args:
         batch (dict): The batch of text, as a dict with a 'text' field.
     tokenised_text = np.concatenate([prefix, all_tokens], axis=1)
     assert  tokenised_text.shape == (current_batch_size, context_length)
     return {"text": tokenised_text}
+def loss_fn(logits, batch):
+    """Loss function to train an autoregressive model. It compares the token logits predicted by the model with the actual next token. Code from Neel.
+    Args:
+        logits (t.Tensor): A tensor containing logits, has shape (batch_size, sequence_length, vocab_size)
+        batch (t.Tensor): A tensor containing token IDs, has shape (batch_size, sequence_length, vocab_size)
+    Returns:
+        loss (t.Tensor): A tensor containing the loss value.
+    """
+    # Log-softmax to get log-probabilities.
+    log_probs = F.log_softmax(logits[:, :-1], dim=-1)
+    # Match up the probabilities of the actual words.
+    pred_log_probs = t.gather(log_probs, -1, batch[:, 1:, None])[..., 0]
+    return -pred_log_probs.mean()
+def count_parameters(model):
+     return sum(parameter.numel() for parameter in model.parameters() if parameter.requires_grad)