Spaces:

descript
/

vampnet

Sleeping

App Files Files Community

new unlooper ui

by hugggof - opened Aug 15, 2023

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+36

-383

Files changed (10) hide show

.gitignore +0 -4
app.py +13 -76
conf/lora/lora.yml +2 -2
conf/vampnet.yml +1 -1
requirements.txt +0 -1
scripts/exp/train.py +7 -7
scripts/utils/{data/augment.py → augment.py} +1 -1
scripts/utils/gtzan_embeddings.py +0 -263
scripts/utils/{data/maestro-reorg.py → maestro-reorg.py} +0 -0
vampnet/modules/transformer.py +12 -28

.gitignore CHANGED Viewed

@@ -182,7 +182,3 @@ models.zip
 .git-old
 conf/generated/*
 runs*/
-gtzan.zip
-.gtzan_emb_cache

 .git-old
 conf/generated/*
 runs*/

app.py CHANGED Viewed

@@ -1,9 +1,6 @@
 # huggingface space exclusive
 import os
-# print("installing pyharp")
-# os.system('pip install "pyharp@git+https://github.com/audacitorch/pyharp.git"')
-# print("installing madmom")
 os.system('pip install cython')
 os.system('pip install madmom')
@@ -24,7 +21,8 @@ import gradio as gr
 from vampnet.interface import Interface
 from vampnet import mask as pmask
-from pyharp import ModelCard, build_endpoint
@@ -56,6 +54,13 @@ def load_interface():
 interface = load_interface()
 OUT_DIR = Path("gradio-outputs")
 OUT_DIR.mkdir(exist_ok=True, parents=True)
@@ -179,7 +184,7 @@ def _vamp(data, return_mask=False):
             mask_temperature=data[masktemp]*10,
             sampling_temperature=data[sampletemp],
             mask=mask,
-            sampling_steps=data[num_steps] // 2,
             sample_cutoff=data[sample_cutoff],
             seed=_seed,
         )
@@ -245,46 +250,6 @@ def save_vamp(data):
     return f"saved! your save code is {out_dir.stem}", zip_path
-def harp_vamp(_input_audio, _beat_mask_width, _sampletemp):
-    out_dir = OUT_DIR / str(uuid.uuid4())
-    out_dir.mkdir()
-    sig = at.AudioSignal(_input_audio)
-    sig = interface.preprocess(sig)
-    z = interface.encode(sig)
-    # build the mask
-    mask = pmask.linear_random(z, 1.0)
-    if _beat_mask_width > 0:
-        beat_mask = interface.make_beat_mask(
-            sig,
-            after_beat_s=(_beat_mask_width/1000),
-        )
-        mask = pmask.mask_and(mask, beat_mask)
-    # save the mask as a txt file
-    zv, mask_z = interface.coarse_vamp(
-        z,
-        mask=mask,
-        sampling_temperature=_sampletemp,
-        return_mask=True,
-        gen_fn=interface.coarse.generate,
-    )
-    zv = interface.coarse_to_fine(
-        zv,
-        sampling_temperature=_sampletemp,
-        mask=mask,
-    )
-    sig = interface.to_signal(zv).cpu()
-    print("done")
-    sig.write(out_dir / "output.wav")
-    return sig.path_to_file
 with gr.Blocks() as demo:
@@ -408,7 +373,7 @@ with gr.Blocks() as demo:
                     minimum=0,
                     maximum=128,
                     step=1,
-                    value=3,
                 )
@@ -421,7 +386,7 @@ with gr.Blocks() as demo:
                 )
                 beat_mask_width = gr.Slider(
-                    label="beat prompt (ms)",
                     minimum=0,
                     maximum=200,
                     value=0,
@@ -521,7 +486,7 @@ with gr.Blocks() as demo:
                     label="top p (0.0 = off)",
                     minimum=0.0,
                     maximum=1.0,
-                    value=0.9
                 )
                 typical_filtering = gr.Checkbox(
                     label="typical filtering ",
@@ -581,14 +546,6 @@ with gr.Blocks() as demo:
         # mask settings
         with gr.Column():
-            # lora_choice = gr.Dropdown(
-            #     label="lora choice",
-            #     choices=list(loras.keys()),
-            #     value=LORA_NONE,
-            #     visible=False
-            # )
             vamp_button = gr.Button("generate (vamp)!!!")
             output_audio = gr.Audio(
                 label="output audio",
@@ -663,24 +620,4 @@ with gr.Blocks() as demo:
         outputs=[thank_you, download_file]
     )
-    # harp stuff
-    harp_inputs = [
-        input_audio,
-        beat_mask_width,
-        sampletemp,
-    ]
-    build_endpoint(
-        inputs=harp_inputs,
-        output=output_audio,
-        process_fn=harp_vamp,
-        card=ModelCard(
-            name="vampnet",
-            description="Generate variations on music input, based on small prompts around the beat. NOTE: vampnet's has a maximum context length of 10 seconds. Please split all audio clips into 10 second chunks, or processing will result in an error. ",
-            author="Hugo Flores García",
-            tags=["music", "generative"]
-        ),
-        visible=False
-    )
 demo.launch()

 # huggingface space exclusive
 import os
 os.system('pip install cython')
 os.system('pip install madmom')
 from vampnet.interface import Interface
 from vampnet import mask as pmask
+# Interface = argbind.bind(Interface)
+# AudioLoader = argbind.bind(at.data.datasets.AudioLoader)
 interface = load_interface()
+# dataset = at.data.datasets.AudioDataset(
+#     loader,
+#     sample_rate=interface.codec.sample_rate,
+#     duration=interface.coarse.chunk_size_s,
+#     n_examples=5000,
+#     without_replacement=True,
+# )
 OUT_DIR = Path("gradio-outputs")
 OUT_DIR.mkdir(exist_ok=True, parents=True)
             mask_temperature=data[masktemp]*10,
             sampling_temperature=data[sampletemp],
             mask=mask,
+            sampling_steps=data[num_steps],
             sample_cutoff=data[sample_cutoff],
             seed=_seed,
         )
     return f"saved! your save code is {out_dir.stem}", zip_path
 with gr.Blocks() as demo:
                     minimum=0,
                     maximum=128,
                     step=1,
+                    value=5,
                 )
                 )
                 beat_mask_width = gr.Slider(
+                    label="beat mask width (in milliseconds)",
                     minimum=0,
                     maximum=200,
                     value=0,
                     label="top p (0.0 = off)",
                     minimum=0.0,
                     maximum=1.0,
+                    value=0.0
                 )
                 typical_filtering = gr.Checkbox(
                     label="typical filtering ",
         # mask settings
         with gr.Column():
             vamp_button = gr.Button("generate (vamp)!!!")
             output_audio = gr.Audio(
                 label="output audio",
         outputs=[thank_you, download_file]
     )
 demo.launch()

conf/lora/lora.yml CHANGED Viewed

@@ -9,9 +9,9 @@ val/AudioDataset.n_examples: 500
 NoamScheduler.warmup: 500
-batch_size: 6
 num_workers: 7
-save_iters: [10000, 20000, 30000, 40000, 50000, 100000]
 sample_freq: 1000
 val_freq: 500

 NoamScheduler.warmup: 500
+batch_size: 7
 num_workers: 7
+save_iters: [10000, 20000, 30000, 40000, 50000]
 sample_freq: 1000
 val_freq: 500

conf/vampnet.yml CHANGED Viewed

@@ -32,7 +32,7 @@ VampNet.n_heads: 20
 VampNet.flash_attn: false
 VampNet.dropout: 0.1
-AudioLoader.relative_path: ""
 AudioDataset.loudness_cutoff: -30.0
 AudioDataset.without_replacement: true
 AudioLoader.shuffle: true

 VampNet.flash_attn: false
 VampNet.dropout: 0.1
+AudioLoader.relative_path: /data/
 AudioDataset.loudness_cutoff: -30.0
 AudioDataset.without_replacement: true
 AudioLoader.shuffle: true

requirements.txt CHANGED Viewed

@@ -6,5 +6,4 @@ loralib
 wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat
 lac @ git+https://github.com/hugofloresgarcia/lac.git
 descript-audiotools @ git+https://github.com/descriptinc/[email protected]
--e git+https://github.com/audacitorch/pyharp.git#egg=pyharp
 torch_pitch_shift

 wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat
 lac @ git+https://github.com/hugofloresgarcia/lac.git
 descript-audiotools @ git+https://github.com/descriptinc/[email protected]
 torch_pitch_shift

scripts/exp/train.py CHANGED Viewed

@@ -224,7 +224,7 @@ def train_loop(state: State, batch: dict, accel: Accelerator):
         dtype = torch.bfloat16 if accel.amp else None
         with accel.autocast(dtype=dtype):
-            z_hat = state.model(z_mask_latent)
         target = codebook_flatten(
             z[:, vn.n_conditioning_codebooks :, :],
@@ -289,7 +289,7 @@ def val_loop(state: State, batch: dict, accel: Accelerator):
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
-    z_hat = state.model(z_mask_latent)
     target = codebook_flatten(
         z[:, vn.n_conditioning_codebooks :, :],
@@ -408,19 +408,19 @@ def save_imputation(state, z, val_idx, writer):
     for i in range(len(val_idx)):
         imputed_noisy[i].cpu().write_audio_to_tb(
-            f"inpainted_prompt/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed[i].cpu().write_audio_to_tb(
-            f"inpainted_middle/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed_true[i].cpu().write_audio_to_tb(
-            f"reconstructed/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
@@ -450,7 +450,7 @@ def save_samples(state: State, val_idx: int, writer: SummaryWriter):
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
-    z_hat = state.model(z_mask_latent)
     z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
     z_pred = codebook_unflatten(z_pred, n_c=vn.n_predict_codebooks)
@@ -469,7 +469,7 @@ def save_samples(state: State, val_idx: int, writer: SummaryWriter):
         }
         for k, v in audio_dict.items():
             v.cpu().write_audio_to_tb(
-                f"onestep/_{i}.r={r[i]:0.2f}/{k}",
                 writer,
                 step=state.tracker.step,
                 plot_fn=None,

         dtype = torch.bfloat16 if accel.amp else None
         with accel.autocast(dtype=dtype):
+            z_hat = state.model(z_mask_latent, r)
         target = codebook_flatten(
             z[:, vn.n_conditioning_codebooks :, :],
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
+    z_hat = state.model(z_mask_latent, r)
     target = codebook_flatten(
         z[:, vn.n_conditioning_codebooks :, :],
     for i in range(len(val_idx)):
         imputed_noisy[i].cpu().write_audio_to_tb(
+            f"imputed_noisy/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed[i].cpu().write_audio_to_tb(
+            f"imputed/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed_true[i].cpu().write_audio_to_tb(
+            f"imputed_true/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
+    z_hat = state.model(z_mask_latent, r)
     z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
     z_pred = codebook_unflatten(z_pred, n_c=vn.n_predict_codebooks)
         }
         for k, v in audio_dict.items():
             v.cpu().write_audio_to_tb(
+                f"samples/_{i}.r={r[i]:0.2f}/{k}",
                 writer,
                 step=state.tracker.step,
                 plot_fn=None,

scripts/utils/{data/augment.py → augment.py} RENAMED Viewed

@@ -64,4 +64,4 @@ if __name__ == "__main__":
     args = argbind.parse_args()
     with argbind.scope(args):
-        augment()

     args = argbind.parse_args()
     with argbind.scope(args):
+        augment()

scripts/utils/gtzan_embeddings.py DELETED Viewed

@@ -1,263 +0,0 @@
-"""
-TODO: train a linear probe
-usage:
-   python gtzan_embeddings.py --args.load conf/interface.yml --Interface.device cuda --path_to_gtzan /path/to/gtzan/genres_original  --output_dir /path/to/output
-"""
-from pathlib import Path
-from typing import List
-import audiotools as at
-from audiotools import AudioSignal
-import argbind
-import torch
-import numpy as np
-import zipfile
-import json
-from vampnet.interface import Interface
-import tqdm
-# bind the Interface to argbind
-Interface = argbind.bind(Interface)
-DEBUG = False
-def smart_plotly_export(fig, save_path):
-    img_format = save_path.split('.')[-1]
-    if img_format == 'html':
-        fig.write_html(save_path)
-    elif img_format == 'bytes':
-        return fig.to_image(format='png')
-    #TODO: come back and make this prettier
-    elif img_format == 'numpy':
-        import io
-        from PIL import Image
-        def plotly_fig2array(fig):
-            #convert Plotly fig to  an array
-            fig_bytes = fig.to_image(format="png", width=1200, height=700)
-            buf = io.BytesIO(fig_bytes)
-            img = Image.open(buf)
-            return np.asarray(img)
-        return plotly_fig2array(fig)
-    elif img_format == 'jpeg' or 'png' or 'webp':
-        fig.write_image(save_path)
-    else:
-        raise ValueError("invalid image format")
-def dim_reduce(emb, labels, save_path, n_components=3, method='tsne', title=''):
-    """
-    dimensionality reduction for visualization!
-    saves an html plotly figure to save_path
-    parameters:
-        emb (np.ndarray): the samples to be reduces with shape (samples, features)
-        labels (list): list of labels for embedding
-        save_path (str): path where u wanna save ur figure
-        method (str): umap, tsne, or pca
-        title (str): title for ur figure
-    returns:
-        proj (np.ndarray): projection vector with shape (samples, dimensions)
-    """
-    import pandas as pd
-    import plotly.express as px
-    if method == 'umap':
-        reducer = umap.UMAP(n_components=n_components)
-    elif method == 'tsne':
-        from sklearn.manifold import TSNE
-        reducer = TSNE(n_components=n_components)
-    elif method == 'pca':
-        from sklearn.decomposition import PCA
-        reducer = PCA(n_components=n_components)
-    else:
-        raise ValueError
-    proj = reducer.fit_transform(emb)
-    if n_components == 2:
-        df = pd.DataFrame(dict(
-            x=proj[:, 0],
-            y=proj[:, 1],
-            instrument=labels
-        ))
-        fig = px.scatter(df, x='x', y='y', color='instrument',
-                        title=title+f"_{method}")
-    elif n_components == 3:
-        df = pd.DataFrame(dict(
-            x=proj[:, 0],
-            y=proj[:, 1],
-            z=proj[:, 2],
-            instrument=labels
-        ))
-        fig = px.scatter_3d(df, x='x', y='y', z='z',
-                        color='instrument',
-                        title=title)
-    else:
-        raise ValueError("cant plot more than 3 components")
-    fig.update_traces(marker=dict(size=6,
-                                  line=dict(width=1,
-                                            color='DarkSlateGrey')),
-                      selector=dict(mode='markers'))
-    return smart_plotly_export(fig, save_path)
-# per JukeMIR, we want the emebddings from the middle layer?
-def vampnet_embed(sig: AudioSignal, interface: Interface, layer=10):
-    with torch.inference_mode():
-        # preprocess the signal
-        sig = interface.preprocess(sig)
-        # get the coarse vampnet model
-        vampnet = interface.coarse
-        # get the tokens
-        z = interface.encode(sig)[:, :vampnet.n_codebooks, :]
-        z_latents = vampnet.embedding.from_codes(z, interface.codec)
-        # do a forward pass through the model, get the embeddings
-        _z, embeddings = vampnet(z_latents, return_activations=True)
-        # print(f"got embeddings with shape {embeddings.shape}")
-        # [layer, batch, time, n_dims]
-        # [20, 1, 600ish, 768]
-        # squeeze batch dim (1 bc layer should be dim 0)
-        assert embeddings.shape[1] == 1, f"expected batch dim to be 1, got {embeddings.shape[0]}"
-        embeddings = embeddings.squeeze(1)
-        num_layers = embeddings.shape[0]
-        assert layer < num_layers, f"layer {layer} is out of bounds for model with {num_layers} layers"
-        # do meanpooling over the time dimension
-        embeddings = embeddings.mean(dim=-2)
-        # [20, 768]
-        # return the embeddings
-        return embeddings
-from dataclasses import dataclass, fields
-@dataclass
-class Embedding:
-    genre: str
-    filename: str
-    embedding: np.ndarray
-    def save(self, path):
-        """Save the Embedding object to a given path as a zip file."""
-        with zipfile.ZipFile(path, 'w') as archive:
-            # Save numpy array
-            with archive.open('embedding.npy', 'w') as f:
-                np.save(f, self.embedding)
-            # Save non-numpy data as json
-            non_numpy_data = {f.name: getattr(self, f.name) for f in fields(self) if f.name != 'embedding'}
-            with archive.open('data.json', 'w') as f:
-                f.write(json.dumps(non_numpy_data).encode('utf-8'))
-    @classmethod
-    def load(cls, path):
-        """Load the Embedding object from a given zip path."""
-        with zipfile.ZipFile(path, 'r') as archive:
-            # Load numpy array
-            with archive.open('embedding.npy') as f:
-                embedding = np.load(f)
-            # Load non-numpy data from json
-            with archive.open('data.json') as f:
-                data = json.loads(f.read().decode('utf-8'))
-        return cls(embedding=embedding, **data)
-@argbind.bind(without_prefix=True)
-def main(
-    path_to_gtzan: str = None,
-    cache_dir: str = "./.gtzan_emb_cache",
-    output_dir: str = "./gtzan_vampnet_embeddings",
-    layers: List[int] = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
-):
-    path_to_gtzan = Path(path_to_gtzan)
-    assert path_to_gtzan.exists(), f"{path_to_gtzan} does not exist"
-    cache_dir = Path(cache_dir)
-    output_dir = Path(output_dir)
-    output_dir.mkdir(exist_ok=True, parents=True)
-    # load our interface
-    # argbind will automatically load the default config,
-    interface = Interface()
-    # gtzan should have a folder for each genre, so let's get the list of genres
-    genres = [Path(x).name for x in path_to_gtzan.iterdir() if x.is_dir()]
-    print(f"Found {len(genres)} genres")
-    print(f"genres: {genres}")
-    # collect audio files, genres, and embeddings
-    data = []
-    for genre in genres:
-        audio_files = list(at.util.find_audio(path_to_gtzan / genre))
-        print(f"Found {len(audio_files)} audio files for genre {genre}")
-        for audio_file in tqdm.tqdm(audio_files, desc=f"embedding genre {genre}"):
-            # check if we have a cached embedding for this file
-            cached_path = (cache_dir / f"{genre}_{audio_file.stem}.emb")
-            if cached_path.exists():
-                # if so, load it
-                if DEBUG:
-                    print(f"loading cached embedding for {cached_path.stem}")
-                embedding = Embedding.load(cached_path)
-                data.append(embedding)
-            else:
-                try:
-                    sig = AudioSignal(audio_file)
-                except Exception as e:
-                    print(f"failed to load {audio_file.name} with error {e}")
-                    print(f"skipping {audio_file.name}")
-                    continue
-                # gets the embedding
-                emb = vampnet_embed(sig, interface).cpu().numpy()
-                # create an embedding we can save/load
-                embedding = Embedding(
-                    genre=genre,
-                    filename=audio_file.name,
-                    embedding=emb
-                )
-                # cache the embeddings
-                cached_path.parent.mkdir(exist_ok=True, parents=True)
-                embedding.save(cached_path)
-    # now, let's do a dim reduction on the embeddings
-    # and visualize them.
-    # collect a list of embeddings and labels
-    embeddings = [d.embedding for d in data]
-    labels = [d.genre for d in data]
-    # convert the embeddings to a numpy array
-    embeddings = np.stack(embeddings)
-    # do dimensionality reduction for each layer we're given
-    for layer in tqdm.tqdm(layers, desc="dim reduction"):
-        dim_reduce(
-            embeddings[:, layer, :], labels,
-            save_path=str(output_dir / f'vampnet-gtzan-layer={layer}.html'),
-            n_components=2, method='tsne',
-            title=f'vampnet-gtzan-layer={layer}'
-        )
-if __name__ == "__main__":
-    args = argbind.parse_args()
-    with argbind.scope(args):
-        main()

scripts/utils/{data/maestro-reorg.py → maestro-reorg.py} RENAMED Viewed

File without changes

vampnet/modules/transformer.py CHANGED Viewed

@@ -410,9 +410,7 @@ class TransformerStack(nn.Module):
     def subsequent_mask(self, size):
         return torch.ones(1, size, size).tril().bool()
-    def forward(self, x, x_mask, cond=None, src=None, src_mask=None,
-                return_activations: bool = False
-        ):
         """Computes a full transformer stack
         Parameters
         ----------
@@ -439,8 +437,6 @@ class TransformerStack(nn.Module):
         encoder_decoder_position_bias = None
         # Compute transformer layers
-        if return_activations:
-            activations = []
         for layer in self.layers:
             x, position_bias, encoder_decoder_position_bias = layer(
                 x=x,
@@ -451,15 +447,8 @@ class TransformerStack(nn.Module):
                 position_bias=position_bias,
                 encoder_decoder_position_bias=encoder_decoder_position_bias,
             )
-            if return_activations:
-                activations.append(x.detach())
-        out = self.norm(x) if self.norm is not None else x
-        if return_activations:
-            return out, torch.stack(activations)
-        else:
-            return out
 class VampNet(at.ml.BaseModel):
@@ -467,7 +456,7 @@ class VampNet(at.ml.BaseModel):
         self,
         n_heads: int = 20,
         n_layers: int = 16,
-        r_cond_dim: int = 0,
         n_codebooks: int = 9,
         n_conditioning_codebooks: int = 0,
         latent_dim: int = 8,
@@ -478,7 +467,6 @@ class VampNet(at.ml.BaseModel):
         dropout: float = 0.1
     ):
         super().__init__()
-        assert r_cond_dim == 0, f"r_cond_dim must be 0 (not supported), but got {r_cond_dim}"
         self.n_heads = n_heads
         self.n_layers = n_layers
         self.r_cond_dim = r_cond_dim
@@ -525,25 +513,21 @@ class VampNet(at.ml.BaseModel):
             ),
         )
-    def forward(self, x, return_activations: bool = False):
         x = self.embedding(x)
         x_mask = torch.ones_like(x, dtype=torch.bool)[:, :1, :].squeeze(1)
-        x = rearrange(x, "b d n -> b n d")
-        out = self.transformer(x=x, x_mask=x_mask, return_activations=return_activations)
-        if return_activations:
-            out, activations = out
         out = rearrange(out, "b n d -> b d n")
-        out = self.classifier(out, None) # no cond here!
         out = rearrange(out, "b (p c) t -> b p (t c)", c=self.n_predict_codebooks)
-        if return_activations:
-            return out, activations
-        else:
-            return out
     def r_embed(self, r, max_positions=10000):
         if self.r_cond_dim > 0:
@@ -605,7 +589,7 @@ class VampNet(at.ml.BaseModel):
         top_p=None,
         return_signal=True,
         seed: int = None,
-        sample_cutoff: float = 1.0,
     ):
         if seed is not None:
             at.util.seed(seed)
@@ -676,7 +660,7 @@ class VampNet(at.ml.BaseModel):
             # infer from latents
             # NOTE: this collapses the codebook dimension into the sequence dimension
-            logits = self.forward(latents) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
             b = logits.shape[0]
@@ -937,7 +921,7 @@ if __name__ == "__main__":
         z_mask_latent = torch.rand(
             batch_size, model.latent_dim * model.n_codebooks, seq_len
         ).to(device)
-        z_hat = model(z_mask_latent)
         pred = z_hat.argmax(dim=1)
         pred = model.embedding.unflatten(pred, n_codebooks=model.n_predict_codebooks)

     def subsequent_mask(self, size):
         return torch.ones(1, size, size).tril().bool()
+    def forward(self, x, x_mask, cond=None, src=None, src_mask=None):
         """Computes a full transformer stack
         Parameters
         ----------
         encoder_decoder_position_bias = None
         # Compute transformer layers
         for layer in self.layers:
             x, position_bias, encoder_decoder_position_bias = layer(
                 x=x,
                 position_bias=position_bias,
                 encoder_decoder_position_bias=encoder_decoder_position_bias,
             )
+        return self.norm(x) if self.norm is not None else x
 class VampNet(at.ml.BaseModel):
         self,
         n_heads: int = 20,
         n_layers: int = 16,
+        r_cond_dim: int = 64,
         n_codebooks: int = 9,
         n_conditioning_codebooks: int = 0,
         latent_dim: int = 8,
         dropout: float = 0.1
     ):
         super().__init__()
         self.n_heads = n_heads
         self.n_layers = n_layers
         self.r_cond_dim = r_cond_dim
             ),
         )
+    def forward(self, x, cond):
         x = self.embedding(x)
         x_mask = torch.ones_like(x, dtype=torch.bool)[:, :1, :].squeeze(1)
+        cond = self.r_embed(cond)
+        x = rearrange(x, "b d n -> b n d")
+        out = self.transformer(x=x, x_mask=x_mask, cond=cond)
         out = rearrange(out, "b n d -> b d n")
+        out = self.classifier(out, cond)
         out = rearrange(out, "b (p c) t -> b p (t c)", c=self.n_predict_codebooks)
+        return out
     def r_embed(self, r, max_positions=10000):
         if self.r_cond_dim > 0:
         top_p=None,
         return_signal=True,
         seed: int = None,
+        sample_cutoff: float = 0.5,
     ):
         if seed is not None:
             at.util.seed(seed)
             # infer from latents
             # NOTE: this collapses the codebook dimension into the sequence dimension
+            logits = self.forward(latents, r) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
             b = logits.shape[0]
         z_mask_latent = torch.rand(
             batch_size, model.latent_dim * model.n_codebooks, seq_len
         ).to(device)
+        z_hat = model(z_mask_latent, r)
         pred = z_hat.argmax(dim=1)
         pred = model.embedding.unflatten(pred, n_codebooks=model.n_predict_codebooks)