nesquik

Running on Zero

File size: 4,977 Bytes

from pathlib import Path
import time
import os
from contextlib import contextmanager
import random

import numpy as np
import audiotools as at
from audiotools import AudioSignal
import argbind
import shutil
import torch
import yaml


from vampnet.interface import Interface, signal_concat
from vampnet import mask as pmask

from ttutil import log

# TODO: incorporate discord bot (if mem allows)
# in a separate thread, send audio samples for listening
# and send back the results
# as well as the params for sampling
# also a command that lets you clear the current signal 
# if you want to start over


device = "cuda" if torch.cuda.is_available() else "cpu"

VAMPNET_DIR = Path(".").resolve()

@contextmanager
def chdir(path):
    old_dir = os.getcwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(old_dir)

def load_interface(model_choice="default") -> Interface:
    with chdir(VAMPNET_DIR):


        # populate the model choices with any interface.yml files in the generated confs
        MODEL_CHOICES = {
            "default": {
                "Interface.coarse_ckpt": "models/vampnet/coarse.pth", 
                "Interface.coarse2fine_ckpt": "models/vampnet/c2f.pth",
                "Interface.codec_ckpt": "models/vampnet/codec.pth",
            }
        }
        generated_confs = Path("conf/generated")
        for conf_file in generated_confs.glob("*/interface.yml"):
            with open(conf_file) as f:
                _conf = yaml.safe_load(f)

                # check if the coarse, c2f, and codec ckpts exist
                # otherwise, dont' add this model choice
                if not (
                    Path(_conf["Interface.coarse_ckpt"]).exists() and 
                    Path(_conf["Interface.coarse2fine_ckpt"]).exists() and 
                    Path(_conf["Interface.codec_ckpt"]).exists()
                ):
                    continue

                MODEL_CHOICES[conf_file.parent.name] = _conf

    interface = Interface(
        device=device, 
        coarse_ckpt=MODEL_CHOICES[model_choice]["Interface.coarse_ckpt"],
        coarse2fine_ckpt=MODEL_CHOICES[model_choice]["Interface.coarse2fine_ckpt"],
        codec_ckpt=MODEL_CHOICES[model_choice]["Interface.codec_ckpt"],
    )

    interface.model_choices = MODEL_CHOICES
    interface.to("cuda" if torch.cuda.is_available() else "cpu")
    return interface

def load_model(interface: Interface, model_choice: str):
    interface.reload(
        interface.model_choices[model_choice]["Interface.coarse_ckpt"],
        interface.model_choices[model_choice]["Interface.coarse2fine_ckpt"],
    )

def ez_variation(
        interface,
        sig: AudioSignal,
        seed: int = None, 
        model_choice: str = None,  
    ):
    t0 = time.time()
    
    if seed is None:
        seed = int(torch.randint(0, 2**32, (1,)).item())
    at.util.seed(seed)

    # reload the model if necessary
    if model_choice is not None:
        load_model(interface, model_choice)

    # SAMPLING MASK PARAMS, hard code for now, we'll prob want a more preset-ey thing for the actual thin
    # we probably honestly just want to oscillate between the same 4 presets
    # in a predictable order such that they have a predictable outcome
    periodic_p = random.choice([3])
    n_mask_codebooks = 3
    sampletemp = random.choice([1.0,])
    dropout = random.choice([0.0, 0.0])

    top_p = None # NOTE: top p may be the culprit behind the collapse into single pitches. 

    # parameters for the build_mask function
    build_mask_kwargs = dict(
        rand_mask_intensity=1.0,
        prefix_s=0.0,
        suffix_s=0.0,
        periodic_prompt=int(periodic_p),
        periodic_prompt2=int(periodic_p),
        periodic_prompt_width=1,
        _dropout=dropout,
        upper_codebook_mask=int(n_mask_codebooks), 
        upper_codebook_mask_2=int(n_mask_codebooks),
    )

    # parameters for the vamp function
    vamp_kwargs = dict(
        temperature=sampletemp,
        typical_filtering=True, 
        typical_mass=0.15, 
        typical_min_tokens=64, 
        top_p=top_p,
        seed=seed,
        sample_cutoff=1.0,
    )

    # save the mask as a txt file
    interface.set_chunk_size(10.0)
    sig, mask, codes = interface.vamp(
        sig, 
        batch_size=1,
        feedback_steps=1,
        time_stretch_factor=1,
        build_mask_kwargs=build_mask_kwargs,
        vamp_kwargs=vamp_kwargs,
        return_mask=True,
    )

    log(f"vamp took {time.time() - t0} seconds")
    return sig



def main():
    import tqdm

    interface = load_interface()
    sig = AudioSignal.excerpt("assets/example.wav", duration=7.0)
    sig = interface.preprocess(sig)
    sig.write('ttout/in.wav')
    insig = sig.clone()

    fdbk_every = 4
    fdbk = 0.5

    for i in tqdm.tqdm(range(1000)): 
        sig = ez_variation(interface, sig, model_choice="orchestral")
        sig.write(f'ttout/out{i}.wav')
    

if __name__ == "__main__":
    main()