Spaces:

descript
/

vampnet

Running on T4

App Files Files Community

Hugo Flores Garcia commited on Apr 14, 2023

Commit

93b48cb

•

1 Parent(s): 128981d

more tweaks

Browse files

Files changed (5) hide show

demo.py +22 -16
scripts/exp/eval.py +17 -12
scripts/utils/vamp_folder.py +116 -22
vampnet/interface.py +0 -2
vampnet/modules/base.py +10 -3

demo.py CHANGED Viewed

@@ -210,25 +210,30 @@ with gr.Blocks() as demo:
             """)
             gr.Markdown("## Input Audio")
-        with gr.Column():
-            gr.Markdown("""
-            ## Mask Hints
-            - most of the original audio will be masked and replaced with audio generated by vampnet
-            - mask hints are used to guide vampnet to generate audio that sounds like the original
-            - the more hints you give, the more the generated audio will sound like the original
-            """)
         with gr.Column():
             gr.Markdown("""
             ### Tips
             - use the beat hint button so the output audio has the same beat structure as the input audio
-            - if you want the generated audio to sound like the original, but with a different beat structure:
-                - uncheck the beat hint button
-                - decrease the periodic unmasking to anywhere from 2 to 8
             - if you want a more "random" generation:
-                - uncheck the beat hint button (or reduce the beat unmask duration)
-                - increase the periodic unmasking to 16 or more
                 - increase the temperatures!
             """)
@@ -243,7 +248,8 @@ with gr.Blocks() as demo:
             num_vamps = gr.Number(
                 label="number of vamps. more vamps = longer generated audio",
                 value=1,
-                precision=0
             )
             manual_audio_upload = gr.File(
@@ -286,7 +292,7 @@ with gr.Blocks() as demo:
                 minimum=0,
                 maximum=64,
                 step=1,
-                value=19,
             )
@@ -326,8 +332,8 @@ with gr.Blocks() as demo:
                 )
             use_beats = gr.Checkbox(
-                label="use beat hints",
-                value=True
             )
             snap_to_beats = gr.Checkbox(

             """)
             gr.Markdown("## Input Audio")
         with gr.Column():
             gr.Markdown("""
             ### Tips
             - use the beat hint button so the output audio has the same beat structure as the input audio
+            - if you want more beat structure:
+                - enable beat hints
             - if you want a more "random" generation:
+                - increase the periodic unmasking to 12 or more
                 - increase the temperatures!
+                - uncheck the beat hint button (or reduce the beat unmask duration)
+            - if you want the generated audio to sound like the original, but with a different beat structure:
+                - uncheck the beat hint button
+                - decrease the periodic unmasking to anywhere from 2 to 20
+                - slightly decrease the random intensity, to like .95
+            """)
+        with gr.Column():
+            gr.Markdown("""
+            ## Mask Hints
+            - most of the original audio will be masked and replaced with audio generated by vampnet
+            - mask hints are used to guide vampnet to generate audio that sounds like the original
+            - the more hints you give, the more the generated audio will sound like the original
             """)
             num_vamps = gr.Number(
                 label="number of vamps. more vamps = longer generated audio",
                 value=1,
+                precision=0,
+                visible=False
             )
             manual_audio_upload = gr.File(
                 minimum=0,
                 maximum=64,
                 step=1,
+                value=9,
             )
                 )
             use_beats = gr.Checkbox(
+                label="use beat hints (helps the output stick to the beat structure of the input)",
+                value=False
             )
             snap_to_beats = gr.Checkbox(

scripts/exp/eval.py CHANGED Viewed

@@ -5,6 +5,7 @@ from functools import partial
 from frechet_audio_distance import FrechetAudioDistance
 import pandas
 import argbind
 from tqdm import tqdm
 import audiotools
@@ -21,15 +22,16 @@ def eval(
     assert exp_dir.exists(), f"exp_dir {exp_dir} does not exist"
     # set up our metrics
-    sisdr_loss = audiotools.metrics.distance.SISDRLoss()
-    stft_loss = audiotools.metrics.spectral.MultiScaleSTFTLoss()
     mel_loss = audiotools.metrics.spectral.MelSpectrogramLoss()
     frechet = FrechetAudioDistance(
         use_pca=False,
         use_activation=False,
-        verbose=True
     )
-    visqol = partial(audiotools.metrics.quality.visqol, mode="audio")
     # figure out what conditions we have
     conditions = [d.name for d in exp_dir.iterdir() if d.is_dir()]
@@ -44,7 +46,7 @@ def eval(
     baseline_files = sorted(list(baseline_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem))
     metrics = []
-    for condition in conditions:
         cond_dir = exp_dir / condition
         cond_files = sorted(list(cond_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem))
@@ -68,14 +70,17 @@ def eval(
             cond_sig.resample(baseline_sig.sample_rate)
             cond_sig.truncate_samples(baseline_sig.length)
-            # compute the metrics
-            # try:
-            #     vsq = visqol(baseline_sig, cond_sig)
-            # except:
-            #     vsq = 0.0
             return {
-                "sisdr": -sisdr_loss(baseline_sig, cond_sig).item(),
-                "stft": stft_loss(baseline_sig, cond_sig).item(),
                 "mel": mel_loss(baseline_sig, cond_sig).item(),
                 "frechet": frechet_score,
                 # "visqol": vsq,

 from frechet_audio_distance import FrechetAudioDistance
 import pandas
 import argbind
+import torch
 from tqdm import tqdm
 import audiotools
     assert exp_dir.exists(), f"exp_dir {exp_dir} does not exist"
     # set up our metrics
+    # sisdr_loss = audiotools.metrics.distance.SISDRLoss()
+    # stft_loss = audiotools.metrics.spectral.MultiScaleSTFTLoss()
     mel_loss = audiotools.metrics.spectral.MelSpectrogramLoss()
     frechet = FrechetAudioDistance(
         use_pca=False,
         use_activation=False,
+        verbose=True,
+        audio_load_worker=4,
     )
+    frechet.model.to("cuda" if torch.cuda.is_available() else "cpu")
     # figure out what conditions we have
     conditions = [d.name for d in exp_dir.iterdir() if d.is_dir()]
     baseline_files = sorted(list(baseline_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem))
     metrics = []
+    for condition in tqdm(conditions):
         cond_dir = exp_dir / condition
         cond_files = sorted(list(cond_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem))
             cond_sig.resample(baseline_sig.sample_rate)
             cond_sig.truncate_samples(baseline_sig.length)
+            # if our condition is inpainting, we need to trim the conditioning off
+            if "inpaint" in condition:
+                ctx_amt = float(condition.split("_")[-1])
+                ctx_samples = int(ctx_amt * baseline_sig.sample_rate)
+                print(f"found inpainting condition. trimming off {ctx_samples} samples from {cond_file} and {baseline_file}")
+                cond_sig.trim(ctx_samples, ctx_samples)
+                baseline_sig.trim(ctx_samples, ctx_samples)
             return {
+                # "sisdr": -sisdr_loss(baseline_sig, cond_sig).item(),
+                # "stft": stft_loss(baseline_sig, cond_sig).item(),
                 "mel": mel_loss(baseline_sig, cond_sig).item(),
                 "frechet": frechet_score,
                 # "visqol": vsq,

scripts/utils/vamp_folder.py CHANGED Viewed

@@ -6,7 +6,7 @@ import subprocess
 import argbind
 from tqdm import tqdm
-import argbind
 from vampnet.interface import Interface
 import audiotools as at
@@ -48,7 +48,6 @@ def coarse2fine_argmax(sig, interface):
     )
     return interface.to_signal(z)
 class CoarseCond:
     def __init__(self, num_codebooks, downsample_factor):
@@ -59,13 +58,12 @@ class CoarseCond:
         n_conditioning_codebooks = interface.coarse.n_codebooks - self.num_codebooks
         zv = interface.coarse_vamp_v2(sig,
             n_conditioning_codebooks=n_conditioning_codebooks,
-            downsample_factor=self.downsample_factor
         )
         zv = interface.coarse_to_fine(zv)
         return interface.to_signal(zv)
 def opus(sig, interface, bitrate=128):
     sig = interface.preprocess(sig)
@@ -97,8 +95,78 @@ def opus(sig, interface, bitrate=128):
         )
     return sig
-COARSE_SAMPLE_CONDS ={
     "baseline": baseline,
     "reconstructed": reconstructed,
     "coarse2fine": coarse2fine,
@@ -119,23 +187,55 @@ COARSE_SAMPLE_CONDS ={
 }
-OPUS_JAZZPOP_SAMPLE_CONDS = {
     f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
     for bitrate in [5620, 1875, 1250, 625]
 }
-OPUS_SPOTDL_SAMPLE_CONDS = {
     f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
     for bitrate in [8036, 2296, 1148, 574]
 }
-C2F_SAMPLE_CONDS = {
     "baseline": baseline,
     "reconstructed": reconstructed,
     "coarse2fine": coarse2fine,
     "coarse2fine_argmax": coarse2fine_argmax,
 }
 @argbind.bind(without_prefix=True)
 def main(
         sources=[
@@ -162,14 +262,8 @@ def main(
         without_replacement=True,
     )
-    if exp_type == "opus-jazzpop":
-        SAMPLE_CONDS = OPUS_JAZZPOP_SAMPLE_CONDS
-    elif exp_type == "opus-spotdl":
-        SAMPLE_CONDS = OPUS_SPOTDL_SAMPLE_CONDS
-    elif exp_type == "coarse":
-        SAMPLE_CONDS = COARSE_SAMPLE_CONDS
-    elif exp_type == "c2f":
-        SAMPLE_CONDS = C2F_SAMPLE_CONDS
     else:
         raise ValueError(f"Unknown exp_type {exp_type}")
@@ -178,12 +272,12 @@ def main(
     random.shuffle(indices)
     for i in tqdm(indices):
         # if all our files are already there, skip
-        # done = []
-        # for name in SAMPLE_CONDS:
-        #     o_dir = Path(output_dir) / name
-        #     done.append((o_dir / f"{i}.wav").exists())
-        # if all(done):
-        #     continue
         sig = dataset[i]["signal"]
         results = {

 import argbind
 from tqdm import tqdm
+import torch
 from vampnet.interface import Interface
 import audiotools as at
     )
     return interface.to_signal(z)
 class CoarseCond:
     def __init__(self, num_codebooks, downsample_factor):
         n_conditioning_codebooks = interface.coarse.n_codebooks - self.num_codebooks
         zv = interface.coarse_vamp_v2(sig,
             n_conditioning_codebooks=n_conditioning_codebooks,
+            downsample_factor=self.downsample_factor,
         )
         zv = interface.coarse_to_fine(zv)
         return interface.to_signal(zv)
 def opus(sig, interface, bitrate=128):
     sig = interface.preprocess(sig)
         )
     return sig
+def token_noise(ratio=1.0):
+    def wrapper(sig, interface):
+        z = interface.encode(sig)
+        r = interface.coarse.invgamma(ratio).to(interface.device)
+        print(f'adding noise with ratio {ratio}')
+        z, mask = interface.coarse.add_noise(
+            z,
+            r,
+            noise_mode="random"
+        )
+        return interface.to_signal(z)
+    return wrapper
+def mask_ratio_1_step(ratio=1.0):
+    def wrapper(sig, interface):
+        r = interface.coarse.invgamma(ratio).to(interface.device)
+        intensity = 1-r
+        zv = interface.coarse_vamp_v2(
+            sig,
+            sample='argmax',
+            sampling_steps=1,
+            intensity=intensity
+        )
+        return interface.to_signal(zv)
+    return wrapper
+def num_sampling_steps(num_steps=1):
+    def wrapper(sig, interface):
+        zv = interface.coarse_vamp_v2(
+            sig,
+            downsample_factor=16,
+            sampling_steps=num_steps,
+        )
+        zv = interface.coarse_to_fine(zv)
+        return interface.to_signal(zv)
+    return wrapper
+def beat_mask(ctx_time):
+    def wrapper(sig, interface):
+        beat_mask = interface.make_beat_mask(
+            sig,
+            before_beat_s=0.0,
+            after_beat_s=ctx_time,
+            invert=True
+        )
+        zv = interface.coarse_vamp_v2(
+            sig,
+            ext_mask=beat_mask,
+        )
+        zv = interface.coarse_to_fine(zv)
+        return interface.to_signal(zv)
+    return wrapper
+def inpaint(ctx_time):
+    def wrapper(sig, interface):
+        zv = interface.coarse_vamp_v2(
+            sig,
+            prefix_dur_s=ctx_time,
+            suffix_dur_s=ctx_time,
+        )
+        zv = interface.coarse_to_fine(zv)
+        return interface.to_signal(zv)
+    return wrapper
+EXP_REGISTRY = {}
+EXP_REGISTRY["gen-compression"] = {
     "baseline": baseline,
     "reconstructed": reconstructed,
     "coarse2fine": coarse2fine,
 }
+EXP_REGISTRY["opus-jazzpop"] = {
     f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
     for bitrate in [5620, 1875, 1250, 625]
 }
+EXP_REGISTRY["opus-spotdl"] = {
     f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
     for bitrate in [8036, 2296, 1148, 574]
 }
+EXP_REGISTRY["opus-baseline"]  = {
+    f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
+    for bitrate in [8000, 12000, 16000]
+}
+EXP_REGISTRY["c2f"]  = {
     "baseline": baseline,
     "reconstructed": reconstructed,
     "coarse2fine": coarse2fine,
     "coarse2fine_argmax": coarse2fine_argmax,
 }
+EXP_REGISTRY["token-noise"] = {
+    f"token_noise_{r}": token_noise(r)  for r in [0.25, 0.5, 0.75, 1.0]
+}
+EXP_REGISTRY["mask-ratio"] = {
+    "codec": reconstructed,
+    **{f"mask_ratio_{r}": mask_ratio_1_step(r)  for r in [0.25, 0.5, 0.75, 0.9]}
+}
+EXP_REGISTRY["sampling-steps"] = {
+    "codec": reconstructed,
+    **{f"steps_{n}": num_sampling_steps(n)  for n in [1, 4, 12, 24, 36, 64, 72, 128]},
+}
+EXP_REGISTRY["baseline"] = {
+    "baseline": baseline,
+    "codec": reconstructed,
+}
+EXP_REGISTRY["musical-sampling"] = {
+    "baseline": baseline,
+    "codec": reconstructed,
+    **{f"downsample_{x}x": CoarseCond(4, downsample_factor=x) for x in [16, 32]},
+    **{f"beat_mask_{t}": beat_mask(t) for t in [0.075]},
+    **{f"inpaint_{t}": inpaint(t) for t in [0.5, 1.0,]}, # multiply these by 2 (they go left and right)
+}
 @argbind.bind(without_prefix=True)
 def main(
         sources=[
         without_replacement=True,
     )
+    if exp_type in EXP_REGISTRY:
+        SAMPLE_CONDS = EXP_REGISTRY[exp_type]
     else:
         raise ValueError(f"Unknown exp_type {exp_type}")
     random.shuffle(indices)
     for i in tqdm(indices):
         # if all our files are already there, skip
+        done = []
+        for name in SAMPLE_CONDS:
+            o_dir = Path(output_dir) / name
+            done.append((o_dir / f"{i}.wav").exists())
+        if all(done):
+            continue
         sig = dataset[i]["signal"]
         results = {

vampnet/interface.py CHANGED Viewed

@@ -183,10 +183,8 @@ class Interface:
                 num_steps = mask[_slice[0]:_slice[1]].shape[0]
                 _m = torch.ones(num_steps, device=self.device)
                 _m = torch.nn.functional.dropout(_m, p=dropout)
-                print(_m)
                 mask[_slice[0]:_slice[1]] = _m
-                print(mask)
         if mask_downbeats:
             for downbeat_idx in downbeats_z:

                 num_steps = mask[_slice[0]:_slice[1]].shape[0]
                 _m = torch.ones(num_steps, device=self.device)
                 _m = torch.nn.functional.dropout(_m, p=dropout)
                 mask[_slice[0]:_slice[1]] = _m
         if mask_downbeats:
             for downbeat_idx in downbeats_z:

vampnet/modules/base.py CHANGED Viewed

@@ -42,6 +42,7 @@ class VampBase(at.ml.BaseModel):
         n_suffix: Optional[torch.Tensor] = None,
         downsample_factor: Optional[int] = None,
         n_conditioning_codebooks: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
@@ -89,13 +90,14 @@ class VampBase(at.ml.BaseModel):
         if random_x is None:
             random_x = torch.randint_like(x, 0, self.vocab_size)
-        if self.noise_mode == "mask":
             random_x = torch.full_like(x, self.mask_token)
-        elif self.noise_mode == "random":
             if random_x is None:
                 random_x = torch.randint_like(x, 0, self.vocab_size)
         else:
-            raise ValueError(f"invalid noise mode {self.noise_mode}")
         # add the external mask if we were given one
         if ext_mask is not None:
@@ -132,6 +134,11 @@ class VampBase(at.ml.BaseModel):
     def gamma(self, r):
         return (r * torch.pi / 2).cos()
     def r_embed(self, r, max_positions=10000):
         """ """
         assert hasattr(self, "r_cond_dim"), "must set r_cond_dim before calling r_embed"

         n_suffix: Optional[torch.Tensor] = None,
         downsample_factor: Optional[int] = None,
         n_conditioning_codebooks: Optional[int] = None,
+        noise_mode: str = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
         if random_x is None:
             random_x = torch.randint_like(x, 0, self.vocab_size)
+        noise_mode = noise_mode if noise_mode is not None else self.noise_mode
+        if noise_mode == "mask":
             random_x = torch.full_like(x, self.mask_token)
+        elif noise_mode == "random":
             if random_x is None:
                 random_x = torch.randint_like(x, 0, self.vocab_size)
         else:
+            raise ValueError(f"invalid noise mode {noise_mode}")
         # add the external mask if we were given one
         if ext_mask is not None:
     def gamma(self, r):
         return (r * torch.pi / 2).cos()
+    def invgamma(self, y):
+        if not torch.is_tensor(y):
+            y = torch.tensor(y)[None]
+        return 2 * y.acos() / torch.pi
     def r_embed(self, r, max_positions=10000):
         """ """
         assert hasattr(self, "r_cond_dim"), "must set r_cond_dim before calling r_embed"