Spaces:

descript
/

vampnet

Running on T4

App Files Files Community

Hugo Flores Garcia commited on Mar 28, 2023

Commit

b9277bd

•

2 Parent(s): e4e3c4e 554c010

Merge branch 'main' of https://github.com/descriptinc/lyrebird-vampnet into main

Browse files

Files changed (4) hide show

requirements.txt +1 -0
vampnet/beats.py +249 -0
vampnet/interface.py +99 -7
vampnet/modules/base.py +6 -5

requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@ pytorch-ignite
 rich
 audiotools @ git+https://github.com/descriptinc/lyrebird-audiotools.git@hf/backup-info
 lac @ git+https://github.com/descriptinc/lyrebird-audio-codec.git@hf/vampnet-temp
 tqdm
 tensorboard
 google-cloud-logging==2.2.0

 rich
 audiotools @ git+https://github.com/descriptinc/lyrebird-audiotools.git@hf/backup-info
 lac @ git+https://github.com/descriptinc/lyrebird-audio-codec.git@hf/vampnet-temp
+wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git
 tqdm
 tensorboard
 google-cloud-logging==2.2.0

vampnet/beats.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import json
+import logging
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Tuple
+from typing import Union
+import librosa
+import numpy as np
+from audiotools import AudioSignal
+logging.basicConfig(level=logging.INFO)
+###################
+# beat sync utils #
+###################
+AGGREGATOR_REGISTRY = {
+    "mean": np.mean,
+    "median": np.median,
+    "max": np.max,
+    "min": np.min,
+}
+def list_aggregators() -> list:
+    return list(AGGREGATOR_REGISTRY.keys())
+@dataclass
+class TimeSegment:
+    start: float
+    end: float
+    @property
+    def duration(self):
+        return self.end - self.start
+    def __str__(self) -> str:
+        return f"{self.start} - {self.end}"
+    def find_overlapping_segment(
+        self, segments: List["TimeSegment"]
+    ) -> Union["TimeSegment", None]:
+        """Find the first segment that overlaps with this segment, or None if no segment overlaps"""
+        for s in segments:
+            if s.start <= self.start and s.end >= self.end:
+                return s
+        return None
+def mkdir(path: Union[Path, str]) -> Path:
+    p = Path(path)
+    p.mkdir(parents=True, exist_ok=True)
+    return p
+###################
+#    beat data    #
+###################
+@dataclass
+class BeatSegment(TimeSegment):
+    downbeat: bool = False  # if there's a downbeat on the start_time
+class Beats:
+    def __init__(self, beat_times, downbeat_times):
+        if isinstance(beat_times, np.ndarray):
+            beat_times = beat_times.tolist()
+        if isinstance(downbeat_times, np.ndarray):
+            downbeat_times = downbeat_times.tolist()
+        self._beat_times = beat_times
+        self._downbeat_times = downbeat_times
+        self._use_downbeats = False
+    def use_downbeats(self, use_downbeats: bool = True):
+        """use downbeats instead of beats when calling beat_times"""
+        self._use_downbeats = use_downbeats
+    def beat_segments(self, signal: AudioSignal) -> List[BeatSegment]:
+        """
+        segments a song into time segments corresponding to beats.
+        the first segment starts at 0 and ends at the first beat time.
+        the last segment starts at the last beat time and ends at the end of the song.
+        """
+        beat_times = self._beat_times.copy()
+        downbeat_times = self._downbeat_times
+        beat_times.insert(0, 0)
+        beat_times.append(signal.signal_duration)
+        downbeat_ids = np.intersect1d(beat_times, downbeat_times, return_indices=True)[
+            1
+        ]
+        is_downbeat = [
+            True if i in downbeat_ids else False for i in range(len(beat_times))
+        ]
+        segments = [
+            BeatSegment(start_time, end_time, downbeat)
+            for start_time, end_time, downbeat in zip(
+                beat_times[:-1], beat_times[1:], is_downbeat
+            )
+        ]
+        return segments
+    def get_beats(self) -> np.ndarray:
+        """returns an array of beat times, in seconds
+        if downbeats is True, returns an array of downbeat times, in seconds
+        """
+        return np.array(
+            self._downbeat_times if self._use_downbeats else self._beat_times
+        )
+    @property
+    def beat_times(self) -> np.ndarray:
+        """return beat times"""
+        return np.array(self._beat_times)
+    @property
+    def downbeat_times(self) -> np.ndarray:
+        """return downbeat times"""
+        return np.array(self._downbeat_times)
+    def beat_times_to_feature_frames(
+        self, signal: AudioSignal, features: np.ndarray
+    ) -> np.ndarray:
+        """convert beat times to frames, given an array of time-varying features"""
+        beat_times = self.get_beats()
+        beat_frames = (
+            beat_times * signal.sample_rate / signal.signal_length * features.shape[-1]
+        ).astype(np.int64)
+        return beat_frames
+    def sync_features(
+        self, feature_frames: np.ndarray, features: np.ndarray, aggregate="median"
+    ) -> np.ndarray:
+        """sync features to beats"""
+        if aggregate not in AGGREGATOR_REGISTRY:
+            raise ValueError(f"unknown aggregation method {aggregate}")
+        return librosa.util.sync(
+            features, feature_frames, aggregate=AGGREGATOR_REGISTRY[aggregate]
+        )
+    def to_json(self) -> dict:
+        """return beats and downbeats as json"""
+        return {
+            "beats": self._beat_times,
+            "downbeats": self._downbeat_times,
+            "use_downbeats": self._use_downbeats,
+        }
+    @classmethod
+    def from_dict(cls, data: dict):
+        """load beats and downbeats from json"""
+        inst = cls(data["beats"], data["downbeats"])
+        inst.use_downbeats(data["use_downbeats"])
+        return inst
+    def save(self, output_dir: Path):
+        """save beats and downbeats to json"""
+        mkdir(output_dir)
+        with open(output_dir / "beats.json", "w") as f:
+            json.dump(self.to_json(), f)
+    @classmethod
+    def load(cls, input_dir: Path):
+        """load beats and downbeats from json"""
+        beats_file = Path(input_dir) / "beats.json"
+        with open(beats_file, "r") as f:
+            data = json.load(f)
+        return cls.from_dict(data)
+###################
+#  beat tracking  #
+###################
+class BeatTracker:
+    def extract_beats(self, signal: AudioSignal) -> Tuple[np.ndarray, np.ndarray]:
+        """extract beats from an audio signal"""
+        raise NotImplementedError
+    def __call__(self, signal: AudioSignal) -> Beats:
+        """extract beats from an audio signal
+        NOTE: if the first beat (and/or downbeat) is detected within the first 100ms of the audio,
+        it is discarded. This is to avoid empty bins with no beat synced features in the first beat.
+        Args:
+            signal (AudioSignal): signal to beat track
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: beats and downbeats
+        """
+        beats, downbeats = self.extract_beats(signal)
+        return Beats(beats, downbeats)
+class WaveBeat(BeatTracker):
+    def __init__(self, ckpt_path: str = "checkpoints/wavebeat", device: str = "cpu"):
+        from wavebeat.dstcn import dsTCNModel
+        model = dsTCNModel.load_from_checkpoint(ckpt_path)
+        model.eval()
+        self.device = device
+        self.model = model
+    def extract_beats(self, signal: AudioSignal) -> Tuple[np.ndarray, np.ndarray]:
+        """returns beat and downbeat times, in  seconds"""
+        # extract beats
+        beats, downbeats = self.model.predict_beats_from_array(
+            audio=signal.audio_data.squeeze(0),
+            sr=signal.sample_rate,
+            use_gpu=self.device is not "cpu",
+        )
+        return beats, downbeats
+class MadmomBeats(BeatTracker):
+    def __init__(self):
+        raise NotImplementedError
+    def extract_beats(self, signal: AudioSignal) -> Tuple[np.ndarray, np.ndarray]:
+        """returns beat and downbeat times, in  seconds"""
+        pass
+BEAT_TRACKER_REGISTRY = {
+    "wavebeat": WaveBeat,
+    "madmom": MadmomBeats,
+}
+def list_beat_trackers() -> list:
+    return list(BEAT_TRACKER_REGISTRY.keys())
+def load_beat_tracker(beat_tracker: str, **kwargs) -> BeatTracker:
+    if beat_tracker not in BEAT_TRACKER_REGISTRY:
+        raise ValueError(
+            f"Unknown beat tracker {beat_tracker}. Available: {list_beat_trackers()}"
+        )
+    return BEAT_TRACKER_REGISTRY[beat_tracker](**kwargs)

vampnet/interface.py CHANGED Viewed

@@ -3,14 +3,15 @@ from pathlib import Path
 import math
 import torch
 from audiotools import AudioSignal
 import tqdm
 from .modules.transformer import VampNet
 from lac.model.lac import LAC
 def signal_concat(
     audio_signals: list,
 ):
@@ -50,7 +51,10 @@ class Interface:
     def s2t(self, seconds: float):
         """seconds to tokens"""
-        return math.ceil(seconds * self.codec.sample_rate / self.codec.hop_length)
     def s2t2s(self, seconds: float):
         """seconds to tokens to seconds"""
@@ -83,12 +87,85 @@ class Interface:
             .ensure_max_of_audio(1.0)
         )
         return signal
     @torch.inference_mode()
     def encode(self, signal: AudioSignal):
         signal = self.preprocess(signal).to(self.device)
         z = self.codec.encode(signal.samples, signal.sample_rate)["codes"]
         return z
     def coarse_to_fine(
         self,
         coarse_z: torch.Tensor,
@@ -231,7 +308,9 @@ class Interface:
         downsample_factor: int = None,
         intensity: float = 1.0,
         debug=False,
-        swap_prefix_suffix=False,
         **kwargs
     ):
         z = self.encode(signal)
@@ -258,14 +337,16 @@ class Interface:
         _cz = cz.clone()
         cz_mask = None
-        for _ in range(num_vamps):
             # add noise
             cz_masked, cz_mask = self.coarse.add_noise(
                 _cz, r=1.0-intensity,
                 n_prefix=n_prefix,
                 n_suffix=n_suffix,
                 downsample_factor=downsample_factor,
-                mask=cz_mask
             )
             if debug:
                 print("tokens to infer")
@@ -366,8 +447,9 @@ class Interface:
     def variation(
         self,
         signal: AudioSignal,
-        overlap_hop_ratio: float = 1.0, # TODO: should this be fixed to 1.0?  or should we overlap and replace instead of overlap add
         verbose: bool = False,
         **kwargs
     ):
         signal = signal.clone()
@@ -380,6 +462,9 @@ class Interface:
             math.ceil(signal.duration / self.coarse.chunk_size_s)
             * self.coarse.chunk_size_s
         )
         hop_duration = self.coarse.chunk_size_s * overlap_hop_ratio
         original_length = signal.length
@@ -398,10 +483,18 @@ class Interface:
                 signal.samples[i,...], signal.sample_rate
             )
             sig.to(self.device)
             out_z = self.coarse_vamp_v2(
                 sig,
                 num_vamps=1,
                 swap_prefix_suffix=False,
                 **kwargs
             )
             if self.c2f is not None:
@@ -415,7 +508,6 @@ class Interface:
         output.truncate_samples(original_length)
         return output
     # create a loop of a single region with variations
     # TODO: this would work nicer if we could trim at the beat
     # otherwise the model has to awkwardly fill up space that won't match

 import math
 import torch
+import numpy as np
 from audiotools import AudioSignal
 import tqdm
 from .modules.transformer import VampNet
+from .beats import WaveBeat
 from lac.model.lac import LAC
 def signal_concat(
     audio_signals: list,
 ):
     def s2t(self, seconds: float):
         """seconds to tokens"""
+        if isinstance(seconds, np.ndarray):
+            return np.ceil(seconds * self.codec.sample_rate / self.codec.hop_length)
+        else:
+            return math.ceil(seconds * self.codec.sample_rate / self.codec.hop_length)
     def s2t2s(self, seconds: float):
         """seconds to tokens to seconds"""
             .ensure_max_of_audio(1.0)
         )
         return signal
     @torch.inference_mode()
     def encode(self, signal: AudioSignal):
         signal = self.preprocess(signal).to(self.device)
         z = self.codec.encode(signal.samples, signal.sample_rate)["codes"]
         return z
+    def make_beat_mask(self,
+            signal: AudioSignal,
+            before_beat_s: float = 0.1,
+            after_beat_s: float = 0.1,
+            mask_downbeats: bool = True,
+            mask_upbeats: bool = True,
+            downbeat_downsample_factor: int = None,
+            beat_downsample_factor: int = None,
+            dropout: float = 0.7,
+            invert: bool = True,
+    ):
+        """make a beat synced mask. that is, make a mask that
+        places 1s at and around the beat, and 0s everywhere else.
+        """
+        assert hasattr(self, "beat_tracker"), "No beat tracker loaded"
+        # get the beat times
+        beats, downbeats = self.beat_tracker.extract_beats(signal)
+        # get the beat indices in z
+        beats_z, downbeats_z = self.s2t(beats), self.s2t(downbeats)
+        # remove downbeats from beats
+        beats_z = torch.tensor(beats_z)[~torch.isin(torch.tensor(beats_z), torch.tensor(downbeats_z))]
+        beats_z = beats_z.tolist()
+        downbeats_z = downbeats_z.tolist()
+        # make the mask
+        seq_len = self.s2t(signal.duration)
+        mask = torch.zeros(seq_len, device=self.device)
+        mask_b4 = self.s2t(before_beat_s)
+        mask_after = self.s2t(after_beat_s)
+        if beat_downsample_factor is not None:
+            if beat_downsample_factor < 1:
+                raise ValueError("mask_beat_downsample_factor must be >= 1 or None")
+        else:
+            beat_downsample_factor = 1
+        if downbeat_downsample_factor is not None:
+            if downbeat_downsample_factor < 1:
+                raise ValueError("mask_beat_downsample_factor must be >= 1 or None")
+        else:
+            downbeat_downsample_factor = 1
+        beats_z = beats_z[::beat_downsample_factor]
+        downbeats_z = downbeats_z[::downbeat_downsample_factor]
+        if mask_upbeats:
+            for beat_idx in beats_z:
+                _slice = int(beat_idx - mask_b4), int(beat_idx + mask_after)
+                num_steps = mask[_slice[0]:_slice[1]].shape[0]
+                _m = torch.ones(num_steps, device=self.device)
+                _m = torch.nn.functional.dropout(_m, p=dropout)
+                mask[_slice[0]:_slice[1]] = _m
+        if mask_downbeats:
+            for downbeat_idx in downbeats_z:
+                _slice = int(downbeat_idx - mask_b4), int(downbeat_idx + mask_after)
+                num_steps = mask[_slice[0]:_slice[1]].shape[0]
+                _m = torch.ones(num_steps, device=self.device)
+                _m = torch.nn.functional.dropout(_m, p=dropout)
+                mask[_slice[0]:_slice[1]] = _m
+        if invert:
+            mask = 1 - mask
+        return mask[None, None, :].bool().long()
     def coarse_to_fine(
         self,
         coarse_z: torch.Tensor,
         downsample_factor: int = None,
         intensity: float = 1.0,
         debug=False,
+        swap_prefix_suffix=False,
+        ext_mask=None,
+        verbose=False,
         **kwargs
     ):
         z = self.encode(signal)
         _cz = cz.clone()
         cz_mask = None
+        range_fn = tqdm.trange if verbose else range
+        for _ in range_fn(num_vamps):
             # add noise
             cz_masked, cz_mask = self.coarse.add_noise(
                 _cz, r=1.0-intensity,
                 n_prefix=n_prefix,
                 n_suffix=n_suffix,
                 downsample_factor=downsample_factor,
+                mask=cz_mask,
+                ext_mask=ext_mask
             )
             if debug:
                 print("tokens to infer")
     def variation(
         self,
         signal: AudioSignal,
         verbose: bool = False,
+        beat_mask: bool = False,
+        beat_mask_kwargs: dict = {},
         **kwargs
     ):
         signal = signal.clone()
             math.ceil(signal.duration / self.coarse.chunk_size_s)
             * self.coarse.chunk_size_s
         )
+        # eventually we DO want overlap, but we want overlap-replace not
+        # overlap-add
+        overlap_hop_ratio = 1.0
         hop_duration = self.coarse.chunk_size_s * overlap_hop_ratio
         original_length = signal.length
                 signal.samples[i,...], signal.sample_rate
             )
             sig.to(self.device)
+            if beat_mask:
+                ext_mask = self.make_beat_mask(sig, **beat_mask_kwargs)
+            else:
+                ext_mask = None
             out_z = self.coarse_vamp_v2(
                 sig,
                 num_vamps=1,
                 swap_prefix_suffix=False,
+                ext_mask=ext_mask,
+                verbose=verbose,
                 **kwargs
             )
             if self.c2f is not None:
         output.truncate_samples(original_length)
         return output
     # create a loop of a single region with variations
     # TODO: this would work nicer if we could trim at the beat
     # otherwise the model has to awkwardly fill up space that won't match

vampnet/modules/base.py CHANGED Viewed

@@ -31,17 +31,13 @@ class VampBase(at.ml.BaseModel):
     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
-    # TODO: add a beat tracking method
-    # that uses a beat tracking model to find beat positions
-    # and then unmask the codes in those poisitions (with some width)
-    # and drop them out with some randomness
-    # and have the option to DONT drop out downbeats for
     def add_noise(
         self,
         x: torch.Tensor,
         r: torch.Tensor,
         random_x: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
         n_prefix: Optional[torch.Tensor] = None,
         n_suffix: Optional[torch.Tensor] = None,
         downsample_factor: Optional[int] = None,
@@ -99,6 +95,11 @@ class VampBase(at.ml.BaseModel):
         else:
             raise ValueError(f"invalid noise mode {self.noise_mode}")
         x = x * (1 - mask) + random_x * mask
         return x, mask

     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
     def add_noise(
         self,
         x: torch.Tensor,
         r: torch.Tensor,
         random_x: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
+        ext_mask: Optional[torch.Tensor] = None,
         n_prefix: Optional[torch.Tensor] = None,
         n_suffix: Optional[torch.Tensor] = None,
         downsample_factor: Optional[int] = None,
         else:
             raise ValueError(f"invalid noise mode {self.noise_mode}")
+        # add the external mask if we were given one
+        if ext_mask is not None:
+            assert ext_mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
+            mask = (mask * ext_mask).bool().long()
         x = x * (1 - mask) + random_x * mask
         return x, mask