Spaces:

mimbres
/

YourMT3

Running on Zero

File size: 24,312 Bytes

a03c9b4

# Copyright 2024 The YourMT3 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Please see the details in the LICENSE file.
"""pitchshift.py"""
# import math
import numpy as np
# from scipy import special
from einops import rearrange
from typing import Optional, Literal, Dict, List, Tuple, Callable

import torch
from torch import nn
import torchaudio
from torchaudio import transforms
# from torchaudio import functional as F
# from torchaudio.functional.functional import (
#     _fix_waveform_shape,
#     _stretch_waveform,
# )
# from model.ops import adjust_b_to_gcd, check_all_elements_equal


class PitchShiftLayer(nn.Module):
    """Applying batch-wise pitch-shift to time-domain audio signals.

    Args:
        pshift_range (List[int]): Range of pitch shift in semitones. Default: ``[-2, 2]``.
        resample_source_fs (int): Default is 4000.
        stretch_n_fft (int): Default is 2048.
        window: (Optional[Literal['kaiser']]) Default is None.
        beta: (Optional[float]): Parameter for 'kaiser' filter. Default: None.
    """

    def __init__(
        self,
        pshift_range: List[int] = [-2, 2],
        resample_source_fs: int = 4000,
        strecth_n_fft: int = 512,
        win_length: Optional[int] = None,
        hop_length: Optional[int] = None,
        window: Optional[Literal['kaiser']] = None,
        beta: Optional[float] = None,
        expected_input_shape: Optional[Tuple[int]] = None,
        device: Optional[torch.device] = None,
        **kwargs,
    ) -> None:
        super().__init__()
        self.pshift_range = pshift_range
        self.resample_source_fs = resample_source_fs
        self.strecth_n_fft = strecth_n_fft
        self.win_length = win_length
        self.hop_length = hop_length

        if window is None:
            self.window_fn = torch.hann_window
            self.window_kwargs = None
        elif 'kaiser' in window:

            def custom_kaiser_window(window_length, beta, **kwargs):
                return torch.kaiser_window(window_length, periodic=True, beta=beta, **kwargs)

            self.window_fn = custom_kaiser_window
            self.window_kwargs = {'beta': beta}

        # Initialize pitch shifters for every semitone
        self.pshifters = None
        self.frame_gaps = None
        self._initialize_pshifters(expected_input_shape, device=device)
        self.requires_grad_(False)

    def _initialize_pshifters(self,
                              expected_input_shape: Optional[Tuple[int]] = None,
                              device: Optional[torch.device] = None) -> None:
        # DDP requires initializing parameters with a dummy input
        if expected_input_shape is not None:
            if device is not None:
                dummy_input = torch.randn(expected_input_shape, requires_grad=False).to(device)
            else:
                dummy_input = torch.randn(expected_input_shape, requires_grad=False)
        else:
            dummy_input = None

        pshifters = nn.ModuleDict()
        for semitone in range(self.pshift_range[0], self.pshift_range[1] + 1):
            if semitone == 0:
                # No need to shift and resample
                pshifters[str(semitone)] = None
            else:
                pshifter = transforms.PitchShift(self.resample_source_fs,
                                                 n_steps=semitone,
                                                 n_fft=self.strecth_n_fft,
                                                 win_length=self.win_length,
                                                 hop_length=self.hop_length,
                                                 window_fn=self.window_fn,
                                                 wkwargs=self.window_kwargs)
                pshifters[str(semitone)] = pshifter
                # Pass dummy input to initialize parameters
                with torch.no_grad():
                    if dummy_input is not None:
                        _ = pshifter.initialize_parameters(dummy_input)
        self.pshifters = pshifters

    def calculate_frame_gaps(self) -> Dict[int, float]:
        """Calculate the expected gap between the original and the stretched audio."""
        frame_gaps = {}  # for debugging
        for semitone in range(self.pshift_range[0], self.pshift_range[1] + 1):
            if semitone == 0:
                # No need to shift and resample
                frame_gaps[semitone] = 0.
            else:
                pshifter = self.pshifters[str(semitone)]
                gap_in_ms = 1000. * (pshifter.kernel.shape[2] -
                                     pshifter.kernel.shape[0] / 2.0**(-float(semitone) / 12)) / self.resample_source_fs
                frame_gaps[semitone] = gap_in_ms
        return frame_gaps

    @torch.no_grad()
    def forward(self, x: torch.Tensor, semitone: int) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): (B, 1, T) or (B, T)
        Returns:
            torch.Tensor: (B, 1, T) or (B, T)
        """
        if semitone == 0:
            return x
        elif semitone >= min(self.pshift_range) and semitone <= max(self.pshift_range):
            return self.pshifters[str(semitone)](x)
        else:
            raise ValueError(f"semitone must be in range {self.pshift_range}")


def test_resampler_sinewave():
    # x: {440Hz, 220Hz} sine wave at 16kHz
    t = torch.arange(0, 2, 1 / 16000)  # 2 seconds at 16kHz
    x0 = torch.sin(2 * torch.pi * 440 * t) * 0.5
    x1 = torch.sin(2 * torch.pi * 220 * t) * 0.5
    x = torch.stack((x0, x1), dim=0)  # (2, 32000)

    # Resample
    psl = PitchShiftLayer(pshift_range=[-2, 2], resample_source_fs=4000)
    y = psl(x, 2)  # (2, 24000)

    # Export to wav
    torchaudio.save("x.wav", x, 16000, bits_per_sample=16)
    torchaudio.save("y.wav", y, 12000, bits_per_sample=16)


# class Resampler(nn.Module):
#     """
#     Resampling using conv1d operations, more memory-efficient than torchaudio's resampler.

#     Based on Dan Povey's resampler.py:
#     https://github.com/danpovey/filtering/blob/master/lilfilter/resampler.py
#     """

#     def __init__(self,
#                  input_sr: int,
#                  output_sr: int,
#                  dtype: torch.dtype = torch.float32,
#                  filter_width: int = 16,
#                  cutoff_ratio: float = 0.85,
#                  filter: Literal['kaiser', 'kaiser_best', 'kaiser_fast', 'hann'] = 'kaiser_fast',
#                  beta: float = 8.555504641634386) -> None:
#         super().__init__()  # init the base class
#         """
#         Initialize the Resampler.

#         Args:
#         - input_sr (int): Input sampling rate.
#         - output_sr (int): Output sampling rate.
#         - dtype (torch.dtype): Computation data type. Default: torch.float32.
#         - filter_width (int): Number of zeros per side in the sinc function. Default: 16.
#         - cutoff_ratio (float): Filter rolloff point as a fraction of Nyquist freq. Default: 0.95.
#         - filter (str): Filter type. One of ['kaiser', 'kaiser_best', 'kaiser_fast', 'hann']. Default: 'kaiser_fast'.
#         - beta (float): Parameter for 'kaiser' filter. Default: 8.555504641634386.

#         Note: Ratio between input_sr and output_sr should be reduced to simplest form.
#         """
#         assert isinstance(input_sr, int) and isinstance(output_sr, int)
#         if input_sr == output_sr:
#             self.resample_type = 'trivial'
#             return

#         d = math.gcd(input_sr, output_sr)
#         input_sr, output_sr = input_sr // d, output_sr // d

#         assert dtype in [torch.float32, torch.float64]
#         assert filter_width > 3  # a reasonable bare minimum
#         np_dtype = np.float32 if dtype == torch.float32 else np.float64

#         assert filter in ['hann', 'kaiser', 'kaiser_best', 'kaiser_fast']

#         if filter == 'kaiser_best':
#             filter_width = 64
#             beta = 14.769656459379492
#             cutoff_ratio = 0.9475937167399596
#             filter = 'kaiser'
#         elif filter == 'kaiser_fast':
#             filter_width = 16
#             beta = 8.555504641634386
#             cutoff_ratio = 0.85
#             filter = 'kaiser'
#         """
#         - Define a sample 'block' correlating `input_sr` input samples to `output_sr` output samples.
#         - Dividing samples into these blocks allows corresponding block alignment.
#         - On average, `zeros_per_block` zeros per block are present in the sinc function.
#         """
#         zeros_per_block = min(input_sr, output_sr) * cutoff_ratio
#         """
#         - Define conv kernel size n = (blocks_per_side*2 + 1), adding blocks to each side of the center.
#         - `blocks_per_side` blocks as window radius ensures each central block sample accesses its window.
#         - `blocks_per_side` is determined, rounding up if needed, as 1 + int(filter_width / zeros_per_block).
#         """
#         blocks_per_side = int(np.ceil(filter_width / zeros_per_block))

#         kernel_width = 2 * blocks_per_side + 1

#         # Shape of conv1d weights: (out_channels, in_channels, kernel_width)
#         """ Time computations are in units of 1 block, aligning with the `canonical` time axis,
#         since each block has input_sr input samples, adhering to our time unit."""

#         window_radius_in_blocks = blocks_per_side
#         """`times` will be sinc function arguments, expanding to shape (output_sr, input_sr, kernel_width)
#         via broadcasting. Ensuring t == 0 along the central block diagonal (when input_sr == output_sr)"""
#         times = (
#             np.arange(output_sr, dtype=np_dtype).reshape(
#                 (output_sr, 1, 1)) / output_sr - np.arange(input_sr, dtype=np_dtype).reshape(
#                     (1, input_sr, 1)) / input_sr - (np.arange(kernel_width, dtype=np_dtype).reshape(
#                         (1, 1, kernel_width)) - blocks_per_side))

#         def hann_window(a):
#             """
#             returning 0.5 + 0.5 cos(a*pi) on [-1,1] and 0 outside.
#             """
#             return np.heaviside(1 - np.abs(a), 0.0) * (0.5 + 0.5 * np.cos(a * np.pi))

#         def kaiser_window(a, beta):
#             w = special.i0(beta * np.sqrt(np.clip(1 - (
#                 (a - 0.0) / 1.0)**2.0, 0.0, 1.0))) / special.i0(beta)
#             return np.heaviside(1 - np.abs(a), 0.0) * w

#         """The weights are computed as a sinc function times a Hann-window function, normalized by
#         `zeros_per_block` (sinc) and `input_sr` (input function) to maintain integral and magnitude."""
#         if filter == 'hann':
#             weights = (
#                 np.sinc(times * zeros_per_block) * hann_window(times / window_radius_in_blocks) *
#                 zeros_per_block / input_sr)
#         else:
#             weights = (
#                 np.sinc(times * zeros_per_block) *
#                 kaiser_window(times / window_radius_in_blocks, beta) * zeros_per_block / input_sr)

#         self.input_sr = input_sr
#         self.output_sr = output_sr
#         """If output_sr == 1, merge input_sr into kernel_width for weights (shape: output_sr, input_sr,
#         kernel_width) to optimize convolution speed and avoid extra reshaping."""

#         assert weights.shape == (output_sr, input_sr, kernel_width)
#         if output_sr == 1:
#             self.resample_type = 'integer_downsample'
#             self.padding = input_sr * blocks_per_side
#             weights = torch.tensor(weights, dtype=dtype, requires_grad=False)
#             weights = weights.transpose(1, 2).contiguous().view(1, 1, input_sr * kernel_width)

#         elif input_sr == 1:
#             # For conv_transpose, use weights as if input_sr and output_sr were swapped, simulating downsampling.
#             self.resample_type = 'integer_upsample'
#             self.padding = output_sr * blocks_per_side
#             weights = torch.tensor(weights, dtype=dtype, requires_grad=False)
#             weights = weights.flip(2).transpose(0,
#                                                 2).contiguous().view(1, 1, output_sr * kernel_width)
#         else:
#             self.resample_type = 'general'
#             self.reshaped = False
#             self.padding = blocks_per_side
#             weights = torch.tensor(weights, dtype=dtype, requires_grad=False)

#         self.weights = torch.nn.Parameter(weights, requires_grad=False)

#     @torch.no_grad()
#     def forward(self, x: torch.Tensor) -> torch.Tensor:
#         """
#         Parameters:
#         - x: torch.Tensor, with shape (minibatch_size, sequence_length), dtype should match the instance's dtype.

#         Returns:
#         - A torch.Tensor with shape (minibatch_size, (sequence_length//input_sr)*output_sr), dtype matching the input,
#           and content resampled.
#         """
#         if self.resample_type == 'trivial':
#             return x
#         elif self.resample_type == 'integer_downsample':
#             (minibatch_size, seq_len) = x.shape  # (B, in_C, L) with in_C == 1
#             x = x.unsqueeze(1)
#             x = torch.nn.functional.conv1d(
#                 x, self.weights, stride=self.input_sr, padding=self.padding)  # (B, out_C, L)
#             return x.squeeze(1)  # (B, L)

#         elif self.resample_type == 'integer_upsample':
#             x = x.unsqueeze(1)
#             x = torch.nn.functional.conv_transpose1d(
#                 x, self.weights, stride=self.output_sr, padding=self.padding)

#             return x.squeeze(1)
#         else:
#             assert self.resample_type == 'general'
#             (minibatch_size, seq_len) = x.shape
#             num_blocks = seq_len // self.input_sr
#             if num_blocks == 0:
#                 # TODO: pad with zeros.
#                 raise RuntimeError("Signal is too short to resample")
#             # Truncate input
#             x = x[:, 0:(num_blocks * self.input_sr)].view(minibatch_size, num_blocks, self.input_sr)
#         x = x.transpose(1, 2)  # (B, in_C, L)
#         x = torch.nn.functional.conv1d(
#             x, self.weights, padding=self.padding)  # (B, out_C, num_blocks)
#         return x.transpose(1, 2).contiguous().view(minibatch_size, num_blocks * self.output_sr)

# def test_resampler_sinewave():
#     import torchaudio
#     # x: {440Hz, 220Hz} sine wave at 16kHz
#     t = torch.arange(0, 2, 1 / 16000)  # 2 seconds at 16kHz
#     x0 = torch.sin(2 * torch.pi * 440 * t) * 0.5
#     x1 = torch.sin(2 * torch.pi * 220 * t) * 0.5
#     x = torch.stack((x0, x1), dim=0)  # (2, 32000)

#     # Resample
#     resampler = Resampler(input_sr=16000, output_sr=12000)
#     y = resampler(x)  # (2, 24000)

#     # Export to wav
#     torchaudio.save("x.wav", x, 16000, bits_per_sample=16)
#     torchaudio.save("y.wav", y, 12000, bits_per_sample=16)

# def test_resampler_music():
#     import torchaudio
#     # x: music at 16kHz
#     x, _ = torchaudio.load("music.wav")
#     slice_length = 32000
#     n_slices = 80
#     slices = [x[0, i * slice_length:(i + 1) * slice_length] for i in range(n_slices)]
#     x = torch.stack(slices)  # (80, 32000)

#     # Resample
#     filter_width = 32
#     resampler = Resampler(16000, 12000, filter_width=filter_width)
#     y = resampler(x)  # (80, 24000)
#     y = y.reshape(1, -1)  # (1, 1920000)
#     torchaudio.save(f"y_filter_width{filter_width}.wav", y, 12000, bits_per_sample=16)

# class PitchShiftLayer(nn.Module):
#     """Applying batch-wise pitch-shift to time-domain audio signals.

#     Args:
#         expected_input_length (int): Expected input length. Default: ``32767``.
#         pshift_range (List[int]): Range of pitch shift in semitones. Default: ``[-2, 2]``.
#         min_gcd (int): Minimum GCD of input and output sampling rates for resampling. Setting high value can save GPU memory. Default: ``16``.
#         max_timing_error (float): Maximum allowed timing error in seconds. Default: ``0.002``.
#         fs (int): Sample rate of input waveform, x. Default: 16000.
#         bins_per_octave (int, optional): The number of steps per octave (Default : ``12``).
#         n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
#         win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
#         hop_length (int or None, optional): Length of hop between STFT windows. If None, then ``win_length // 4``
#             is used (Default: ``None``).
#         window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
#             If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).

#     """

#     def __init__(
#         self,
#         expected_input_length: int = 32767,
#         pshift_range: List[int] = [-2, 2],
#         min_gcd: int = 16,
#         max_timing_error: float = 0.002,
#         fs: int = 16000,
#         bins_per_octave: int = 12,
#         n_fft: int = 2048,
#         win_length: Optional[int] = None,
#         hop_length: Optional[int] = None,
#         window: Optional[torch.Tensor] = None,
#         filter_width: int = 16,
#         filter: Literal['kaiser', 'kaiser_best', 'kaiser_fast', 'hann'] = 'kaiser_fast',
#         cutoff_ratio: float = 0.85,
#         beta: float = 8.555504641634386,
#         **kwargs,
#     ):
#         super().__init__()
#         self.expected_input_length = expected_input_length
#         self.pshift_range = pshift_range
#         self.min_gcd = min_gcd
#         self.max_timing_error = max_timing_error
#         self.fs = fs
#         self.bins_per_octave = bins_per_octave
#         self.n_fft = n_fft
#         self.win_length = win_length
#         self.hop_length = hop_length
#         self.window = window
#         self.resample_args = {
#             "filter_width": filter_width,
#             "filter": filter,
#             "cutoff_ratio": cutoff_ratio,
#             "beta": beta,
#         }

#         # Initialize Resamplers
#         self._initialize_resamplers()

#     def _initialize_resamplers(self):
#         resamplers = nn.ModuleDict()
#         self.frame_gaps = {}  # for debugging
#         for i in range(self.pshift_range[0], self.pshift_range[1] + 1):
#             if i == 0:
#                 # No need to shift and resample
#                 resamplers[str(i)] = None
#             else:
#                 # Find optimal reconversion frames meeting the min_gcd
#                 stretched_frames, recon_frames, gap = self._find_optimal_reconversion_frames(i)
#                 self.frame_gaps[i] = gap
#                 resamplers[str(i)] = Resampler(stretched_frames, recon_frames, **self.resample_args)
#         self.resamplers = resamplers

#     def _find_optimal_reconversion_frames(self, semitone: int):
#         """
#         Find the optimal reconversion frames for a given source sample rate, input length, and semitone for strech.

#         Parameters:
#         - sr (int): Input audio sample rate, which should be power of 2
#         - n_step (int): The number of pitch-shift steps in semi-tone.
#         - min_gcd (int): The minimum desired GCD, power of 2. Defaults to 16. 16 or 32 are good choices.
#         - max_timing_error (float): The maximum allowed timing error, in seconds. Defaults to 5 ms

#         Returns:
#         - int: The optimal target sample rate
#         """
#         stretch_rate = 1 / 2.0**(-float(semitone) / self.bins_per_octave)
#         stretched_frames = round(self.expected_input_length * stretch_rate)

#         gcd = math.gcd(self.expected_input_length, stretched_frames)
#         if gcd >= self.min_gcd:
#             return stretched_frames, self.expected_input_length, 0
#         else:
#             reconversion_frames = adjust_b_to_gcd(stretched_frames, self.expected_input_length,
#                                                   self.min_gcd)
#             gap = reconversion_frames - self.expected_input_length
#             gap_sec = gap / self.fs
#             if gap_sec > self.max_timing_error:
#                 # TODO: modifying vocoder of stretch_waveform to adjust pitch-shift rate in cents
#                 raise ValueError(
#                     gap_sec < self.max_timing_error,
#                     f"gap_sec={gap_sec} > max_timing_error={self.max_timing_error} with semitone={semitone}, stretched_frames={stretched_frames}, recon_frames={reconversion_frames}. Try adjusting input lenght or decreasing min_gcd."
#                 )
#             else:
#                 return stretched_frames, reconversion_frames, gap_sec

#     @torch.no_grad()
#     def forward(self,
#                 x: torch.Tensor,
#                 semitone: int,
#                 resample: bool = True,
#                 fix_shape: bool = True) -> torch.Tensor:
#         """
#         Args:
#             x (torch.Tensor): (B, 1, T)
#         Returns:
#             torch.Tensor: (B, 1, T)
#         """
#         if semitone == 0:
#             return x
#         elif semitone >= min(self.pshift_range) and semitone <= max(self.pshift_range):
#             x = x.squeeze(1)  # (B, T)
#             original_x_size = x.size()
#             x = _stretch_waveform(
#                 x,
#                 semitone,
#                 self.bins_per_octave,
#                 self.n_fft,
#                 self.win_length,
#                 self.hop_length,
#                 self.window,
#             )
#             if resample:
#                 x = self.resamplers[str(semitone)].forward(x)
#             # Fix waveform shape
#             if fix_shape:
#                 if x.size(1) != original_x_size[1]:
#                     # print(f"Warning: {x.size(1)} != {original_x_length}")
#                     x = _fix_waveform_shape(x, original_x_size)
#             return x.unsqueeze(1)  # (B, 1, T)
#         else:
#             raise ValueError(f"semitone must be in range {self.pshift_range}")

# def test_pitchshift_layer():
#     import torchaudio
#     # music
#     # x, _ = torchaudio.load("music.wav")
#     # slice_length = 32767
#     # n_slices = 80
#     # slices = [x[0, i * slice_length:(i + 1) * slice_length] for i in range(n_slices)]
#     # x = torch.stack(slices).unsqueeze(1)  # (80, 1, 32767)

#     # sine wave
#     t = torch.arange(0, 2.0479, 1 / 16000)  # 2.05 seconds at 16kHz
#     x = torch.sin(2 * torch.pi * 440 * t) * 0.5
#     x = x.reshape(1, 1, 32767).tile(80, 1, 1)

#     # Resample
#     pos = 0
#     ps = PitchShiftLayer(
#         pshift_range=[-3, 4],
#         expected_input_length=32767,
#         fs=16000,
#         min_gcd=16,
#         max_timing_error=0.002,
#         # filter_width=64,
#         filter='kaiser_fast',
#         n_fft=2048)
#     y = []
#     for i in range(-3, 4):
#         y.append(ps(x[[pos], :, :], i, resample=False, fix_shape=False)[0, 0, :])
#     y = torch.cat(y).unsqueeze(0)  # (1, 32767 * 7)
#     torchaudio.save("y_2048_kaiser_fast.wav", y, 16000, bits_per_sample=16)

#     # TorchAudio PitchShifter fopr comparision
#     y_ta = []
#     for i in range(-3, 4):
#         ta_transform = torchaudio.transforms.PitchShift(16000, n_steps=i)
#         y_ta.append(ta_transform(x[[pos], :, :])[0, 0, :])
#     y_ta = torch.cat(y_ta).unsqueeze(0)  # (1, 32767 * 7)
#     torchaudio.save("y_ta.wav", y_ta, 16000, bits_per_sample=16)

# def test_min_gcd_mem_usage():
#     min_gcd = 16
#     for i in range(-3, 4):
#         stretched_frames = _stretch_waveform(x, i).shape[1]
#         adjusted = adjust_b_to_gcd(stretched_frames, 32767, min_gcd)
#         gcd_val = math.gcd(adjusted, stretched_frames)
#         gap = adjusted - 32767
#         gap_ms = (gap / 16000) * 1000
#         mem_mb = (stretched_frames / gcd_val) * (adjusted / gcd_val) * 3 * 4 / 1000 / 1000
#         print(f'\033[92mmin_gcd={min_gcd}\033[0m', f'ps={i}', f'frames={stretched_frames}',
#               f'adjusted_frames={adjusted}', f'gap={gap}', f'\033[91mgap_ms={gap_ms}\033[0m',
#               f'gcd={gcd_val}', f'mem_MB={mem_mb}')