Spaces:

jhtonyKoo
/

ITO-Master

Running

File size: 17,683 Bytes

6fc042a

""" 
    Implementation of differentiable mastering effects based on DASP-pytorch and torchcomp libraries
        - Distortion
        - Multiband Compressor
        - Limiter
    DASP-pytorch: https://github.com/csteinmetz1/dasp-pytorch
    torchcomp: https://github.com/yoyololicon/torchcomp
"""
import dasp_pytorch
from dasp_pytorch.modules import Processor
import torchcomp
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import time


EPS = 1e-6

class Distortion(Processor):
    def __init__(
        self,
        sample_rate: int,
        min_gain_db: float = 0.0,
        max_gain_db: float = 24.0,
    ):
        super().__init__()
        self.sample_rate = sample_rate
        self.process_fn = distortion
        self.param_ranges = {
            "drive_db": (min_gain_db, max_gain_db),
            "parallel_weight_factor": (0.2, 0.7),
        }
        self.num_params = len(self.param_ranges)

def distortion(x: torch.Tensor, 
    sample_rate: int, 
    drive_db: torch.Tensor, 
    parallel_weight_factor: torch.Tensor()):
    """Simple soft-clipping distortion with drive control.

    Args:
        x (torch.Tensor): Input audio tensor with shape (bs, chs, seq_len)
        sample_rate (int): Audio sample rate.
        drive_db (torch.Tensor): Drive in dB with shape (bs)

    Returns:
        torch.Tensor: Output audio tensor with shape (bs, chs, seq_len)

    """
    bs, chs, seq_len = x.size()
    parallel_weight_factor = parallel_weight_factor.view(-1, 1, 1)

    # return torch.tanh(x * (10 ** (drive_db.view(bs, chs, -1) / 20.0))) -> wrong?
    x_dist = torch.tanh(x * (10 ** (drive_db.view(bs, 1, 1) / 20.0)))
    
    # parallel compuatation
    return parallel_weight_factor * x_dist + (1-parallel_weight_factor) * x



class Multiband_Compressor(Processor):
    def __init__(
        self,
        sample_rate: int,
        min_threshold_db_comp: float = -60.0,
        max_threshold_db_comp: float = 0.0-EPS,
        min_ratio_comp: float = 1.0+EPS,
        max_ratio_comp: float = 20.0,
        min_attack_ms_comp: float = 5.0,
        max_attack_ms_comp: float = 100.0,
        min_release_ms_comp: float = 5.0,
        max_release_ms_comp: float = 100.0,
        min_threshold_db_exp: float = -60.0,
        max_threshold_db_exp: float = 0.0-EPS,
        min_ratio_exp: float = 0.0+EPS,
        max_ratio_exp: float = 1.0-EPS,
        min_attack_ms_exp: float = 5.0,
        max_attack_ms_exp: float = 100.0,
        min_release_ms_exp: float = 5.0,
        max_release_ms_exp: float = 100.0,
    ):
        super().__init__()
        self.sample_rate = sample_rate
        self.process_fn = multiband_compressor
        self.param_ranges = {
            "low_cutoff": (20, 300),
            "high_cutoff": (2000, 12000), 
            "parallel_weight_factor": (0.2, 0.7),

            "low_shelf_comp_thresh": (min_threshold_db_comp, max_threshold_db_comp),
            "low_shelf_comp_ratio": (min_ratio_comp, max_ratio_comp),
            "low_shelf_exp_thresh": (min_threshold_db_exp, max_threshold_db_exp),
            "low_shelf_exp_ratio": (min_ratio_exp, max_ratio_exp),
            "low_shelf_at": (min_attack_ms_exp, max_attack_ms_exp),
            "low_shelf_rt": (min_release_ms_exp, max_release_ms_exp),
            
            "mid_band_comp_thresh": (min_threshold_db_comp, max_threshold_db_comp),
            "mid_band_comp_ratio": (min_ratio_comp, max_ratio_comp),
            "mid_band_exp_thresh": (min_threshold_db_exp, max_threshold_db_exp),
            "mid_band_exp_ratio": (min_ratio_exp, max_ratio_exp),
            "mid_band_at": (min_attack_ms_exp, max_attack_ms_exp),
            "mid_band_rt": (min_release_ms_exp, max_release_ms_exp),
            
            "high_shelf_comp_thresh": (min_threshold_db_comp, max_threshold_db_comp),
            "high_shelf_comp_ratio": (min_ratio_comp, max_ratio_comp),
            "high_shelf_exp_thresh": (min_threshold_db_exp, max_threshold_db_exp),
            "high_shelf_exp_ratio": (min_ratio_exp, max_ratio_exp),
            "high_shelf_at": (min_attack_ms_exp, max_attack_ms_exp),
            "high_shelf_rt": (min_release_ms_exp, max_release_ms_exp),
        }
        self.num_params = len(self.param_ranges)



def linkwitz_riley_4th_order(
    x: torch.Tensor, 
    cutoff_freq: torch.Tensor,
    sample_rate: float, 
    filter_type: str):
    q_factor = torch.ones(cutoff_freq.shape) / torch.sqrt(torch.tensor([2.0]))
    gain_db = torch.zeros(cutoff_freq.shape)
    q_factor = q_factor.to(x.device)
    gain_db = gain_db.to(x.device)

    b, a = dasp_pytorch.signal.biquad(
        gain_db,
        cutoff_freq,
        q_factor,
        sample_rate,
        filter_type
    )

    del gain_db
    del q_factor
    
    eff_bs = x.size(0)
    # six second order sections
    sos = torch.cat((b, a), dim=-1).unsqueeze(1)

    # apply filter twice to phase difference amounts of 360°
    x = dasp_pytorch.signal.sosfilt_via_fsm(sos, x)
    x_out = dasp_pytorch.signal.sosfilt_via_fsm(sos, x)

    return x_out


def multiband_compressor(
    x: torch.Tensor,
    sample_rate: float,

    low_cutoff: torch.Tensor,
    high_cutoff: torch.Tensor, 
    parallel_weight_factor: torch.Tensor,

    low_shelf_comp_thresh: torch.Tensor,
    low_shelf_comp_ratio: torch.Tensor,
    low_shelf_exp_thresh: torch.Tensor,
    low_shelf_exp_ratio: torch.Tensor,
    low_shelf_at: torch.Tensor,
    low_shelf_rt: torch.Tensor,
    
    mid_band_comp_thresh: torch.Tensor,
    mid_band_comp_ratio: torch.Tensor,
    mid_band_exp_thresh: torch.Tensor,
    mid_band_exp_ratio: torch.Tensor,
    mid_band_at: torch.Tensor,
    mid_band_rt: torch.Tensor,
    
    high_shelf_comp_thresh: torch.Tensor,
    high_shelf_comp_ratio: torch.Tensor,
    high_shelf_exp_thresh: torch.Tensor,
    high_shelf_exp_ratio: torch.Tensor,
    high_shelf_at: torch.Tensor,
    high_shelf_rt: torch.Tensor,
):
    """Multiband (Three-band) Compressor.

    Low-shelf -> Mid-band -> High-shelf

    Args:
        x (torch.Tensor): Time domain tensor with shape (bs, chs, seq_len)
        sample_rate (float): Audio sample rate.
        low_cutoff (torch.Tensor): Low-shelf filter cutoff frequency in Hz.
        high_cutoff (torch.Tensor): High-shelf filter cutoff frequency in Hz.
        low_shelf_comp_thresh (torch.Tensor): 
        low_shelf_comp_ratio (torch.Tensor): 
        low_shelf_exp_thresh (torch.Tensor): 
        low_shelf_exp_ratio (torch.Tensor): 
        low_shelf_at (torch.Tensor): 
        low_shelf_rt (torch.Tensor): 
        mid_band_comp_thresh (torch.Tensor): 
        mid_band_comp_ratio (torch.Tensor): 
        mid_band_exp_thresh (torch.Tensor): 
        mid_band_exp_ratio (torch.Tensor): 
        mid_band_at (torch.Tensor): 
        mid_band_rt (torch.Tensor): 
        high_shelf_comp_thresh (torch.Tensor): 
        high_shelf_comp_ratio (torch.Tensor): 
        high_shelf_exp_thresh (torch.Tensor): 
        high_shelf_exp_ratio (torch.Tensor): 
        high_shelf_at (torch.Tensor): 
        high_shelf_rt (torch.Tensor): 

    Returns:
        y (torch.Tensor): Filtered signal.
    """
    bs, chs, seq_len = x.size()

    low_cutoff = low_cutoff.view(-1, 1, 1)
    high_cutoff = high_cutoff.view(-1, 1, 1) 
    parallel_weight_factor = parallel_weight_factor.view(-1, 1, 1)

    eff_bs = x.size(0)

    ''' cross over filter '''
    # Low-shelf band (low frequencies)
    low_band = linkwitz_riley_4th_order(x, low_cutoff, sample_rate, filter_type="low_pass")
    # High-shelf band (high frequencies)
    high_band = linkwitz_riley_4th_order(x, high_cutoff, sample_rate, filter_type="high_pass")
    # Mid-band (band-pass)
    mid_band = x - low_band - high_band  # Subtract low and high bands from original signal

    ''' compressor '''
    try:
        x_out_low = low_band * torchcomp.compexp_gain(low_band.sum(axis=1).abs(),
                                            comp_thresh=low_shelf_comp_thresh, \
                                            comp_ratio=low_shelf_comp_ratio, \
                                            exp_thresh=low_shelf_exp_thresh, \
                                            exp_ratio=low_shelf_exp_ratio, \
                                            at=torchcomp.ms2coef(low_shelf_at, sample_rate), \
                                            rt=torchcomp.ms2coef(low_shelf_rt, sample_rate)).unsqueeze(1)
    except:
        x_out_low = low_band
        print('\t!!!failed computing low-band compression!!!')
    try:
        x_out_high = high_band * torchcomp.compexp_gain(high_band.sum(axis=1).abs(),
                                            comp_thresh=high_shelf_comp_thresh, \
                                            comp_ratio=high_shelf_comp_ratio, \
                                            exp_thresh=high_shelf_exp_thresh, \
                                            exp_ratio=high_shelf_exp_ratio, \
                                            at=torchcomp.ms2coef(high_shelf_at, sample_rate), \
                                            rt=torchcomp.ms2coef(high_shelf_rt, sample_rate)).unsqueeze(1)
    except:
        x_out_high = high_band
        print('\t!!!failed computing high-band compression!!!')
    try:
        x_out_mid = mid_band * torchcomp.compexp_gain(mid_band.sum(axis=1).abs(),
                                            comp_thresh=mid_band_comp_thresh, \
                                            comp_ratio=mid_band_comp_ratio, \
                                            exp_thresh=mid_band_exp_thresh, \
                                            exp_ratio=mid_band_exp_ratio, \
                                            at=torchcomp.ms2coef(mid_band_at, sample_rate), \
                                            rt=torchcomp.ms2coef(mid_band_rt, sample_rate)).unsqueeze(1)
    except:
        x_out_mid = mid_band
        print('\t!!!failed computing mid-band compression!!!')
    x_out = x_out_low + x_out_high + x_out_mid

    # parallel computation
    x_out = parallel_weight_factor * x_out + (1-parallel_weight_factor) * x

    # move channels back
    x_out = x_out.view(bs, chs, seq_len)

    return x_out




class Limiter(Processor):
    def __init__(
        self,
        sample_rate: int,
        min_threshold_db: float = -60.0,
        max_threshold_db: float = 0.0-EPS,
        min_attack_ms: float = 5.0,
        max_attack_ms: float = 100.0,
        min_release_ms: float = 5.0,
        max_release_ms: float = 100.0,
    ):
        super().__init__()
        self.sample_rate = sample_rate
        self.process_fn = limiter
        self.param_ranges = {
            "threshold": (min_threshold_db, max_threshold_db),
            "at": (min_attack_ms, max_attack_ms),
            "rt": (min_release_ms, max_release_ms),
        }
        self.num_params = len(self.param_ranges)


def limiter(
    x: torch.Tensor,
    sample_rate: float,
    threshold: float,
    at: float,
    rt: float,
):
    """Limiter.

    from Chin-yun's paper

    Args:
        x (torch.Tensor): Time domain tensor with shape (bs, chs, seq_len)
        sample_rate (float): Audio sample rate.
        threshold (torch.Tensor): Limiter threshold in dB.
        at (torch.Tensor): Attack time.
        rt (torch.Tensor): Release time.
        
    Returns:
        y (torch.Tensor): Limited signal.
    """
    bs, chs, seq_len = x.size()

    x_out = x * torchcomp.limiter_gain(x.sum(axis=1).abs(), 
                                        threshold=threshold,
                                        at=torchcomp.ms2coef(at, sample_rate), 
                                        rt=torchcomp.ms2coef(rt, sample_rate)).unsqueeze(1)

    # move channels back
    x_out = x_out.view(bs, chs, seq_len)

    return x_out




class Random_Augmentation_Dasp(nn.Module):
    def __init__(self, sample_rate, \
                    tgt_fx_names = ['eq', 'comp', 'imager', 'gain']):
        super(Random_Augmentation_Dasp, self).__init__()
        self.sample_rate = sample_rate
        self.tgt_fx_names = tgt_fx_names
        
        self.device = torch.device("cpu")
        if torch.cuda.is_available():
            self.device = torch.device(f"cuda")

        self.fx_prob = {'eq': 0.9, \
                        'distortion': 0.3, \
                        'comp': 0.8, \
                        'multiband_comp': 0.8, \
                        'gain': 0.85, \
                        'imager': 0.6, \
                        'limiter': 1.0}
        self.fx_processors = {}
        for cur_fx in tgt_fx_names:
            if cur_fx=='eq':
                cur_fx_module = dasp_pytorch.ParametricEQ(sample_rate=sample_rate, \
                                                            min_gain_db = -10.0, \
                                                            max_gain_db = 10.0, \
                                                            min_q_factor = 0.5, \
                                                            max_q_factor=5.0)
            elif cur_fx=='distortion':
                cur_fx_module = Distortion(sample_rate=sample_rate, 
                                            min_gain_db = 0.0,
                                            max_gain_db = 4.0)
            elif cur_fx=='comp':
                cur_fx_module = dasp_pytorch.Compressor(sample_rate=sample_rate)
            elif cur_fx=='multiband_comp':
                cur_fx_module = Multiband_Compressor(sample_rate=sample_rate,
                                                    min_threshold_db_comp = -30.0,
                                                    max_threshold_db_comp = -5.0,
                                                    min_ratio_comp = 1.5,
                                                    max_ratio_comp = 6.0,
                                                    min_attack_ms_comp = 1.0,
                                                    max_attack_ms_comp = 20.0,
                                                    min_release_ms_comp = 20.0,
                                                    max_release_ms_comp = 500.0,
                                                    min_threshold_db_exp = -30.0,
                                                    max_threshold_db_exp = -5.0,
                                                    min_ratio_exp = 0.0+EPS,
                                                    max_ratio_exp = 1.0-EPS,
                                                    min_attack_ms_exp = 1.0,
                                                    max_attack_ms_exp = 20.0,
                                                    min_release_ms_exp = 20.0,
                                                    max_release_ms_exp = 500.0,
                )
            elif cur_fx=='gain':
                cur_fx_module = dasp_pytorch.Gain(sample_rate=sample_rate,
                                                    min_gain_db = 0.0,
                                                    max_gain_db = 6.0,)
            elif cur_fx=='imager':
                continue
            elif cur_fx=='limiter':
                cur_fx_module = Limiter(sample_rate=sample_rate,
                                        min_threshold_db = -20.0,
                                        max_threshold_db = 0.0-EPS,
                                        min_attack_ms = 0.1,
                                        max_attack_ms = 5.0,
                                        min_release_ms = 20.0,
                                        max_release_ms = 1000.0,)
            else:
                raise AssertionError(f"current fx name ({cur_fx}) not found")
            self.fx_processors[cur_fx] = cur_fx_module
        total_num_param = sum([self.fx_processors[cur_fx].num_params for cur_fx in self.fx_processors])
        if 'imager' in tgt_fx_names:
            total_num_param += 1
        self.total_num_param = total_num_param


    # network forward operation
    def forward(self, x, rand_param=None, use_mask=None):
        if rand_param==None:
            rand_param = torch.rand((x.shape[0], self.total_num_param)).to(self.device)
        else:
            assert rand_param.shape[0]==x.shape[0] and rand_param.shape[1]==self.total_num_param
        if use_mask==None:
            use_mask = self.random_mask_generator(x.shape[0])

        # dafx chain
        cur_param_idx = 0
        for cur_fx in self.tgt_fx_names:
            cur_param_count = 1 if cur_fx=='imager' else self.fx_processors[cur_fx].num_params
            if cur_fx=='imager':
                x_processed = dasp_pytorch.functional.stereo_widener(x, \
                                                            sample_rate=self.sample_rate, \
                                                            width=rand_param[:,cur_param_idx:cur_param_idx+1])
            else:
                cur_input_param = rand_param[:, cur_param_idx:cur_param_idx+cur_param_count]
                x_processed = self.fx_processors[cur_fx].process_normalized(x, cur_input_param)
            # process all FX but decide to use the processed output based on probability
            cur_mask = use_mask[cur_fx]
            x = x_processed*cur_mask + x*~cur_mask
            # update param index
            cur_param_idx += cur_param_count

        return x


    def random_mask_generator(self, batch_size, repeat=1):
        mask = {}
        for cur_fx in self.tgt_fx_names:
            mask[cur_fx] = self.fx_prob[cur_fx] > torch.rand(batch_size).view(-1, 1, 1)
            if repeat>1:
                mask[cur_fx] = mask[cur_fx].repeat(repeat, 1, 1)
            mask[cur_fx] = mask[cur_fx].to(self.device)
        return mask