File size: 2,747 Bytes
33e3a91
 
 
 
4f821f0
33e3a91
 
 
 
 
 
 
 
 
 
 
 
73e61ac
 
 
33e3a91
 
73e61ac
 
f7db087
73e61ac
33e3a91
 
b7e88e1
33e3a91
 
 
 
 
 
 
 
 
 
 
 
 
04d9b94
33e3a91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96c45a5
 
9ca6f22
 
 
 
 
96c45a5
04d9b94
33e3a91
 
28d63d4
33e3a91
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
from tqdm import tqdm
from copy import deepcopy

import soundfile as sf
import numpy as np
import gradio as gr
import torch

import random
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)

from util import print_size, sampling
from network import CleanUNet
import torchaudio
import torchaudio.transforms as T

SAMPLE_RATE = 22050

def load_simple(filename):
    wav, sr = torchaudio.load(filename)
    resampler = T.Resample(sr, SAMPLE_RATE, dtype=wav.dtype)
    resampled_wav = resampler(wav)
    return resampled_wav

CONFIG = "configs/DNS-large-full.json"
CHECKPOINT = "./exp/DNS-large-full/checkpoint/pretrained.pkl"

# Parse configs. Globals nicer in this case
with open(CONFIG) as f:
    data = f.read()
    config = json.loads(data)
    gen_config              = config["gen_config"]
    global network_config
    network_config          = config["network_config"]      # to define wavenet
    global train_config
    train_config            = config["train_config"]        # train config
    global trainset_config
    trainset_config         = config["trainset_config"]     # to read trainset configurations

def denoise(filename, ckpt_path = CHECKPOINT, out = "out.wav"):
    """
    Denoise audio
    Parameters:
    output_directory (str):         save generated speeches to this path
    ckpt_iter (int or 'max'):       the pretrained checkpoint to be loaded; 
                                    automitically selects the maximum iteration if 'max' is selected
    subset (str):                   training, testing, validation
    dump (bool):                    whether save enhanced (denoised) audio
    """

    # setup local experiment path
    exp_path = train_config["exp_path"]
    print('exp_path:', exp_path)

    # load data
    loader_config = deepcopy(trainset_config)
    loader_config["crop_length_sec"] = 0

    # predefine model
    net = CleanUNet(**network_config)
    print_size(net)

    # load checkpoint
    checkpoint = torch.load(ckpt_path, map_location='cpu')
    net.load_state_dict(checkpoint['model_state_dict'])
    net.eval()

    # inference
    noisy_audio = load_simple(filename)

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            generated_audio = sampling(net, noisy_audio)
            generated_audio = generated_audio[0].squeeze().cpu().numpy()
            sf.write(out, np.ravel(generated_audio), SAMPLE_RATE)

    return out

audio = gr.inputs.Audio(label = "Audio to denoise", type = 'filepath')
inputs = [audio]
outputs = gr.outputs.Audio(label = "Denoised audio", type = 'filepath')

title = "Speech Denoising in the Waveform Domain with Self-Attention from Nvidia"

gr.Interface(denoise, inputs, outputs, title=title, enable_queue=True).launch()