File size: 2,469 Bytes
aeef433
 
 
 
 
 
 
 
 
950b34b
aeef433
950b34b
aeef433
 
 
4f41ac7
aeef433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06f6cc4
 
 
 
 
4f41ac7
06f6cc4
 
aeef433
 
 
 
06f6cc4
aeef433
81ec9a1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gradio as gr
from miipher.dataset.preprocess_for_infer import PreprocessForInfer
from miipher.lightning_module import MiipherLightningModule
from lightning_vocoders.models.hifigan.xvector_lightning_module import HiFiGANXvectorLightningModule
import torch
import torchaudio
import hydra
import tempfile

miipher_path = "miipher_v2.ckpt"
miipher = MiipherLightningModule.load_from_checkpoint(miipher_path,map_location='cpu')
vocoder = HiFiGANXvectorLightningModule.load_from_checkpoint("vocoder_finetuned_v2.ckpt",map_location='cpu')
xvector_model = hydra.utils.instantiate(vocoder.cfg.data.xvector.model)
xvector_model = xvector_model.to('cpu')
preprocessor = PreprocessForInfer(miipher.cfg)
preprocessor.cfg.preprocess.text2phone_model.is_cuda=False
@torch.inference_mode()
def main(wav_path,transcript,lang_code):
    wav,sr =torchaudio.load(wav_path)
    wav = wav[0].unsqueeze(0)
    batch = preprocessor.process(
        'test',
        (torch.tensor(wav),sr),
        word_segmented_text=transcript,
        lang_code=lang_code
    )

    miipher.feature_extractor(batch)
    (
        phone_feature,
        speaker_feature,
        degraded_ssl_feature,
        _,
    ) = miipher.feature_extractor(batch)
    cleaned_ssl_feature, _ = miipher(phone_feature,speaker_feature,degraded_ssl_feature)
    vocoder_xvector = xvector_model.encode_batch(batch['degraded_wav_16k'].view(1,-1).cpu()).squeeze(1)
    cleaned_wav = vocoder.generator_forward({"input_feature": cleaned_ssl_feature, "xvector": vocoder_xvector})[0].T
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp:
        torchaudio.save(fp,cleaned_wav.view(1,-1), sample_rate=22050,format='wav')
        return fp.name

description = """
# Miipher demo
This repository provices pretrained weights and demo of Miipher implementation by [Wataru-Nakata](https://github.com/Wataru-Nakata/miipher)
Miipher was originally proposed by Koizumi et. al. [arxiv](https://arxiv.org/abs/2303.01664)
Please note that the model differs in many ways from the paper.

**Non commercial use only** as the weights are provided in CC-BY-NC 2.0.
"""
inputs = [gr.Audio(label="noisy audio",type='filepath'),gr.Textbox(label="Transcript", value="Your transcript here", max_lines=1), 
            gr.Radio(label="Language", choices=["eng-us", "jpn"], value="eng-us")]
outputs = gr.Audio(label="Output")

demo = gr.Interface(fn=main, inputs=inputs, outputs=outputs,description=description)

demo.launch()