Spaces:

Emu-Academic
/

sovits-emu-voice-changer

Running

File size: 4,579 Bytes

import io
import os

# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
import gradio as gr
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc
import logging

logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

config_path = "configs/config.json"

model = Svc("logs/44k/G_130400.pth", "configs/config.json", cluster_model_path="logs/44k/kmeans.pt")



def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale):
    if input_audio is None:
        return "You need to upload an audio", None
    sampling_rate, audio = input_audio
    # print(audio.shape,sampling_rate)
    duration = audio.shape[0] / sampling_rate
    if duration > 90:
        return "请上传小于90s的音频，需要转换长音频请本地进行转换", None
    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
    print(audio.shape)
    out_wav_path = "temp.wav"
    soundfile.write(out_wav_path, audio, 16000, format="wav")
    print( cluster_ratio, auto_f0, noise_scale)
    _audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale)
    return "Success", (44100, _audio)


app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem("Basic"):
            gr.Markdown(value="""
                # sovits-emu-voice-transform ｜ OtoriEmu的在线变声器

                [![Visitors](https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FMashiroSA%2Fsovits-emu-voice-transform&labelColor=%23f47373&countColor=%23555555)](https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FMashiroSA%2Fsovits-emu-voice-transform)

                *Modified from public demo based on so-vits-svc 4.0. *
                基于so-vits-svc 4.0的公开demo修改而成。
                *The dialogue training model based on the role Otori Emu has shown good results in dialogue, but the vocal of music conversion is not good. *
                所使用的基于角色鳳えむ的对话训练的模型，在对话中具有良好效果，乐音转换欠佳。
                *Only authorized to run on huggingface, running with free instance so conversion is much slower. Please be patient. *
                仅授权在huggingface上运行，运行使用免费实例转换很慢很慢很慢很慢，请耐心等待。
                
                ```text
                For academic exchange only and not for illegal purposes. We have no relationship or interest with SEGA or related organizations.
                The model derivation output is only similar to Otori Emu and there is inevitable loss, which cannot be fully simulated. 
                If you have any questions, please send an email or forum for inquiry.
                ```
                """)
            spks = list(model.spk2id.keys())
            sid = gr.Dropdown(label="音色", choices=spks, value=spks[0])
            vc_input3 = gr.Audio(label="上传音频（长度小于90秒）")
            vc_transform = gr.Number(label="变调（整数，可以正负，半音数量，升高八度就是12，当你觉得音色不准确时可以适当调高或降低）", value=0)
            cluster_ratio = gr.Number(label="聚类模型混合比例，0-1之间，默认为0不启用聚类，能提升音色相似度，但会导致咬字下降（如果使用建议0.5左右）", value=0)
            auto_f0 = gr.Checkbox(label="自动f0预测，配合聚类模型f0预测效果更好,会导致变调功能失效（仅限转换语音，歌声不要勾选此项会究极跑调）", value=False)
            slice_db = gr.Number(label="切片阈值", value=-40)
            noise_scale = gr.Number(label="noise_scale 建议不要动，会影响音质，玄学参数", value=0.4)
            vc_submit = gr.Button("转换", variant="primary")
            vc_output1 = gr.Textbox(label="Output Message")
            vc_output2 = gr.Audio(label="Output Audio")
        vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [vc_output1, vc_output2])

    app.launch()