Spaces:
Running
Running
File size: 3,893 Bytes
e3c945a 6e41012 e3c945a 6e41012 e3c945a 842c3cd 2f245d3 6e41012 842c3cd 6e41012 43440f3 6e41012 43440f3 e3c945a 5199f9b e3c945a c86b4c1 9847bf2 c86b4c1 842c3cd c86b4c1 f5dd311 c86b4c1 2a42a98 e3c945a 6e41012 2f245d3 43440f3 e3c945a 6e41012 e3c945a b29c6aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import io
import os
os.system("wget -P hubert/ https://huggingface.co/spaces/MarcusSu1216/XingTong/blob/main/hubert/checkpoint_best_legacy_500.pt")
import gradio as gr
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
config_path = "configs/config.json"
model = Svc("logs/44k/G_99200.pth", "configs/config.json", cluster_model_path="logs/44k/kmeans_10000.pt")
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, noise_scale):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
# print(audio.shape,sampling_rate)
duration = audio.shape[0] / sampling_rate
if duration > 90:
return "请上传小于90s的音频,需要转换长音频请本地进行转换", None
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
print(audio.shape)
out_wav_path = "temp.wav"
soundfile.write(out_wav_path, audio, 16000, format="wav")
print( cluster_ratio, auto_f0, noise_scale)
_audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale)
return "转换成功", (44100, _audio)
app = gr.Blocks()
with app:
with gr.Tabs():
with gr.TabItem("介绍"):
gr.Markdown(value="""
星瞳_Official的语音在线合成,基于so-vits-svc-4.0生成。\n
使用须知:\n
1、请使用伴奏和声去除干净的人声素材,时长小于90秒,格式为mp3或wav。\n
2、去除伴奏推荐使用UVR5软件,B站上有详细教程。\n
3、条件不支持推荐使用以下几个去伴奏的网站:\n
https://vocalremover.org/zh/\n
https://tuanziai.com/vocal-remover/upload\n
https://www.lalal.ai/zh-hans/\n
4、在线版服务器为2核16G免费版,转换效率较慢请耐心等待。\n
5、使用此模型请标注作者:南下扶苏,以及该项目地址。\n
6、有问题可以在B站私聊我反馈:https://space.bilibili.com/38523418\n
7、语音模型转换出的音频请勿用于商业化。
""")
spks = list(model.spk2id.keys())
sid = gr.Dropdown(label="音色", choices=["XT4.0"], value="XT4.0")
vc_input3 = gr.Audio(label="上传音频(长度建议小于90秒)")
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False)
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
vc_submit = gr.Button("转换", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio")
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, noise_scale], [vc_output1, vc_output2])
app.launch()
|