Spaces:

Emu-Academic
/

sovits-emu-voice-changer

Running

sovits-emu-voice-changer / inference /infer_tool_grad.py

MashiroSA

feat: init

b87af40 over 1 year ago

5.67 kB

	import hashlib
	import json
	import logging
	import os
	import time
	from pathlib import Path
	import io
	import librosa
	import maad
	import numpy as np
	from inference import slicer
	import parselmouth
	import soundfile
	import torch
	import torchaudio

	from hubert import hubert_model
	import utils
	from models import SynthesizerTrn
	logging.getLogger('numba').setLevel(logging.WARNING)
	logging.getLogger('matplotlib').setLevel(logging.WARNING)

	def resize2d_f0(x, target_len):
	source = np.array(x)
	source[source < 0.001] = np.nan
	target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
	source)
	res = np.nan_to_num(target)
	return res

	def get_f0(x, p_len,f0_up_key=0):

	time_step = 160 / 16000 * 1000
	f0_min = 50
	f0_max = 1100
	f0_mel_min = 1127 * np.log(1 + f0_min / 700)
	f0_mel_max = 1127 * np.log(1 + f0_max / 700)

	f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
	time_step=time_step / 1000, voicing_threshold=0.6,
	pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']

	pad_size=(p_len - len(f0) + 1) // 2
	if(pad_size>0 or p_len - len(f0) - pad_size>0):
	f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')

	f0 *= pow(2, f0_up_key / 12)
	f0_mel = 1127 * np.log(1 + f0 / 700)
	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > 255] = 255
	f0_coarse = np.rint(f0_mel).astype(np.int)
	return f0_coarse, f0

	def clean_pitch(input_pitch):
	num_nan = np.sum(input_pitch == 1)
	if num_nan / len(input_pitch) > 0.9:
	input_pitch[input_pitch != 1] = 1
	return input_pitch


	def plt_pitch(input_pitch):
	input_pitch = input_pitch.astype(float)
	input_pitch[input_pitch == 1] = np.nan
	return input_pitch


	def f0_to_pitch(ff):
	f0_pitch = 69 + 12 * np.log2(ff / 440)
	return f0_pitch


	def fill_a_to_b(a, b):
	if len(a) < len(b):
	for _ in range(0, len(b) - len(a)):
	a.append(a[0])


	def mkdir(paths: list):
	for path in paths:
	if not os.path.exists(path):
	os.mkdir(path)


	class VitsSvc(object):
	def __init__(self):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.SVCVITS = None
	self.hps = None
	self.speakers = None
	self.hubert_soft = utils.get_hubert_model()

	def set_device(self, device):
	self.device = torch.device(device)
	self.hubert_soft.to(self.device)
	if self.SVCVITS != None:
	self.SVCVITS.to(self.device)

	def loadCheckpoint(self, path):
	self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
	self.SVCVITS = SynthesizerTrn(
	self.hps.data.filter_length // 2 + 1,
	self.hps.train.segment_size // self.hps.data.hop_length,
	**self.hps.model)
	_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
	_ = self.SVCVITS.eval().to(self.device)
	self.speakers = self.hps.spk

	def get_units(self, source, sr):
	source = source.unsqueeze(0).to(self.device)
	with torch.inference_mode():
	units = self.hubert_soft.units(source)
	return units


	def get_unit_pitch(self, in_path, tran):
	source, sr = torchaudio.load(in_path)
	source = torchaudio.functional.resample(source, sr, 16000)
	if len(source.shape) == 2 and source.shape[1] >= 2:
	source = torch.mean(source, dim=0).unsqueeze(0)
	soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
	f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
	return soft, f0

	def infer(self, speaker_id, tran, raw_path):
	speaker_id = self.speakers[speaker_id]
	sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
	soft, pitch = self.get_unit_pitch(raw_path, tran)
	f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
	stn_tst = torch.FloatTensor(soft)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0).to(self.device)
	x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
	audio = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
	return audio, audio.shape[-1]

	def inference(self,srcaudio,chara,tran,slice_db):
	sampling_rate, audio = srcaudio
	audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio.transpose(1, 0))
	if sampling_rate != 16000:
	audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
	soundfile.write("tmpwav.wav", audio, 16000, format="wav")
	chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
	audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
	audio = []
	for (slice_tag, data) in audio_data:
	length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
	raw_path = io.BytesIO()
	soundfile.write(raw_path, data, audio_sr, format="wav")
	raw_path.seek(0)
	if slice_tag:
	_audio = np.zeros(length)
	else:
	out_audio, out_sr = self.infer(chara, tran, raw_path)
	_audio = out_audio.cpu().numpy()
	audio.extend(list(_audio))
	audio = (np.array(audio) * 32768.0).astype('int16')
	return (self.hps.data.sampling_rate,audio)