Spaces:
Build error
Build error
from copy import deepcopy | |
from dataclasses import dataclass | |
from itertools import chain | |
from typing import Dict, List, Tuple | |
import numpy as np | |
import pyworld as pw | |
from scipy.signal import resample | |
from .metas.Metas import Speaker, SpeakerSupportPermittedSynthesisMorphing, StyleInfo | |
from .metas.MetasStore import construct_lookup | |
from .model import AudioQuery, MorphableTargetInfo, SpeakerNotFoundError | |
from .synthesis_engine import SynthesisEngine | |
# FIXME: ndarray type hint, https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/blob/2b64f86197573497c685c785c6e0e743f407b63e/pyworld/pyworld.pyx#L398 # noqa | |
class MorphingParameter: | |
fs: int | |
frame_period: float | |
base_f0: np.ndarray | |
base_aperiodicity: np.ndarray | |
base_spectrogram: np.ndarray | |
target_spectrogram: np.ndarray | |
def create_morphing_parameter( | |
base_wave: np.ndarray, | |
target_wave: np.ndarray, | |
fs: int, | |
) -> MorphingParameter: | |
frame_period = 1.0 | |
base_f0, base_time_axis = pw.harvest(base_wave, fs, frame_period=frame_period) | |
base_spectrogram = pw.cheaptrick(base_wave, base_f0, base_time_axis, fs) | |
base_aperiodicity = pw.d4c(base_wave, base_f0, base_time_axis, fs) | |
target_f0, morph_time_axis = pw.harvest(target_wave, fs, frame_period=frame_period) | |
target_spectrogram = pw.cheaptrick(target_wave, target_f0, morph_time_axis, fs) | |
target_spectrogram.resize(base_spectrogram.shape) | |
return MorphingParameter( | |
fs=fs, | |
frame_period=frame_period, | |
base_f0=base_f0, | |
base_aperiodicity=base_aperiodicity, | |
base_spectrogram=base_spectrogram, | |
target_spectrogram=target_spectrogram, | |
) | |
def get_morphable_targets( | |
speakers: List[Speaker], | |
base_speakers: List[int], | |
) -> List[Dict[int, MorphableTargetInfo]]: | |
""" | |
speakers: 全話者の情報 | |
base_speakers: モーフィング可能か判定したいベースの話者リスト(スタイルID) | |
""" | |
speaker_lookup = construct_lookup(speakers) | |
morphable_targets_arr = [] | |
for base_speaker in base_speakers: | |
morphable_targets = dict() | |
for style in chain.from_iterable(speaker.styles for speaker in speakers): | |
morphable_targets[style.id] = MorphableTargetInfo( | |
is_morphable=is_synthesis_morphing_permitted( | |
speaker_lookup=speaker_lookup, | |
base_speaker=base_speaker, | |
target_speaker=style.id, | |
) | |
) | |
morphable_targets_arr.append(morphable_targets) | |
return morphable_targets_arr | |
def is_synthesis_morphing_permitted( | |
speaker_lookup: Dict[int, Tuple[Speaker, StyleInfo]], | |
base_speaker: int, | |
target_speaker: int, | |
) -> bool: | |
""" | |
指定されたspeakerがモーフィング可能かどうか返す | |
speakerが見つからない場合はSpeakerNotFoundErrorを送出する | |
""" | |
base_speaker_data = speaker_lookup[base_speaker] | |
target_speaker_data = speaker_lookup[target_speaker] | |
if base_speaker_data is None or target_speaker_data is None: | |
raise SpeakerNotFoundError( | |
base_speaker if base_speaker_data is None else target_speaker | |
) | |
base_speaker_info, _ = base_speaker_data | |
target_speaker_info, _ = target_speaker_data | |
base_speaker_uuid = base_speaker_info.speaker_uuid | |
target_speaker_uuid = target_speaker_info.speaker_uuid | |
base_speaker_morphing_info: SpeakerSupportPermittedSynthesisMorphing = ( | |
base_speaker_info.supported_features.permitted_synthesis_morphing | |
) | |
target_speaker_morphing_info: SpeakerSupportPermittedSynthesisMorphing = ( | |
target_speaker_info.supported_features.permitted_synthesis_morphing | |
) | |
# 禁止されている場合はFalse | |
if ( | |
base_speaker_morphing_info == SpeakerSupportPermittedSynthesisMorphing.NOTHING | |
or target_speaker_morphing_info | |
== SpeakerSupportPermittedSynthesisMorphing.NOTHING | |
): | |
return False | |
# 同一話者のみの場合は同一話者判定 | |
if ( | |
base_speaker_morphing_info == SpeakerSupportPermittedSynthesisMorphing.SELF_ONLY | |
or target_speaker_morphing_info | |
== SpeakerSupportPermittedSynthesisMorphing.SELF_ONLY | |
): | |
return base_speaker_uuid == target_speaker_uuid | |
# 念のため許可されているかチェック | |
return ( | |
base_speaker_morphing_info == SpeakerSupportPermittedSynthesisMorphing.ALL | |
and target_speaker_morphing_info == SpeakerSupportPermittedSynthesisMorphing.ALL | |
) | |
def synthesis_morphing_parameter( | |
engine: SynthesisEngine, | |
query: AudioQuery, | |
base_speaker: int, | |
target_speaker: int, | |
) -> MorphingParameter: | |
query = deepcopy(query) | |
# 不具合回避のためデフォルトのサンプリングレートでWORLDに掛けた後に指定のサンプリングレートに変換する | |
query.outputSamplingRate = engine.default_sampling_rate | |
# WORLDに掛けるため合成はモノラルで行う | |
query.outputStereo = False | |
base_wave = engine.synthesis(query=query, speaker_id=base_speaker).astype("float") | |
target_wave = engine.synthesis(query=query, speaker_id=target_speaker).astype( | |
"float" | |
) | |
return create_morphing_parameter( | |
base_wave=base_wave, | |
target_wave=target_wave, | |
fs=query.outputSamplingRate, | |
) | |
def synthesis_morphing( | |
morph_param: MorphingParameter, | |
morph_rate: float, | |
output_fs: int, | |
output_stereo: bool = False, | |
) -> np.ndarray: | |
""" | |
指定した割合で、パラメータをもとにモーフィングした音声を生成します。 | |
Parameters | |
---------- | |
morph_param : MorphingParameter | |
`synthesis_morphing_parameter`または`create_morphing_parameter`で作成したパラメータ | |
morph_rate : float | |
モーフィングの割合 | |
0.0でベースの話者、1.0でターゲットの話者に近づきます。 | |
Returns | |
------- | |
generated : np.ndarray | |
モーフィングした音声 | |
Raises | |
------- | |
ValueError | |
morph_rate ∈ [0, 1] | |
""" | |
if morph_rate < 0.0 or morph_rate > 1.0: | |
raise ValueError("morph_rateは0.0から1.0の範囲で指定してください") | |
morph_spectrogram = ( | |
morph_param.base_spectrogram * (1.0 - morph_rate) | |
+ morph_param.target_spectrogram * morph_rate | |
) | |
y_h = pw.synthesize( | |
morph_param.base_f0, | |
morph_spectrogram, | |
morph_param.base_aperiodicity, | |
morph_param.fs, | |
morph_param.frame_period, | |
) | |
# TODO: synthesis_engine.py でのリサンプル処理と共通化する | |
if output_fs != morph_param.fs: | |
y_h = resample(y_h, output_fs * len(y_h) // morph_param.fs) | |
if output_stereo: | |
y_h = np.array([y_h, y_h]).T | |
return y_h | |