Spaces:
Build error
Build error
from logging import getLogger | |
from typing import Any, Dict, List, Optional | |
import numpy as np | |
from pyopenjtalk import tts | |
from scipy.signal import resample | |
from ...model import AccentPhrase, AudioQuery | |
from ...synthesis_engine import SynthesisEngineBase | |
from ...synthesis_engine.synthesis_engine import to_flatten_moras | |
class MockSynthesisEngine(SynthesisEngineBase): | |
""" | |
SynthesisEngine [Mock] | |
""" | |
def __init__( | |
self, | |
speakers: str, | |
supported_devices: Optional[str] = None, | |
): | |
""" | |
__init__ [Mock] | |
""" | |
super().__init__() | |
self._speakers = speakers | |
self._supported_devices = supported_devices | |
self.default_sampling_rate = 24000 | |
def speakers(self) -> str: | |
return self._speakers | |
def supported_devices(self) -> Optional[str]: | |
return self._supported_devices | |
def replace_phoneme_length( | |
self, accent_phrases: List[AccentPhrase], speaker_id: int | |
) -> List[AccentPhrase]: | |
""" | |
replace_phoneme_length 入力accent_phrasesを変更せずにそのまま返します [Mock] | |
Parameters | |
---------- | |
accent_phrases : List[AccentPhrase] | |
フレーズ句のリスト | |
speaker_id : int | |
話者 | |
Returns | |
------- | |
List[AccentPhrase] | |
フレーズ句のリスト(変更なし) | |
""" | |
return accent_phrases | |
def replace_mora_pitch( | |
self, accent_phrases: List[AccentPhrase], speaker_id: int | |
) -> List[AccentPhrase]: | |
""" | |
replace_mora_pitch 入力accent_phrasesを変更せずにそのまま返します [Mock] | |
Parameters | |
---------- | |
accent_phrases : List[AccentPhrase] | |
フレーズ句のリスト | |
speaker_id : int | |
話者 | |
Returns | |
------- | |
List[AccentPhrase] | |
フレーズ句のリスト(変更なし) | |
""" | |
return accent_phrases | |
def _synthesis_impl(self, query: AudioQuery, speaker_id: int) -> np.ndarray: | |
""" | |
synthesis voicevox coreを使わずに、音声合成する [Mock] | |
Parameters | |
---------- | |
query : AudioQuery | |
/audio_query APIで得たjson | |
speaker_id : int | |
話者 | |
Returns | |
------- | |
wave [npt.NDArray[np.int16]] | |
音声波形データをNumPy配列で返します | |
""" | |
# recall text in katakana | |
flatten_moras = to_flatten_moras(query.accent_phrases) | |
kana_text = "".join([mora.text for mora in flatten_moras]) | |
wave = self.forward(kana_text) | |
# volume | |
wave *= query.volumeScale | |
return wave.astype("int16") | |
def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray: | |
""" | |
forward tts via pyopenjtalk.tts() | |
参照→SynthesisEngine のdocstring [Mock] | |
Parameters | |
---------- | |
text : str | |
入力文字列(例:読み上げたい文章をカタカナにした文字列、等) | |
Returns | |
------- | |
wave [npt.NDArray[np.int16]] | |
音声波形データをNumPy配列で返します | |
Note | |
------- | |
ここで行う音声合成では、調声(ピッチ等)を反映しない | |
# pyopenjtalk.tts()の出力仕様 | |
dtype=np.float64, 16 bit, mono 48000 Hz | |
# resampleの説明 | |
非モック実装(decode_forward)と合わせるために、出力を24kHzに変換した。 | |
""" | |
logger = getLogger("uvicorn") # FastAPI / Uvicorn 内からの利用のため | |
logger.info("[Mock] input text: %s" % text) | |
wave, sr = tts(text) | |
wave = resample(wave, 24000 * len(wave) // 48000) | |
return wave | |