from typing import List, Union from unittest import TestCase from unittest.mock import Mock import numpy from voicevox_engine.model import AccentPhrase, AudioQuery, Mora from voicevox_engine.synthesis_engine import SynthesisEngine def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray): result = [] # mockとしての適当な処理、特に意味はない for i in range(length): result.append(round(float(phoneme_list[i] * 0.0625 + speaker_id), 2)) return numpy.array(result) def yukarin_sa_mock( length: int, vowel_phoneme_list: numpy.ndarray, consonant_phoneme_list: numpy.ndarray, start_accent_list: numpy.ndarray, end_accent_list: numpy.ndarray, start_accent_phrase_list: numpy.ndarray, end_accent_phrase_list: numpy.ndarray, speaker_id: numpy.ndarray, ): result = [] # mockとしての適当な処理、特に意味はない for i in range(length): result.append( round( float( ( vowel_phoneme_list[0][i] + consonant_phoneme_list[0][i] + start_accent_list[0][i] + end_accent_list[0][i] + start_accent_phrase_list[0][i] + end_accent_phrase_list[0][i] ) * 0.0625 + speaker_id ), 2, ) ) return numpy.array(result)[numpy.newaxis] def decode_mock( length: int, phoneme_size: int, f0: numpy.ndarray, phoneme: numpy.ndarray, speaker_id: Union[numpy.ndarray, int], ): result = [] # mockとしての適当な処理、特に意味はない for i in range(length): # decode forwardはデータサイズがlengthの256倍になるのでとりあえず256回データをresultに入れる for _ in range(256): result.append( float( f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) + speaker_id ) ) return numpy.array(result) def koreha_arimasuka_base_expected(): return [ AccentPhrase( moras=[ Mora( text="コ", consonant="k", consonant_length=2.44, vowel="o", vowel_length=2.88, pitch=4.38, ), Mora( text="レ", consonant="r", consonant_length=3.06, vowel="e", vowel_length=1.88, pitch=4.0, ), Mora( text="ワ", consonant="w", consonant_length=3.62, vowel="a", vowel_length=1.44, pitch=4.19, ), ], accent=3, pause_mora=None, is_interrogative=False, ), AccentPhrase( moras=[ Mora( text="ア", consonant=None, consonant_length=None, vowel="a", vowel_length=1.44, pitch=1.44, ), Mora( text="リ", consonant="r", consonant_length=3.06, vowel="i", vowel_length=2.31, pitch=4.44, ), Mora( text="マ", consonant="m", consonant_length=2.62, vowel="a", vowel_length=1.44, pitch=3.12, ), Mora( text="ス", consonant="s", consonant_length=3.19, vowel="U", vowel_length=1.38, pitch=0.0, ), Mora( text="カ", consonant="k", consonant_length=2.44, vowel="a", vowel_length=1.44, pitch=2.94, ), ], accent=3, pause_mora=None, is_interrogative=False, ), ] def create_mock_query(accent_phrases): return AudioQuery( accent_phrases=accent_phrases, speedScale=1, pitchScale=0, intonationScale=1, volumeScale=1, prePhonemeLength=0.1, postPhonemeLength=0.1, outputSamplingRate=24000, outputStereo=False, kana="", ) class MockCore: yukarin_s_forward = Mock(side_effect=yukarin_s_mock) yukarin_sa_forward = Mock(side_effect=yukarin_sa_mock) decode_forward = Mock(side_effect=decode_mock) def metas(self): return "" def supported_devices(self): return "" def is_model_loaded(self, speaker_id): return True class TestSynthesisEngineBase(TestCase): def setUp(self): super().setUp() self.synthesis_engine = SynthesisEngine( core=MockCore(), ) self.synthesis_engine._synthesis_impl = Mock() def create_accent_phrases_test_base(self, text: str, expected: List[AccentPhrase]): actual = self.synthesis_engine.create_accent_phrases(text, 1) self.assertEqual( expected, actual, "case(text:" + text + ")", ) def create_synthesis_test_base( self, text: str, expected: List[AccentPhrase], enable_interrogative_upspeak: bool, ): """音声合成時に疑問文モーラ処理を行っているかどうかを検証 (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) """ accent_phrases = self.synthesis_engine.create_accent_phrases(text, 1) query = create_mock_query(accent_phrases=accent_phrases) self.synthesis_engine.synthesis( query, 0, enable_interrogative_upspeak=enable_interrogative_upspeak ) # _synthesis_implの第一引数に与えられたqueryを検証 actual = self.synthesis_engine._synthesis_impl.call_args[0][0].accent_phrases self.assertEqual( expected, actual, "case(text:" + text + ")", ) def test_create_accent_phrases(self): """accent_phrasesの作成時では疑問文モーラ処理を行わない (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) """ expected = koreha_arimasuka_base_expected() expected[-1].is_interrogative = True self.create_accent_phrases_test_base(text="これはありますか?", expected=expected) def test_synthesis_interrogative(self): expected = koreha_arimasuka_base_expected() expected[-1].is_interrogative = True expected[-1].moras += [ Mora( text="ア", consonant=None, consonant_length=None, vowel="a", vowel_length=0.15, pitch=expected[-1].moras[-1].pitch + 0.3, ) ] self.create_synthesis_test_base( text="これはありますか?", expected=expected, enable_interrogative_upspeak=True, ) expected = koreha_arimasuka_base_expected() expected[-1].is_interrogative = True self.create_synthesis_test_base( text="これはありますか?", expected=expected, enable_interrogative_upspeak=False, ) expected = koreha_arimasuka_base_expected() self.create_synthesis_test_base( text="これはありますか", expected=expected, enable_interrogative_upspeak=True, ) def nn_base_expected(): return [ AccentPhrase( moras=[ Mora( text="ン", consonant=None, consonant_length=None, vowel="N", vowel_length=1.25, pitch=1.44, ) ], accent=1, pause_mora=None, is_interrogative=False, ) ] expected = nn_base_expected() self.create_synthesis_test_base( text="ん", expected=expected, enable_interrogative_upspeak=True, ) expected = nn_base_expected() expected[-1].is_interrogative = True expected[-1].moras += [ Mora( text="ン", consonant=None, consonant_length=None, vowel="N", vowel_length=0.15, pitch=expected[-1].moras[-1].pitch + 0.3, ) ] self.create_synthesis_test_base( text="ん?", expected=expected, enable_interrogative_upspeak=True, ) expected = nn_base_expected() expected[-1].is_interrogative = True self.create_synthesis_test_base( text="ん?", expected=expected, enable_interrogative_upspeak=False, ) def ltu_base_expected(): return [ AccentPhrase( moras=[ Mora( text="ッ", consonant=None, consonant_length=None, vowel="cl", vowel_length=1.69, pitch=0.0, ) ], accent=1, pause_mora=None, is_interrogative=False, ) ] expected = ltu_base_expected() self.create_synthesis_test_base( text="っ", expected=expected, enable_interrogative_upspeak=True, ) expected = ltu_base_expected() expected[-1].is_interrogative = True self.create_synthesis_test_base( text="っ?", expected=expected, enable_interrogative_upspeak=True, ) expected = ltu_base_expected() expected[-1].is_interrogative = True self.create_synthesis_test_base( text="っ?", expected=expected, enable_interrogative_upspeak=False, ) def su_base_expected(): return [ AccentPhrase( moras=[ Mora( text="ス", consonant="s", consonant_length=3.19, vowel="u", vowel_length=3.5, pitch=5.94, ) ], accent=1, pause_mora=None, is_interrogative=False, ) ] expected = su_base_expected() self.create_synthesis_test_base( text="す", expected=expected, enable_interrogative_upspeak=True, ) expected = su_base_expected() expected[-1].is_interrogative = True expected[-1].moras += [ Mora( text="ウ", consonant=None, consonant_length=None, vowel="u", vowel_length=0.15, pitch=expected[-1].moras[-1].pitch + 0.3, ) ] self.create_synthesis_test_base( text="す?", expected=expected, enable_interrogative_upspeak=True, ) expected = su_base_expected() expected[-1].is_interrogative = True self.create_synthesis_test_base( text="す?", expected=expected, enable_interrogative_upspeak=False, )