Spaces:
Build error
Build error
import math | |
from copy import deepcopy | |
from random import random | |
from typing import Union | |
from unittest import TestCase | |
from unittest.mock import Mock | |
import numpy | |
from voicevox_engine.acoustic_feature_extractor import OjtPhoneme | |
from voicevox_engine.model import AccentPhrase, AudioQuery, Mora | |
from voicevox_engine.synthesis_engine import SynthesisEngine | |
# TODO: import from voicevox_engine.synthesis_engine.mora | |
from voicevox_engine.synthesis_engine.synthesis_engine import ( | |
mora_phoneme_list, | |
pre_process, | |
split_mora, | |
to_flatten_moras, | |
to_phoneme_data_list, | |
unvoiced_mora_phoneme_list, | |
) | |
def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray): | |
result = [] | |
# mockとしての適当な処理、特に意味はない | |
for i in range(length): | |
result.append(float(phoneme_list[i] * 0.5 + speaker_id)) | |
return numpy.array(result) | |
def yukarin_sa_mock( | |
length: int, | |
vowel_phoneme_list: numpy.ndarray, | |
consonant_phoneme_list: numpy.ndarray, | |
start_accent_list: numpy.ndarray, | |
end_accent_list: numpy.ndarray, | |
start_accent_phrase_list: numpy.ndarray, | |
end_accent_phrase_list: numpy.ndarray, | |
speaker_id: numpy.ndarray, | |
): | |
result = [] | |
# mockとしての適当な処理、特に意味はない | |
for i in range(length): | |
result.append( | |
float( | |
( | |
vowel_phoneme_list[0][i] | |
+ consonant_phoneme_list[0][i] | |
+ start_accent_list[0][i] | |
+ end_accent_list[0][i] | |
+ start_accent_phrase_list[0][i] | |
+ end_accent_phrase_list[0][i] | |
) | |
* 0.5 | |
+ speaker_id | |
) | |
) | |
return numpy.array(result)[numpy.newaxis] | |
def decode_mock( | |
length: int, | |
phoneme_size: int, | |
f0: numpy.ndarray, | |
phoneme: numpy.ndarray, | |
speaker_id: Union[numpy.ndarray, int], | |
): | |
result = [] | |
# mockとしての適当な処理、特に意味はない | |
for i in range(length): | |
# decode forwardはデータサイズがlengthの256倍になるのでとりあえず256回データをresultに入れる | |
for _ in range(256): | |
result.append( | |
float( | |
f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) | |
+ speaker_id | |
) | |
) | |
return numpy.array(result) | |
class MockCore: | |
yukarin_s_forward = Mock(side_effect=yukarin_s_mock) | |
yukarin_sa_forward = Mock(side_effect=yukarin_sa_mock) | |
decode_forward = Mock(side_effect=decode_mock) | |
def metas(self): | |
return "" | |
def supported_devices(self): | |
return "" | |
def is_model_loaded(self, speaker_id): | |
return True | |
class TestSynthesisEngine(TestCase): | |
def setUp(self): | |
super().setUp() | |
self.str_list_hello_hiho = ( | |
"sil k o N n i ch i w a pau h i h o d e s U sil".split() | |
) | |
self.phoneme_data_list_hello_hiho = [ | |
OjtPhoneme(phoneme=p, start=i, end=i + 1) | |
for i, p in enumerate( | |
"pau k o N n i ch i w a pau h i h o d e s U pau".split() | |
) | |
] | |
self.accent_phrases_hello_hiho = [ | |
AccentPhrase( | |
moras=[ | |
Mora( | |
text="コ", | |
consonant="k", | |
consonant_length=0.0, | |
vowel="o", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
Mora( | |
text="ン", | |
consonant=None, | |
consonant_length=None, | |
vowel="N", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
Mora( | |
text="ニ", | |
consonant="n", | |
consonant_length=0.0, | |
vowel="i", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
Mora( | |
text="チ", | |
consonant="ch", | |
consonant_length=0.0, | |
vowel="i", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
Mora( | |
text="ワ", | |
consonant="w", | |
consonant_length=0.0, | |
vowel="a", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
], | |
accent=5, | |
pause_mora=Mora( | |
text="、", | |
consonant=None, | |
consonant_length=None, | |
vowel="pau", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
), | |
AccentPhrase( | |
moras=[ | |
Mora( | |
text="ヒ", | |
consonant="h", | |
consonant_length=0.0, | |
vowel="i", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
Mora( | |
text="ホ", | |
consonant="h", | |
consonant_length=0.0, | |
vowel="o", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
Mora( | |
text="デ", | |
consonant="d", | |
consonant_length=0.0, | |
vowel="e", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
Mora( | |
text="ス", | |
consonant="s", | |
consonant_length=0.0, | |
vowel="U", | |
vowel_length=0.0, | |
pitch=0.0, | |
), | |
], | |
accent=1, | |
pause_mora=None, | |
), | |
] | |
core = MockCore() | |
self.yukarin_s_mock = core.yukarin_s_forward | |
self.yukarin_sa_mock = core.yukarin_sa_forward | |
self.decode_mock = core.decode_forward | |
self.synthesis_engine = SynthesisEngine( | |
core=core, | |
) | |
def test_to_flatten_moras(self): | |
flatten_moras = to_flatten_moras(self.accent_phrases_hello_hiho) | |
self.assertEqual( | |
flatten_moras, | |
self.accent_phrases_hello_hiho[0].moras | |
+ [self.accent_phrases_hello_hiho[0].pause_mora] | |
+ self.accent_phrases_hello_hiho[1].moras, | |
) | |
def test_to_phoneme_data_list(self): | |
phoneme_data_list = to_phoneme_data_list(self.str_list_hello_hiho) | |
self.assertEqual(phoneme_data_list, self.phoneme_data_list_hello_hiho) | |
def test_split_mora(self): | |
consonant_phoneme_list, vowel_phoneme_list, vowel_indexes = split_mora( | |
self.phoneme_data_list_hello_hiho | |
) | |
self.assertEqual(vowel_indexes, [0, 2, 3, 5, 7, 9, 10, 12, 14, 16, 18, 19]) | |
self.assertEqual( | |
vowel_phoneme_list, | |
[ | |
OjtPhoneme(phoneme="pau", start=0, end=1), | |
OjtPhoneme(phoneme="o", start=2, end=3), | |
OjtPhoneme(phoneme="N", start=3, end=4), | |
OjtPhoneme(phoneme="i", start=5, end=6), | |
OjtPhoneme(phoneme="i", start=7, end=8), | |
OjtPhoneme(phoneme="a", start=9, end=10), | |
OjtPhoneme(phoneme="pau", start=10, end=11), | |
OjtPhoneme(phoneme="i", start=12, end=13), | |
OjtPhoneme(phoneme="o", start=14, end=15), | |
OjtPhoneme(phoneme="e", start=16, end=17), | |
OjtPhoneme(phoneme="U", start=18, end=19), | |
OjtPhoneme(phoneme="pau", start=19, end=20), | |
], | |
) | |
self.assertEqual( | |
consonant_phoneme_list, | |
[ | |
None, | |
OjtPhoneme(phoneme="k", start=1, end=2), | |
None, | |
OjtPhoneme(phoneme="n", start=4, end=5), | |
OjtPhoneme(phoneme="ch", start=6, end=7), | |
OjtPhoneme(phoneme="w", start=8, end=9), | |
None, | |
OjtPhoneme(phoneme="h", start=11, end=12), | |
OjtPhoneme(phoneme="h", start=13, end=14), | |
OjtPhoneme(phoneme="d", start=15, end=16), | |
OjtPhoneme(phoneme="s", start=17, end=18), | |
None, | |
], | |
) | |
def test_pre_process(self): | |
flatten_moras, phoneme_data_list = pre_process( | |
deepcopy(self.accent_phrases_hello_hiho) | |
) | |
mora_index = 0 | |
phoneme_index = 1 | |
self.assertEqual(phoneme_data_list[0], OjtPhoneme("pau", 0, 1)) | |
for accent_phrase in self.accent_phrases_hello_hiho: | |
moras = accent_phrase.moras | |
for mora in moras: | |
self.assertEqual(flatten_moras[mora_index], mora) | |
mora_index += 1 | |
if mora.consonant is not None: | |
self.assertEqual( | |
phoneme_data_list[phoneme_index], | |
OjtPhoneme(mora.consonant, phoneme_index, phoneme_index + 1), | |
) | |
phoneme_index += 1 | |
self.assertEqual( | |
phoneme_data_list[phoneme_index], | |
OjtPhoneme(mora.vowel, phoneme_index, phoneme_index + 1), | |
) | |
phoneme_index += 1 | |
if accent_phrase.pause_mora: | |
self.assertEqual(flatten_moras[mora_index], accent_phrase.pause_mora) | |
mora_index += 1 | |
self.assertEqual( | |
phoneme_data_list[phoneme_index], | |
OjtPhoneme("pau", phoneme_index, phoneme_index + 1), | |
) | |
phoneme_index += 1 | |
self.assertEqual( | |
phoneme_data_list[phoneme_index], | |
OjtPhoneme("pau", phoneme_index, phoneme_index + 1), | |
) | |
def test_replace_phoneme_length(self): | |
result = self.synthesis_engine.replace_phoneme_length( | |
accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1 | |
) | |
# yukarin_sに渡される値の検証 | |
yukarin_s_args = self.yukarin_s_mock.call_args[1] | |
list_length = yukarin_s_args["length"] | |
phoneme_list = yukarin_s_args["phoneme_list"] | |
self.assertEqual(list_length, 20) | |
self.assertEqual(list_length, len(phoneme_list)) | |
numpy.testing.assert_array_equal( | |
phoneme_list, | |
numpy.array( | |
[ | |
0, | |
23, | |
30, | |
4, | |
28, | |
21, | |
10, | |
21, | |
42, | |
7, | |
0, | |
19, | |
21, | |
19, | |
30, | |
12, | |
14, | |
35, | |
6, | |
0, | |
], | |
dtype=numpy.int64, | |
), | |
) | |
self.assertEqual(yukarin_s_args["speaker_id"], 1) | |
# flatten_morasを使わずに愚直にaccent_phrasesにデータを反映させてみる | |
true_result = deepcopy(self.accent_phrases_hello_hiho) | |
index = 1 | |
def result_value(i: int): | |
return float(phoneme_list[i] * 0.5 + 1) | |
for accent_phrase in true_result: | |
moras = accent_phrase.moras | |
for mora in moras: | |
if mora.consonant is not None: | |
mora.consonant_length = result_value(index) | |
index += 1 | |
mora.vowel_length = result_value(index) | |
index += 1 | |
if accent_phrase.pause_mora is not None: | |
accent_phrase.pause_mora.vowel_length = result_value(index) | |
index += 1 | |
self.assertEqual(result, true_result) | |
def test_replace_mora_pitch(self): | |
# 空のリストでエラーを吐かないか | |
empty_accent_phrases = [] | |
self.assertEqual( | |
self.synthesis_engine.replace_mora_pitch( | |
accent_phrases=empty_accent_phrases, speaker_id=1 | |
), | |
[], | |
) | |
result = self.synthesis_engine.replace_mora_pitch( | |
accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1 | |
) | |
# yukarin_saに渡される値の検証 | |
yukarin_sa_args = self.yukarin_sa_mock.call_args[1] | |
list_length = yukarin_sa_args["length"] | |
vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0] | |
consonant_phoneme_list = yukarin_sa_args["consonant_phoneme_list"][0] | |
start_accent_list = yukarin_sa_args["start_accent_list"][0] | |
end_accent_list = yukarin_sa_args["end_accent_list"][0] | |
start_accent_phrase_list = yukarin_sa_args["start_accent_phrase_list"][0] | |
end_accent_phrase_list = yukarin_sa_args["end_accent_phrase_list"][0] | |
self.assertEqual(list_length, 12) | |
self.assertEqual(list_length, len(vowel_phoneme_list)) | |
self.assertEqual(list_length, len(consonant_phoneme_list)) | |
self.assertEqual(list_length, len(start_accent_list)) | |
self.assertEqual(list_length, len(end_accent_list)) | |
self.assertEqual(list_length, len(start_accent_phrase_list)) | |
self.assertEqual(list_length, len(end_accent_phrase_list)) | |
self.assertEqual(yukarin_sa_args["speaker_id"], 1) | |
numpy.testing.assert_array_equal( | |
vowel_phoneme_list, | |
numpy.array( | |
[ | |
0, | |
30, | |
4, | |
21, | |
21, | |
7, | |
0, | |
21, | |
30, | |
14, | |
6, | |
0, | |
] | |
), | |
) | |
numpy.testing.assert_array_equal( | |
consonant_phoneme_list, | |
numpy.array( | |
[ | |
-1, | |
23, | |
-1, | |
28, | |
10, | |
42, | |
-1, | |
19, | |
19, | |
12, | |
35, | |
-1, | |
] | |
), | |
) | |
numpy.testing.assert_array_equal( | |
start_accent_list, numpy.array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]) | |
) | |
numpy.testing.assert_array_equal( | |
end_accent_list, numpy.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]) | |
) | |
numpy.testing.assert_array_equal( | |
start_accent_phrase_list, numpy.array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]) | |
) | |
numpy.testing.assert_array_equal( | |
end_accent_phrase_list, numpy.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]) | |
) | |
# flatten_morasを使わずに愚直にaccent_phrasesにデータを反映させてみる | |
true_result = deepcopy(self.accent_phrases_hello_hiho) | |
index = 1 | |
def result_value(i: int): | |
# unvoiced_mora_phoneme_listのPhoneme ID版 | |
unvoiced_mora_phoneme_id_list = [ | |
OjtPhoneme(p, 0, 0).phoneme_id for p in unvoiced_mora_phoneme_list | |
] | |
if vowel_phoneme_list[i] in unvoiced_mora_phoneme_id_list: | |
return 0 | |
return ( | |
vowel_phoneme_list[i] | |
+ consonant_phoneme_list[i] | |
+ start_accent_list[i] | |
+ end_accent_list[i] | |
+ start_accent_phrase_list[i] | |
+ end_accent_phrase_list[i] | |
) * 0.5 + 1 | |
for accent_phrase in true_result: | |
moras = accent_phrase.moras | |
for mora in moras: | |
mora.pitch = result_value(index) | |
index += 1 | |
if accent_phrase.pause_mora is not None: | |
accent_phrase.pause_mora.pitch = result_value(index) | |
index += 1 | |
self.assertEqual(result, true_result) | |
def synthesis_test_base(self, audio_query: AudioQuery): | |
accent_phrases = audio_query.accent_phrases | |
# decode forwardのために適当にpitchとlengthを設定し、リストで持っておく | |
phoneme_length_list = [0.0] | |
phoneme_id_list = [0] | |
f0_list = [0.0] | |
for accent_phrase in accent_phrases: | |
moras = accent_phrase.moras | |
for mora in moras: | |
if mora.consonant is not None: | |
mora.consonant_length = 0.1 | |
phoneme_length_list.append(0.1) | |
phoneme_id_list.append(OjtPhoneme(mora.consonant, 0, 0).phoneme_id) | |
mora.vowel_length = 0.2 | |
phoneme_length_list.append(0.2) | |
phoneme_id_list.append(OjtPhoneme(mora.vowel, 0, 0).phoneme_id) | |
if mora.vowel not in unvoiced_mora_phoneme_list: | |
mora.pitch = 5.0 + random() | |
f0_list.append(mora.pitch) | |
if accent_phrase.pause_mora is not None: | |
accent_phrase.pause_mora.vowel_length = 0.2 | |
phoneme_length_list.append(0.2) | |
phoneme_id_list.append(OjtPhoneme("pau", 0, 0).phoneme_id) | |
f0_list.append(0.0) | |
phoneme_length_list.append(0.0) | |
phoneme_id_list.append(0) | |
f0_list.append(0.0) | |
phoneme_length_list[0] = audio_query.prePhonemeLength | |
phoneme_length_list[-1] = audio_query.postPhonemeLength | |
for i in range(len(phoneme_length_list)): | |
phoneme_length_list[i] /= audio_query.speedScale | |
result = self.synthesis_engine.synthesis(query=audio_query, speaker_id=1) | |
# decodeに渡される値の検証 | |
decode_args = self.decode_mock.call_args[1] | |
list_length = decode_args["length"] | |
self.assertEqual( | |
list_length, | |
int(sum([round(p * 24000 / 256) for p in phoneme_length_list])), | |
) | |
num_phoneme = OjtPhoneme.num_phoneme | |
# mora_phoneme_listのPhoneme ID版 | |
mora_phoneme_id_list = [ | |
OjtPhoneme(p, 0, 0).phoneme_id for p in mora_phoneme_list | |
] | |
# numpy.repeatをfor文でやる | |
f0 = [] | |
phoneme = [] | |
f0_index = 0 | |
mean_f0 = [] | |
for i, phoneme_length in enumerate(phoneme_length_list): | |
f0_single = numpy.array(f0_list[f0_index], dtype=numpy.float32) * ( | |
2**audio_query.pitchScale | |
) | |
for _ in range(int(round(phoneme_length * (24000 / 256)))): | |
f0.append([f0_single]) | |
phoneme_s = [] | |
for _ in range(num_phoneme): | |
phoneme_s.append(0) | |
# one hot | |
phoneme_s[phoneme_id_list[i]] = 1 | |
phoneme.append(phoneme_s) | |
# consonantとvowelを判別し、vowelであればf0_indexを一つ進める | |
if phoneme_id_list[i] in mora_phoneme_id_list: | |
if f0_single > 0: | |
mean_f0.append(f0_single) | |
f0_index += 1 | |
mean_f0 = numpy.array(mean_f0, dtype=numpy.float32).mean() | |
f0 = numpy.array(f0, dtype=numpy.float32) | |
for i in range(len(f0)): | |
if f0[i][0] != 0.0: | |
f0[i][0] = (f0[i][0] - mean_f0) * audio_query.intonationScale + mean_f0 | |
phoneme = numpy.array(phoneme, dtype=numpy.float32) | |
# 乱数の影響で数値の位置がずれが生じるので、大半(4/5)があっていればよしとする | |
# また、上の部分のint(round(phoneme_length * (24000 / 256)))の影響で | |
# 本来のf0/phonemeとテスト生成したf0/phonemeの長さが変わることがあり、 | |
# テスト生成したものが若干長くなることがあるので、本来のものの長さを基準にassertする | |
assert_f0_count = 0 | |
decode_f0 = decode_args["f0"] | |
for i in range(len(decode_f0)): | |
# 乱数の影響等で数値にずれが生じるので、10の-5乗までの近似値であれば許容する | |
assert_f0_count += math.isclose(f0[i][0], decode_f0[i][0], rel_tol=10e-5) | |
self.assertTrue(assert_f0_count >= int(len(decode_f0) / 5) * 4) | |
assert_phoneme_count = 0 | |
decode_phoneme = decode_args["phoneme"] | |
for i in range(len(decode_phoneme)): | |
assert_true_count = 0 | |
for j in range(len(decode_phoneme[i])): | |
assert_true_count += bool(phoneme[i][j] == decode_phoneme[i][j]) | |
assert_phoneme_count += assert_true_count == num_phoneme | |
self.assertTrue(assert_phoneme_count >= int(len(decode_phoneme) / 5) * 4) | |
self.assertEqual(decode_args["speaker_id"], 1) | |
# decode forwarderのmockを使う | |
true_result = decode_mock(list_length, num_phoneme, f0, phoneme, 1) | |
true_result *= audio_query.volumeScale | |
# TODO: resampyの部分は値の検証しようがないので、パスする | |
if audio_query.outputSamplingRate != 24000: | |
return | |
assert_result_count = 0 | |
for i in range(len(true_result)): | |
if audio_query.outputStereo: | |
assert_result_count += math.isclose( | |
true_result[i], result[i][0], rel_tol=10e-5 | |
) and math.isclose(true_result[i], result[i][1], rel_tol=10e-5) | |
else: | |
assert_result_count += math.isclose( | |
true_result[i], result[i], rel_tol=10e-5 | |
) | |
self.assertTrue(assert_result_count >= int(len(true_result) / 5) * 4) | |
def test_synthesis(self): | |
audio_query = AudioQuery( | |
accent_phrases=deepcopy(self.accent_phrases_hello_hiho), | |
speedScale=1.0, | |
pitchScale=1.0, | |
intonationScale=1.0, | |
volumeScale=1.0, | |
prePhonemeLength=0.1, | |
postPhonemeLength=0.1, | |
outputSamplingRate=24000, | |
outputStereo=False, | |
# このテスト内では使わないので生成不要 | |
kana="", | |
) | |
self.synthesis_test_base(audio_query) | |
# speed scaleのテスト | |
audio_query.speedScale = 1.2 | |
self.synthesis_test_base(audio_query) | |
# pitch scaleのテスト | |
audio_query.pitchScale = 1.5 | |
audio_query.speedScale = 1.0 | |
self.synthesis_test_base(audio_query) | |
# intonation scaleのテスト | |
audio_query.pitchScale = 1.0 | |
audio_query.intonationScale = 1.4 | |
self.synthesis_test_base(audio_query) | |
# volume scaleのテスト | |
audio_query.intonationScale = 1.0 | |
audio_query.volumeScale = 2.0 | |
self.synthesis_test_base(audio_query) | |
# pre/post phoneme lengthのテスト | |
audio_query.volumeScale = 1.0 | |
audio_query.prePhonemeLength = 0.5 | |
audio_query.postPhonemeLength = 0.5 | |
self.synthesis_test_base(audio_query) | |
# output sampling rateのテスト | |
audio_query.prePhonemeLength = 0.1 | |
audio_query.postPhonemeLength = 0.1 | |
audio_query.outputSamplingRate = 48000 | |
self.synthesis_test_base(audio_query) | |
# output stereoのテスト | |
audio_query.outputSamplingRate = 24000 | |
audio_query.outputStereo = True | |
self.synthesis_test_base(audio_query) | |