Spaces:

2ndelement
/

voicevox

Build error

App Files Files Community

voicevox / test /test_synthesis_engine.py

2ndelement

init

f1f433f over 1 year ago

raw

history blame

24 kB

	import math
	from copy import deepcopy
	from random import random
	from typing import Union
	from unittest import TestCase
	from unittest.mock import Mock

	import numpy

	from voicevox_engine.acoustic_feature_extractor import OjtPhoneme
	from voicevox_engine.model import AccentPhrase, AudioQuery, Mora
	from voicevox_engine.synthesis_engine import SynthesisEngine

	# TODO: import from voicevox_engine.synthesis_engine.mora
	from voicevox_engine.synthesis_engine.synthesis_engine import (
	mora_phoneme_list,
	pre_process,
	split_mora,
	to_flatten_moras,
	to_phoneme_data_list,
	unvoiced_mora_phoneme_list,
	)


	def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray):
	result = []
	# mockとしての適当な処理、特に意味はない
	for i in range(length):
	result.append(float(phoneme_list[i] * 0.5 + speaker_id))
	return numpy.array(result)


	def yukarin_sa_mock(
	length: int,
	vowel_phoneme_list: numpy.ndarray,
	consonant_phoneme_list: numpy.ndarray,
	start_accent_list: numpy.ndarray,
	end_accent_list: numpy.ndarray,
	start_accent_phrase_list: numpy.ndarray,
	end_accent_phrase_list: numpy.ndarray,
	speaker_id: numpy.ndarray,
	):
	result = []
	# mockとしての適当な処理、特に意味はない
	for i in range(length):
	result.append(
	float(
	(
	vowel_phoneme_list[0][i]
	+ consonant_phoneme_list[0][i]
	+ start_accent_list[0][i]
	+ end_accent_list[0][i]
	+ start_accent_phrase_list[0][i]
	+ end_accent_phrase_list[0][i]
	)
	* 0.5
	+ speaker_id
	)
	)
	return numpy.array(result)[numpy.newaxis]


	def decode_mock(
	length: int,
	phoneme_size: int,
	f0: numpy.ndarray,
	phoneme: numpy.ndarray,
	speaker_id: Union[numpy.ndarray, int],
	):
	result = []
	# mockとしての適当な処理、特に意味はない
	for i in range(length):
	# decode forwardはデータサイズがlengthの256倍になるのでとりあえず256回データをresultに入れる
	for _ in range(256):
	result.append(
	float(
	f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size)
	+ speaker_id
	)
	)
	return numpy.array(result)


	class MockCore:
	yukarin_s_forward = Mock(side_effect=yukarin_s_mock)
	yukarin_sa_forward = Mock(side_effect=yukarin_sa_mock)
	decode_forward = Mock(side_effect=decode_mock)

	def metas(self):
	return ""

	def supported_devices(self):
	return ""

	def is_model_loaded(self, speaker_id):
	return True


	class TestSynthesisEngine(TestCase):
	def setUp(self):
	super().setUp()
	self.str_list_hello_hiho = (
	"sil k o N n i ch i w a pau h i h o d e s U sil".split()
	)
	self.phoneme_data_list_hello_hiho = [
	OjtPhoneme(phoneme=p, start=i, end=i + 1)
	for i, p in enumerate(
	"pau k o N n i ch i w a pau h i h o d e s U pau".split()
	)
	]
	self.accent_phrases_hello_hiho = [
	AccentPhrase(
	moras=[
	Mora(
	text="コ",
	consonant="k",
	consonant_length=0.0,
	vowel="o",
	vowel_length=0.0,
	pitch=0.0,
	),
	Mora(
	text="ン",
	consonant=None,
	consonant_length=None,
	vowel="N",
	vowel_length=0.0,
	pitch=0.0,
	),
	Mora(
	text="ニ",
	consonant="n",
	consonant_length=0.0,
	vowel="i",
	vowel_length=0.0,
	pitch=0.0,
	),
	Mora(
	text="チ",
	consonant="ch",
	consonant_length=0.0,
	vowel="i",
	vowel_length=0.0,
	pitch=0.0,
	),
	Mora(
	text="ワ",
	consonant="w",
	consonant_length=0.0,
	vowel="a",
	vowel_length=0.0,
	pitch=0.0,
	),
	],
	accent=5,
	pause_mora=Mora(
	text="、",
	consonant=None,
	consonant_length=None,
	vowel="pau",
	vowel_length=0.0,
	pitch=0.0,
	),
	),
	AccentPhrase(
	moras=[
	Mora(
	text="ヒ",
	consonant="h",
	consonant_length=0.0,
	vowel="i",
	vowel_length=0.0,
	pitch=0.0,
	),
	Mora(
	text="ホ",
	consonant="h",
	consonant_length=0.0,
	vowel="o",
	vowel_length=0.0,
	pitch=0.0,
	),
	Mora(
	text="デ",
	consonant="d",
	consonant_length=0.0,
	vowel="e",
	vowel_length=0.0,
	pitch=0.0,
	),
	Mora(
	text="ス",
	consonant="s",
	consonant_length=0.0,
	vowel="U",
	vowel_length=0.0,
	pitch=0.0,
	),
	],
	accent=1,
	pause_mora=None,
	),
	]
	core = MockCore()
	self.yukarin_s_mock = core.yukarin_s_forward
	self.yukarin_sa_mock = core.yukarin_sa_forward
	self.decode_mock = core.decode_forward
	self.synthesis_engine = SynthesisEngine(
	core=core,
	)

	def test_to_flatten_moras(self):
	flatten_moras = to_flatten_moras(self.accent_phrases_hello_hiho)
	self.assertEqual(
	flatten_moras,
	self.accent_phrases_hello_hiho[0].moras
	+ [self.accent_phrases_hello_hiho[0].pause_mora]
	+ self.accent_phrases_hello_hiho[1].moras,
	)

	def test_to_phoneme_data_list(self):
	phoneme_data_list = to_phoneme_data_list(self.str_list_hello_hiho)
	self.assertEqual(phoneme_data_list, self.phoneme_data_list_hello_hiho)

	def test_split_mora(self):
	consonant_phoneme_list, vowel_phoneme_list, vowel_indexes = split_mora(
	self.phoneme_data_list_hello_hiho
	)

	self.assertEqual(vowel_indexes, [0, 2, 3, 5, 7, 9, 10, 12, 14, 16, 18, 19])
	self.assertEqual(
	vowel_phoneme_list,
	[
	OjtPhoneme(phoneme="pau", start=0, end=1),
	OjtPhoneme(phoneme="o", start=2, end=3),
	OjtPhoneme(phoneme="N", start=3, end=4),
	OjtPhoneme(phoneme="i", start=5, end=6),
	OjtPhoneme(phoneme="i", start=7, end=8),
	OjtPhoneme(phoneme="a", start=9, end=10),
	OjtPhoneme(phoneme="pau", start=10, end=11),
	OjtPhoneme(phoneme="i", start=12, end=13),
	OjtPhoneme(phoneme="o", start=14, end=15),
	OjtPhoneme(phoneme="e", start=16, end=17),
	OjtPhoneme(phoneme="U", start=18, end=19),
	OjtPhoneme(phoneme="pau", start=19, end=20),
	],
	)
	self.assertEqual(
	consonant_phoneme_list,
	[
	None,
	OjtPhoneme(phoneme="k", start=1, end=2),
	None,
	OjtPhoneme(phoneme="n", start=4, end=5),
	OjtPhoneme(phoneme="ch", start=6, end=7),
	OjtPhoneme(phoneme="w", start=8, end=9),
	None,
	OjtPhoneme(phoneme="h", start=11, end=12),
	OjtPhoneme(phoneme="h", start=13, end=14),
	OjtPhoneme(phoneme="d", start=15, end=16),
	OjtPhoneme(phoneme="s", start=17, end=18),
	None,
	],
	)

	def test_pre_process(self):
	flatten_moras, phoneme_data_list = pre_process(
	deepcopy(self.accent_phrases_hello_hiho)
	)

	mora_index = 0
	phoneme_index = 1

	self.assertEqual(phoneme_data_list[0], OjtPhoneme("pau", 0, 1))
	for accent_phrase in self.accent_phrases_hello_hiho:
	moras = accent_phrase.moras
	for mora in moras:
	self.assertEqual(flatten_moras[mora_index], mora)
	mora_index += 1
	if mora.consonant is not None:
	self.assertEqual(
	phoneme_data_list[phoneme_index],
	OjtPhoneme(mora.consonant, phoneme_index, phoneme_index + 1),
	)
	phoneme_index += 1
	self.assertEqual(
	phoneme_data_list[phoneme_index],
	OjtPhoneme(mora.vowel, phoneme_index, phoneme_index + 1),
	)
	phoneme_index += 1
	if accent_phrase.pause_mora:
	self.assertEqual(flatten_moras[mora_index], accent_phrase.pause_mora)
	mora_index += 1
	self.assertEqual(
	phoneme_data_list[phoneme_index],
	OjtPhoneme("pau", phoneme_index, phoneme_index + 1),
	)
	phoneme_index += 1
	self.assertEqual(
	phoneme_data_list[phoneme_index],
	OjtPhoneme("pau", phoneme_index, phoneme_index + 1),
	)

	def test_replace_phoneme_length(self):
	result = self.synthesis_engine.replace_phoneme_length(
	accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1
	)

	# yukarin_sに渡される値の検証
	yukarin_s_args = self.yukarin_s_mock.call_args[1]
	list_length = yukarin_s_args["length"]
	phoneme_list = yukarin_s_args["phoneme_list"]
	self.assertEqual(list_length, 20)
	self.assertEqual(list_length, len(phoneme_list))
	numpy.testing.assert_array_equal(
	phoneme_list,
	numpy.array(
	[
	0,
	23,
	30,
	4,
	28,
	21,
	10,
	21,
	42,
	7,
	0,
	19,
	21,
	19,
	30,
	12,
	14,
	35,
	6,
	0,
	],
	dtype=numpy.int64,
	),
	)
	self.assertEqual(yukarin_s_args["speaker_id"], 1)

	# flatten_morasを使わずに愚直にaccent_phrasesにデータを反映させてみる
	true_result = deepcopy(self.accent_phrases_hello_hiho)
	index = 1

	def result_value(i: int):
	return float(phoneme_list[i] * 0.5 + 1)

	for accent_phrase in true_result:
	moras = accent_phrase.moras
	for mora in moras:
	if mora.consonant is not None:
	mora.consonant_length = result_value(index)
	index += 1
	mora.vowel_length = result_value(index)
	index += 1
	if accent_phrase.pause_mora is not None:
	accent_phrase.pause_mora.vowel_length = result_value(index)
	index += 1

	self.assertEqual(result, true_result)

	def test_replace_mora_pitch(self):
	# 空のリストでエラーを吐かないか
	empty_accent_phrases = []
	self.assertEqual(
	self.synthesis_engine.replace_mora_pitch(
	accent_phrases=empty_accent_phrases, speaker_id=1
	),
	[],
	)

	result = self.synthesis_engine.replace_mora_pitch(
	accent_phrases=deepcopy(self.accent_phrases_hello_hiho), speaker_id=1
	)

	# yukarin_saに渡される値の検証
	yukarin_sa_args = self.yukarin_sa_mock.call_args[1]
	list_length = yukarin_sa_args["length"]
	vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0]
	consonant_phoneme_list = yukarin_sa_args["consonant_phoneme_list"][0]
	start_accent_list = yukarin_sa_args["start_accent_list"][0]
	end_accent_list = yukarin_sa_args["end_accent_list"][0]
	start_accent_phrase_list = yukarin_sa_args["start_accent_phrase_list"][0]
	end_accent_phrase_list = yukarin_sa_args["end_accent_phrase_list"][0]
	self.assertEqual(list_length, 12)
	self.assertEqual(list_length, len(vowel_phoneme_list))
	self.assertEqual(list_length, len(consonant_phoneme_list))
	self.assertEqual(list_length, len(start_accent_list))
	self.assertEqual(list_length, len(end_accent_list))
	self.assertEqual(list_length, len(start_accent_phrase_list))
	self.assertEqual(list_length, len(end_accent_phrase_list))
	self.assertEqual(yukarin_sa_args["speaker_id"], 1)

	numpy.testing.assert_array_equal(
	vowel_phoneme_list,
	numpy.array(
	[
	0,
	30,
	4,
	21,
	21,
	7,
	0,
	21,
	30,
	14,
	6,
	0,
	]
	),
	)
	numpy.testing.assert_array_equal(
	consonant_phoneme_list,
	numpy.array(
	[
	-1,
	23,
	-1,
	28,
	10,
	42,
	-1,
	19,
	19,
	12,
	35,
	-1,
	]
	),
	)
	numpy.testing.assert_array_equal(
	start_accent_list, numpy.array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0])
	)
	numpy.testing.assert_array_equal(
	end_accent_list, numpy.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0])
	)
	numpy.testing.assert_array_equal(
	start_accent_phrase_list, numpy.array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])
	)
	numpy.testing.assert_array_equal(
	end_accent_phrase_list, numpy.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])
	)

	# flatten_morasを使わずに愚直にaccent_phrasesにデータを反映させてみる
	true_result = deepcopy(self.accent_phrases_hello_hiho)
	index = 1

	def result_value(i: int):
	# unvoiced_mora_phoneme_listのPhoneme ID版
	unvoiced_mora_phoneme_id_list = [
	OjtPhoneme(p, 0, 0).phoneme_id for p in unvoiced_mora_phoneme_list
	]
	if vowel_phoneme_list[i] in unvoiced_mora_phoneme_id_list:
	return 0
	return (
	vowel_phoneme_list[i]
	+ consonant_phoneme_list[i]
	+ start_accent_list[i]
	+ end_accent_list[i]
	+ start_accent_phrase_list[i]
	+ end_accent_phrase_list[i]
	) * 0.5 + 1

	for accent_phrase in true_result:
	moras = accent_phrase.moras
	for mora in moras:
	mora.pitch = result_value(index)
	index += 1
	if accent_phrase.pause_mora is not None:
	accent_phrase.pause_mora.pitch = result_value(index)
	index += 1

	self.assertEqual(result, true_result)

	def synthesis_test_base(self, audio_query: AudioQuery):
	accent_phrases = audio_query.accent_phrases

	# decode forwardのために適当にpitchとlengthを設定し、リストで持っておく
	phoneme_length_list = [0.0]
	phoneme_id_list = [0]
	f0_list = [0.0]
	for accent_phrase in accent_phrases:
	moras = accent_phrase.moras
	for mora in moras:
	if mora.consonant is not None:
	mora.consonant_length = 0.1
	phoneme_length_list.append(0.1)
	phoneme_id_list.append(OjtPhoneme(mora.consonant, 0, 0).phoneme_id)
	mora.vowel_length = 0.2
	phoneme_length_list.append(0.2)
	phoneme_id_list.append(OjtPhoneme(mora.vowel, 0, 0).phoneme_id)
	if mora.vowel not in unvoiced_mora_phoneme_list:
	mora.pitch = 5.0 + random()
	f0_list.append(mora.pitch)
	if accent_phrase.pause_mora is not None:
	accent_phrase.pause_mora.vowel_length = 0.2
	phoneme_length_list.append(0.2)
	phoneme_id_list.append(OjtPhoneme("pau", 0, 0).phoneme_id)
	f0_list.append(0.0)
	phoneme_length_list.append(0.0)
	phoneme_id_list.append(0)
	f0_list.append(0.0)

	phoneme_length_list[0] = audio_query.prePhonemeLength
	phoneme_length_list[-1] = audio_query.postPhonemeLength

	for i in range(len(phoneme_length_list)):
	phoneme_length_list[i] /= audio_query.speedScale

	result = self.synthesis_engine.synthesis(query=audio_query, speaker_id=1)

	# decodeに渡される値の検証
	decode_args = self.decode_mock.call_args[1]
	list_length = decode_args["length"]
	self.assertEqual(
	list_length,
	int(sum([round(p * 24000 / 256) for p in phoneme_length_list])),
	)

	num_phoneme = OjtPhoneme.num_phoneme
	# mora_phoneme_listのPhoneme ID版
	mora_phoneme_id_list = [
	OjtPhoneme(p, 0, 0).phoneme_id for p in mora_phoneme_list
	]

	# numpy.repeatをfor文でやる
	f0 = []
	phoneme = []
	f0_index = 0
	mean_f0 = []
	for i, phoneme_length in enumerate(phoneme_length_list):
	f0_single = numpy.array(f0_list[f0_index], dtype=numpy.float32) * (
	2**audio_query.pitchScale
	)
	for _ in range(int(round(phoneme_length * (24000 / 256)))):
	f0.append([f0_single])
	phoneme_s = []
	for _ in range(num_phoneme):
	phoneme_s.append(0)
	# one hot
	phoneme_s[phoneme_id_list[i]] = 1
	phoneme.append(phoneme_s)
	# consonantとvowelを判別し、vowelであればf0_indexを一つ進める
	if phoneme_id_list[i] in mora_phoneme_id_list:
	if f0_single > 0:
	mean_f0.append(f0_single)
	f0_index += 1

	mean_f0 = numpy.array(mean_f0, dtype=numpy.float32).mean()
	f0 = numpy.array(f0, dtype=numpy.float32)
	for i in range(len(f0)):
	if f0[i][0] != 0.0:
	f0[i][0] = (f0[i][0] - mean_f0) * audio_query.intonationScale + mean_f0

	phoneme = numpy.array(phoneme, dtype=numpy.float32)

	# 乱数の影響で数値の位置がずれが生じるので、大半(4/5)があっていればよしとする
	# また、上の部分のint(round(phoneme_length * (24000 / 256)))の影響で
	# 本来のf0/phonemeとテスト生成したf0/phonemeの長さが変わることがあり、
	# テスト生成したものが若干長くなることがあるので、本来のものの長さを基準にassertする
	assert_f0_count = 0
	decode_f0 = decode_args["f0"]
	for i in range(len(decode_f0)):
	# 乱数の影響等で数値にずれが生じるので、10の-5乗までの近似値であれば許容する
	assert_f0_count += math.isclose(f0[i][0], decode_f0[i][0], rel_tol=10e-5)
	self.assertTrue(assert_f0_count >= int(len(decode_f0) / 5) * 4)
	assert_phoneme_count = 0
	decode_phoneme = decode_args["phoneme"]
	for i in range(len(decode_phoneme)):
	assert_true_count = 0
	for j in range(len(decode_phoneme[i])):
	assert_true_count += bool(phoneme[i][j] == decode_phoneme[i][j])
	assert_phoneme_count += assert_true_count == num_phoneme
	self.assertTrue(assert_phoneme_count >= int(len(decode_phoneme) / 5) * 4)
	self.assertEqual(decode_args["speaker_id"], 1)

	# decode forwarderのmockを使う
	true_result = decode_mock(list_length, num_phoneme, f0, phoneme, 1)

	true_result *= audio_query.volumeScale

	# TODO: resampyの部分は値の検証しようがないので、パスする
	if audio_query.outputSamplingRate != 24000:
	return

	assert_result_count = 0
	for i in range(len(true_result)):
	if audio_query.outputStereo:
	assert_result_count += math.isclose(
	true_result[i], result[i][0], rel_tol=10e-5
	) and math.isclose(true_result[i], result[i][1], rel_tol=10e-5)
	else:
	assert_result_count += math.isclose(
	true_result[i], result[i], rel_tol=10e-5
	)
	self.assertTrue(assert_result_count >= int(len(true_result) / 5) * 4)

	def test_synthesis(self):
	audio_query = AudioQuery(
	accent_phrases=deepcopy(self.accent_phrases_hello_hiho),
	speedScale=1.0,
	pitchScale=1.0,
	intonationScale=1.0,
	volumeScale=1.0,
	prePhonemeLength=0.1,
	postPhonemeLength=0.1,
	outputSamplingRate=24000,
	outputStereo=False,
	# このテスト内では使わないので生成不要
	kana="",
	)

	self.synthesis_test_base(audio_query)

	# speed scaleのテスト
	audio_query.speedScale = 1.2
	self.synthesis_test_base(audio_query)

	# pitch scaleのテスト
	audio_query.pitchScale = 1.5
	audio_query.speedScale = 1.0
	self.synthesis_test_base(audio_query)

	# intonation scaleのテスト
	audio_query.pitchScale = 1.0
	audio_query.intonationScale = 1.4
	self.synthesis_test_base(audio_query)

	# volume scaleのテスト
	audio_query.intonationScale = 1.0
	audio_query.volumeScale = 2.0
	self.synthesis_test_base(audio_query)

	# pre/post phoneme lengthのテスト
	audio_query.volumeScale = 1.0
	audio_query.prePhonemeLength = 0.5
	audio_query.postPhonemeLength = 0.5
	self.synthesis_test_base(audio_query)

	# output sampling rateのテスト
	audio_query.prePhonemeLength = 0.1
	audio_query.postPhonemeLength = 0.1
	audio_query.outputSamplingRate = 48000
	self.synthesis_test_base(audio_query)

	# output stereoのテスト
	audio_query.outputSamplingRate = 24000
	audio_query.outputStereo = True
	self.synthesis_test_base(audio_query)