voicevox / voicevox_engine /acoustic_feature_extractor.py
GaenKoki's picture
Duplicate from 2ndelement/voicevox
5cda731
raw
history blame contribute delete
No virus
7.58 kB
from abc import abstractmethod
from enum import Enum
from pathlib import Path
from typing import List, Sequence
import numpy
class BasePhoneme(object):
"""
音素の応用クラス群の抽象基底クラス
Attributes
----------
phoneme_list : Sequence[str]
音素のリスト
num_phoneme : int
音素リストの要素数
space_phoneme : str
読点に値する音素
"""
phoneme_list: Sequence[str]
num_phoneme: int
space_phoneme: str
def __init__(
self,
phoneme: str,
start: float,
end: float,
):
self.phoneme = phoneme
self.start = numpy.round(start, decimals=2)
self.end = numpy.round(end, decimals=2)
def __repr__(self):
return f"Phoneme(phoneme='{self.phoneme}', start={self.start}, end={self.end})"
def __eq__(self, o: object):
return isinstance(o, BasePhoneme) and (
self.phoneme == o.phoneme and self.start == o.start and self.end == o.end
)
def verify(self):
"""
音素クラスとして、データが正しいかassertする
"""
assert self.phoneme in self.phoneme_list, f"{self.phoneme} is not defined."
@property
def phoneme_id(self):
"""
phoneme_id (phoneme list内でのindex)を取得する
Returns
-------
id : int
phoneme_idを返す
"""
return self.phoneme_list.index(self.phoneme)
@property
def duration(self):
"""
音素継続期間を取得する
Returns
-------
duration : int
音素継続期間を返す
"""
return self.end - self.start
@property
def onehot(self):
"""
phoneme listの長さ分の0埋め配列のうち、phoneme id番目がTrue(1)の配列を返す
Returns
-------
onehot : numpu.ndarray
関数内で変更された配列を返す
"""
array = numpy.zeros(self.num_phoneme, dtype=bool)
array[self.phoneme_id] = True
return array
@classmethod
def parse(cls, s: str):
"""
文字列をパースして音素クラスを作る
Parameters
----------
s : str
パースしたい文字列
Returns
-------
phoneme : BasePhoneme
パース結果を用いた音素クラスを返す
Examples
--------
>>> BasePhoneme.parse('1.7425000 1.9125000 o:')
Phoneme(phoneme='o:', start=1.74, end=1.91)
"""
words = s.split()
return cls(
start=float(words[0]),
end=float(words[1]),
phoneme=words[2],
)
@classmethod
@abstractmethod
def convert(cls, phonemes: List["BasePhoneme"]) -> List["BasePhoneme"]:
raise NotImplementedError
@classmethod
def load_lab_list(cls, path: Path):
"""
labファイルを読み込む
Parameters
----------
path : Path
読み込みたいlabファイルのパス
Returns
-------
phonemes : List[BasePhoneme]
パース結果を用いた音素クラスを返す
"""
phonemes = [cls.parse(s) for s in path.read_text().split("\n") if len(s) > 0]
phonemes = cls.convert(phonemes)
for phoneme in phonemes:
phoneme.verify()
return phonemes
@classmethod
def save_lab_list(cls, phonemes: List["BasePhoneme"], path: Path):
"""
音素クラスのリストをlabファイル形式で保存する
Parameters
----------
phonemes : List[BasePhoneme]
保存したい音素クラスのリスト
path : Path
labファイルの保存先パス
"""
text = "\n".join(
[
f"{numpy.round(p.start, decimals=2):.2f}\t"
f"{numpy.round(p.end, decimals=2):.2f}\t"
f"{p.phoneme}"
for p in phonemes
]
)
path.write_text(text)
class JvsPhoneme(BasePhoneme):
"""
JVS(Japanese versatile speech)コーパスに含まれる音素群クラス
Attributes
----------
phoneme_list : Sequence[str]
音素のリスト
num_phoneme : int
音素リストの要素数
space_phoneme : str
読点に値する音素
"""
phoneme_list = (
"pau",
"I",
"N",
"U",
"a",
"b",
"by",
"ch",
"cl",
"d",
"dy",
"e",
"f",
"g",
"gy",
"h",
"hy",
"i",
"j",
"k",
"ky",
"m",
"my",
"n",
"ny",
"o",
"p",
"py",
"r",
"ry",
"s",
"sh",
"t",
"ts",
"u",
"v",
"w",
"y",
"z",
)
num_phoneme = len(phoneme_list)
space_phoneme = "pau"
@classmethod
def convert(cls, phonemes: List["JvsPhoneme"]) -> List["JvsPhoneme"]:
"""
最初と最後のsil(silent)をspace_phoneme(pau)に置き換え(変換)する
Parameters
----------
phonemes : List[JvsPhoneme]
変換したいphonemeのリスト
Returns
-------
phonemes : List[JvsPhoneme]
変換されたphonemeのリスト
"""
if "sil" in phonemes[0].phoneme:
phonemes[0].phoneme = cls.space_phoneme
if "sil" in phonemes[-1].phoneme:
phonemes[-1].phoneme = cls.space_phoneme
return phonemes
class OjtPhoneme(BasePhoneme):
"""
OpenJTalkに含まれる音素群クラス
Attributes
----------
phoneme_list : Sequence[str]
音素のリスト
num_phoneme : int
音素リストの要素数
space_phoneme : str
読点に値する音素
"""
phoneme_list = (
"pau",
"A",
"E",
"I",
"N",
"O",
"U",
"a",
"b",
"by",
"ch",
"cl",
"d",
"dy",
"e",
"f",
"g",
"gw",
"gy",
"h",
"hy",
"i",
"j",
"k",
"kw",
"ky",
"m",
"my",
"n",
"ny",
"o",
"p",
"py",
"r",
"ry",
"s",
"sh",
"t",
"ts",
"ty",
"u",
"v",
"w",
"y",
"z",
)
num_phoneme = len(phoneme_list)
space_phoneme = "pau"
@classmethod
def convert(cls, phonemes: List["OjtPhoneme"]):
"""
最初と最後のsil(silent)をspace_phoneme(pau)に置き換え(変換)する
Parameters
----------
phonemes : List[OjtPhoneme]
変換したいphonemeのリスト
Returns
-------
phonemes : List[OjtPhoneme]
変換されたphonemeのリスト
"""
if "sil" in phonemes[0].phoneme:
phonemes[0].phoneme = cls.space_phoneme
if "sil" in phonemes[-1].phoneme:
phonemes[-1].phoneme = cls.space_phoneme
return phonemes
class PhonemeType(str, Enum):
jvs = "jvs"
openjtalk = "openjtalk"
phoneme_type_to_class = {
PhonemeType.jvs: JvsPhoneme,
PhonemeType.openjtalk: OjtPhoneme,
}