|
from nemo.collections.tts.torch.tts_tokenizers import BaseCharsTokenizer |
|
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import any_locale_text_preprocessing |
|
|
|
|
|
|
|
def lowercase_text_preprocessing(text): |
|
text = any_locale_text_preprocessing(text) |
|
text = text.lower() |
|
return text |
|
|
|
|
|
|
|
class CharsTokenizer(BaseCharsTokenizer): |
|
PUNCT_LIST = BaseCharsTokenizer.PUNCT_LIST+('+',"—") |
|
|
|
def __init__( |
|
self, |
|
chars, |
|
punct=True, |
|
apostrophe=True, |
|
add_blank_at=None, |
|
pad_with_space=False, |
|
non_default_punct_list=None, |
|
text_preprocessing_func=lowercase_text_preprocessing, |
|
): |
|
"""Char-based tokenizer. |
|
Args: |
|
chars: string that represents all possible characters. |
|
punct: Whether to reserve grapheme for basic punctuation or not. |
|
apostrophe: Whether to use apostrophe or not. |
|
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), |
|
if None then no blank in labels. |
|
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. |
|
non_default_punct_list: List of punctuation marks which will be used instead default. |
|
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. |
|
""" |
|
super().__init__( |
|
chars=chars, |
|
punct=punct, |
|
apostrophe=apostrophe, |
|
add_blank_at=add_blank_at, |
|
pad_with_space=pad_with_space, |
|
non_default_punct_list=non_default_punct_list, |
|
text_preprocessing_func=text_preprocessing_func, |
|
) |