ellie-Bert-VITS2 / processing_bert_vits2.py
hans00's picture
Update processing_bert_vits2.py
23bb9d9 verified
# coding=utf-8
# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for Bert VITS2
"""
import os
from typing import Dict
import re
from transformers.tokenization_utils_base import BatchEncoding
from transformers.processing_utils import ProcessorMixin
from transformers.utils import logging
from transformers import AutoTokenizer, PreTrainedTokenizer
logger = logging.get_logger(__name__)
def chinese_number_to_words(text):
out = ""
if text[0] == "-":
out += "負"
text = text[1:]
elif text[0] == "+":
out += "正"
text = text[1:]
if "." in text:
integer, decimal = text.split(".")
out += chinese_number_to_words(integer)
out += "點"
for c in decimal:
out += chinese_number_to_words(c)
return out
chinese_num = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"]
length = len(text)
for i, c in enumerate(text):
if c == "0" and out[-1] not in chinese_num:
if i != length - 1 or length == 1:
out += chinese_num[0]
else:
out += chinese_num[int(c)]
if length - i == 2:
out += "十"
elif length - i == 3:
out += "百"
elif length - i == 4:
out += "千"
elif length - i == 5:
out += "萬"
elif length - i == 6:
out += "十"
elif length - i == 7:
out += "百"
elif length - i == 8:
out += "千"
elif length - i == 9:
out += "億"
elif length - i == 10:
out += "十"
elif length - i == 11:
out += "百"
elif length - i == 12:
out += "千"
elif length - i == 13:
out += "兆"
elif length - i == 14:
out += "十"
elif length - i == 15:
out += "百"
elif length - i == 16:
out += "千"
elif length - i == 17:
out += "京"
return out
class BertVits2Processor(ProcessorMixin):
r"""
Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.
Args:
tokenizers ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`].
bert_tokenizer ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`].
"""
tokenizer_class = "AutoTokenizer"
attributes = ["tokenizer"]
def __init__(self, tokenizer: PreTrainedTokenizer, bert_tokenizers: Dict[str, PreTrainedTokenizer]):
super().__init__(tokenizer)
self.__bert_tokenizers = bert_tokenizers
@property
def bert_tokenizers(self):
return self.__bert_tokenizers
def preprocess_stage1(self, text, language=None):
# normalize punctuation
text = text.replace(",", ",").replace("。", ".").replace("?", "?").replace("!", "!").replace("...", "…")
# normalize whitespace
text = re.sub(r"\s+", " ", text).strip()
# convert number to words
if language == "zh":
text = re.sub(r"[+-]?\d+", lambda x: chinese_number_to_words(x.group()), text)
return text
def preprocess_stage2(self, text, language=None):
# normalize whitespace
text = re.sub(r"\s", 'SP', text).strip()
return text
def __call__(
self,
text=None,
language=None,
return_tensors="pt",
max_length=256,
add_special_tokens=True,
return_attention_mask=True,
padding="longest",
**kwargs,
):
"""
Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
Returns:
Tuple([`BatchEncoding`], [`BatchFeature`]): A tuple composed of a [`BatchEncoding`], i.e the output of the
`tokenizer` and a [`BatchFeature`], i.e the voice preset with the right tensors type.
"""
if language is None:
raise ValueError("The language argument is required for BertVits2Processor.")
if language not in self.bert_tokenizers:
raise ValueError(f"Language '{language}' not supported by BertVits2Processor.")
bert_text = self.preprocess_stage1(text, language)
g2p_text = self.preprocess_stage2(bert_text, language)
phone_text, tone_ids, lang_ids, word2ph = self.tokenizer.convert_g2p(g2p_text, language, add_special_tokens)
encoded_text = self.tokenizer(
phone_text,
return_tensors=return_tensors,
padding=padding,
max_length=max_length,
return_attention_mask=return_attention_mask,
**kwargs,
)
bert_tokenizer = self.bert_tokenizers[language]
bert_encoded_text = bert_tokenizer(
bert_text,
return_tensors=return_tensors,
padding=padding,
max_length=max_length,
return_attention_mask=return_attention_mask,
add_special_tokens=add_special_tokens,
return_token_type_ids=False,
**kwargs,
)
return BatchEncoding({
**encoded_text,
**{ f"bert_{k}": v for k, v in bert_encoded_text.items() },
"tone_ids": [tone_ids],
"language_ids": [lang_ids],
"word_to_phoneme": [word2ph],
}, tensor_type=return_tensors)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
processor_dict['bert_tokenizers'] = {
key: AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder=val)
for key, val in processor_dict['bert_tokenizers'].items()
}
return cls.from_args_and_dict(args, processor_dict, **kwargs)
def save_pretrained(
self,
save_directory,
**kwargs,
):
"""
Save the processor to the `save_directory` directory. If the processor has been created from a
repository, the method will push the model to the `save_directory` repository.
Args:
save_directory (`str`):
Directory where the processor will be saved.
push_to_hub (`bool`, `optional`, defaults to `False`):
Whether or not to push the model to the Hugging Face Hub after saving it.
kwargs:
Additional attributes to be saved with the processor.
"""
os.makedirs(save_directory, exist_ok=True)
for language, tokenizer in self.bert_tokenizers.items():
tokenizer.save_pretrained(os.path.join(save_directory, f"bert_{language}"))
bert_tokenizers = self.bert_tokenizers
self.bert_tokenizers = {language: f"bert_{language}" for language in self.bert_tokenizers}
outputs = super().save_pretrained(save_directory, **kwargs)
self.bert_tokenizers = bert_tokenizers
return outputs