Spaces:
Build error
Build error
add data_gen
Browse files- data_gen/tts/__pycache__/base_binarizer.cpython-36.pyc +0 -0
- data_gen/tts/__pycache__/base_binarizer.cpython-37.pyc +0 -0
- data_gen/tts/__pycache__/base_binarizer.cpython-39.pyc +0 -0
- data_gen/tts/__pycache__/base_preprocess.cpython-36.pyc +0 -0
- data_gen/tts/__pycache__/base_preprocess.cpython-37.pyc +0 -0
- data_gen/tts/base_binarizer.py +324 -0
- data_gen/tts/base_preprocess.py +251 -0
- data_gen/tts/runs/align_and_binarize.py +12 -0
- data_gen/tts/runs/binarize.py +17 -0
- data_gen/tts/runs/preprocess.py +17 -0
- data_gen/tts/runs/train_mfa_align.py +46 -0
- data_gen/tts/txt_processors/__init__.py +1 -0
- data_gen/tts/txt_processors/__pycache__/__init__.cpython-36.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/__init__.cpython-37.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-36.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-37.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/en.cpython-36.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/en.cpython-37.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/syntactic_graph_buider.cpython-36.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/zh.cpython-36.pyc +0 -0
- data_gen/tts/txt_processors/__pycache__/zh.cpython-37.pyc +0 -0
- data_gen/tts/txt_processors/base_text_processor.py +50 -0
- data_gen/tts/txt_processors/en.py +78 -0
- data_gen/tts/txt_processors/zh.py +110 -0
- data_gen/tts/wav_processors/__init__.py +2 -0
- data_gen/tts/wav_processors/__pycache__/__init__.cpython-36.pyc +0 -0
- data_gen/tts/wav_processors/__pycache__/__init__.cpython-37.pyc +0 -0
- data_gen/tts/wav_processors/__pycache__/base_processor.cpython-36.pyc +0 -0
- data_gen/tts/wav_processors/__pycache__/base_processor.cpython-37.pyc +0 -0
- data_gen/tts/wav_processors/__pycache__/common_processors.cpython-36.pyc +0 -0
- data_gen/tts/wav_processors/__pycache__/common_processors.cpython-37.pyc +0 -0
- data_gen/tts/wav_processors/base_processor.py +25 -0
- data_gen/tts/wav_processors/common_processors.py +86 -0
data_gen/tts/__pycache__/base_binarizer.cpython-36.pyc
ADDED
Binary file (11.3 kB). View file
|
|
data_gen/tts/__pycache__/base_binarizer.cpython-37.pyc
ADDED
Binary file (11.2 kB). View file
|
|
data_gen/tts/__pycache__/base_binarizer.cpython-39.pyc
ADDED
Binary file (11.1 kB). View file
|
|
data_gen/tts/__pycache__/base_preprocess.cpython-36.pyc
ADDED
Binary file (10.8 kB). View file
|
|
data_gen/tts/__pycache__/base_preprocess.cpython-37.pyc
ADDED
Binary file (10.8 kB). View file
|
|
data_gen/tts/base_binarizer.py
ADDED
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
from re import L
|
5 |
+
import traceback
|
6 |
+
from functools import partial
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
from resemblyzer import VoiceEncoder
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
import utils.commons.single_thread_env # NOQA
|
13 |
+
from utils.audio import librosa_wav2spec
|
14 |
+
from utils.audio.align import get_mel2ph, mel2token_to_dur
|
15 |
+
from utils.audio.cwt import get_lf0_cwt, get_cont_lf0
|
16 |
+
from utils.audio.pitch.utils import f0_to_coarse
|
17 |
+
from utils.audio.pitch_extractors import extract_pitch_simple
|
18 |
+
from utils.commons.hparams import hparams
|
19 |
+
from utils.commons.indexed_datasets import IndexedDatasetBuilder
|
20 |
+
from utils.commons.multiprocess_utils import multiprocess_run_tqdm
|
21 |
+
from utils.os_utils import remove_file, copy_file
|
22 |
+
|
23 |
+
np.seterr(divide='ignore', invalid='ignore')
|
24 |
+
|
25 |
+
|
26 |
+
class BinarizationError(Exception):
|
27 |
+
pass
|
28 |
+
|
29 |
+
sentence2graph_parser = None
|
30 |
+
|
31 |
+
|
32 |
+
class BaseBinarizer:
|
33 |
+
def __init__(self, processed_data_dir=None):
|
34 |
+
if processed_data_dir is None:
|
35 |
+
processed_data_dir = hparams['processed_data_dir']
|
36 |
+
self.processed_data_dir = processed_data_dir
|
37 |
+
self.binarization_args = hparams['binarization_args']
|
38 |
+
self.items = {}
|
39 |
+
self.item_names = []
|
40 |
+
|
41 |
+
global sentence2graph_parser
|
42 |
+
from modules.tts.syntaspeech.syntactic_graph_buider import Sentence2GraphParser
|
43 |
+
|
44 |
+
if hparams['ds_name'] == 'libritts':
|
45 |
+
# Unfortunately, we found when processing libritts with multi-processing will incur pytorch.multiprocessing
|
46 |
+
# so we use single thread with cuda graph builder
|
47 |
+
# it take about 20 hours in a PC with 24-cores-cpu and a RTX2080Ti to process the whole LibriTTS
|
48 |
+
# so run the binarization and take a break!
|
49 |
+
sentence2graph_parser = Sentence2GraphParser("en", use_gpu=True)
|
50 |
+
elif hparams['ds_name'] == 'ljspeech':
|
51 |
+
# use multi-processing, thus gpu is disabled
|
52 |
+
# it takes about 30 minutes for binarization
|
53 |
+
sentence2graph_parser = Sentence2GraphParser("en", use_gpu=False)
|
54 |
+
elif hparams['preprocess_args']['txt_processor'] == 'zh':
|
55 |
+
# use multi-processing, thus gpu is disabled
|
56 |
+
# it takes about 30 minutes for binarization
|
57 |
+
sentence2graph_parser = Sentence2GraphParser("zh", use_gpu=False)
|
58 |
+
else:
|
59 |
+
raise NotImplementedError
|
60 |
+
|
61 |
+
def load_meta_data(self):
|
62 |
+
processed_data_dir = self.processed_data_dir
|
63 |
+
items_list = json.load(open(f"{processed_data_dir}/metadata.json"))
|
64 |
+
for r in tqdm(items_list, desc='Loading meta data.'):
|
65 |
+
item_name = r['item_name']
|
66 |
+
self.items[item_name] = r
|
67 |
+
self.item_names.append(item_name)
|
68 |
+
if self.binarization_args['shuffle']:
|
69 |
+
random.seed(1234)
|
70 |
+
random.shuffle(self.item_names)
|
71 |
+
|
72 |
+
@property
|
73 |
+
def train_item_names(self):
|
74 |
+
range_ = self._convert_range(self.binarization_args['train_range'])
|
75 |
+
return self.item_names[range_[0]:range_[1]]
|
76 |
+
|
77 |
+
@property
|
78 |
+
def valid_item_names(self):
|
79 |
+
range_ = self._convert_range(self.binarization_args['valid_range'])
|
80 |
+
return self.item_names[range_[0]:range_[1]]
|
81 |
+
|
82 |
+
@property
|
83 |
+
def test_item_names(self):
|
84 |
+
range_ = self._convert_range(self.binarization_args['test_range'])
|
85 |
+
return self.item_names[range_[0]:range_[1]]
|
86 |
+
|
87 |
+
def _convert_range(self, range_):
|
88 |
+
if range_[1] == -1:
|
89 |
+
range_[1] = len(self.item_names)
|
90 |
+
return range_
|
91 |
+
|
92 |
+
def meta_data(self, prefix):
|
93 |
+
if prefix == 'valid':
|
94 |
+
item_names = self.valid_item_names
|
95 |
+
elif prefix == 'test':
|
96 |
+
item_names = self.test_item_names
|
97 |
+
else:
|
98 |
+
item_names = self.train_item_names
|
99 |
+
for item_name in item_names:
|
100 |
+
yield self.items[item_name]
|
101 |
+
|
102 |
+
def process(self):
|
103 |
+
self.load_meta_data()
|
104 |
+
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
|
105 |
+
for fn in ['phone_set.json', 'word_set.json', 'spk_map.json']:
|
106 |
+
remove_file(f"{hparams['binary_data_dir']}/{fn}")
|
107 |
+
copy_file(f"{hparams['processed_data_dir']}/{fn}", f"{hparams['binary_data_dir']}/{fn}")
|
108 |
+
if hparams['ds_name'] in ['ljspeech', 'biaobei']:
|
109 |
+
self.process_data('valid')
|
110 |
+
self.process_data('test')
|
111 |
+
self.process_data('train')
|
112 |
+
elif hparams['ds_name'] in ['libritts']:
|
113 |
+
self.process_data_single_processing('valid')
|
114 |
+
self.process_data_single_processing('test')
|
115 |
+
self.process_data_single_processing('train')
|
116 |
+
else:
|
117 |
+
raise NotImplementedError
|
118 |
+
|
119 |
+
def process_data(self, prefix):
|
120 |
+
data_dir = hparams['binary_data_dir']
|
121 |
+
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
|
122 |
+
meta_data = list(self.meta_data(prefix))
|
123 |
+
process_item = partial(self.process_item, binarization_args=self.binarization_args)
|
124 |
+
ph_lengths = []
|
125 |
+
mel_lengths = []
|
126 |
+
total_sec = 0
|
127 |
+
items = []
|
128 |
+
args = [{'item': item} for item in meta_data]
|
129 |
+
|
130 |
+
for item_id, item in multiprocess_run_tqdm(process_item, args, desc='Processing data'):
|
131 |
+
if item is not None:
|
132 |
+
items.append(item)
|
133 |
+
if self.binarization_args['with_spk_embed']:
|
134 |
+
args = [{'wav': item['wav']} for item in items]
|
135 |
+
for item_id, spk_embed in multiprocess_run_tqdm(
|
136 |
+
self.get_spk_embed, args,
|
137 |
+
init_ctx_func=lambda wid: {'voice_encoder': VoiceEncoder().cuda()}, num_workers=4,
|
138 |
+
desc='Extracting spk embed'):
|
139 |
+
items[item_id]['spk_embed'] = spk_embed
|
140 |
+
|
141 |
+
for item in items:
|
142 |
+
if not self.binarization_args['with_wav'] and 'wav' in item:
|
143 |
+
del item['wav']
|
144 |
+
builder.add_item(item)
|
145 |
+
mel_lengths.append(item['len'])
|
146 |
+
assert item['len'] > 0, (item['item_name'], item['txt'], item['mel2ph'])
|
147 |
+
if 'ph_len' in item:
|
148 |
+
ph_lengths.append(item['ph_len'])
|
149 |
+
total_sec += item['sec']
|
150 |
+
builder.finalize()
|
151 |
+
np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
|
152 |
+
if len(ph_lengths) > 0:
|
153 |
+
np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
|
154 |
+
print(f"| {prefix} total duration: {total_sec:.3f}s")
|
155 |
+
|
156 |
+
|
157 |
+
def process_data_single_processing(self, prefix):
|
158 |
+
data_dir = hparams['binary_data_dir']
|
159 |
+
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
|
160 |
+
meta_data = list(self.meta_data(prefix))
|
161 |
+
ph_lengths = []
|
162 |
+
mel_lengths = []
|
163 |
+
total_sec = 0
|
164 |
+
items = []
|
165 |
+
args = [{'item': item} for item in meta_data]
|
166 |
+
|
167 |
+
for raw_item in tqdm(meta_data):
|
168 |
+
item = self.process_item(raw_item, self.binarization_args)
|
169 |
+
if item is not None:
|
170 |
+
if item['dgl_graph'].num_nodes() != np.array(item['ph2word']).max():
|
171 |
+
print(f"Skip Item: {item['item_name']} word nodes number incorrect!")
|
172 |
+
continue
|
173 |
+
|
174 |
+
items.append(item)
|
175 |
+
|
176 |
+
if self.binarization_args['with_spk_embed']:
|
177 |
+
args = [{'wav': item['wav']} for item in items]
|
178 |
+
for item_id, spk_embed in multiprocess_run_tqdm(
|
179 |
+
self.get_spk_embed, args,
|
180 |
+
init_ctx_func=lambda wid: {'voice_encoder': VoiceEncoder().cuda()}, num_workers=4,
|
181 |
+
desc='Extracting spk embed'):
|
182 |
+
items[item_id]['spk_embed'] = spk_embed
|
183 |
+
|
184 |
+
for item in items:
|
185 |
+
if not self.binarization_args['with_wav'] and 'wav' in item:
|
186 |
+
del item['wav']
|
187 |
+
builder.add_item(item)
|
188 |
+
mel_lengths.append(item['len'])
|
189 |
+
assert item['len'] > 0, (item['item_name'], item['txt'], item['mel2ph'])
|
190 |
+
if 'ph_len' in item:
|
191 |
+
ph_lengths.append(item['ph_len'])
|
192 |
+
total_sec += item['sec']
|
193 |
+
builder.finalize()
|
194 |
+
np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
|
195 |
+
if len(ph_lengths) > 0:
|
196 |
+
np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
|
197 |
+
print(f"| {prefix} total duration: {total_sec:.3f}s")
|
198 |
+
|
199 |
+
@classmethod
|
200 |
+
def process_item(cls, item, binarization_args):
|
201 |
+
try:
|
202 |
+
item['ph_len'] = len(item['ph_token'])
|
203 |
+
item_name = item['item_name']
|
204 |
+
wav_fn = item['wav_fn']
|
205 |
+
wav, mel = cls.process_audio(wav_fn, item, binarization_args)
|
206 |
+
except Exception as e:
|
207 |
+
print(f"| Skip item ({e}) for index error. item_name: {item_name}, wav_fn: {wav_fn}")
|
208 |
+
return None
|
209 |
+
try:
|
210 |
+
n_bos_frames, n_eos_frames = 0, 0
|
211 |
+
if binarization_args['with_align']:
|
212 |
+
tg_fn = f"{hparams['processed_data_dir']}/mfa_outputs/{item_name}.TextGrid"
|
213 |
+
item['tg_fn'] = tg_fn
|
214 |
+
cls.process_align(tg_fn, item)
|
215 |
+
if binarization_args['trim_eos_bos']:
|
216 |
+
n_bos_frames = item['dur'][0]
|
217 |
+
n_eos_frames = item['dur'][-1]
|
218 |
+
T = len(mel)
|
219 |
+
item['mel'] = mel[n_bos_frames:T - n_eos_frames]
|
220 |
+
|
221 |
+
item['mel2ph'] = item['mel2ph'][n_bos_frames:T - n_eos_frames]
|
222 |
+
item['mel2word'] = item['mel2word'][n_bos_frames:T - n_eos_frames]
|
223 |
+
item['dur'] = item['dur'][1:-1]
|
224 |
+
item['dur_word'] = item['dur_word'][1:-1]
|
225 |
+
item['len'] = item['mel'].shape[0]
|
226 |
+
item['wav'] = wav[n_bos_frames * hparams['hop_size']:len(wav) - n_eos_frames * hparams['hop_size']]
|
227 |
+
if binarization_args['with_f0']:
|
228 |
+
cls.process_pitch(item, n_bos_frames, n_eos_frames)
|
229 |
+
except BinarizationError as e:
|
230 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
231 |
+
return None
|
232 |
+
except Exception as e:
|
233 |
+
traceback.print_exc()
|
234 |
+
print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
|
235 |
+
return None
|
236 |
+
|
237 |
+
if item['mel'].shape[0] < 128:
|
238 |
+
print(f"Skip Item: {item['item_name']} Mel-spectrogram is shorter than 128!")
|
239 |
+
return None
|
240 |
+
# fix one bad case of stanza
|
241 |
+
if item['txt'].endswith('yn .'):
|
242 |
+
item['txt'] = item['txt'][:-4]+'y .'
|
243 |
+
try:
|
244 |
+
language = sentence2graph_parser.language
|
245 |
+
if language == 'en':
|
246 |
+
dgl_graph, etypes = sentence2graph_parser.parse(item['txt'])
|
247 |
+
elif language == 'zh':
|
248 |
+
dgl_graph, etypes = sentence2graph_parser.parse(item['txt'], item['word'].split(" "), item['ph_gb_word'].split(" "))
|
249 |
+
else:
|
250 |
+
raise NotImplementedError
|
251 |
+
item['dgl_graph'] = dgl_graph
|
252 |
+
item['edge_types'] = etypes
|
253 |
+
except:
|
254 |
+
print(f"| Dependency Parsing Error! Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
|
255 |
+
return None
|
256 |
+
return item
|
257 |
+
|
258 |
+
@classmethod
|
259 |
+
def process_audio(cls, wav_fn, res, binarization_args):
|
260 |
+
wav2spec_dict = librosa_wav2spec(
|
261 |
+
wav_fn,
|
262 |
+
fft_size=hparams['fft_size'],
|
263 |
+
hop_size=hparams['hop_size'],
|
264 |
+
win_length=hparams['win_size'],
|
265 |
+
num_mels=hparams['audio_num_mel_bins'],
|
266 |
+
fmin=hparams['fmin'],
|
267 |
+
fmax=hparams['fmax'],
|
268 |
+
sample_rate=hparams['audio_sample_rate'],
|
269 |
+
loud_norm=hparams['loud_norm'])
|
270 |
+
mel = wav2spec_dict['mel']
|
271 |
+
wav = wav2spec_dict['wav'].astype(np.float16)
|
272 |
+
if binarization_args['with_linear']:
|
273 |
+
res['linear'] = wav2spec_dict['linear']
|
274 |
+
res.update({'mel': mel, 'wav': wav, 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
|
275 |
+
return wav, mel
|
276 |
+
|
277 |
+
@staticmethod
|
278 |
+
def process_align(tg_fn, item):
|
279 |
+
ph = item['ph']
|
280 |
+
mel = item['mel']
|
281 |
+
ph_token = item['ph_token']
|
282 |
+
if tg_fn is not None and os.path.exists(tg_fn):
|
283 |
+
mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams['hop_size'], hparams['audio_sample_rate'],
|
284 |
+
hparams['binarization_args']['min_sil_duration'])
|
285 |
+
else:
|
286 |
+
raise BinarizationError(f"Align not found")
|
287 |
+
if np.array(mel2ph).max() - 1 >= len(ph_token):
|
288 |
+
raise BinarizationError(
|
289 |
+
f"Align does not match: mel2ph.max() - 1: {np.array(mel2ph).max() - 1}, len(phone_encoded): {len(ph_token)}")
|
290 |
+
item['mel2ph'] = mel2ph
|
291 |
+
item['dur'] = dur
|
292 |
+
|
293 |
+
ph2word = item['ph2word']
|
294 |
+
mel2word = [ph2word[p - 1] for p in item['mel2ph']]
|
295 |
+
item['mel2word'] = mel2word # [T_mel]
|
296 |
+
dur_word = mel2token_to_dur(mel2word, len(item['word_token']))
|
297 |
+
item['dur_word'] = dur_word.tolist() # [T_word]
|
298 |
+
|
299 |
+
@staticmethod
|
300 |
+
def process_pitch(item, n_bos_frames, n_eos_frames):
|
301 |
+
wav, mel = item['wav'], item['mel']
|
302 |
+
f0 = extract_pitch_simple(item['wav'])
|
303 |
+
if sum(f0) == 0:
|
304 |
+
raise BinarizationError("Empty f0")
|
305 |
+
assert len(mel) == len(f0), (len(mel), len(f0))
|
306 |
+
pitch_coarse = f0_to_coarse(f0)
|
307 |
+
item['f0'] = f0
|
308 |
+
item['pitch'] = pitch_coarse
|
309 |
+
if hparams['binarization_args']['with_f0cwt']:
|
310 |
+
uv, cont_lf0_lpf = get_cont_lf0(f0)
|
311 |
+
logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
|
312 |
+
cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
|
313 |
+
cwt_spec, scales = get_lf0_cwt(cont_lf0_lpf_norm)
|
314 |
+
item['cwt_spec'] = cwt_spec
|
315 |
+
item['cwt_mean'] = logf0s_mean_org
|
316 |
+
item['cwt_std'] = logf0s_std_org
|
317 |
+
|
318 |
+
@staticmethod
|
319 |
+
def get_spk_embed(wav, ctx):
|
320 |
+
return ctx['voice_encoder'].embed_utterance(wav.astype(float))
|
321 |
+
|
322 |
+
@property
|
323 |
+
def num_workers(self):
|
324 |
+
return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
|
data_gen/tts/base_preprocess.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import re
|
5 |
+
import traceback
|
6 |
+
from collections import Counter
|
7 |
+
from functools import partial
|
8 |
+
|
9 |
+
import librosa
|
10 |
+
from tqdm import tqdm
|
11 |
+
from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
|
12 |
+
from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
|
13 |
+
from utils.commons.hparams import hparams
|
14 |
+
from utils.commons.multiprocess_utils import multiprocess_run_tqdm
|
15 |
+
from utils.os_utils import link_file, move_file, remove_file
|
16 |
+
from utils.text.text_encoder import is_sil_phoneme, build_token_encoder
|
17 |
+
|
18 |
+
|
19 |
+
class BasePreprocessor:
|
20 |
+
def __init__(self):
|
21 |
+
self.preprocess_args = hparams['preprocess_args']
|
22 |
+
txt_processor = self.preprocess_args['txt_processor']
|
23 |
+
self.txt_processor = get_txt_processor_cls(txt_processor)
|
24 |
+
self.raw_data_dir = hparams['raw_data_dir']
|
25 |
+
self.processed_dir = hparams['processed_data_dir']
|
26 |
+
self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
|
27 |
+
|
28 |
+
def meta_data(self):
|
29 |
+
"""
|
30 |
+
|
31 |
+
:return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
|
32 |
+
"""
|
33 |
+
raise NotImplementedError
|
34 |
+
|
35 |
+
def process(self):
|
36 |
+
processed_dir = self.processed_dir
|
37 |
+
wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
|
38 |
+
remove_file(wav_processed_tmp_dir)
|
39 |
+
os.makedirs(wav_processed_tmp_dir, exist_ok=True)
|
40 |
+
wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
|
41 |
+
remove_file(wav_processed_dir)
|
42 |
+
os.makedirs(wav_processed_dir, exist_ok=True)
|
43 |
+
|
44 |
+
meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
|
45 |
+
item_names = [d['item_name'] for d in meta_data]
|
46 |
+
assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
|
47 |
+
|
48 |
+
# preprocess data
|
49 |
+
phone_list = []
|
50 |
+
word_list = []
|
51 |
+
spk_names = set()
|
52 |
+
process_item = partial(self.preprocess_first_pass,
|
53 |
+
txt_processor=self.txt_processor,
|
54 |
+
wav_processed_dir=wav_processed_dir,
|
55 |
+
wav_processed_tmp=wav_processed_tmp_dir,
|
56 |
+
preprocess_args=self.preprocess_args)
|
57 |
+
items = []
|
58 |
+
args = [{
|
59 |
+
'item_name': item_raw['item_name'],
|
60 |
+
'txt_raw': item_raw['txt'],
|
61 |
+
'wav_fn': item_raw['wav_fn'],
|
62 |
+
'txt_loader': item_raw.get('txt_loader'),
|
63 |
+
'others': item_raw.get('others', None)
|
64 |
+
} for item_raw in meta_data]
|
65 |
+
for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
|
66 |
+
if item is not None:
|
67 |
+
item_.update(item)
|
68 |
+
item = item_
|
69 |
+
if 'txt_loader' in item:
|
70 |
+
del item['txt_loader']
|
71 |
+
item['id'] = item_id
|
72 |
+
item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
|
73 |
+
item['others'] = item.get('others', None)
|
74 |
+
phone_list += item['ph'].split(" ")
|
75 |
+
word_list += item['word'].split(" ")
|
76 |
+
spk_names.add(item['spk_name'])
|
77 |
+
items.append(item)
|
78 |
+
|
79 |
+
# add encoded tokens
|
80 |
+
ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
|
81 |
+
spk_map = self.build_spk_map(spk_names)
|
82 |
+
args = [{
|
83 |
+
'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
|
84 |
+
'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
|
85 |
+
} for item in items]
|
86 |
+
for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
|
87 |
+
items[idx].update(item_new_kv)
|
88 |
+
|
89 |
+
# build mfa data
|
90 |
+
if self.preprocess_args['use_mfa']:
|
91 |
+
mfa_dict = set()
|
92 |
+
mfa_input_dir = f'{processed_dir}/mfa_inputs'
|
93 |
+
remove_file(mfa_input_dir)
|
94 |
+
# group MFA inputs for better parallelism
|
95 |
+
mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
|
96 |
+
if self.preprocess_args['mfa_group_shuffle']:
|
97 |
+
random.seed(hparams['seed'])
|
98 |
+
random.shuffle(mfa_groups)
|
99 |
+
args = [{
|
100 |
+
'item': item, 'mfa_input_dir': mfa_input_dir,
|
101 |
+
'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
|
102 |
+
'preprocess_args': self.preprocess_args
|
103 |
+
} for item, mfa_group in zip(items, mfa_groups)]
|
104 |
+
for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
|
105 |
+
self.build_mfa_inputs, args, desc='Build MFA data'):
|
106 |
+
items[i]['wav_align_fn'] = new_wav_align_fn
|
107 |
+
for w in ph_gb_word_nosil.split(" "):
|
108 |
+
mfa_dict.add(f"{w} {w.replace('_', ' ')}")
|
109 |
+
mfa_dict = sorted(mfa_dict)
|
110 |
+
with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
|
111 |
+
f.writelines([f'{l}\n' for l in mfa_dict])
|
112 |
+
with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
|
113 |
+
f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
|
114 |
+
remove_file(wav_processed_tmp_dir)
|
115 |
+
|
116 |
+
@classmethod
|
117 |
+
def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
|
118 |
+
wav_fn, wav_processed_dir, wav_processed_tmp,
|
119 |
+
preprocess_args, txt_loader=None, others=None):
|
120 |
+
try:
|
121 |
+
if txt_loader is not None:
|
122 |
+
txt_raw = txt_loader(txt_raw)
|
123 |
+
ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
|
124 |
+
wav_fn, wav_align_fn = cls.process_wav(
|
125 |
+
item_name, wav_fn,
|
126 |
+
hparams['processed_data_dir'],
|
127 |
+
wav_processed_tmp, preprocess_args)
|
128 |
+
|
129 |
+
# wav for binarization
|
130 |
+
ext = os.path.splitext(wav_fn)[1]
|
131 |
+
os.makedirs(wav_processed_dir, exist_ok=True)
|
132 |
+
new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
|
133 |
+
move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
|
134 |
+
move_link_func(wav_fn, new_wav_fn)
|
135 |
+
return {
|
136 |
+
'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
|
137 |
+
'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
|
138 |
+
'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
|
139 |
+
'others': others
|
140 |
+
}
|
141 |
+
except:
|
142 |
+
traceback.print_exc()
|
143 |
+
print(f"| Error is caught. item_name: {item_name}.")
|
144 |
+
return None
|
145 |
+
|
146 |
+
@staticmethod
|
147 |
+
def txt_to_ph(txt_processor, txt_raw, preprocess_args):
|
148 |
+
txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
|
149 |
+
ph = [p for w in txt_struct for p in w[1]]
|
150 |
+
ph_gb_word = ["_".join(w[1]) for w in txt_struct]
|
151 |
+
words = [w[0] for w in txt_struct]
|
152 |
+
# word_id=0 is reserved for padding
|
153 |
+
ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
|
154 |
+
return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
|
155 |
+
|
156 |
+
@staticmethod
|
157 |
+
def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
|
158 |
+
processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
|
159 |
+
processors = [k() for k in processors if k is not None]
|
160 |
+
if len(processors) >= 1:
|
161 |
+
sr_file = librosa.core.get_samplerate(wav_fn)
|
162 |
+
output_fn_for_align = None
|
163 |
+
ext = os.path.splitext(wav_fn)[1]
|
164 |
+
input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
|
165 |
+
link_file(wav_fn, input_fn)
|
166 |
+
for p in processors:
|
167 |
+
outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
|
168 |
+
if len(outputs) == 3:
|
169 |
+
input_fn, sr, output_fn_for_align = outputs
|
170 |
+
else:
|
171 |
+
input_fn, sr = outputs
|
172 |
+
return input_fn, output_fn_for_align
|
173 |
+
else:
|
174 |
+
return wav_fn, wav_fn
|
175 |
+
|
176 |
+
def _phone_encoder(self, ph_set):
|
177 |
+
ph_set_fn = f"{self.processed_dir}/phone_set.json"
|
178 |
+
if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
179 |
+
ph_set = sorted(set(ph_set))
|
180 |
+
json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
|
181 |
+
print("| Build phone set: ", ph_set)
|
182 |
+
else:
|
183 |
+
ph_set = json.load(open(ph_set_fn, 'r'))
|
184 |
+
print("| Load phone set: ", ph_set)
|
185 |
+
return build_token_encoder(ph_set_fn)
|
186 |
+
|
187 |
+
def _word_encoder(self, word_set):
|
188 |
+
word_set_fn = f"{self.processed_dir}/word_set.json"
|
189 |
+
if self.preprocess_args['reset_word_dict']:
|
190 |
+
word_set = Counter(word_set)
|
191 |
+
total_words = sum(word_set.values())
|
192 |
+
word_set = word_set.most_common(hparams['word_dict_size'])
|
193 |
+
num_unk_words = total_words - sum([x[1] for x in word_set])
|
194 |
+
word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
|
195 |
+
word_set = sorted(set(word_set))
|
196 |
+
json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
|
197 |
+
print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
|
198 |
+
f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
|
199 |
+
else:
|
200 |
+
word_set = json.load(open(word_set_fn, 'r'))
|
201 |
+
print("| Load word set. Size: ", len(word_set), word_set[:10])
|
202 |
+
return build_token_encoder(word_set_fn)
|
203 |
+
|
204 |
+
@classmethod
|
205 |
+
def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
|
206 |
+
word_token = word_encoder.encode(word)
|
207 |
+
ph_token = ph_encoder.encode(ph)
|
208 |
+
spk_id = spk_map[spk_name]
|
209 |
+
return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
|
210 |
+
|
211 |
+
def build_spk_map(self, spk_names):
|
212 |
+
spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
|
213 |
+
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
|
214 |
+
print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
|
215 |
+
json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
|
216 |
+
return spk_map
|
217 |
+
|
218 |
+
@classmethod
|
219 |
+
def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
|
220 |
+
item_name = item['item_name']
|
221 |
+
wav_align_fn = item['wav_align_fn']
|
222 |
+
ph_gb_word = item['ph_gb_word']
|
223 |
+
ext = os.path.splitext(wav_align_fn)[1]
|
224 |
+
mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
|
225 |
+
os.makedirs(mfa_input_group_dir, exist_ok=True)
|
226 |
+
new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
|
227 |
+
move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
|
228 |
+
move_link_func(wav_align_fn, new_wav_align_fn)
|
229 |
+
ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
|
230 |
+
for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
|
231 |
+
with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
|
232 |
+
f_txt.write(ph_gb_word_nosil)
|
233 |
+
return ph_gb_word_nosil, new_wav_align_fn
|
234 |
+
|
235 |
+
def load_spk_map(self, base_dir):
|
236 |
+
spk_map_fn = f"{base_dir}/spk_map.json"
|
237 |
+
spk_map = json.load(open(spk_map_fn, 'r'))
|
238 |
+
return spk_map
|
239 |
+
|
240 |
+
def load_dict(self, base_dir):
|
241 |
+
ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
|
242 |
+
word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
|
243 |
+
return ph_encoder, word_encoder
|
244 |
+
|
245 |
+
@property
|
246 |
+
def meta_csv_filename(self):
|
247 |
+
return 'metadata'
|
248 |
+
|
249 |
+
@property
|
250 |
+
def wav_processed_dirname(self):
|
251 |
+
return 'wav_processed'
|
data_gen/tts/runs/align_and_binarize.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
from utils.commons.hparams import set_hparams, hparams
|
3 |
+
from data_gen.tts.runs.binarize import binarize
|
4 |
+
from data_gen.tts.runs.preprocess import preprocess
|
5 |
+
from data_gen.tts.runs.train_mfa_align import train_mfa_align
|
6 |
+
|
7 |
+
if __name__ == '__main__':
|
8 |
+
set_hparams()
|
9 |
+
preprocess()
|
10 |
+
if hparams['preprocess_args']['use_mfa']:
|
11 |
+
train_mfa_align()
|
12 |
+
binarize()
|
data_gen/tts/runs/binarize.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
from utils.commons.hparams import hparams, set_hparams
|
3 |
+
import importlib
|
4 |
+
|
5 |
+
|
6 |
+
def binarize():
|
7 |
+
binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
|
8 |
+
pkg = ".".join(binarizer_cls.split(".")[:-1])
|
9 |
+
cls_name = binarizer_cls.split(".")[-1]
|
10 |
+
binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
|
11 |
+
print("| Binarizer: ", binarizer_cls)
|
12 |
+
binarizer_cls().process()
|
13 |
+
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
set_hparams()
|
17 |
+
binarize()
|
data_gen/tts/runs/preprocess.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
from utils.commons.hparams import hparams, set_hparams
|
3 |
+
import importlib
|
4 |
+
|
5 |
+
|
6 |
+
def preprocess():
|
7 |
+
assert hparams['preprocess_cls'] != ''
|
8 |
+
|
9 |
+
pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
|
10 |
+
cls_name = hparams["preprocess_cls"].split(".")[-1]
|
11 |
+
process_cls = getattr(importlib.import_module(pkg), cls_name)
|
12 |
+
process_cls().process()
|
13 |
+
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
set_hparams()
|
17 |
+
preprocess()
|
data_gen/tts/runs/train_mfa_align.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
import glob
|
3 |
+
import subprocess
|
4 |
+
from textgrid import TextGrid
|
5 |
+
import os
|
6 |
+
from utils.commons.hparams import hparams, set_hparams
|
7 |
+
|
8 |
+
|
9 |
+
def train_mfa_align(mfa_outputs="mfa_outputs",
|
10 |
+
mfa_inputs="mfa_inputs",
|
11 |
+
model_name=None, pretrain_model_name=None,
|
12 |
+
mfa_cmd='train'):
|
13 |
+
CORPUS = hparams['processed_data_dir'].split("/")[-1]
|
14 |
+
NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
|
15 |
+
env_vars = [f'CORPUS={CORPUS}', f'NUM_JOB={NUM_JOB}']
|
16 |
+
if mfa_outputs is not None:
|
17 |
+
env_vars.append(f'MFA_OUTPUTS={mfa_outputs}')
|
18 |
+
if mfa_inputs is not None:
|
19 |
+
env_vars.append(f'MFA_INPUTS={mfa_inputs}')
|
20 |
+
if model_name is not None:
|
21 |
+
env_vars.append(f'MODEL_NAME={model_name}')
|
22 |
+
if pretrain_model_name is not None:
|
23 |
+
env_vars.append(f'PRETRAIN_MODEL_NAME={pretrain_model_name}')
|
24 |
+
if mfa_cmd is not None:
|
25 |
+
env_vars.append(f'MFA_CMD={mfa_cmd}')
|
26 |
+
env_str = ' '.join(env_vars)
|
27 |
+
print(f"| Run MFA for {CORPUS}. Env vars: {env_str}")
|
28 |
+
subprocess.check_call(f'{env_str} bash mfa_usr/run_mfa_train_align.sh', shell=True)
|
29 |
+
mfa_offset = hparams['preprocess_args']['mfa_offset']
|
30 |
+
if mfa_offset > 0:
|
31 |
+
for tg_fn in glob.glob(f'{hparams["processed_data_dir"]}/{mfa_outputs}/*.TextGrid'):
|
32 |
+
tg = TextGrid.fromFile(tg_fn)
|
33 |
+
max_time = tg.maxTime
|
34 |
+
for tier in tg.tiers:
|
35 |
+
for interval in tier.intervals:
|
36 |
+
interval.maxTime = min(interval.maxTime + mfa_offset, max_time)
|
37 |
+
interval.minTime = min(interval.minTime + mfa_offset, max_time)
|
38 |
+
tier.intervals[0].minTime = 0
|
39 |
+
tier.maxTime = min(tier.maxTime + mfa_offset, max_time)
|
40 |
+
tg.write(tg_fn)
|
41 |
+
TextGrid.fromFile(tg_fn)
|
42 |
+
|
43 |
+
|
44 |
+
if __name__ == '__main__':
|
45 |
+
set_hparams(print_hparams=False)
|
46 |
+
train_mfa_align()
|
data_gen/tts/txt_processors/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import en, zh
|
data_gen/tts/txt_processors/__pycache__/__init__.cpython-36.pyc
ADDED
Binary file (200 Bytes). View file
|
|
data_gen/tts/txt_processors/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (206 Bytes). View file
|
|
data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-36.pyc
ADDED
Binary file (1.82 kB). View file
|
|
data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-37.pyc
ADDED
Binary file (1.82 kB). View file
|
|
data_gen/tts/txt_processors/__pycache__/en.cpython-36.pyc
ADDED
Binary file (2.66 kB). View file
|
|
data_gen/tts/txt_processors/__pycache__/en.cpython-37.pyc
ADDED
Binary file (2.66 kB). View file
|
|
data_gen/tts/txt_processors/__pycache__/syntactic_graph_buider.cpython-36.pyc
ADDED
Binary file (8.23 kB). View file
|
|
data_gen/tts/txt_processors/__pycache__/zh.cpython-36.pyc
ADDED
Binary file (3.96 kB). View file
|
|
data_gen/tts/txt_processors/__pycache__/zh.cpython-37.pyc
ADDED
Binary file (3.96 kB). View file
|
|
data_gen/tts/txt_processors/base_text_processor.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.text.text_encoder import is_sil_phoneme
|
2 |
+
|
3 |
+
REGISTERED_TEXT_PROCESSORS = {}
|
4 |
+
|
5 |
+
|
6 |
+
def register_txt_processors(name):
|
7 |
+
def _f(cls):
|
8 |
+
REGISTERED_TEXT_PROCESSORS[name] = cls
|
9 |
+
return cls
|
10 |
+
|
11 |
+
return _f
|
12 |
+
|
13 |
+
|
14 |
+
def get_txt_processor_cls(name):
|
15 |
+
return REGISTERED_TEXT_PROCESSORS.get(name, None)
|
16 |
+
|
17 |
+
|
18 |
+
class BaseTxtProcessor:
|
19 |
+
@staticmethod
|
20 |
+
def sp_phonemes():
|
21 |
+
return ['|']
|
22 |
+
|
23 |
+
@classmethod
|
24 |
+
def process(cls, txt, preprocess_args):
|
25 |
+
raise NotImplementedError
|
26 |
+
|
27 |
+
@classmethod
|
28 |
+
def postprocess(cls, txt_struct, preprocess_args):
|
29 |
+
# remove sil phoneme in head and tail
|
30 |
+
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
|
31 |
+
txt_struct = txt_struct[1:]
|
32 |
+
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
|
33 |
+
txt_struct = txt_struct[:-1]
|
34 |
+
if preprocess_args['with_phsep']:
|
35 |
+
txt_struct = cls.add_bdr(txt_struct)
|
36 |
+
if preprocess_args['add_eos_bos']:
|
37 |
+
txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
|
38 |
+
return txt_struct
|
39 |
+
|
40 |
+
@classmethod
|
41 |
+
def add_bdr(cls, txt_struct):
|
42 |
+
txt_struct_ = []
|
43 |
+
for i, ts in enumerate(txt_struct):
|
44 |
+
txt_struct_.append(ts)
|
45 |
+
if i != len(txt_struct) - 1 and \
|
46 |
+
not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
|
47 |
+
# txt_struct_.append(['|', ['|']])
|
48 |
+
# We disbale the sep token because it is imcompatible with syntactic graph.
|
49 |
+
pass
|
50 |
+
return txt_struct_
|
data_gen/tts/txt_processors/en.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import unicodedata
|
3 |
+
|
4 |
+
from g2p_en import G2p
|
5 |
+
from g2p_en.expand import normalize_numbers
|
6 |
+
from nltk import pos_tag
|
7 |
+
from nltk.tokenize import TweetTokenizer
|
8 |
+
|
9 |
+
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
|
10 |
+
from utils.text.text_encoder import PUNCS, is_sil_phoneme
|
11 |
+
|
12 |
+
|
13 |
+
class EnG2p(G2p):
|
14 |
+
word_tokenize = TweetTokenizer().tokenize
|
15 |
+
|
16 |
+
def __call__(self, text):
|
17 |
+
# preprocessing
|
18 |
+
words = EnG2p.word_tokenize(text)
|
19 |
+
tokens = pos_tag(words) # tuples of (word, tag)
|
20 |
+
|
21 |
+
# steps
|
22 |
+
prons = []
|
23 |
+
for word, pos in tokens:
|
24 |
+
if re.search("[a-z]", word) is None:
|
25 |
+
pron = [word]
|
26 |
+
|
27 |
+
elif word in self.homograph2features: # Check homograph
|
28 |
+
pron1, pron2, pos1 = self.homograph2features[word]
|
29 |
+
if pos.startswith(pos1):
|
30 |
+
pron = pron1
|
31 |
+
else:
|
32 |
+
pron = pron2
|
33 |
+
elif word in self.cmu: # lookup CMU dict
|
34 |
+
pron = self.cmu[word][0]
|
35 |
+
else: # predict for oov
|
36 |
+
pron = self.predict(word)
|
37 |
+
|
38 |
+
prons.extend(pron)
|
39 |
+
prons.extend([" "])
|
40 |
+
|
41 |
+
return prons[:-1]
|
42 |
+
|
43 |
+
|
44 |
+
@register_txt_processors('en')
|
45 |
+
class TxtProcessor(BaseTxtProcessor):
|
46 |
+
g2p = EnG2p()
|
47 |
+
|
48 |
+
@staticmethod
|
49 |
+
def preprocess_text(text):
|
50 |
+
text = normalize_numbers(text)
|
51 |
+
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
52 |
+
if unicodedata.category(char) != 'Mn') # Strip accents
|
53 |
+
text = text.lower()
|
54 |
+
text = re.sub("[\'\"()]+", "", text)
|
55 |
+
text = re.sub("[-]+", " ", text)
|
56 |
+
text = re.sub(f"[^ a-z{PUNCS}]", "", text)
|
57 |
+
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
|
58 |
+
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
|
59 |
+
text = text.replace("i.e.", "that is")
|
60 |
+
text = text.replace("i.e.", "that is")
|
61 |
+
text = text.replace("etc.", "etc")
|
62 |
+
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
|
63 |
+
text = re.sub(rf"\s+", r" ", text)
|
64 |
+
return text
|
65 |
+
|
66 |
+
@classmethod
|
67 |
+
def process(cls, txt, preprocess_args):
|
68 |
+
txt = cls.preprocess_text(txt).strip()
|
69 |
+
phs = cls.g2p(txt)
|
70 |
+
txt_struct = [[w, []] for w in txt.split(" ")]
|
71 |
+
i_word = 0
|
72 |
+
for p in phs:
|
73 |
+
if p == ' ':
|
74 |
+
i_word += 1
|
75 |
+
else:
|
76 |
+
txt_struct[i_word][1].append(p)
|
77 |
+
txt_struct = cls.postprocess(txt_struct, preprocess_args)
|
78 |
+
return txt_struct, txt
|
data_gen/tts/txt_processors/zh.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import jieba
|
3 |
+
from pypinyin import pinyin, Style
|
4 |
+
from utils.text.text_norm import NSWNormalizer
|
5 |
+
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
|
6 |
+
from utils.text.text_encoder import PUNCS, is_sil_phoneme
|
7 |
+
|
8 |
+
ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
|
9 |
+
'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
|
10 |
+
|
11 |
+
|
12 |
+
@register_txt_processors('zh')
|
13 |
+
class TxtProcessor(BaseTxtProcessor):
|
14 |
+
table = {ord(f): ord(t) for f, t in zip(
|
15 |
+
u':,。!?【】()%#@&1234567890',
|
16 |
+
u':,.!?[]()%#@&1234567890')}
|
17 |
+
|
18 |
+
@staticmethod
|
19 |
+
def sp_phonemes():
|
20 |
+
return ['|', '#']
|
21 |
+
|
22 |
+
@staticmethod
|
23 |
+
def preprocess_text(text):
|
24 |
+
text = text.translate(TxtProcessor.table)
|
25 |
+
text = NSWNormalizer(text).normalize(remove_punc=False).lower()
|
26 |
+
text = re.sub("[\'\"()]+", "", text)
|
27 |
+
text = re.sub("[-]+", " ", text)
|
28 |
+
text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
|
29 |
+
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
|
30 |
+
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
|
31 |
+
text = re.sub(rf"\s+", r"", text)
|
32 |
+
text = re.sub(rf"[A-Za-z]+", r"$", text)
|
33 |
+
return text
|
34 |
+
|
35 |
+
@classmethod
|
36 |
+
def pinyin_with_en(cls, txt, style):
|
37 |
+
x = pinyin(txt, style)
|
38 |
+
x = [t[0] for t in x]
|
39 |
+
x_ = []
|
40 |
+
for t in x:
|
41 |
+
if '$' not in t:
|
42 |
+
x_.append(t)
|
43 |
+
else:
|
44 |
+
x_ += list(t)
|
45 |
+
x_ = [t if t != '$' else 'ENG' for t in x_]
|
46 |
+
return x_
|
47 |
+
|
48 |
+
@classmethod
|
49 |
+
def process(cls, txt, pre_align_args):
|
50 |
+
txt = cls.preprocess_text(txt)
|
51 |
+
# https://blog.csdn.net/zhoulei124/article/details/89055403
|
52 |
+
pre_align_args['use_tone'] = True
|
53 |
+
shengmu = cls.pinyin_with_en(txt, style=Style.INITIALS)
|
54 |
+
yunmu = cls.pinyin_with_en(txt, style=
|
55 |
+
Style.FINALS_TONE3 if pre_align_args['use_tone'] else Style.FINALS)
|
56 |
+
assert len(shengmu) == len(yunmu)
|
57 |
+
ph_list = []
|
58 |
+
for a, b in zip(shengmu, yunmu):
|
59 |
+
if a == b:
|
60 |
+
ph_list += [a]
|
61 |
+
else:
|
62 |
+
ph_list += [a + "%" + b]
|
63 |
+
seg_list = '#'.join(jieba.cut(txt))
|
64 |
+
assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
|
65 |
+
|
66 |
+
# 加入词边界'#'
|
67 |
+
ph_list_ = []
|
68 |
+
seg_idx = 0
|
69 |
+
for p in ph_list:
|
70 |
+
if seg_list[seg_idx] == '#':
|
71 |
+
ph_list_.append('#')
|
72 |
+
seg_idx += 1
|
73 |
+
elif len(ph_list_) > 0:
|
74 |
+
ph_list_.append("|")
|
75 |
+
seg_idx += 1
|
76 |
+
finished = False
|
77 |
+
if not finished:
|
78 |
+
ph_list_ += [x for x in p.split("%") if x != '']
|
79 |
+
|
80 |
+
ph_list = ph_list_
|
81 |
+
|
82 |
+
# 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
|
83 |
+
sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
|
84 |
+
ph_list_ = []
|
85 |
+
for i in range(0, len(ph_list), 1):
|
86 |
+
if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
|
87 |
+
ph_list_.append(ph_list[i])
|
88 |
+
ph_list = ph_list_
|
89 |
+
txt_struct = [[w, []] for w in txt]
|
90 |
+
i = 0
|
91 |
+
for ph in ph_list:
|
92 |
+
if ph == '|' or ph == '#':
|
93 |
+
i += 1
|
94 |
+
continue
|
95 |
+
elif ph in [',', '.']:
|
96 |
+
i += 1
|
97 |
+
txt_struct[i][1].append(ph)
|
98 |
+
i += 1
|
99 |
+
continue
|
100 |
+
txt_struct[i][1].append(ph)
|
101 |
+
# return ph_list, txt
|
102 |
+
txt_struct.insert(0, ['<BOS>', ['<BOS>']])
|
103 |
+
txt_struct.append(['<EOS>', ['<EOS>']])
|
104 |
+
return txt_struct, txt
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
t = 'simon演唱过后,simon还进行了simon精彩的文艺演出simon.'
|
109 |
+
phs, txt = TxtProcessor.process(t, {'use_tone': True})
|
110 |
+
print(phs, txt)
|
data_gen/tts/wav_processors/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from . import base_processor
|
2 |
+
from . import common_processors
|
data_gen/tts/wav_processors/__pycache__/__init__.cpython-36.pyc
ADDED
Binary file (239 Bytes). View file
|
|
data_gen/tts/wav_processors/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (245 Bytes). View file
|
|
data_gen/tts/wav_processors/__pycache__/base_processor.cpython-36.pyc
ADDED
Binary file (1.26 kB). View file
|
|
data_gen/tts/wav_processors/__pycache__/base_processor.cpython-37.pyc
ADDED
Binary file (1.26 kB). View file
|
|
data_gen/tts/wav_processors/__pycache__/common_processors.cpython-36.pyc
ADDED
Binary file (3.77 kB). View file
|
|
data_gen/tts/wav_processors/__pycache__/common_processors.cpython-37.pyc
ADDED
Binary file (3.77 kB). View file
|
|
data_gen/tts/wav_processors/base_processor.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
REGISTERED_WAV_PROCESSORS = {}
|
2 |
+
|
3 |
+
|
4 |
+
def register_wav_processors(name):
|
5 |
+
def _f(cls):
|
6 |
+
REGISTERED_WAV_PROCESSORS[name] = cls
|
7 |
+
return cls
|
8 |
+
|
9 |
+
return _f
|
10 |
+
|
11 |
+
|
12 |
+
def get_wav_processor_cls(name):
|
13 |
+
return REGISTERED_WAV_PROCESSORS.get(name, None)
|
14 |
+
|
15 |
+
|
16 |
+
class BaseWavProcessor:
|
17 |
+
@property
|
18 |
+
def name(self):
|
19 |
+
raise NotImplementedError
|
20 |
+
|
21 |
+
def output_fn(self, input_fn):
|
22 |
+
return f'{input_fn[:-4]}_{self.name}.wav'
|
23 |
+
|
24 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
25 |
+
raise NotImplementedError
|
data_gen/tts/wav_processors/common_processors.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
|
6 |
+
from utils.audio import trim_long_silences
|
7 |
+
from utils.audio.io import save_wav
|
8 |
+
from utils.audio.rnnoise import rnnoise
|
9 |
+
from utils.commons.hparams import hparams
|
10 |
+
|
11 |
+
|
12 |
+
@register_wav_processors(name='sox_to_wav')
|
13 |
+
class ConvertToWavProcessor(BaseWavProcessor):
|
14 |
+
@property
|
15 |
+
def name(self):
|
16 |
+
return 'ToWav'
|
17 |
+
|
18 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
19 |
+
if input_fn[-4:] == '.wav':
|
20 |
+
return input_fn, sr
|
21 |
+
else:
|
22 |
+
output_fn = self.output_fn(input_fn)
|
23 |
+
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
|
24 |
+
return output_fn, sr
|
25 |
+
|
26 |
+
|
27 |
+
@register_wav_processors(name='sox_resample')
|
28 |
+
class ResampleProcessor(BaseWavProcessor):
|
29 |
+
@property
|
30 |
+
def name(self):
|
31 |
+
return 'Resample'
|
32 |
+
|
33 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
34 |
+
output_fn = self.output_fn(input_fn)
|
35 |
+
sr_file = librosa.core.get_samplerate(input_fn)
|
36 |
+
if sr != sr_file:
|
37 |
+
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
|
38 |
+
y, _ = librosa.core.load(input_fn, sr=sr)
|
39 |
+
y, _ = librosa.effects.trim(y)
|
40 |
+
save_wav(y, output_fn, sr)
|
41 |
+
return output_fn, sr
|
42 |
+
else:
|
43 |
+
return input_fn, sr
|
44 |
+
|
45 |
+
|
46 |
+
@register_wav_processors(name='trim_sil')
|
47 |
+
class TrimSILProcessor(BaseWavProcessor):
|
48 |
+
@property
|
49 |
+
def name(self):
|
50 |
+
return 'TrimSIL'
|
51 |
+
|
52 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
53 |
+
output_fn = self.output_fn(input_fn)
|
54 |
+
y, _ = librosa.core.load(input_fn, sr=sr)
|
55 |
+
y, _ = librosa.effects.trim(y)
|
56 |
+
save_wav(y, output_fn, sr)
|
57 |
+
return output_fn
|
58 |
+
|
59 |
+
|
60 |
+
@register_wav_processors(name='trim_all_sil')
|
61 |
+
class TrimAllSILProcessor(BaseWavProcessor):
|
62 |
+
@property
|
63 |
+
def name(self):
|
64 |
+
return 'TrimSIL'
|
65 |
+
|
66 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
67 |
+
output_fn = self.output_fn(input_fn)
|
68 |
+
y, audio_mask, _ = trim_long_silences(
|
69 |
+
input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
|
70 |
+
save_wav(y, output_fn, sr)
|
71 |
+
if preprocess_args['save_sil_mask']:
|
72 |
+
os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
|
73 |
+
np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
|
74 |
+
return output_fn, sr
|
75 |
+
|
76 |
+
|
77 |
+
@register_wav_processors(name='denoise')
|
78 |
+
class DenoiseProcessor(BaseWavProcessor):
|
79 |
+
@property
|
80 |
+
def name(self):
|
81 |
+
return 'Denoise'
|
82 |
+
|
83 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
84 |
+
output_fn = self.output_fn(input_fn)
|
85 |
+
rnnoise(input_fn, output_fn, out_sample_rate=sr)
|
86 |
+
return output_fn, sr
|