Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
add device argument
Browse files- app.py +14 -12
- text/cleaners.py +25 -21
app.py
CHANGED
@@ -62,9 +62,9 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
62 |
speaker_id = speaker_ids[speaker]
|
63 |
stn_tst = get_text(text, hps, is_symbol)
|
64 |
with no_grad():
|
65 |
-
x_tst = stn_tst.unsqueeze(0)
|
66 |
-
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
67 |
-
sid = LongTensor([speaker_id])
|
68 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
69 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
70 |
del stn_tst, x_tst, x_tst_lengths, sid
|
@@ -94,10 +94,10 @@ def create_vc_fn(model, hps, speaker_ids):
|
|
94 |
y = y.unsqueeze(0)
|
95 |
spec = spectrogram_torch(y, hps.data.filter_length,
|
96 |
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
97 |
-
center=False)
|
98 |
-
spec_lengths = LongTensor([spec.size(-1)])
|
99 |
-
sid_src = LongTensor([original_speaker_id])
|
100 |
-
sid_tgt = LongTensor([target_speaker_id])
|
101 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
102 |
0, 0].data.cpu().float().numpy()
|
103 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
@@ -125,10 +125,10 @@ def create_soft_vc_fn(model, hps, speaker_ids):
|
|
125 |
if sampling_rate != 16000:
|
126 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
127 |
with torch.inference_mode():
|
128 |
-
units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0))
|
129 |
with no_grad():
|
130 |
-
unit_lengths = LongTensor([units.size(1)])
|
131 |
-
sid = LongTensor([target_speaker_id])
|
132 |
audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
|
133 |
noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
|
134 |
del units, unit_lengths, sid
|
@@ -147,9 +147,11 @@ def create_to_symbol_fn(hps):
|
|
147 |
|
148 |
if __name__ == '__main__':
|
149 |
parser = argparse.ArgumentParser()
|
|
|
150 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
151 |
args = parser.parse_args()
|
152 |
|
|
|
153 |
models_tts = []
|
154 |
models_vc = []
|
155 |
models_soft_vc = []
|
@@ -171,7 +173,7 @@ if __name__ == '__main__':
|
|
171 |
n_speakers=hps.data.n_speakers,
|
172 |
**hps.model)
|
173 |
utils.load_checkpoint(model_path, model, None)
|
174 |
-
model.eval()
|
175 |
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
|
176 |
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
|
177 |
|
@@ -184,7 +186,7 @@ if __name__ == '__main__':
|
|
184 |
elif t == "soft-vits-vc":
|
185 |
models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
|
186 |
|
187 |
-
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
|
188 |
|
189 |
app = gr.Blocks()
|
190 |
|
|
|
62 |
speaker_id = speaker_ids[speaker]
|
63 |
stn_tst = get_text(text, hps, is_symbol)
|
64 |
with no_grad():
|
65 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
66 |
+
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
67 |
+
sid = LongTensor([speaker_id]).to(device)
|
68 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
69 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
70 |
del stn_tst, x_tst, x_tst_lengths, sid
|
|
|
94 |
y = y.unsqueeze(0)
|
95 |
spec = spectrogram_torch(y, hps.data.filter_length,
|
96 |
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
97 |
+
center=False).to(device)
|
98 |
+
spec_lengths = LongTensor([spec.size(-1)]).to(device)
|
99 |
+
sid_src = LongTensor([original_speaker_id]).to(device)
|
100 |
+
sid_tgt = LongTensor([target_speaker_id]).to(device)
|
101 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
102 |
0, 0].data.cpu().float().numpy()
|
103 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
|
|
125 |
if sampling_rate != 16000:
|
126 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
127 |
with torch.inference_mode():
|
128 |
+
units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0).to(device))
|
129 |
with no_grad():
|
130 |
+
unit_lengths = LongTensor([units.size(1)]).to(device)
|
131 |
+
sid = LongTensor([target_speaker_id]).to(device)
|
132 |
audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
|
133 |
noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
|
134 |
del units, unit_lengths, sid
|
|
|
147 |
|
148 |
if __name__ == '__main__':
|
149 |
parser = argparse.ArgumentParser()
|
150 |
+
parser.add_argument('--device', type=str, default='cpu')
|
151 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
152 |
args = parser.parse_args()
|
153 |
|
154 |
+
device = torch.device(args.device)
|
155 |
models_tts = []
|
156 |
models_vc = []
|
157 |
models_soft_vc = []
|
|
|
173 |
n_speakers=hps.data.n_speakers,
|
174 |
**hps.model)
|
175 |
utils.load_checkpoint(model_path, model, None)
|
176 |
+
model.eval().to(device)
|
177 |
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
|
178 |
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
|
179 |
|
|
|
186 |
elif t == "soft-vits-vc":
|
187 |
models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
|
188 |
|
189 |
+
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).to(device)
|
190 |
|
191 |
app = gr.Blocks()
|
192 |
|
text/cleaners.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
import re
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
def japanese_cleaners(text):
|
@@ -36,9 +39,9 @@ def zh_ja_mixture_cleaners(text):
|
|
36 |
from text.mandarin import chinese_to_romaji
|
37 |
from text.japanese import japanese_to_romaji_with_accent
|
38 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
39 |
-
lambda x: chinese_to_romaji(x.group(1))+' ', text)
|
40 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
41 |
-
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
|
42 |
text = re.sub(r'\s+$', '', text)
|
43 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
44 |
return text
|
@@ -58,15 +61,15 @@ def cjks_cleaners(text):
|
|
58 |
from text.sanskrit import devanagari_to_ipa
|
59 |
from text.english import english_to_lazy_ipa
|
60 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
61 |
-
lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
|
62 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
63 |
-
lambda x: japanese_to_ipa(x.group(1))+' ', text)
|
64 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
65 |
-
lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
|
66 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
67 |
-
lambda x: devanagari_to_ipa(x.group(1))+' ', text)
|
68 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
69 |
-
lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
|
70 |
text = re.sub(r'\s+$', '', text)
|
71 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
72 |
return text
|
@@ -78,13 +81,13 @@ def cjke_cleaners(text):
|
|
78 |
from text.korean import korean_to_ipa
|
79 |
from text.english import english_to_ipa2
|
80 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
81 |
-
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
|
82 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
83 |
-
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
|
84 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
85 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
86 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
87 |
-
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
|
88 |
text = re.sub(r'\s+$', '', text)
|
89 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
90 |
return text
|
@@ -96,13 +99,13 @@ def cjke_cleaners2(text):
|
|
96 |
from text.korean import korean_to_ipa
|
97 |
from text.english import english_to_ipa2
|
98 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
99 |
-
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
100 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
101 |
-
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
102 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
103 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
104 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
105 |
-
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
106 |
text = re.sub(r'\s+$', '', text)
|
107 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
108 |
return text
|
@@ -130,17 +133,18 @@ def chinese_dialect_cleaners(text):
|
|
130 |
from text.english import english_to_lazy_ipa2
|
131 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
132 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
133 |
-
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
134 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
135 |
-
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
|
136 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
137 |
-
|
|
|
138 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
139 |
-
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
|
140 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
141 |
-
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
|
142 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
143 |
-
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
|
144 |
text = re.sub(r'\s+$', '', text)
|
145 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
146 |
return text
|
|
|
1 |
import re
|
2 |
+
import pyopenjtalk
|
3 |
+
|
4 |
+
pyopenjtalk._lazy_init()
|
5 |
|
6 |
|
7 |
def japanese_cleaners(text):
|
|
|
39 |
from text.mandarin import chinese_to_romaji
|
40 |
from text.japanese import japanese_to_romaji_with_accent
|
41 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
42 |
+
lambda x: chinese_to_romaji(x.group(1)) + ' ', text)
|
43 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
44 |
+
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…') + ' ', text)
|
45 |
text = re.sub(r'\s+$', '', text)
|
46 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
47 |
return text
|
|
|
61 |
from text.sanskrit import devanagari_to_ipa
|
62 |
from text.english import english_to_lazy_ipa
|
63 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
64 |
+
lambda x: chinese_to_lazy_ipa(x.group(1)) + ' ', text)
|
65 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
66 |
+
lambda x: japanese_to_ipa(x.group(1)) + ' ', text)
|
67 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
68 |
+
lambda x: korean_to_lazy_ipa(x.group(1)) + ' ', text)
|
69 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
70 |
+
lambda x: devanagari_to_ipa(x.group(1)) + ' ', text)
|
71 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
72 |
+
lambda x: english_to_lazy_ipa(x.group(1)) + ' ', text)
|
73 |
text = re.sub(r'\s+$', '', text)
|
74 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
75 |
return text
|
|
|
81 |
from text.korean import korean_to_ipa
|
82 |
from text.english import english_to_ipa2
|
83 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
84 |
+
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
|
85 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
86 |
+
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
|
87 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
88 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
89 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
90 |
+
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
|
91 |
text = re.sub(r'\s+$', '', text)
|
92 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
93 |
return text
|
|
|
99 |
from text.korean import korean_to_ipa
|
100 |
from text.english import english_to_ipa2
|
101 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
102 |
+
lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
|
103 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
104 |
+
lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
|
105 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
106 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
107 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
108 |
+
lambda x: english_to_ipa2(x.group(1)) + ' ', text)
|
109 |
text = re.sub(r'\s+$', '', text)
|
110 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
111 |
return text
|
|
|
133 |
from text.english import english_to_lazy_ipa2
|
134 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
135 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
136 |
+
lambda x: chinese_to_ipa2(x.group(1)) + ' ', text)
|
137 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
138 |
+
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ') + ' ', text)
|
139 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
140 |
+
'˧˧˦').replace(
|
141 |
+
'6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e') + ' ', text)
|
142 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
143 |
+
lambda x: cantonese_to_ipa(x.group(1)) + ' ', text)
|
144 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
145 |
+
lambda x: english_to_lazy_ipa2(x.group(1)) + ' ', text)
|
146 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
147 |
+
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ') + ' ', text)
|
148 |
text = re.sub(r'\s+$', '', text)
|
149 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
150 |
return text
|