skytnt commited on
Commit
58493c0
β€’
1 Parent(s): b310615

add soft-vits and more models

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ˜ŠπŸŽ™οΈ
4
  colorFrom: red
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: red
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.3
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -84,6 +84,34 @@ def create_vc_fn(model, hps, speaker_ids):
84
  return vc_fn
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def create_to_phoneme_fn(hps):
88
  def to_phoneme_fn(text):
89
  return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
@@ -110,7 +138,9 @@ css = """
110
  """
111
 
112
  if __name__ == '__main__':
113
- models = []
 
 
114
  with open("saved_model/info.json", "r", encoding="utf-8") as f:
115
  models_info = json.load(f)
116
  for i, info in models_info.items():
@@ -132,9 +162,16 @@ if __name__ == '__main__':
132
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
133
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
134
 
135
- models.append((name, lang, example, cover_path, speakers, hps.symbols,
136
- create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids),
137
- create_to_phoneme_fn(hps)))
 
 
 
 
 
 
 
138
 
139
  app = gr.Blocks(css=css)
140
 
@@ -144,13 +181,14 @@ if __name__ == '__main__':
144
  "unofficial demo for \n\n"
145
  "- [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)\n"
146
  "- [https://github.com/Francis-Komizu/VITS](https://github.com/Francis-Komizu/VITS)\n"
147
- "- [https://github.com/luoyily/MoeTTS](https://github.com/luoyily/MoeTTS)"
 
148
  )
149
  with gr.Tabs():
150
  with gr.TabItem("TTS"):
151
  with gr.Tabs():
152
- for i, (name, lang, example, cover_path, speakers,
153
- symbols, tts_fn, vc_fn, to_phoneme_fn) in enumerate(models):
154
  with gr.TabItem(f"model{i}"):
155
  with gr.Column():
156
  gr.Markdown(f"## {name}\n\n"
@@ -204,8 +242,7 @@ if __name__ == '__main__':
204
 
205
  with gr.TabItem("Voice Conversion"):
206
  with gr.Tabs():
207
- for i, (name, lang, example, cover_path, speakers,
208
- symbols, tts_fn, vc_fn, to_phoneme_fn) in enumerate(models):
209
  with gr.TabItem(f"model{i}"):
210
  gr.Markdown(f"## {name}\n\n"
211
  f"![cover](file/{cover_path})")
@@ -218,4 +255,17 @@ if __name__ == '__main__':
218
  vc_output1 = gr.Textbox(label="Output Message")
219
  vc_output2 = gr.Audio(label="Output Audio")
220
  vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  app.launch()
 
84
  return vc_fn
85
 
86
 
87
+ def create_soft_vc_fn(model, hps, speaker_ids):
88
+ def soft_vc_fn(target_speaker, input_audio):
89
+ if input_audio is None:
90
+ return "You need to upload an audio", None
91
+ sampling_rate, audio = input_audio
92
+ duration = audio.shape[0] / sampling_rate
93
+ if limitation and duration > 15:
94
+ return "Error: Audio is too long", None
95
+ target_speaker_id = speaker_ids[target_speaker]
96
+
97
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
98
+ if len(audio.shape) > 1:
99
+ audio = librosa.to_mono(audio.transpose(1, 0))
100
+ if sampling_rate != 16000:
101
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
102
+ with torch.inference_mode():
103
+ units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0))
104
+ with no_grad():
105
+ unit_lengths = LongTensor([units.size(1)])
106
+ sid = LongTensor([target_speaker_id])
107
+ audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
108
+ noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
109
+ del units, unit_lengths, sid
110
+ return "Success", (hps.data.sampling_rate, audio)
111
+
112
+ return soft_vc_fn
113
+
114
+
115
  def create_to_phoneme_fn(hps):
116
  def to_phoneme_fn(text):
117
  return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
 
138
  """
139
 
140
  if __name__ == '__main__':
141
+ models_tts = []
142
+ models_vc = []
143
+ models_soft_vc = []
144
  with open("saved_model/info.json", "r", encoding="utf-8") as f:
145
  models_info = json.load(f)
146
  for i, info in models_info.items():
 
162
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
163
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
164
 
165
+ t = info["type"]
166
+ if t == "vits":
167
+ models_tts.append((name, cover_path, speakers, lang, example,
168
+ hps.symbols, create_tts_fn(model, hps, speaker_ids),
169
+ create_to_phoneme_fn(hps)))
170
+ models_vc.append((name, cover_path, speakers, create_vc_fn(model, hps, speaker_ids)))
171
+ elif t == "soft-vits-vc":
172
+ models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
173
+
174
+ hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")
175
 
176
  app = gr.Blocks(css=css)
177
 
 
181
  "unofficial demo for \n\n"
182
  "- [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)\n"
183
  "- [https://github.com/Francis-Komizu/VITS](https://github.com/Francis-Komizu/VITS)\n"
184
+ "- [https://github.com/luoyily/MoeTTS](https://github.com/luoyily/MoeTTS)\n"
185
+ "- [https://github.com/Francis-Komizu/Sovits](https://github.com/Francis-Komizu/Sovits)"
186
  )
187
  with gr.Tabs():
188
  with gr.TabItem("TTS"):
189
  with gr.Tabs():
190
+ for i, (name, cover_path, speakers, lang, example, symbols, tts_fn,
191
+ to_phoneme_fn) in enumerate(models_tts):
192
  with gr.TabItem(f"model{i}"):
193
  with gr.Column():
194
  gr.Markdown(f"## {name}\n\n"
 
242
 
243
  with gr.TabItem("Voice Conversion"):
244
  with gr.Tabs():
245
+ for i, (name, cover_path, speakers, vc_fn) in enumerate(models_vc):
 
246
  with gr.TabItem(f"model{i}"):
247
  gr.Markdown(f"## {name}\n\n"
248
  f"![cover](file/{cover_path})")
 
255
  vc_output1 = gr.Textbox(label="Output Message")
256
  vc_output2 = gr.Audio(label="Output Audio")
257
  vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
258
+ with gr.TabItem("Soft Voice Conversion"):
259
+ with gr.Tabs():
260
+ for i, (name, cover_path, speakers,soft_vc_fn) in enumerate(models_soft_vc):
261
+ with gr.TabItem(f"model{i}"):
262
+ gr.Markdown(f"## {name}\n\n"
263
+ f"![cover](file/{cover_path})")
264
+ vc_input1 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
265
+ value=speakers[0])
266
+ vc_input2 = gr.Audio(label="Input Audio (15s limitation)")
267
+ vc_submit = gr.Button("Convert", variant="primary")
268
+ vc_output1 = gr.Textbox(label="Output Message")
269
+ vc_output2 = gr.Audio(label="Output Audio")
270
+ vc_submit.click(soft_vc_fn, [vc_input1, vc_input2], [vc_output1, vc_output2])
271
  app.launch()
export_model.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ if __name__ == '__main__':
4
+ model_path = "saved_model/11/model.pth"
5
+ output_path = "saved_model/11/model1.pth"
6
+ checkpoint_dict = torch.load(model_path, map_location='cpu')
7
+ checkpoint_dict_new = {}
8
+ for k, v in checkpoint_dict.items():
9
+ if k == "optimizer":
10
+ print("remove optimizer")
11
+ continue
12
+ checkpoint_dict_new[k] = v
13
+ torch.save(checkpoint_dict_new, output_path)
models.py CHANGED
@@ -14,234 +14,239 @@ from commons import init_weights, get_padding
14
 
15
 
16
  class StochasticDurationPredictor(nn.Module):
17
- def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
18
- super().__init__()
19
- filter_channels = in_channels # it needs to be removed from future version.
20
- self.in_channels = in_channels
21
- self.filter_channels = filter_channels
22
- self.kernel_size = kernel_size
23
- self.p_dropout = p_dropout
24
- self.n_flows = n_flows
25
- self.gin_channels = gin_channels
26
-
27
- self.log_flow = modules.Log()
28
- self.flows = nn.ModuleList()
29
- self.flows.append(modules.ElementwiseAffine(2))
30
- for i in range(n_flows):
31
- self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
32
- self.flows.append(modules.Flip())
33
-
34
- self.post_pre = nn.Conv1d(1, filter_channels, 1)
35
- self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
36
- self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
37
- self.post_flows = nn.ModuleList()
38
- self.post_flows.append(modules.ElementwiseAffine(2))
39
- for i in range(4):
40
- self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
41
- self.post_flows.append(modules.Flip())
42
-
43
- self.pre = nn.Conv1d(in_channels, filter_channels, 1)
44
- self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
45
- self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
46
- if gin_channels != 0:
47
- self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
48
-
49
- def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
50
- x = torch.detach(x)
51
- x = self.pre(x)
52
- if g is not None:
53
- g = torch.detach(g)
54
- x = x + self.cond(g)
55
- x = self.convs(x, x_mask)
56
- x = self.proj(x) * x_mask
57
-
58
- if not reverse:
59
- flows = self.flows
60
- assert w is not None
61
-
62
- logdet_tot_q = 0
63
- h_w = self.post_pre(w)
64
- h_w = self.post_convs(h_w, x_mask)
65
- h_w = self.post_proj(h_w) * x_mask
66
- e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
67
- z_q = e_q
68
- for flow in self.post_flows:
69
- z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
70
- logdet_tot_q += logdet_q
71
- z_u, z1 = torch.split(z_q, [1, 1], 1)
72
- u = torch.sigmoid(z_u) * x_mask
73
- z0 = (w - u) * x_mask
74
- logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
75
- logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
76
-
77
- logdet_tot = 0
78
- z0, logdet = self.log_flow(z0, x_mask)
79
- logdet_tot += logdet
80
- z = torch.cat([z0, z1], 1)
81
- for flow in flows:
82
- z, logdet = flow(z, x_mask, g=x, reverse=reverse)
83
- logdet_tot = logdet_tot + logdet
84
- nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
85
- return nll + logq # [b]
86
- else:
87
- flows = list(reversed(self.flows))
88
- flows = flows[:-2] + [flows[-1]] # remove a useless vflow
89
- z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
90
- for flow in flows:
91
- z = flow(z, x_mask, g=x, reverse=reverse)
92
- z0, z1 = torch.split(z, [1, 1], 1)
93
- logw = z0
94
- return logw
95
 
96
 
97
  class DurationPredictor(nn.Module):
98
- def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
99
- super().__init__()
100
-
101
- self.in_channels = in_channels
102
- self.filter_channels = filter_channels
103
- self.kernel_size = kernel_size
104
- self.p_dropout = p_dropout
105
- self.gin_channels = gin_channels
106
-
107
- self.drop = nn.Dropout(p_dropout)
108
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
109
- self.norm_1 = modules.LayerNorm(filter_channels)
110
- self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
111
- self.norm_2 = modules.LayerNorm(filter_channels)
112
- self.proj = nn.Conv1d(filter_channels, 1, 1)
113
-
114
- if gin_channels != 0:
115
- self.cond = nn.Conv1d(gin_channels, in_channels, 1)
116
-
117
- def forward(self, x, x_mask, g=None):
118
- x = torch.detach(x)
119
- if g is not None:
120
- g = torch.detach(g)
121
- x = x + self.cond(g)
122
- x = self.conv_1(x * x_mask)
123
- x = torch.relu(x)
124
- x = self.norm_1(x)
125
- x = self.drop(x)
126
- x = self.conv_2(x * x_mask)
127
- x = torch.relu(x)
128
- x = self.norm_2(x)
129
- x = self.drop(x)
130
- x = self.proj(x * x_mask)
131
- return x * x_mask
132
 
133
 
134
  class TextEncoder(nn.Module):
135
- def __init__(self,
136
- n_vocab,
137
- out_channels,
138
- hidden_channels,
139
- filter_channels,
140
- n_heads,
141
- n_layers,
142
- kernel_size,
143
- p_dropout):
144
- super().__init__()
145
- self.n_vocab = n_vocab
146
- self.out_channels = out_channels
147
- self.hidden_channels = hidden_channels
148
- self.filter_channels = filter_channels
149
- self.n_heads = n_heads
150
- self.n_layers = n_layers
151
- self.kernel_size = kernel_size
152
- self.p_dropout = p_dropout
153
-
154
- self.emb = nn.Embedding(n_vocab, hidden_channels)
155
- nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
156
-
157
- self.encoder = attentions.Encoder(
158
- hidden_channels,
159
- filter_channels,
160
- n_heads,
161
- n_layers,
162
- kernel_size,
163
- p_dropout)
164
- self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
165
-
166
- def forward(self, x, x_lengths):
167
- x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
168
- x = torch.transpose(x, 1, -1) # [b, h, t]
169
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
170
-
171
- x = self.encoder(x * x_mask, x_mask)
172
- stats = self.proj(x) * x_mask
173
-
174
- m, logs = torch.split(stats, self.out_channels, dim=1)
175
- return x, m, logs, x_mask
 
 
176
 
177
 
178
  class ResidualCouplingBlock(nn.Module):
179
- def __init__(self,
180
- channels,
181
- hidden_channels,
182
- kernel_size,
183
- dilation_rate,
184
- n_layers,
185
- n_flows=4,
186
- gin_channels=0):
187
- super().__init__()
188
- self.channels = channels
189
- self.hidden_channels = hidden_channels
190
- self.kernel_size = kernel_size
191
- self.dilation_rate = dilation_rate
192
- self.n_layers = n_layers
193
- self.n_flows = n_flows
194
- self.gin_channels = gin_channels
195
-
196
- self.flows = nn.ModuleList()
197
- for i in range(n_flows):
198
- self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
199
- self.flows.append(modules.Flip())
200
-
201
- def forward(self, x, x_mask, g=None, reverse=False):
202
- if not reverse:
203
- for flow in self.flows:
204
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
205
- else:
206
- for flow in reversed(self.flows):
207
- x = flow(x, x_mask, g=g, reverse=reverse)
208
- return x
 
 
209
 
210
 
211
  class PosteriorEncoder(nn.Module):
212
- def __init__(self,
213
- in_channels,
214
- out_channels,
215
- hidden_channels,
216
- kernel_size,
217
- dilation_rate,
218
- n_layers,
219
- gin_channels=0):
220
- super().__init__()
221
- self.in_channels = in_channels
222
- self.out_channels = out_channels
223
- self.hidden_channels = hidden_channels
224
- self.kernel_size = kernel_size
225
- self.dilation_rate = dilation_rate
226
- self.n_layers = n_layers
227
- self.gin_channels = gin_channels
228
-
229
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
230
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
231
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
232
-
233
- def forward(self, x, x_lengths, g=None):
234
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
235
- x = self.pre(x) * x_mask
236
- x = self.enc(x, x_mask, g=g)
237
- stats = self.proj(x) * x_mask
238
- m, logs = torch.split(stats, self.out_channels, dim=1)
239
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
240
- return z, m, logs, x_mask
241
 
242
 
243
  class Generator(torch.nn.Module):
244
- def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
 
245
  super(Generator, self).__init__()
246
  self.num_kernels = len(resblock_kernel_sizes)
247
  self.num_upsamples = len(upsample_rates)
@@ -251,12 +256,12 @@ class Generator(torch.nn.Module):
251
  self.ups = nn.ModuleList()
252
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
253
  self.ups.append(weight_norm(
254
- ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
255
- k, u, padding=(k-u)//2)))
256
 
257
  self.resblocks = nn.ModuleList()
258
  for i in range(len(self.ups)):
259
- ch = upsample_initial_channel//(2**(i+1))
260
  for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
261
  self.resblocks.append(resblock(ch, k, d))
262
 
@@ -269,7 +274,7 @@ class Generator(torch.nn.Module):
269
  def forward(self, x, g=None):
270
  x = self.conv_pre(x)
271
  if g is not None:
272
- x = x + self.cond(g)
273
 
274
  for i in range(self.num_upsamples):
275
  x = F.leaky_relu(x, modules.LRELU_SLOPE)
@@ -277,9 +282,9 @@ class Generator(torch.nn.Module):
277
  xs = None
278
  for j in range(self.num_kernels):
279
  if xs is None:
280
- xs = self.resblocks[i*self.num_kernels+j](x)
281
  else:
282
- xs += self.resblocks[i*self.num_kernels+j](x)
283
  x = xs / self.num_kernels
284
  x = F.leaky_relu(x)
285
  x = self.conv_post(x)
@@ -315,7 +320,7 @@ class DiscriminatorP(torch.nn.Module):
315
 
316
  # 1d to 2d
317
  b, c, t = x.shape
318
- if t % self.period != 0: # pad first
319
  n_pad = self.period - (t % self.period)
320
  x = F.pad(x, (0, n_pad), "reflect")
321
  t = t + n_pad
@@ -363,7 +368,7 @@ class DiscriminatorS(torch.nn.Module):
363
  class MultiPeriodDiscriminator(torch.nn.Module):
364
  def __init__(self, use_spectral_norm=False):
365
  super(MultiPeriodDiscriminator, self).__init__()
366
- periods = [2,3,5,7,11]
367
 
368
  discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
369
  discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
@@ -385,149 +390,151 @@ class MultiPeriodDiscriminator(torch.nn.Module):
385
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
386
 
387
 
388
-
389
  class SynthesizerTrn(nn.Module):
390
- """
391
  Synthesizer for Training
392
  """
393
 
394
- def __init__(self,
395
- n_vocab,
396
- spec_channels,
397
- segment_size,
398
- inter_channels,
399
- hidden_channels,
400
- filter_channels,
401
- n_heads,
402
- n_layers,
403
- kernel_size,
404
- p_dropout,
405
- resblock,
406
- resblock_kernel_sizes,
407
- resblock_dilation_sizes,
408
- upsample_rates,
409
- upsample_initial_channel,
410
- upsample_kernel_sizes,
411
- n_speakers=0,
412
- gin_channels=0,
413
- use_sdp=True,
414
- **kwargs):
415
-
416
- super().__init__()
417
- self.n_vocab = n_vocab
418
- self.spec_channels = spec_channels
419
- self.inter_channels = inter_channels
420
- self.hidden_channels = hidden_channels
421
- self.filter_channels = filter_channels
422
- self.n_heads = n_heads
423
- self.n_layers = n_layers
424
- self.kernel_size = kernel_size
425
- self.p_dropout = p_dropout
426
- self.resblock = resblock
427
- self.resblock_kernel_sizes = resblock_kernel_sizes
428
- self.resblock_dilation_sizes = resblock_dilation_sizes
429
- self.upsample_rates = upsample_rates
430
- self.upsample_initial_channel = upsample_initial_channel
431
- self.upsample_kernel_sizes = upsample_kernel_sizes
432
- self.segment_size = segment_size
433
- self.n_speakers = n_speakers
434
- self.gin_channels = gin_channels
435
-
436
- self.use_sdp = use_sdp
437
-
438
- self.enc_p = TextEncoder(n_vocab,
439
- inter_channels,
440
- hidden_channels,
441
- filter_channels,
442
- n_heads,
443
- n_layers,
444
- kernel_size,
445
- p_dropout)
446
- self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
447
- self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
448
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
449
-
450
- if use_sdp:
451
- self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
452
- else:
453
- self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
454
-
455
- if n_speakers > 1:
456
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
457
-
458
- def forward(self, x, x_lengths, y, y_lengths, sid=None):
459
-
460
- x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
461
- if self.n_speakers > 0:
462
- g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
463
- else:
464
- g = None
465
-
466
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
467
- z_p = self.flow(z, y_mask, g=g)
468
-
469
- with torch.no_grad():
470
- # negative cross-entropy
471
- s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
472
- neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
473
- neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
474
- neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
475
- neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
476
- neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
477
-
478
- attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
479
- attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
480
-
481
- w = attn.sum(2)
482
- if self.use_sdp:
483
- l_length = self.dp(x, x_mask, w, g=g)
484
- l_length = l_length / torch.sum(x_mask)
485
- else:
486
- logw_ = torch.log(w + 1e-6) * x_mask
487
- logw = self.dp(x, x_mask, g=g)
488
- l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
489
-
490
- # expand prior
491
- m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
492
- logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
493
-
494
- z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
495
- o = self.dec(z_slice, g=g)
496
- return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
497
-
498
- def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
499
- x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
500
- if self.n_speakers > 0:
501
- g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
502
- else:
503
- g = None
504
-
505
- if self.use_sdp:
506
- logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
507
- else:
508
- logw = self.dp(x, x_mask, g=g)
509
- w = torch.exp(logw) * x_mask * length_scale
510
- w_ceil = torch.ceil(w)
511
- y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
512
- y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
513
- attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
514
- attn = commons.generate_path(w_ceil, attn_mask)
515
-
516
- m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
517
- logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
518
-
519
- z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
520
- z = self.flow(z_p, y_mask, g=g, reverse=True)
521
- o = self.dec((z * y_mask)[:,:,:max_len], g=g)
522
- return o, attn, y_mask, (z, z_p, m_p, logs_p)
523
-
524
- def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
525
- assert self.n_speakers > 0, "n_speakers have to be larger than 0."
526
- g_src = self.emb_g(sid_src).unsqueeze(-1)
527
- g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
528
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
529
- z_p = self.flow(z, y_mask, g=g_src)
530
- z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
531
- o_hat = self.dec(z_hat * y_mask, g=g_tgt)
532
- return o_hat, y_mask, (z, z_p, z_hat)
533
-
 
 
 
 
14
 
15
 
16
  class StochasticDurationPredictor(nn.Module):
17
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
18
+ super().__init__()
19
+ filter_channels = in_channels # it needs to be removed from future version.
20
+ self.in_channels = in_channels
21
+ self.filter_channels = filter_channels
22
+ self.kernel_size = kernel_size
23
+ self.p_dropout = p_dropout
24
+ self.n_flows = n_flows
25
+ self.gin_channels = gin_channels
26
+
27
+ self.log_flow = modules.Log()
28
+ self.flows = nn.ModuleList()
29
+ self.flows.append(modules.ElementwiseAffine(2))
30
+ for i in range(n_flows):
31
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
32
+ self.flows.append(modules.Flip())
33
+
34
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
35
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
36
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
37
+ self.post_flows = nn.ModuleList()
38
+ self.post_flows.append(modules.ElementwiseAffine(2))
39
+ for i in range(4):
40
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
41
+ self.post_flows.append(modules.Flip())
42
+
43
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
44
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
45
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
46
+ if gin_channels != 0:
47
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
48
+
49
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
50
+ x = torch.detach(x)
51
+ x = self.pre(x)
52
+ if g is not None:
53
+ g = torch.detach(g)
54
+ x = x + self.cond(g)
55
+ x = self.convs(x, x_mask)
56
+ x = self.proj(x) * x_mask
57
+
58
+ if not reverse:
59
+ flows = self.flows
60
+ assert w is not None
61
+
62
+ logdet_tot_q = 0
63
+ h_w = self.post_pre(w)
64
+ h_w = self.post_convs(h_w, x_mask)
65
+ h_w = self.post_proj(h_w) * x_mask
66
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
67
+ z_q = e_q
68
+ for flow in self.post_flows:
69
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
70
+ logdet_tot_q += logdet_q
71
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
72
+ u = torch.sigmoid(z_u) * x_mask
73
+ z0 = (w - u) * x_mask
74
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
75
+ logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q
76
+
77
+ logdet_tot = 0
78
+ z0, logdet = self.log_flow(z0, x_mask)
79
+ logdet_tot += logdet
80
+ z = torch.cat([z0, z1], 1)
81
+ for flow in flows:
82
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
83
+ logdet_tot = logdet_tot + logdet
84
+ nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
85
+ return nll + logq # [b]
86
+ else:
87
+ flows = list(reversed(self.flows))
88
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
89
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
90
+ for flow in flows:
91
+ z = flow(z, x_mask, g=x, reverse=reverse)
92
+ z0, z1 = torch.split(z, [1, 1], 1)
93
+ logw = z0
94
+ return logw
95
 
96
 
97
  class DurationPredictor(nn.Module):
98
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
99
+ super().__init__()
100
+
101
+ self.in_channels = in_channels
102
+ self.filter_channels = filter_channels
103
+ self.kernel_size = kernel_size
104
+ self.p_dropout = p_dropout
105
+ self.gin_channels = gin_channels
106
+
107
+ self.drop = nn.Dropout(p_dropout)
108
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
109
+ self.norm_1 = modules.LayerNorm(filter_channels)
110
+ self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
111
+ self.norm_2 = modules.LayerNorm(filter_channels)
112
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
113
+
114
+ if gin_channels != 0:
115
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
116
+
117
+ def forward(self, x, x_mask, g=None):
118
+ x = torch.detach(x)
119
+ if g is not None:
120
+ g = torch.detach(g)
121
+ x = x + self.cond(g)
122
+ x = self.conv_1(x * x_mask)
123
+ x = torch.relu(x)
124
+ x = self.norm_1(x)
125
+ x = self.drop(x)
126
+ x = self.conv_2(x * x_mask)
127
+ x = torch.relu(x)
128
+ x = self.norm_2(x)
129
+ x = self.drop(x)
130
+ x = self.proj(x * x_mask)
131
+ return x * x_mask
132
 
133
 
134
  class TextEncoder(nn.Module):
135
+ def __init__(self,
136
+ n_vocab,
137
+ out_channels,
138
+ hidden_channels,
139
+ filter_channels,
140
+ n_heads,
141
+ n_layers,
142
+ kernel_size,
143
+ p_dropout):
144
+ super().__init__()
145
+ self.n_vocab = n_vocab
146
+ self.out_channels = out_channels
147
+ self.hidden_channels = hidden_channels
148
+ self.filter_channels = filter_channels
149
+ self.n_heads = n_heads
150
+ self.n_layers = n_layers
151
+ self.kernel_size = kernel_size
152
+ self.p_dropout = p_dropout
153
+
154
+ if self.n_vocab != 0:
155
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
156
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
157
+
158
+ self.encoder = attentions.Encoder(
159
+ hidden_channels,
160
+ filter_channels,
161
+ n_heads,
162
+ n_layers,
163
+ kernel_size,
164
+ p_dropout)
165
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
166
+
167
+ def forward(self, x, x_lengths):
168
+ if self.n_vocab != 0:
169
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
170
+ x = torch.transpose(x, 1, -1) # [b, h, t]
171
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
172
+
173
+ x = self.encoder(x * x_mask, x_mask)
174
+ stats = self.proj(x) * x_mask
175
+
176
+ m, logs = torch.split(stats, self.out_channels, dim=1)
177
+ return x, m, logs, x_mask
178
 
179
 
180
  class ResidualCouplingBlock(nn.Module):
181
+ def __init__(self,
182
+ channels,
183
+ hidden_channels,
184
+ kernel_size,
185
+ dilation_rate,
186
+ n_layers,
187
+ n_flows=4,
188
+ gin_channels=0):
189
+ super().__init__()
190
+ self.channels = channels
191
+ self.hidden_channels = hidden_channels
192
+ self.kernel_size = kernel_size
193
+ self.dilation_rate = dilation_rate
194
+ self.n_layers = n_layers
195
+ self.n_flows = n_flows
196
+ self.gin_channels = gin_channels
197
+
198
+ self.flows = nn.ModuleList()
199
+ for i in range(n_flows):
200
+ self.flows.append(
201
+ modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
202
+ gin_channels=gin_channels, mean_only=True))
203
+ self.flows.append(modules.Flip())
204
+
205
+ def forward(self, x, x_mask, g=None, reverse=False):
206
+ if not reverse:
207
+ for flow in self.flows:
208
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
209
+ else:
210
+ for flow in reversed(self.flows):
211
+ x = flow(x, x_mask, g=g, reverse=reverse)
212
+ return x
213
 
214
 
215
  class PosteriorEncoder(nn.Module):
216
+ def __init__(self,
217
+ in_channels,
218
+ out_channels,
219
+ hidden_channels,
220
+ kernel_size,
221
+ dilation_rate,
222
+ n_layers,
223
+ gin_channels=0):
224
+ super().__init__()
225
+ self.in_channels = in_channels
226
+ self.out_channels = out_channels
227
+ self.hidden_channels = hidden_channels
228
+ self.kernel_size = kernel_size
229
+ self.dilation_rate = dilation_rate
230
+ self.n_layers = n_layers
231
+ self.gin_channels = gin_channels
232
+
233
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
234
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
235
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
236
+
237
+ def forward(self, x, x_lengths, g=None):
238
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
239
+ x = self.pre(x) * x_mask
240
+ x = self.enc(x, x_mask, g=g)
241
+ stats = self.proj(x) * x_mask
242
+ m, logs = torch.split(stats, self.out_channels, dim=1)
243
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
244
+ return z, m, logs, x_mask
245
 
246
 
247
  class Generator(torch.nn.Module):
248
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
249
+ upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
250
  super(Generator, self).__init__()
251
  self.num_kernels = len(resblock_kernel_sizes)
252
  self.num_upsamples = len(upsample_rates)
 
256
  self.ups = nn.ModuleList()
257
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
258
  self.ups.append(weight_norm(
259
+ ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
260
+ k, u, padding=(k - u) // 2)))
261
 
262
  self.resblocks = nn.ModuleList()
263
  for i in range(len(self.ups)):
264
+ ch = upsample_initial_channel // (2 ** (i + 1))
265
  for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
266
  self.resblocks.append(resblock(ch, k, d))
267
 
 
274
  def forward(self, x, g=None):
275
  x = self.conv_pre(x)
276
  if g is not None:
277
+ x = x + self.cond(g)
278
 
279
  for i in range(self.num_upsamples):
280
  x = F.leaky_relu(x, modules.LRELU_SLOPE)
 
282
  xs = None
283
  for j in range(self.num_kernels):
284
  if xs is None:
285
+ xs = self.resblocks[i * self.num_kernels + j](x)
286
  else:
287
+ xs += self.resblocks[i * self.num_kernels + j](x)
288
  x = xs / self.num_kernels
289
  x = F.leaky_relu(x)
290
  x = self.conv_post(x)
 
320
 
321
  # 1d to 2d
322
  b, c, t = x.shape
323
+ if t % self.period != 0: # pad first
324
  n_pad = self.period - (t % self.period)
325
  x = F.pad(x, (0, n_pad), "reflect")
326
  t = t + n_pad
 
368
  class MultiPeriodDiscriminator(torch.nn.Module):
369
  def __init__(self, use_spectral_norm=False):
370
  super(MultiPeriodDiscriminator, self).__init__()
371
+ periods = [2, 3, 5, 7, 11]
372
 
373
  discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
374
  discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
 
390
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
391
 
392
 
 
393
  class SynthesizerTrn(nn.Module):
394
+ """
395
  Synthesizer for Training
396
  """
397
 
398
+ def __init__(self,
399
+ n_vocab,
400
+ spec_channels,
401
+ segment_size,
402
+ inter_channels,
403
+ hidden_channels,
404
+ filter_channels,
405
+ n_heads,
406
+ n_layers,
407
+ kernel_size,
408
+ p_dropout,
409
+ resblock,
410
+ resblock_kernel_sizes,
411
+ resblock_dilation_sizes,
412
+ upsample_rates,
413
+ upsample_initial_channel,
414
+ upsample_kernel_sizes,
415
+ n_speakers=0,
416
+ gin_channels=0,
417
+ use_sdp=True,
418
+ **kwargs):
419
+
420
+ super().__init__()
421
+ self.n_vocab = n_vocab
422
+ self.spec_channels = spec_channels
423
+ self.inter_channels = inter_channels
424
+ self.hidden_channels = hidden_channels
425
+ self.filter_channels = filter_channels
426
+ self.n_heads = n_heads
427
+ self.n_layers = n_layers
428
+ self.kernel_size = kernel_size
429
+ self.p_dropout = p_dropout
430
+ self.resblock = resblock
431
+ self.resblock_kernel_sizes = resblock_kernel_sizes
432
+ self.resblock_dilation_sizes = resblock_dilation_sizes
433
+ self.upsample_rates = upsample_rates
434
+ self.upsample_initial_channel = upsample_initial_channel
435
+ self.upsample_kernel_sizes = upsample_kernel_sizes
436
+ self.segment_size = segment_size
437
+ self.n_speakers = n_speakers
438
+ self.gin_channels = gin_channels
439
+
440
+ self.use_sdp = use_sdp
441
+
442
+ self.enc_p = TextEncoder(n_vocab,
443
+ inter_channels,
444
+ hidden_channels,
445
+ filter_channels,
446
+ n_heads,
447
+ n_layers,
448
+ kernel_size,
449
+ p_dropout)
450
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
451
+ upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
452
+ self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
453
+ gin_channels=gin_channels)
454
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
455
+
456
+ if use_sdp:
457
+ self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
458
+ else:
459
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
460
+
461
+ if n_speakers > 1:
462
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
463
+
464
+ def forward(self, x, x_lengths, y, y_lengths, sid=None):
465
+
466
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
467
+ if self.n_speakers > 1:
468
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
469
+ else:
470
+ g = None
471
+
472
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
473
+ z_p = self.flow(z, y_mask, g=g)
474
+
475
+ with torch.no_grad():
476
+ # negative cross-entropy
477
+ s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
478
+ neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
479
+ neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2),
480
+ s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
481
+ neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
482
+ neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
483
+ neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
484
+
485
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
486
+ attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
487
+
488
+ w = attn.sum(2)
489
+ if self.use_sdp:
490
+ l_length = self.dp(x, x_mask, w, g=g)
491
+ l_length = l_length / torch.sum(x_mask)
492
+ else:
493
+ logw_ = torch.log(w + 1e-6) * x_mask
494
+ logw = self.dp(x, x_mask, g=g)
495
+ l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(x_mask) # for averaging
496
+
497
+ # expand prior
498
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
499
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
500
+
501
+ z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
502
+ o = self.dec(z_slice, g=g)
503
+ return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
504
+
505
+ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
506
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
507
+ if self.n_speakers > 1:
508
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
509
+ else:
510
+ g = None
511
+
512
+ if self.use_sdp:
513
+ logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
514
+ else:
515
+ logw = self.dp(x, x_mask, g=g)
516
+ w = torch.exp(logw) * x_mask * length_scale
517
+ w_ceil = torch.ceil(w)
518
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
519
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
520
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
521
+ attn = commons.generate_path(w_ceil, attn_mask)
522
+
523
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
524
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
525
+ 2) # [b, t', t], [b, t, d] -> [b, d, t']
526
+
527
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
528
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
529
+ o = self.dec((z * y_mask)[:, :, :max_len], g=g)
530
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
531
+
532
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
533
+ assert self.n_speakers > 1, "n_speakers have to be larger than 1."
534
+ g_src = self.emb_g(sid_src).unsqueeze(-1)
535
+ g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
536
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
537
+ z_p = self.flow(z, y_mask, g=g_src)
538
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
539
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt)
540
+ return o_hat, y_mask, (z, z_p, z_hat)
saved_model/10/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06b3c77565155ac550a3264e24f5c59627c6f8e4f9953a5f2423f6d375823e52
3
+ size 1228
saved_model/10/cover.jpg ADDED

Git LFS Details

  • SHA256: cb5d83e14c8cd74a20185d8b9535f9a1699a15057f7ebce87a32f32f5aad94ba
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
saved_model/10/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7d3dc42ad38c3479b41c1060c442ba33018069be637e664fefafb4bb4ad764
3
+ size 220972879
saved_model/11/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2aa2128f54f61bf1b01951f7d2e0e2d5a835a9750a4a9ef8b4854ac25324823
3
+ size 1187
saved_model/11/cover.jpg ADDED

Git LFS Details

  • SHA256: 5ce5e75924dca82bb7cddbe9715f1254fe7aa0fc068085f72ff893c9324c586e
  • Pointer size: 130 Bytes
  • Size of remote file: 30.2 kB
saved_model/11/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56d55e4672c5f335ebae30728529e5efb8a9c3975a9b63e6590454ef8769ae70
3
+ size 203264375
saved_model/8/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4304293bb85d90daa3b5fa2dc3a35ce0842f0282f54298df68103932fee0e9f2
3
+ size 1873
saved_model/8/cover.jpg ADDED

Git LFS Details

  • SHA256: 090dd3b832004b22ac58c54075890f7484fe6989a9ce91d234af35f1adf27e0a
  • Pointer size: 130 Bytes
  • Size of remote file: 37.2 kB
saved_model/8/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70ba2e6a192f836d58dcb4e36b8d5dde2e2a06c88d03dda107c07b9aa35ee4db
3
+ size 158902605
saved_model/9/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2040ad22b30868bb031f4d2e2af91fdcfe057753f68e8cb135be5459374cba73
3
+ size 816
saved_model/9/cover.jpg ADDED

Git LFS Details

  • SHA256: 090dd3b832004b22ac58c54075890f7484fe6989a9ce91d234af35f1adf27e0a
  • Pointer size: 130 Bytes
  • Size of remote file: 37.2 kB
saved_model/9/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20b38cc55191ec02c2809e80d758ff0d56bd44760841704feb9921aa58a4d9de
3
+ size 203264375
saved_model/info.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43443cfa806bfad9cbb429c96ba440913d01da1bdff63daa564c824037e8070b
3
- size 1015
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33b7a2042589990eb609c4e87044b7d5d6d80da206c88f54b70175ce0d2a535c
3
+ size 1616