Spaces:
Build error
Build error
Update audio_foundation_models.py
Browse files- audio_foundation_models.py +33 -29
audio_foundation_models.py
CHANGED
@@ -135,11 +135,6 @@ class T2A:
|
|
135 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
|
136 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
137 |
|
138 |
-
@prompts(name="Generate Audio From User Input Text",
|
139 |
-
description="useful for when you want to generate an audio "
|
140 |
-
"from a user input text and it saved it to a file."
|
141 |
-
"The input to this tool should be a string, "
|
142 |
-
"representing the text used to generate audio.")
|
143 |
|
144 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
145 |
SAMPLE_RATE = 16000
|
@@ -168,6 +163,12 @@ class T2A:
|
|
168 |
best_wav = select_best_audio(text, wav_list)
|
169 |
return best_wav
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
def inference(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
172 |
melbins,mel_len = 80,624
|
173 |
with torch.no_grad():
|
@@ -188,11 +189,6 @@ class I2A:
|
|
188 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
189 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
190 |
|
191 |
-
@prompts(name="Generate Audio From The Image",
|
192 |
-
description="useful for when you want to generate an audio "
|
193 |
-
"based on an image. "
|
194 |
-
"The input to this tool should be a string, "
|
195 |
-
"representing the image_path. ")
|
196 |
|
197 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
198 |
SAMPLE_RATE = 16000
|
@@ -224,6 +220,13 @@ class I2A:
|
|
224 |
wav_list.append((SAMPLE_RATE,wav))
|
225 |
best_wav = wav_list[0]
|
226 |
return best_wav
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
def inference(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
228 |
melbins,mel_len = 80,624
|
229 |
with torch.no_grad():
|
@@ -247,7 +250,6 @@ class TTS:
|
|
247 |
"representing the text used to be converted to speech.")
|
248 |
|
249 |
def inference(self, text):
|
250 |
-
global temp_audio_filename
|
251 |
inp = {"text": text}
|
252 |
out = self.inferencer.infer_once(inp)
|
253 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
@@ -270,6 +272,11 @@ class T2S:
|
|
270 |
'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
|
271 |
}
|
272 |
|
|
|
|
|
|
|
|
|
|
|
273 |
@prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
|
274 |
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
|
275 |
"and save it to a file."
|
@@ -278,11 +285,7 @@ class T2S:
|
|
278 |
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
279 |
"The input to this tool should be a comma seperated string of three, "
|
280 |
"representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
|
281 |
-
|
282 |
-
def set_model_hparams(self):
|
283 |
-
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
284 |
-
self.hp = hp
|
285 |
-
|
286 |
def inference(self, inputs):
|
287 |
self.set_model_hparams()
|
288 |
val = inputs.split(",")
|
@@ -311,13 +314,6 @@ class TTS_OOD:
|
|
311 |
self.set_model_hparams()
|
312 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
313 |
|
314 |
-
@prompts(name="Style Transfer",
|
315 |
-
description="useful for when you want to generate speech samples with styles "
|
316 |
-
"(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
|
317 |
-
"Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
318 |
-
"The input to this tool should be a comma seperated string of two, "
|
319 |
-
"representing reference audio path and input text. " )
|
320 |
-
|
321 |
def set_model_hparams(self):
|
322 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
323 |
f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
|
@@ -328,6 +324,13 @@ class TTS_OOD:
|
|
328 |
hp['emotion_encoder_path'] = 'checkpoints/Emotion_encoder.pt'
|
329 |
self.hp = hp
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
def inference(self, inputs):
|
332 |
self.set_model_hparams()
|
333 |
key = ['ref_audio', 'text']
|
@@ -349,12 +352,6 @@ class Inpaint:
|
|
349 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
350 |
self.cmap_transform = matplotlib.cm.viridis
|
351 |
|
352 |
-
@prompts(name="Audio Inpainting",
|
353 |
-
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
|
354 |
-
"this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
|
355 |
-
"The input to this tool should be a string, "
|
356 |
-
"representing the audio_path. " )
|
357 |
-
|
358 |
def make_batch_sd(self, mel, mask, num_samples=1):
|
359 |
|
360 |
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
@@ -471,6 +468,13 @@ class Inpaint:
|
|
471 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
472 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
473 |
return image_filename, audio_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
474 |
def inference(self, input_audio_path):
|
475 |
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
476 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
|
|
135 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
|
136 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
137 |
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
140 |
SAMPLE_RATE = 16000
|
|
|
163 |
best_wav = select_best_audio(text, wav_list)
|
164 |
return best_wav
|
165 |
|
166 |
+
@prompts(name="Generate Audio From User Input Text",
|
167 |
+
description="useful for when you want to generate an audio "
|
168 |
+
"from a user input text and it saved it to a file."
|
169 |
+
"The input to this tool should be a string, "
|
170 |
+
"representing the text used to generate audio.")
|
171 |
+
|
172 |
def inference(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
|
173 |
melbins,mel_len = 80,624
|
174 |
with torch.no_grad():
|
|
|
189 |
self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
190 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
191 |
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
194 |
SAMPLE_RATE = 16000
|
|
|
220 |
wav_list.append((SAMPLE_RATE,wav))
|
221 |
best_wav = wav_list[0]
|
222 |
return best_wav
|
223 |
+
|
224 |
+
@prompts(name="Generate Audio From The Image",
|
225 |
+
description="useful for when you want to generate an audio "
|
226 |
+
"based on an image. "
|
227 |
+
"The input to this tool should be a string, "
|
228 |
+
"representing the image_path. ")
|
229 |
+
|
230 |
def inference(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
|
231 |
melbins,mel_len = 80,624
|
232 |
with torch.no_grad():
|
|
|
250 |
"representing the text used to be converted to speech.")
|
251 |
|
252 |
def inference(self, text):
|
|
|
253 |
inp = {"text": text}
|
254 |
out = self.inferencer.infer_once(inp)
|
255 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
|
|
272 |
'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
|
273 |
}
|
274 |
|
275 |
+
|
276 |
+
def set_model_hparams(self):
|
277 |
+
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
278 |
+
self.hp = hp
|
279 |
+
|
280 |
@prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
|
281 |
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
|
282 |
"and save it to a file."
|
|
|
285 |
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
286 |
"The input to this tool should be a comma seperated string of three, "
|
287 |
"representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
|
288 |
+
|
|
|
|
|
|
|
|
|
289 |
def inference(self, inputs):
|
290 |
self.set_model_hparams()
|
291 |
val = inputs.split(",")
|
|
|
314 |
self.set_model_hparams()
|
315 |
self.pipe = GenerSpeechInfer(self.hp, device)
|
316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
def set_model_hparams(self):
|
318 |
set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
|
319 |
f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
|
|
|
324 |
hp['emotion_encoder_path'] = 'checkpoints/Emotion_encoder.pt'
|
325 |
self.hp = hp
|
326 |
|
327 |
+
@prompts(name="Style Transfer",
|
328 |
+
description="useful for when you want to generate speech samples with styles "
|
329 |
+
"(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
|
330 |
+
"Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
331 |
+
"The input to this tool should be a comma seperated string of two, "
|
332 |
+
"representing reference audio path and input text. " )
|
333 |
+
|
334 |
def inference(self, inputs):
|
335 |
self.set_model_hparams()
|
336 |
key = ['ref_audio', 'text']
|
|
|
352 |
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
353 |
self.cmap_transform = matplotlib.cm.viridis
|
354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
def make_batch_sd(self, mel, mask, num_samples=1):
|
356 |
|
357 |
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
|
|
468 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
469 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
470 |
return image_filename, audio_filename
|
471 |
+
|
472 |
+
@prompts(name="Audio Inpainting",
|
473 |
+
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
|
474 |
+
"this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
|
475 |
+
"The input to this tool should be a string, "
|
476 |
+
"representing the audio_path. " )
|
477 |
+
|
478 |
def inference(self, input_audio_path):
|
479 |
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
480 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|