Spaces:
Build error
Build error
Update audio_foundation_models.py
Browse files- audio_foundation_models.py +27 -10
audio_foundation_models.py
CHANGED
@@ -190,7 +190,7 @@ class I2A:
|
|
190 |
|
191 |
@prompts(name="Generate Audio From The Image",
|
192 |
description="useful for when you want to generate an audio "
|
193 |
-
"based on an image."
|
194 |
"The input to this tool should be a string, "
|
195 |
"representing the image_path. ")
|
196 |
|
@@ -237,6 +237,23 @@ class I2A:
|
|
237 |
print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
|
238 |
return audio_filename
|
239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
class T2S:
|
241 |
def __init__(self, device= None):
|
242 |
if device is None:
|
@@ -394,14 +411,6 @@ class Inpaint:
|
|
394 |
input_wav = ori_wav[:input_len]
|
395 |
mel = TRANSFORMS_16000(input_wav)
|
396 |
return mel
|
397 |
-
def show_mel_fn(self, input_audio_path):
|
398 |
-
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
399 |
-
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
400 |
-
color_mel = self.cmap_transform(crop_mel)
|
401 |
-
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
402 |
-
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
403 |
-
image.save(image_filename)
|
404 |
-
return image_filename
|
405 |
def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
406 |
model = self.sampler.model
|
407 |
|
@@ -432,7 +441,7 @@ class Inpaint:
|
|
432 |
inapint_wav = self.vocoder.vocode(inpainted)
|
433 |
|
434 |
return inpainted, inapint_wav
|
435 |
-
def
|
436 |
SAMPLE_RATE = 16000
|
437 |
torch.set_grad_enabled(False)
|
438 |
mel_img = Image.open(mel_and_mask['image'])
|
@@ -462,6 +471,14 @@ class Inpaint:
|
|
462 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
463 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
464 |
return image_filename, audio_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
|
466 |
class ASR:
|
467 |
def __init__(self, device):
|
|
|
190 |
|
191 |
@prompts(name="Generate Audio From The Image",
|
192 |
description="useful for when you want to generate an audio "
|
193 |
+
"based on an image. "
|
194 |
"The input to this tool should be a string, "
|
195 |
"representing the image_path. ")
|
196 |
|
|
|
237 |
print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
|
238 |
return audio_filename
|
239 |
|
240 |
+
class TTS:
|
241 |
+
def __init__(self, device=None):
|
242 |
+
self.inferencer = TTSInference(device)
|
243 |
+
|
244 |
+
@prompts(name="Synthesize Speech Given the User Input Text",
|
245 |
+
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
246 |
+
"The input to this tool should be a string, "
|
247 |
+
"representing the text used to be converted to speech.")
|
248 |
+
|
249 |
+
def inference(self, text):
|
250 |
+
global temp_audio_filename
|
251 |
+
inp = {"text": text}
|
252 |
+
out = self.inferencer.infer_once(inp)
|
253 |
+
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
254 |
+
soundfile.write(audio_filename, out, samplerate = 22050)
|
255 |
+
return audio_filename
|
256 |
+
|
257 |
class T2S:
|
258 |
def __init__(self, device= None):
|
259 |
if device is None:
|
|
|
411 |
input_wav = ori_wav[:input_len]
|
412 |
mel = TRANSFORMS_16000(input_wav)
|
413 |
return mel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
415 |
model = self.sampler.model
|
416 |
|
|
|
441 |
inapint_wav = self.vocoder.vocode(inpainted)
|
442 |
|
443 |
return inpainted, inapint_wav
|
444 |
+
def predict(self, input_audio, mel_and_mask, seed = 55, ddim_steps = 100):
|
445 |
SAMPLE_RATE = 16000
|
446 |
torch.set_grad_enabled(False)
|
447 |
mel_img = Image.open(mel_and_mask['image'])
|
|
|
471 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
472 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
473 |
return image_filename, audio_filename
|
474 |
+
def inference(self, input_audio_path):
|
475 |
+
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
476 |
+
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
477 |
+
color_mel = self.cmap_transform(crop_mel)
|
478 |
+
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
479 |
+
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
480 |
+
image.save(image_filename)
|
481 |
+
return image_filename
|
482 |
|
483 |
class ASR:
|
484 |
def __init__(self, device):
|