Spaces:
Build error
Build error
Rongjiehuang
commited on
Commit
•
1400424
1
Parent(s):
3075f9b
update huggingface
Browse files- README.md +3 -3
- app.py +1 -1
- audio_foundation_models.py +7 -7
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: AudioGPT
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.23.0
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
title: AudioGPT
|
3 |
+
emoji: 🚀
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: pink
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.23.0
|
8 |
app_file: app.py
|
app.py
CHANGED
@@ -6,7 +6,7 @@ from audio_foundation_models import *
|
|
6 |
import gradio as gr
|
7 |
|
8 |
_DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
|
9 |
-
_DESCRIPTION += '\n<p>This is a demo to the work
|
10 |
_DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
|
11 |
|
12 |
|
|
|
6 |
import gradio as gr
|
7 |
|
8 |
_DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
|
9 |
+
_DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting</a>. </p>'
|
10 |
_DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
|
11 |
|
12 |
|
audio_foundation_models.py
CHANGED
@@ -212,7 +212,7 @@ class I2A:
|
|
212 |
image = Image.open(image)
|
213 |
image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
|
214 |
image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
|
215 |
-
c = image_embedding.repeat(n_samples, 1, 1)
|
216 |
shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
217 |
samples_ddim, _ = self.sampler.sample(S=ddim_steps,
|
218 |
conditioning=c,
|
@@ -384,7 +384,7 @@ class Inpaint:
|
|
384 |
sr, ori_wav = wavfile.read(input_audio_path)
|
385 |
print("gen_mel")
|
386 |
print(sr,ori_wav.shape,ori_wav)
|
387 |
-
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
388 |
if len(ori_wav.shape)==2:# stereo
|
389 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
390 |
print(sr,ori_wav.shape,ori_wav)
|
@@ -405,7 +405,7 @@ class Inpaint:
|
|
405 |
print("gen_mel_audio")
|
406 |
print(sr,ori_wav.shape,ori_wav)
|
407 |
|
408 |
-
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
409 |
if len(ori_wav.shape)==2:# stereo
|
410 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
411 |
print(sr,ori_wav.shape,ori_wav)
|
@@ -454,11 +454,11 @@ class Inpaint:
|
|
454 |
torch.set_grad_enabled(False)
|
455 |
mel_img = Image.open(mel_and_mask['image'])
|
456 |
mask_img = Image.open(mel_and_mask["mask"])
|
457 |
-
show_mel = np.array(mel_img.convert("L"))/255
|
458 |
mask = np.array(mask_img.convert("L"))/255
|
459 |
mel_bins,mel_len = 80,848
|
460 |
-
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]
|
461 |
-
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)
|
462 |
print(mask.shape,input_mel.shape)
|
463 |
with torch.no_grad():
|
464 |
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
|
@@ -487,7 +487,7 @@ class Inpaint:
|
|
487 |
"representing the audio_path. " )
|
488 |
|
489 |
def inference(self, input_audio_path):
|
490 |
-
crop_len = 500
|
491 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
492 |
color_mel = self.cmap_transform(crop_mel)
|
493 |
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
|
|
212 |
image = Image.open(image)
|
213 |
image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
|
214 |
image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
|
215 |
+
c = image_embedding.repeat(n_samples, 1, 1)
|
216 |
shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
217 |
samples_ddim, _ = self.sampler.sample(S=ddim_steps,
|
218 |
conditioning=c,
|
|
|
384 |
sr, ori_wav = wavfile.read(input_audio_path)
|
385 |
print("gen_mel")
|
386 |
print(sr,ori_wav.shape,ori_wav)
|
387 |
+
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
388 |
if len(ori_wav.shape)==2:# stereo
|
389 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
390 |
print(sr,ori_wav.shape,ori_wav)
|
|
|
405 |
print("gen_mel_audio")
|
406 |
print(sr,ori_wav.shape,ori_wav)
|
407 |
|
408 |
+
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
409 |
if len(ori_wav.shape)==2:# stereo
|
410 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
411 |
print(sr,ori_wav.shape,ori_wav)
|
|
|
454 |
torch.set_grad_enabled(False)
|
455 |
mel_img = Image.open(mel_and_mask['image'])
|
456 |
mask_img = Image.open(mel_and_mask["mask"])
|
457 |
+
show_mel = np.array(mel_img.convert("L"))/255
|
458 |
mask = np.array(mask_img.convert("L"))/255
|
459 |
mel_bins,mel_len = 80,848
|
460 |
+
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]
|
461 |
+
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)
|
462 |
print(mask.shape,input_mel.shape)
|
463 |
with torch.no_grad():
|
464 |
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
|
|
|
487 |
"representing the audio_path. " )
|
488 |
|
489 |
def inference(self, input_audio_path):
|
490 |
+
crop_len = 500
|
491 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
492 |
color_mel = self.cmap_transform(crop_mel)
|
493 |
image = Image.fromarray((color_mel*255).astype(np.uint8))
|