Spaces:
Running
Running
import torch | |
import numpy as np | |
import gradio as gr | |
from PIL import Image | |
from omegaconf import OmegaConf | |
from pathlib import Path | |
from vocoder.bigvgan.models import VocoderBigVGAN | |
from ldm.models.diffusion.ddim import DDIMSampler | |
from ldm.util import instantiate_from_config | |
SAMPLE_RATE = 16000 | |
torch.set_grad_enabled(False) | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
def initialize_model(config, ckpt=None): | |
config = OmegaConf.load(config) | |
model = instantiate_from_config(config.model) | |
if ckpt == None: | |
print("not load state dict") | |
else: | |
model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False) | |
model = model.to(device) | |
model.cond_stage_model.to(model.device) | |
model.cond_stage_model.device = model.device | |
print(model.device,device,model.cond_stage_model.device) | |
sampler = DDIMSampler(model) | |
return sampler | |
sampler = initialize_model('configs/img_to_audio/img2audio_args.yaml', 'useful_ckpts/ta54_epoch=000216.ckpt') | |
vocoder = VocoderBigVGAN('vocoder/logs/bigv16k53w',device=device) | |
def img2audio(sampler,vocoder,image, seed, scale, ddim_steps, W=624, H=80): | |
# print(type(image))# np.ndarray | |
n_samples = 1 # only support 1 sample | |
prng = np.random.RandomState(seed) | |
start_code = prng.randn(n_samples, sampler.model.first_stage_model.embed_dim, H // 8, W // 8) | |
start_code = torch.from_numpy(start_code).to(device=device, dtype=torch.float32) | |
uc = None | |
if scale != 1.0: | |
uc = sampler.model.get_learned_conditioning(n_samples * [""]) | |
image = Image.fromarray(image) | |
image = sampler.model.cond_stage_model.preprocess(image).unsqueeze(0) | |
image_embedding = sampler.model.cond_stage_model.forward_img(image) | |
c = image_embedding.repeat(n_samples, 1, 1)# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding | |
shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x) | |
samples_ddim, _ = sampler.sample(S=ddim_steps, | |
conditioning=c, | |
batch_size=n_samples, | |
shape=shape, | |
verbose=False, | |
unconditional_guidance_scale=scale, | |
unconditional_conditioning=uc, | |
x_T=start_code) | |
x_samples_ddim = sampler.model.decode_first_stage(samples_ddim) | |
x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0) # [0, 1] | |
wav_list = [] | |
for idx,spec in enumerate(x_samples_ddim): | |
wav = vocoder.vocode(spec) | |
wav_list.append((SAMPLE_RATE,wav)) | |
best_wav = wav_list[0] | |
return best_wav | |
def predict(image, ddim_steps, scale, seed):# 经过试验,这个input_image需要是256x256、512x512的大小效果才正常,实际应该resize一下,输出再resize回去,但是他们使用的是pad,不知道为什么 | |
melbins,mel_len = 80,624 | |
with torch.no_grad(): | |
result = img2audio( | |
sampler=sampler, | |
vocoder=vocoder, | |
image=image, | |
seed=seed, | |
scale=scale, | |
ddim_steps=ddim_steps, | |
H=melbins, W=mel_len | |
) | |
return result | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
gr.Markdown("## Make-An-Audio: Text-to-Audio Generation") | |
with gr.Row(): | |
with gr.Column(): | |
image = gr.Image(label="Input Image: Select ome image to upload ") | |
run_button = gr.Button(label="Run") | |
with gr.Accordion("Advanced options", open=False): | |
# num_samples = 1 | |
ddim_steps = gr.Slider(label="Steps", minimum=1, | |
maximum=1000, value=100, step=1) | |
scale = gr.Slider( | |
label="Guidance Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=4.0, value=1.5, step=0.1 | |
) | |
seed = gr.Slider( | |
label="Seed:Change this value (any integer number) will lead to a different generation result.", | |
minimum=0, | |
maximum=2147483647, | |
step=1, | |
value=44, | |
) | |
with gr.Column(): | |
# audio_list = [] | |
# for i in range(int(num_samples)): | |
# audio_list.append(gr.outputs.Audio()) | |
outaudio = gr.Audio() | |
run_button.click(fn=predict, inputs=[ | |
image,ddim_steps, scale, seed], outputs=[outaudio])# inputs的参数只能传gr.xxx | |
with gr.Row(): | |
with gr.Column(): | |
gr.Examples( | |
examples = [['./example_imgs/cat.png',100,3,55],['./example_imgs/violin.png',100,3,55]], | |
inputs = [image,ddim_steps, scale, seed], | |
outputs = [outaudio] | |
) | |
with gr.Column(): | |
pass | |
demo.launch() | |