Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- .gitignore +2 -0
- app.py +11 -9
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
__pycache__
|
app.py
CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
|
|
4 |
from PIL import Image
|
5 |
from omegaconf import OmegaConf
|
6 |
from pathlib import Path
|
7 |
-
from vocoder.
|
8 |
from ldm.models.diffusion.ddim import DDIMSampler
|
9 |
from ldm.util import instantiate_from_config
|
10 |
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
@@ -29,7 +29,7 @@ def initialize_model(config, ckpt):
|
|
29 |
return sampler
|
30 |
|
31 |
sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
|
32 |
-
vocoder =
|
33 |
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
34 |
|
35 |
def select_best_audio(prompt,wav_list):
|
@@ -52,7 +52,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62
|
|
52 |
uc = None
|
53 |
if scale != 1.0:
|
54 |
uc = sampler.model.get_learned_conditioning(n_samples * [""])
|
55 |
-
c = sampler.model.get_learned_conditioning(n_samples * [prompt])
|
56 |
shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
57 |
samples_ddim, _ = sampler.sample(S=ddim_steps,
|
58 |
conditioning=c,
|
@@ -74,7 +74,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62
|
|
74 |
return best_wav
|
75 |
|
76 |
|
77 |
-
def predict(prompt, ddim_steps, num_samples, scale, seed)
|
78 |
melbins,mel_len = 80,624
|
79 |
with torch.no_grad():
|
80 |
result = txt2audio(
|
@@ -97,21 +97,23 @@ with gr.Blocks() as demo:
|
|
97 |
|
98 |
with gr.Row():
|
99 |
with gr.Column():
|
100 |
-
prompt = gr.Textbox(label="Prompt: Input your text here
|
101 |
run_button = gr.Button(label="Run")
|
102 |
|
103 |
|
104 |
with gr.Accordion("Advanced options", open=False):
|
105 |
num_samples = gr.Slider(
|
106 |
-
label="
|
|
|
|
|
107 |
# num_samples = 1
|
108 |
ddim_steps = gr.Slider(label="Steps", minimum=1,
|
109 |
maximum=150, value=100, step=1)
|
110 |
scale = gr.Slider(
|
111 |
-
label="Guidance Scale", minimum=0.1, maximum=4.0, value=1.5, step=0.1
|
112 |
)
|
113 |
seed = gr.Slider(
|
114 |
-
label="Seed",
|
115 |
minimum=0,
|
116 |
maximum=2147483647,
|
117 |
step=1,
|
@@ -138,4 +140,4 @@ with gr.Blocks() as demo:
|
|
138 |
with gr.Column():
|
139 |
pass
|
140 |
|
141 |
-
demo.launch()
|
|
|
4 |
from PIL import Image
|
5 |
from omegaconf import OmegaConf
|
6 |
from pathlib import Path
|
7 |
+
from vocoder.bigvgan.models import VocoderBigVGAN
|
8 |
from ldm.models.diffusion.ddim import DDIMSampler
|
9 |
from ldm.util import instantiate_from_config
|
10 |
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
|
|
29 |
return sampler
|
30 |
|
31 |
sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
|
32 |
+
vocoder = VocoderBigVGAN('vocoder/logs/bigv16k53w',device=device)
|
33 |
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
34 |
|
35 |
def select_best_audio(prompt,wav_list):
|
|
|
52 |
uc = None
|
53 |
if scale != 1.0:
|
54 |
uc = sampler.model.get_learned_conditioning(n_samples * [""])
|
55 |
+
c = sampler.model.get_learned_conditioning(n_samples * [prompt])# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding
|
56 |
shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
57 |
samples_ddim, _ = sampler.sample(S=ddim_steps,
|
58 |
conditioning=c,
|
|
|
74 |
return best_wav
|
75 |
|
76 |
|
77 |
+
def predict(prompt, ddim_steps, num_samples, scale, seed):# 经过试验,这个input_image需要是256x256、512x512的大小效果才正常,实际应该resize一下,输出再resize回去,但是他们使用的是pad,不知道为什么
|
78 |
melbins,mel_len = 80,624
|
79 |
with torch.no_grad():
|
80 |
result = txt2audio(
|
|
|
97 |
|
98 |
with gr.Row():
|
99 |
with gr.Column():
|
100 |
+
prompt = gr.Textbox(label="Prompt: Input your text here. ")
|
101 |
run_button = gr.Button(label="Run")
|
102 |
|
103 |
|
104 |
with gr.Accordion("Advanced options", open=False):
|
105 |
num_samples = gr.Slider(
|
106 |
+
label="Select from audios num.This number control the number of candidates \
|
107 |
+
(e.g., generate three audios and choose the best to show you). A Larger value usually lead to \
|
108 |
+
better quality with heavier computation", minimum=1, maximum=10, value=3, step=1)
|
109 |
# num_samples = 1
|
110 |
ddim_steps = gr.Slider(label="Steps", minimum=1,
|
111 |
maximum=150, value=100, step=1)
|
112 |
scale = gr.Slider(
|
113 |
+
label="Guidance Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=4.0, value=1.5, step=0.1
|
114 |
)
|
115 |
seed = gr.Slider(
|
116 |
+
label="Seed:Change this value (any integer number) will lead to a different generation result.",
|
117 |
minimum=0,
|
118 |
maximum=2147483647,
|
119 |
step=1,
|
|
|
140 |
with gr.Column():
|
141 |
pass
|
142 |
|
143 |
+
demo.launch(share=True)
|