lmzjms commited on
Commit
b63f63b
1 Parent(s): 8e99fb3

Upload 5 files

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. app.py +11 -9
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.pyc
2
+ __pycache__
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  from PIL import Image
5
  from omegaconf import OmegaConf
6
  from pathlib import Path
7
- from vocoder.hifigan.modules import VocoderHifigan
8
  from ldm.models.diffusion.ddim import DDIMSampler
9
  from ldm.util import instantiate_from_config
10
  from wav_evaluation.models.CLAPWrapper import CLAPWrapper
@@ -29,7 +29,7 @@ def initialize_model(config, ckpt):
29
  return sampler
30
 
31
  sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
32
- vocoder = VocoderHifigan('vocoder/logs/hifi_0127',device=device)
33
  clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
34
 
35
  def select_best_audio(prompt,wav_list):
@@ -52,7 +52,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62
52
  uc = None
53
  if scale != 1.0:
54
  uc = sampler.model.get_learned_conditioning(n_samples * [""])
55
- c = sampler.model.get_learned_conditioning(n_samples * [prompt])
56
  shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
57
  samples_ddim, _ = sampler.sample(S=ddim_steps,
58
  conditioning=c,
@@ -74,7 +74,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62
74
  return best_wav
75
 
76
 
77
- def predict(prompt, ddim_steps, num_samples, scale, seed):
78
  melbins,mel_len = 80,624
79
  with torch.no_grad():
80
  result = txt2audio(
@@ -97,21 +97,23 @@ with gr.Blocks() as demo:
97
 
98
  with gr.Row():
99
  with gr.Column():
100
- prompt = gr.Textbox(label="Prompt: Input your text here:")
101
  run_button = gr.Button(label="Run")
102
 
103
 
104
  with gr.Accordion("Advanced options", open=False):
105
  num_samples = gr.Slider(
106
- label="Candidates", minimum=1, maximum=10, value=3, step=1)
 
 
107
  # num_samples = 1
108
  ddim_steps = gr.Slider(label="Steps", minimum=1,
109
  maximum=150, value=100, step=1)
110
  scale = gr.Slider(
111
- label="Guidance Scale", minimum=0.1, maximum=4.0, value=1.5, step=0.1
112
  )
113
  seed = gr.Slider(
114
- label="Seed",
115
  minimum=0,
116
  maximum=2147483647,
117
  step=1,
@@ -138,4 +140,4 @@ with gr.Blocks() as demo:
138
  with gr.Column():
139
  pass
140
 
141
- demo.launch()
 
4
  from PIL import Image
5
  from omegaconf import OmegaConf
6
  from pathlib import Path
7
+ from vocoder.bigvgan.models import VocoderBigVGAN
8
  from ldm.models.diffusion.ddim import DDIMSampler
9
  from ldm.util import instantiate_from_config
10
  from wav_evaluation.models.CLAPWrapper import CLAPWrapper
 
29
  return sampler
30
 
31
  sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
32
+ vocoder = VocoderBigVGAN('vocoder/logs/bigv16k53w',device=device)
33
  clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
34
 
35
  def select_best_audio(prompt,wav_list):
 
52
  uc = None
53
  if scale != 1.0:
54
  uc = sampler.model.get_learned_conditioning(n_samples * [""])
55
+ c = sampler.model.get_learned_conditioning(n_samples * [prompt])# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding
56
  shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
57
  samples_ddim, _ = sampler.sample(S=ddim_steps,
58
  conditioning=c,
 
74
  return best_wav
75
 
76
 
77
+ def predict(prompt, ddim_steps, num_samples, scale, seed):# 经过试验,这个input_image需要是256x256、512x512的大小效果才正常,实际应该resize一下,输出再resize回去,但是他们使用的是pad,不知道为什么
78
  melbins,mel_len = 80,624
79
  with torch.no_grad():
80
  result = txt2audio(
 
97
 
98
  with gr.Row():
99
  with gr.Column():
100
+ prompt = gr.Textbox(label="Prompt: Input your text here. ")
101
  run_button = gr.Button(label="Run")
102
 
103
 
104
  with gr.Accordion("Advanced options", open=False):
105
  num_samples = gr.Slider(
106
+ label="Select from audios num.This number control the number of candidates \
107
+ (e.g., generate three audios and choose the best to show you). A Larger value usually lead to \
108
+ better quality with heavier computation", minimum=1, maximum=10, value=3, step=1)
109
  # num_samples = 1
110
  ddim_steps = gr.Slider(label="Steps", minimum=1,
111
  maximum=150, value=100, step=1)
112
  scale = gr.Slider(
113
+ label="Guidance Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=4.0, value=1.5, step=0.1
114
  )
115
  seed = gr.Slider(
116
+ label="Seed:Change this value (any integer number) will lead to a different generation result.",
117
  minimum=0,
118
  maximum=2147483647,
119
  step=1,
 
140
  with gr.Column():
141
  pass
142
 
143
+ demo.launch(share=True)