Spaces:
Running
on
Zero
Running
on
Zero
ymzhang319
commited on
Commit
•
e562afd
1
Parent(s):
3af5e96
update app.py
Browse files
app.py
CHANGED
@@ -130,7 +130,7 @@ class FoleyController:
|
|
130 |
prompt_textbox,
|
131 |
negative_prompt_textbox,
|
132 |
ip_adapter_scale,
|
133 |
-
|
134 |
sampler_dropdown,
|
135 |
sample_step_slider,
|
136 |
cfg_scale_slider,
|
@@ -154,7 +154,7 @@ class FoleyController:
|
|
154 |
if seed_textbox != "":
|
155 |
torch.manual_seed(int(seed_textbox))
|
156 |
generator.manual_seed(int(seed_textbox))
|
157 |
-
max_frame_nums =
|
158 |
frames, duration = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
|
159 |
if duration >= 10:
|
160 |
duration = 10
|
@@ -169,7 +169,9 @@ class FoleyController:
|
|
169 |
time_condition = time_condition + [-1] * (1024 - len(time_condition))
|
170 |
# w -> b c h w
|
171 |
time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
|
172 |
-
|
|
|
|
|
173 |
images = self.image_processor(images=frames, return_tensors="pt").to(device)
|
174 |
image_embeddings = self.image_encoder(**images).image_embeds
|
175 |
image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
|
@@ -253,18 +255,20 @@ with gr.Blocks(css=css) as demo:
|
|
253 |
negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
|
254 |
|
255 |
with gr.Row():
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
)
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
|
|
|
|
268 |
|
269 |
with gr.Row():
|
270 |
seed_textbox = gr.Textbox(label="Seed", value=42)
|
@@ -273,7 +277,12 @@ with gr.Blocks(css=css) as demo:
|
|
273 |
|
274 |
generate_button = gr.Button(value="Generate", variant="primary")
|
275 |
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
generate_button.click(
|
279 |
fn=controller.foley,
|
@@ -282,7 +291,7 @@ with gr.Blocks(css=css) as demo:
|
|
282 |
prompt_textbox,
|
283 |
negative_prompt_textbox,
|
284 |
ip_adapter_scale,
|
285 |
-
|
286 |
sampler_dropdown,
|
287 |
sample_step_slider,
|
288 |
cfg_scale_slider,
|
@@ -292,13 +301,22 @@ with gr.Blocks(css=css) as demo:
|
|
292 |
)
|
293 |
|
294 |
gr.Examples(
|
295 |
-
examples= [
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
],
|
301 |
-
inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
|
|
|
|
|
|
|
302 |
)
|
303 |
|
304 |
demo.queue(10)
|
|
|
130 |
prompt_textbox,
|
131 |
negative_prompt_textbox,
|
132 |
ip_adapter_scale,
|
133 |
+
temporal_scale,
|
134 |
sampler_dropdown,
|
135 |
sample_step_slider,
|
136 |
cfg_scale_slider,
|
|
|
154 |
if seed_textbox != "":
|
155 |
torch.manual_seed(int(seed_textbox))
|
156 |
generator.manual_seed(int(seed_textbox))
|
157 |
+
max_frame_nums = 150
|
158 |
frames, duration = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
|
159 |
if duration >= 10:
|
160 |
duration = 10
|
|
|
169 |
time_condition = time_condition + [-1] * (1024 - len(time_condition))
|
170 |
# w -> b c h w
|
171 |
time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
|
172 |
+
|
173 |
+
# Note that clip need fewer frames
|
174 |
+
frames = frames[::10]
|
175 |
images = self.image_processor(images=frames, return_tensors="pt").to(device)
|
176 |
image_embeddings = self.image_encoder(**images).image_embeds
|
177 |
image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
|
|
|
255 |
negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
|
256 |
|
257 |
with gr.Row():
|
258 |
+
ip_adapter_scale = gr.Slider(label="Visual Content Scale", value=1.0, minimum=0, maximum=1)
|
259 |
+
temporal_scale = gr.Slider(label="Temporal Align Scale", value=0.2, minimum=0., maximum=1.0)
|
260 |
+
|
261 |
+
with gr.Accordion("Sampling Settings", open=False):
|
262 |
+
with gr.Row():
|
263 |
+
sampler_dropdown = gr.Dropdown(
|
264 |
+
label="Sampling method",
|
265 |
+
choices=list(scheduler_dict.keys()),
|
266 |
+
value=list(scheduler_dict.keys())[0],
|
267 |
+
)
|
268 |
+
sample_step_slider = gr.Slider(
|
269 |
+
label="Sampling steps", value=25, minimum=10, maximum=100, step=1
|
270 |
+
)
|
271 |
+
cfg_scale_slider = gr.Slider(label="CFG Scale", value=7.5, minimum=0, maximum=20)
|
272 |
|
273 |
with gr.Row():
|
274 |
seed_textbox = gr.Textbox(label="Seed", value=42)
|
|
|
277 |
|
278 |
generate_button = gr.Button(value="Generate", variant="primary")
|
279 |
|
280 |
+
with gr.Column():
|
281 |
+
result_video = gr.Video(label="Generated Audio", interactive=False)
|
282 |
+
gr.Markdown('**Tips**: <br> \
|
283 |
+
1. With strong temporal visual cues in input video, you can scale up the **Temporal Align Scale**. <br>\
|
284 |
+
2. **Visual content scale** is the level of semantic alignment with visual content. \
|
285 |
+
')
|
286 |
|
287 |
generate_button.click(
|
288 |
fn=controller.foley,
|
|
|
291 |
prompt_textbox,
|
292 |
negative_prompt_textbox,
|
293 |
ip_adapter_scale,
|
294 |
+
temporal_scale,
|
295 |
sampler_dropdown,
|
296 |
sample_step_slider,
|
297 |
cfg_scale_slider,
|
|
|
301 |
)
|
302 |
|
303 |
gr.Examples(
|
304 |
+
# examples= [
|
305 |
+
# ['examples/videos/51701454.mp4', 'seagulls', '', 1.0, 'DDIM', 25, 7.5, 10014024412012338098],
|
306 |
+
# ['examples/videos/42.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 42],
|
307 |
+
# ['examples/videos/1.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 93493458],
|
308 |
+
# ['examples/videos/2.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 16520432],
|
309 |
+
# ],
|
310 |
+
examples=[
|
311 |
+
['examples/input/case1.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 33817921],
|
312 |
+
['examples/input/case3.mp4', '', '', 1.0, 0.2,'DDIM', 25, 7.5, 94667578],
|
313 |
+
['examples/input/case5.mp4', '', '', 0.75, 0.2,'DDIM', 25, 7.5, 92890876],
|
314 |
+
['examples/input/case6.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 77015909],
|
315 |
],
|
316 |
+
inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,temporal_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
|
317 |
+
cache_examples=True,
|
318 |
+
outputs=[result_video],
|
319 |
+
fn=controller.foley,
|
320 |
)
|
321 |
|
322 |
demo.queue(10)
|