Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -14,6 +14,7 @@ import torch
|
|
14 |
|
15 |
from spectro import wav_bytes_from_spectrogram_image
|
16 |
from diffusers import StableDiffusionPipeline
|
|
|
17 |
|
18 |
import io
|
19 |
from os import path
|
@@ -38,8 +39,10 @@ tips = {"en": "Tips: The input text will be translated into English for generati
|
|
38 |
|
39 |
count = 0
|
40 |
|
|
|
41 |
model_id = "runwayml/stable-diffusion-v1-5"
|
42 |
-
|
|
|
43 |
pipe = pipe.to("cuda")
|
44 |
|
45 |
model_id2 = "riffusion/riffusion-model-v1"
|
@@ -60,23 +63,23 @@ def translate_language(text_prompts):
|
|
60 |
text_prompts = language_translation_model.translate(text_prompts, language_code, 'en')
|
61 |
except Exception as e:
|
62 |
error_text = str(e)
|
63 |
-
return {status_text:error_text, language_tips_text:gr.update(visible=False)}
|
64 |
if language_code in tips:
|
65 |
tips_text = tips[language_code]
|
66 |
else:
|
67 |
tips_text = tips['en']
|
68 |
-
if language_code == '
|
69 |
return {language_tips_text:gr.update(visible=False), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)}
|
70 |
else:
|
71 |
return {language_tips_text:gr.update(visible=True, value=tips_text), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)}
|
72 |
|
73 |
|
74 |
|
75 |
-
def get_result(text_prompts, style_indx, musicAI_indx):
|
76 |
style = style_list_EN[style_indx]
|
77 |
prompt = style + "," + text_prompts
|
78 |
|
79 |
-
sdresult = pipe(prompt)
|
80 |
image_output = sdresult.images[0] if not sdresult.nsfw_content_detected[0] else Image.open("nsfw_placeholder.jpg")
|
81 |
|
82 |
print("Generated image with prompt " + prompt)
|
@@ -91,15 +94,18 @@ def get_result(text_prompts, style_indx, musicAI_indx):
|
|
91 |
|
92 |
interrogate_prompt = img_to_text(imagefile, "ViT-L (best for Stable Diffusion 1.*)", "fast", fn_index=1)[0]
|
93 |
print(interrogate_prompt)
|
94 |
-
spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx], musicAI_indx)
|
95 |
|
96 |
video_merged = merge_video(music_output, image_output)
|
97 |
-
return {spec_result:spec_image, video_result:video_merged, status_text:'Success'}
|
98 |
-
|
99 |
|
100 |
-
def get_music(prompt, musicAI_indx):
|
101 |
if musicAI_indx == 0:
|
102 |
-
|
|
|
|
|
|
|
|
|
103 |
print(spec)
|
104 |
wav = wav_bytes_from_spectrogram_image(spec)
|
105 |
with open("output.wav", "wb") as f:
|
@@ -148,7 +154,9 @@ def merge_video(mp3file_name, image):
|
|
148 |
fps = 12
|
149 |
slide_time = audio_length
|
150 |
fourcc = cv2.VideoWriter.fourcc(*'MJPG')
|
151 |
-
|
|
|
|
|
152 |
|
153 |
# for image in img_list:
|
154 |
# cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
@@ -188,6 +196,11 @@ def merge_video(mp3file_name, image):
|
|
188 |
mergedclip.to_videofile('mergedvideo.mp4')
|
189 |
return 'mergedvideo.mp4'
|
190 |
|
|
|
|
|
|
|
|
|
|
|
191 |
title="文生图生音乐视频 Text to Image to Music to Video with Riffusion"
|
192 |
|
193 |
description="An AI art generation pipeline, which supports text-to-image-to-music task."
|
@@ -263,6 +276,22 @@ css = """
|
|
263 |
font-weight: bold;
|
264 |
font-size: 115%;
|
265 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
"""
|
267 |
|
268 |
block = gr.Blocks(css=css)
|
@@ -429,6 +458,7 @@ with block:
|
|
429 |
</div>
|
430 |
"""
|
431 |
)
|
|
|
432 |
with gr.Group():
|
433 |
with gr.Box():
|
434 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
@@ -437,6 +467,7 @@ with block:
|
|
437 |
show_label=False,
|
438 |
max_lines=1,
|
439 |
placeholder="Enter your prompt, multiple languages are supported now.",
|
|
|
440 |
).style(
|
441 |
border=(True, False, True, True),
|
442 |
rounded=(True, False, False, True),
|
@@ -453,6 +484,7 @@ with block:
|
|
453 |
'像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)',
|
454 |
'洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index")
|
455 |
musicAI = gr.Dropdown(label="音乐生成技术(AI Music Generator)", choices=['Riffusion', 'Mubert AI'], value='Riffusion', type="index")
|
|
|
456 |
status_text = gr.Textbox(
|
457 |
label="处理状态(Process status)",
|
458 |
show_label=True,
|
@@ -460,35 +492,45 @@ with block:
|
|
460 |
interactive=False
|
461 |
)
|
462 |
|
463 |
-
video_result = gr.Video(type=None, label='Final Merged video')
|
464 |
-
spec_result = gr.Image()
|
465 |
-
|
466 |
-
trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion.
|
467 |
-
translated_language = gr.Textbox(vaule="", visible=False)
|
468 |
-
|
469 |
-
|
470 |
-
ex = gr.Examples(examples=examples, fn=translate_language_example, inputs=[text, styles], outputs=[language_tips_text, status_text, trigger_component, translated_language], cache_examples=False)
|
471 |
-
ex.dataset.headers = [""]
|
472 |
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
|
|
|
|
|
|
|
|
|
|
492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
|
494 |
-
block.queue(
|
|
|
14 |
|
15 |
from spectro import wav_bytes_from_spectrogram_image
|
16 |
from diffusers import StableDiffusionPipeline
|
17 |
+
from diffusers import EulerAncestralDiscreteScheduler
|
18 |
|
19 |
import io
|
20 |
from os import path
|
|
|
39 |
|
40 |
count = 0
|
41 |
|
42 |
+
|
43 |
model_id = "runwayml/stable-diffusion-v1-5"
|
44 |
+
eulera = EulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
|
45 |
+
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, scheduler=eulera)
|
46 |
pipe = pipe.to("cuda")
|
47 |
|
48 |
model_id2 = "riffusion/riffusion-model-v1"
|
|
|
63 |
text_prompts = language_translation_model.translate(text_prompts, language_code, 'en')
|
64 |
except Exception as e:
|
65 |
error_text = str(e)
|
66 |
+
return {status_text:error_text, language_tips_text:gr.update(visible=False), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)}
|
67 |
if language_code in tips:
|
68 |
tips_text = tips[language_code]
|
69 |
else:
|
70 |
tips_text = tips['en']
|
71 |
+
if language_code == 'en':
|
72 |
return {language_tips_text:gr.update(visible=False), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)}
|
73 |
else:
|
74 |
return {language_tips_text:gr.update(visible=True, value=tips_text), translated_language:text_prompts, trigger_component: gr.update(value=count, visible=False)}
|
75 |
|
76 |
|
77 |
|
78 |
+
def get_result(text_prompts, style_indx, musicAI_indx, duration):
|
79 |
style = style_list_EN[style_indx]
|
80 |
prompt = style + "," + text_prompts
|
81 |
|
82 |
+
sdresult = pipe(prompt, negative_prompt = "out of focus, scary, creepy, evil, disfigured, missing limbs, ugly, gross, missing fingers", num_inference_steps=50, guidance_scale=7, width=576, height=576)
|
83 |
image_output = sdresult.images[0] if not sdresult.nsfw_content_detected[0] else Image.open("nsfw_placeholder.jpg")
|
84 |
|
85 |
print("Generated image with prompt " + prompt)
|
|
|
94 |
|
95 |
interrogate_prompt = img_to_text(imagefile, "ViT-L (best for Stable Diffusion 1.*)", "fast", fn_index=1)[0]
|
96 |
print(interrogate_prompt)
|
97 |
+
spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx], musicAI_indx, duration)
|
98 |
|
99 |
video_merged = merge_video(music_output, image_output)
|
100 |
+
return {spec_result:spec_image, video_result:video_merged, status_text:'Success', share_button:gr.update(visible=True), community_icon:gr.update(visible=True), loading_icon:gr.update(visible=True)}
|
|
|
101 |
|
102 |
+
def get_music(prompt, musicAI_indx, duration):
|
103 |
if musicAI_indx == 0:
|
104 |
+
if duration == 5:
|
105 |
+
width_duration=512
|
106 |
+
else :
|
107 |
+
width_duration = 512 + ((int(duration)-5) * 128)
|
108 |
+
spec = pipe2(prompt, height=512, width=width_duration).images[0]
|
109 |
print(spec)
|
110 |
wav = wav_bytes_from_spectrogram_image(spec)
|
111 |
with open("output.wav", "wb") as f:
|
|
|
154 |
fps = 12
|
155 |
slide_time = audio_length
|
156 |
fourcc = cv2.VideoWriter.fourcc(*'MJPG')
|
157 |
+
|
158 |
+
#W, H should be the same as input image
|
159 |
+
out = cv2.VideoWriter(file_name, fourcc, fps, (576, 576))
|
160 |
|
161 |
# for image in img_list:
|
162 |
# cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
|
|
196 |
mergedclip.to_videofile('mergedvideo.mp4')
|
197 |
return 'mergedvideo.mp4'
|
198 |
|
199 |
+
def change_music_generator(current_choice):
|
200 |
+
if current_choice == 0:
|
201 |
+
return gr.update(visible=True)
|
202 |
+
return gr.update(visible=False)
|
203 |
+
|
204 |
title="文生图生音乐视频 Text to Image to Music to Video with Riffusion"
|
205 |
|
206 |
description="An AI art generation pipeline, which supports text-to-image-to-music task."
|
|
|
276 |
font-weight: bold;
|
277 |
font-size: 115%;
|
278 |
}
|
279 |
+
#share-btn-container {
|
280 |
+
display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
|
281 |
+
}
|
282 |
+
#share-btn {
|
283 |
+
all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
|
284 |
+
}
|
285 |
+
#share-btn * {
|
286 |
+
all: unset;
|
287 |
+
}
|
288 |
+
#share-btn-container div:nth-child(-n+2){
|
289 |
+
width: auto !important;
|
290 |
+
min-height: 0px !important;
|
291 |
+
}
|
292 |
+
#share-btn-container .wrap {
|
293 |
+
display: none !important;
|
294 |
+
}
|
295 |
"""
|
296 |
|
297 |
block = gr.Blocks(css=css)
|
|
|
458 |
</div>
|
459 |
"""
|
460 |
)
|
461 |
+
|
462 |
with gr.Group():
|
463 |
with gr.Box():
|
464 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
|
|
467 |
show_label=False,
|
468 |
max_lines=1,
|
469 |
placeholder="Enter your prompt, multiple languages are supported now.",
|
470 |
+
elem_id="input-prompt",
|
471 |
).style(
|
472 |
border=(True, False, True, True),
|
473 |
rounded=(True, False, False, True),
|
|
|
484 |
'像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)',
|
485 |
'洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index")
|
486 |
musicAI = gr.Dropdown(label="音乐生成技术(AI Music Generator)", choices=['Riffusion', 'Mubert AI'], value='Riffusion', type="index")
|
487 |
+
duration_input = gr.Slider(label="Duration in seconds", minimum=5, maximum=10, step=1, value=5, elem_id="duration-slider", visible=True)
|
488 |
status_text = gr.Textbox(
|
489 |
label="处理状态(Process status)",
|
490 |
show_label=True,
|
|
|
492 |
interactive=False
|
493 |
)
|
494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
|
496 |
+
with gr.Column(elem_id="col-container"):
|
497 |
+
with gr.Group(elem_id="share-btn-container"):
|
498 |
+
community_icon = gr.HTML(community_icon_html, visible=False)
|
499 |
+
loading_icon = gr.HTML(loading_icon_html, visible=False)
|
500 |
+
share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
|
501 |
+
|
502 |
+
share_button.click(None, [], [], _js=share_js)
|
503 |
+
|
504 |
+
video_result = gr.Video(type=None, label='Final Merged video', elem_id="output-video")
|
505 |
+
spec_result = gr.Image()
|
506 |
+
|
507 |
+
trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion.
|
508 |
+
translated_language = gr.Textbox(vaule="", visible=False)
|
509 |
+
|
510 |
+
|
511 |
+
ex = gr.Examples(examples=examples, fn=translate_language_example, inputs=[text, styles], outputs=[language_tips_text, status_text, trigger_component, translated_language], cache_examples=False)
|
512 |
+
ex.dataset.headers = [""]
|
513 |
+
|
514 |
+
|
515 |
+
musicAI.change(fn=change_music_generator, inputs=[musicAI], outputs=[duration_input])
|
516 |
+
text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
|
517 |
+
btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
|
518 |
+
trigger_component.change(fn=get_result, inputs=[translated_language, styles, musicAI, duration_input], outputs=[spec_result, video_result, status_text, share_button, community_icon, loading_icon])
|
519 |
+
|
520 |
|
521 |
+
gr.Markdown(
|
522 |
+
"""
|
523 |
+
Space by [@DGSpitzer](https://www.youtube.com/channel/UCzzsYBF4qwtMwJaPJZ5SuPg)❤️ [@大谷的游戏创作小屋](https://space.bilibili.com/176003)
|
524 |
+
[![Twitter Follow](https://img.shields.io/twitter/follow/DGSpitzer?label=%40DGSpitzer&style=social)](https://twitter.com/DGSpitzer)
|
525 |
+
![visitors](https://visitor-badge.glitch.me/badge?page_id=dgspitzer_txt2img2video)
|
526 |
+
"""
|
527 |
+
)
|
528 |
+
gr.HTML('''
|
529 |
+
<div class="footer">
|
530 |
+
<p>Model:<a href="https://huggingface.co/riffusion/riffusion-model-v1" style="text-decoration: underline;" target="_blank">Riffusion</a>
|
531 |
+
</p>
|
532 |
+
</div>
|
533 |
+
''')
|
534 |
+
|
535 |
|
536 |
+
block.queue().launch()
|