Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
# coding: utf-8 | |
import argparse | |
import glob | |
import os | |
from pathlib import Path | |
import gradio as gr | |
from cloud_task_executor import CloudTaskExecutor | |
from elevenlabs_helper import ElevenLabsHelper | |
# --- | |
talk_key = "talk" | |
valid_base_motion_expressions = [ | |
f"{talk_key}-head", | |
f"{talk_key}-neutral", | |
"smile", | |
"approve", | |
"disapprove", | |
"confused", | |
"sad", | |
"surprised", | |
] | |
def get_default_base_motion_expression(): | |
return valid_base_motion_expressions[0] | |
# --- | |
def get_sorted_filenames_in_dir(dir_path: str, ext: str = ".jpg", throw_if_empty: bool = True) -> list: | |
"""Return the sorted filenames in the spedified directory.""" | |
p = Path(dir_path) | |
if not p.exists() and not p.is_dir(): | |
raise RuntimeError(f"The path: {dir_path} does not exist") | |
if not os.listdir(dir_path): | |
message = f"The path: {dir_path} is empty" | |
if throw_if_empty: | |
raise RuntimeError(message) | |
else: | |
return [] | |
search_string = str(dir_path) + "/*" + ext | |
return sorted(glob.glob(search_string)) | |
# --- | |
description = """Experience a demo of the world's most advanced Text/Audio To Video (TTV) system, crafted by Two AI. | |
Sign up with Two AI to gain rapid, long-form generation, API keys, and more!""" | |
# Core constants | |
tmp_dir = "/tmp/gradio" | |
data_dir = "./data" | |
male_key = "male" | |
female_key = "female" | |
unknown_key = "unknown" | |
media_height = 512 | |
# Male/Female | |
female_terms = ["Female", "Lady", "Woman"] | |
male_terms = ["Male", "Lad", "Man"] | |
# Elevenlabs Voices # | |
all_voices = ElevenLabsHelper.get_voices() | |
voices_ = [voice for voice in all_voices.voices if len(voice.name.split(" ")) < 2 and len(voice.name) < 10] | |
female_voice_names = ElevenLabsHelper.select_voices(voices_, labels={"gender": female_key, "age": "young"}) | |
male_voice_names = ElevenLabsHelper.select_voices(voices_, labels={"gender": male_key, "age": "young"}) | |
male_voice_names.remove("Priya") | |
voices = { | |
female_key: female_voice_names, | |
male_key: male_voice_names, | |
unknown_key: female_voice_names + male_voice_names, | |
} | |
# Examples | |
# Base Images | |
example_base_image_dir = os.path.join(data_dir, "input_image_bases") | |
example_base_images = { | |
female_key: get_sorted_filenames_in_dir(os.path.join(example_base_image_dir, female_key), ext=".jpg"), | |
male_key: get_sorted_filenames_in_dir(os.path.join(example_base_image_dir, male_key), ext=".jpg"), | |
} | |
# Base Videos | |
example_base_video_dir = os.path.join(data_dir, "input_video_bases") | |
example_source_videos = { | |
female_key: get_sorted_filenames_in_dir(os.path.join(example_base_video_dir, female_key), ext=".mp4"), | |
male_key: get_sorted_filenames_in_dir(os.path.join(example_base_video_dir, male_key), ext=".mp4"), | |
} | |
# Driving Audio | |
example_driving_audio_dir = os.path.join(data_dir, "input_audio/gradio") | |
example_driving_audios_male = get_sorted_filenames_in_dir(os.path.join(example_driving_audio_dir, male_key), ext=".mp3") | |
example_driving_audios_female = get_sorted_filenames_in_dir( | |
os.path.join(example_driving_audio_dir, female_key), ext=".mp3" | |
) | |
example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male} | |
# Driving Text | |
audio_text_groups = ["General", "Promotional Messages", "Pronunciation Practice"] | |
example_driving_audio_texts = { | |
"General": [ | |
"The 2026 World Cup final match is in New York.", | |
"Enhance efficiency and cut costs with AI.", | |
"A bee's wings beat more than 200 times per second.", | |
"2026년 월드컵 결승전은 뉴욕에서 열립니다.", | |
"AI로 효율성을 높이고 비용을 절감하세요.", | |
"벌은 초당 200회 이상의 날개짓을 합니다.", | |
"2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।", | |
"AI के साथ दक्षता बढ़ाएं और लागत कम करें।", | |
"मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।", | |
], | |
"Promotional Messages": [ | |
"Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!", | |
"Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!", | |
"This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.", | |
"Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.", | |
], | |
"Pronunciation Practice": [ | |
"A big black bug bit a big black dog on his big black nose.", | |
"Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?", | |
], | |
} | |
example_showcase_dir = os.path.join(data_dir, "showcase_examples") | |
examples_showcase = { | |
"make_image_talk_multilingual": get_sorted_filenames_in_dir( | |
os.path.join(example_showcase_dir, "make_image_talk_multilingual"), ext=".mp4" | |
), | |
"make_image_talk_cartoon": get_sorted_filenames_in_dir( | |
os.path.join(example_showcase_dir, "make_image_talk_cartoon"), ext=".mp4" | |
), | |
"make_image_talk_diff_angles": get_sorted_filenames_in_dir( | |
os.path.join(example_showcase_dir, "make_image_talk_diff_angles"), ext=".mp4" | |
), | |
"make_image_talk_hb": get_sorted_filenames_in_dir( | |
os.path.join(example_showcase_dir, "make_image_talk_hb"), ext=".mp4" | |
), | |
"make_video_talk_multilingual": get_sorted_filenames_in_dir( | |
os.path.join(example_showcase_dir, "make_video_talk_multilingual"), ext=".mp4" | |
), | |
"make_video_talk_corp_msg": get_sorted_filenames_in_dir( | |
os.path.join(example_showcase_dir, "make_video_talk_corp_msg"), ext=".mp4" | |
), | |
"make_video_talk_rap_multii": get_sorted_filenames_in_dir( | |
os.path.join(example_showcase_dir, "make_video_talk_rap_multii"), ext=".mp4" | |
), | |
"dubbing_superpowerman": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "dubbing_superpowerman"), ext=".mp4"), | |
"make_image_talk_selfie": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "make_image_talk_selfie"), ext=".mp4"), | |
"dubbing_coffee": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "dubbing_coffee"), ext=".mp4"), | |
} | |
def update_voices(media_path): | |
def get_category(media_path): | |
if media_path: | |
for fterm in female_terms: | |
if fterm in media_path or fterm.lower() in media_path: | |
return female_key | |
for mterm in male_terms: | |
if mterm in media_path or mterm.lower() in media_path: | |
return male_key | |
return unknown_key | |
category = get_category(media_path) | |
driving_input_voice = gr.Dropdown( | |
choices=voices[category], | |
value=voices[category][0], | |
interactive=True, | |
) | |
return driving_input_voice | |
def task_executor_fn( | |
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input | |
): | |
return task_executor.execute_task( | |
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input | |
) | |
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image: | |
with gr.Row(): | |
# Step 1: Choose Image | |
with gr.Column(scale=4): | |
gr.Markdown("### Step 1: Choose Image") | |
gr.Markdown("Upload or select an example image to drive.") | |
with gr.Accordion(open=True, label="Base Image"): | |
base_image_input = gr.Image(type="filepath", sources="upload", height=media_height) | |
gr.Examples( | |
examples=[[example] for example in example_base_images[female_key]], | |
inputs=[base_image_input], | |
cache_examples=False, | |
label="Female", | |
) | |
gr.Examples( | |
examples=[[example] for example in example_base_images[male_key]], | |
inputs=[base_image_input], | |
cache_examples=False, | |
label="Male", | |
) | |
# Step 2: Motion and Audio/TTS | |
with gr.Column(scale=4): | |
gr.Markdown("### Step 2: Motion and Audio/TTS") | |
gr.Markdown("Select motion and provide audio or text for lip-sync.") | |
with gr.Accordion(open=True, label="Base Motion"): | |
base_motion_expression = gr.Radio( | |
choices=valid_base_motion_expressions, | |
label="Select base motion", | |
value=get_default_base_motion_expression(), | |
) | |
with gr.Tabs(): | |
with gr.TabItem("Driving Audio: File") as tab_audio_file: | |
with gr.Accordion(open=True, label="Driving Audio: From File"): | |
driving_audio_input = gr.Audio(sources=["upload"], type="filepath") | |
gr.Examples( | |
examples=[[example] for example in example_driving_audios[female_key]], | |
inputs=[driving_audio_input], | |
cache_examples=False, | |
examples_per_page=18, | |
label="Female", | |
) | |
gr.Examples( | |
examples=[[example] for example in example_driving_audios[male_key]], | |
inputs=[driving_audio_input], | |
cache_examples=False, | |
examples_per_page=18, | |
label="Male", | |
) | |
with gr.TabItem("Driving Audio: TTS") as tab_audio_tts: | |
with gr.Accordion(open=True, label="Driving Audio: From Text"): | |
driving_input_voice = gr.Dropdown( | |
choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice" | |
) | |
driving_text_input = gr.Textbox( | |
label="Input Text (300 characters max)", | |
lines=2, | |
) | |
for group in audio_text_groups: | |
gr.Examples( | |
examples=[[example] for example in example_driving_audio_texts[group]], | |
inputs=[driving_text_input], | |
cache_examples=False, | |
label=group, | |
) | |
# Step 3: Result | |
with gr.Column(scale=4): | |
gr.Markdown("### Step 3: Result") | |
gr.Markdown("Generate and view the output video.") | |
process_button_animation = gr.Button("🌟 Generate", variant="primary") | |
output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height) | |
message = gr.Textbox(label="Info") | |
process_button_reset = gr.ClearButton( | |
[ | |
base_image_input, | |
driving_audio_input, | |
driving_text_input, | |
driving_input_voice, | |
output_video_i2v, | |
], | |
value="🧹 Clear", | |
) | |
base_image_input.change(fn=update_voices, inputs=[base_image_input], outputs=[driving_input_voice]) | |
# binding functions for buttons | |
process_button_animation.click( | |
fn=task_executor_fn, | |
inputs=[ | |
base_image_input, | |
base_motion_expression, | |
driving_audio_input, | |
driving_text_input, | |
driving_input_voice, | |
], | |
outputs=[output_video_i2v, output_video_i2v, message], | |
show_progress=True, | |
) | |
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_video: | |
with gr.Row(): | |
# Step 1: Choose Video | |
with gr.Column(scale=4): | |
gr.Markdown("### Step 1: Choose Video") | |
gr.Markdown("Upload or select an example video to drive.") | |
with gr.Accordion(open=True, label="Base Video"): | |
base_video_input = gr.Video(sources="upload", height=media_height, interactive=True) | |
gr.Examples( | |
examples=[[example] for example in example_source_videos[female_key]], | |
inputs=[base_video_input], | |
cache_examples=False, | |
label="Female", | |
) | |
gr.Examples( | |
examples=[[example] for example in example_source_videos[male_key]], | |
inputs=[base_video_input], | |
cache_examples=False, | |
label="Male", | |
) | |
# Step 2: Audio/TTS | |
with gr.Column(scale=4): | |
gr.Markdown("### Step 2: Audio/TTS") | |
gr.Markdown("Provide audio or text for lip-sync.") | |
with gr.Tabs(): | |
with gr.TabItem("Driving Audio: File") as tab_audio_file: | |
with gr.Accordion(open=True, label="Driving Audio: From File"): | |
driving_audio_input = gr.Audio(sources=["upload"], type="filepath") | |
gr.Examples( | |
examples=[[example] for example in example_driving_audios[female_key]], | |
inputs=[driving_audio_input], | |
cache_examples=False, | |
examples_per_page=18, | |
label="Female", | |
) | |
gr.Examples( | |
examples=[[example] for example in example_driving_audios[male_key]], | |
inputs=[driving_audio_input], | |
cache_examples=False, | |
examples_per_page=18, | |
label="Male", | |
) | |
with gr.TabItem("Driving Audio: TTS") as tab_audio_tts: | |
with gr.Accordion(open=True, label="Driving Audio: From Text"): | |
driving_input_voice = gr.Dropdown( | |
choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice" | |
) | |
driving_text_input = gr.Textbox( | |
label="Input Text (300 characters max)", | |
lines=2, | |
) | |
for group in audio_text_groups: | |
gr.Examples( | |
examples=[[example] for example in example_driving_audio_texts[group]], | |
inputs=[driving_text_input], | |
cache_examples=False, | |
label=group, | |
) | |
# Step 3: Result | |
with gr.Column(scale=4): | |
gr.Markdown("### Step 3: Result") | |
gr.Markdown("Generate and view the output video.") | |
process_button_animation = gr.Button("🌟 Generate", variant="primary") | |
output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height) | |
message = gr.Textbox(label="Info") | |
process_button_reset = gr.ClearButton( | |
[base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v], | |
value="🧹 Clear", | |
) | |
base_video_input.change(fn=update_voices, inputs=[base_video_input], outputs=[driving_input_voice]) | |
# binding functions for buttons | |
base_motion_expression = gr.Radio(value=None, visible=False) | |
process_button_animation.click( | |
fn=task_executor_fn, | |
inputs=[ | |
base_video_input, | |
base_motion_expression, | |
driving_audio_input, | |
driving_text_input, | |
driving_input_voice, | |
], | |
outputs=[output_video_i2v, output_video_i2v, message], | |
show_progress=True, | |
) | |
with gr.Blocks() as showcase_examples: | |
gr.Markdown("# Make Image Talk") | |
with gr.Row(): | |
with gr.Column(scale=7): | |
for path in examples_showcase["make_image_talk_multilingual"]: | |
gr.Video(value=path, label=os.path.basename(path), height=300) | |
with gr.Column(scale=3): | |
for path in examples_showcase["make_image_talk_cartoon"]: | |
gr.Video(value=path, label=os.path.basename(path), height=616) | |
with gr.Row(): | |
with gr.Column(scale=7): | |
for path in examples_showcase["make_image_talk_diff_angles"]: | |
gr.Video(value=path, label=os.path.basename(path), height=350) | |
with gr.Column(scale=3): | |
for path in examples_showcase["make_image_talk_hb"]: | |
gr.Video(value=path, label=os.path.basename(path), height=350) | |
with gr.Row(): | |
for path in examples_showcase['make_image_talk_selfie']: | |
gr.Video(value=path, label=os.path.basename(path), height=430) | |
gr.Markdown("# Make Video Talk") | |
with gr.Row(): | |
with gr.Column(scale=7): | |
for path in examples_showcase["make_video_talk_multilingual"]: | |
gr.Video(value=path, label=os.path.basename(path), height=300) | |
with gr.Column(scale=3): | |
for path in examples_showcase["make_video_talk_corp_msg"]: | |
gr.Video(value=path, label=os.path.basename(path), height=616) | |
with gr.Row(): | |
for path in examples_showcase["make_video_talk_rap_multii"]: | |
gr.Video(value=path, label=os.path.basename(path), height=500) | |
gr.Markdown("# Dubbing") | |
with gr.Row(): | |
for path in examples_showcase["dubbing_superpowerman"]: | |
gr.Video(value=path, label=os.path.basename(path), height=320) | |
with gr.Row(): | |
for path in examples_showcase["dubbing_coffee"]: | |
gr.Video(value=path, label=os.path.basename(path), height=440) | |
with gr.Blocks(analytics_enabled=False, css="footer{display:none !important}", title="SUTRA Avatar v2") as demo: | |
gr.Markdown( | |
""" | |
## <img src="https://playground.two.ai/sutra.svg" height="20"/> | |
""" | |
) | |
title = "# 🌟 SUTRA Avatar v2 🌟\n## Drive Image or Video with LipSync from Audio or Text" | |
gr.Markdown(title) | |
gr.Markdown(description) | |
gr.TabbedInterface( | |
interface_list=[demo_image, demo_video, showcase_examples], | |
tab_names=["Drive Image", "Drive Video", "Showcase Examples"], | |
) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="SUTRA AVATAR CLIENT") | |
args = parser.parse_args() | |
task_executor = CloudTaskExecutor() | |
demo.queue(default_concurrency_limit=10).launch( | |
server_name="0.0.0.0", | |
allowed_paths=["/"], | |
) | |