Spaces:
Runtime error
Runtime error
File size: 8,846 Bytes
5c4a11c e5f9b65 54ad002 5c4a11c 54ad002 5c4a11c 54ad002 4997010 54ad002 5c4a11c d29f4c0 b6320af a92b0d1 b6320af 1bfbba7 21c2481 b6320af c1fcf29 a92b0d1 5d778f1 b6320af 1bfbba7 5c4a11c c353fff a92b0d1 c1fcf29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import gradio as gr
import os
import time
import argparse
import yaml, math
from tqdm import trange
import torch
import numpy as np
from omegaconf import OmegaConf
import torch.distributed as dist
from pytorch_lightning import seed_everything
from lvdm.samplers.ddim import DDIMSampler
from lvdm.utils.common_utils import str2bool
from lvdm.utils.dist_utils import setup_dist, gather_data
from lvdm.utils.saving_utils import npz_to_video_grid, npz_to_imgsheet_5d
from utils import load_model, get_conditions, make_model_input_shape, torch_to_np
from huggingface_hub import hf_hub_url, cached_download
config_path = "model_config.yaml"
config = OmegaConf.load(config_path)
REPO_ID = "RamAnanth1/videocrafter-text2video"
ckpt_path = cached_download(hf_hub_url(REPO_ID, 'model.ckpt'))
# # get model & sampler
model, _, _ = load_model(config, ckpt_path,
inject_lora=False,
lora_scale=None,
)
ddim_sampler = DDIMSampler(model)
def sample_denoising_batch(model, noise_shape, condition, *args,
sample_type="ddim", sampler=None,
ddim_steps=None, eta=None,
unconditional_guidance_scale=1.0, uc=None,
denoising_progress=False,
**kwargs,
):
assert(sampler is not None)
assert(ddim_steps is not None)
assert(eta is not None)
ddim_sampler = sampler
samples, _ = ddim_sampler.sample(S=ddim_steps,
conditioning=condition,
batch_size=noise_shape[0],
shape=noise_shape[1:],
verbose=denoising_progress,
unconditional_guidance_scale=unconditional_guidance_scale,
unconditional_conditioning=uc,
eta=eta,
**kwargs,
)
return samples
@torch.no_grad()
def sample_text2video(model, prompt, n_samples, batch_size,
sample_type="ddim", sampler=None,
ddim_steps=50, eta=1.0, cfg_scale=7.5,
decode_frame_bs=1,
ddp=False, all_gather=True,
batch_progress=True, show_denoising_progress=False,
):
# get cond vector
assert(model.cond_stage_model is not None)
cond_embd = get_conditions(prompt, model, batch_size)
uncond_embd = get_conditions("", model, batch_size) if cfg_scale != 1.0 else None
# sample batches
all_videos = []
n_iter = math.ceil(n_samples / batch_size)
iterator = trange(n_iter, desc="Sampling Batches (text-to-video)") if batch_progress else range(n_iter)
for _ in iterator:
noise_shape = make_model_input_shape(model, batch_size)
samples_latent = sample_denoising_batch(model, noise_shape, cond_embd,
sample_type=sample_type,
sampler=sampler,
ddim_steps=ddim_steps,
eta=eta,
unconditional_guidance_scale=cfg_scale,
uc=uncond_embd,
denoising_progress=show_denoising_progress,
)
samples = model.decode_first_stage(samples_latent, decode_bs=decode_frame_bs, return_cpu=False)
# gather samples from multiple gpus
if ddp and all_gather:
data_list = gather_data(samples, return_np=False)
all_videos.extend([torch_to_np(data) for data in data_list])
else:
all_videos.append(torch_to_np(samples))
all_videos = np.concatenate(all_videos, axis=0)
assert(all_videos.shape[0] >= n_samples)
return all_videos
def save_results(videos,
save_name="results", save_fps=8, save_mp4=True,
save_npz=False, save_mp4_sheet=False, save_jpg=False
):
save_subdir = os.path.join("videos")
os.makedirs(save_subdir, exist_ok=True)
for i in range(videos.shape[0]):
npz_to_video_grid(videos[i:i+1,...],
os.path.join(save_subdir, f"{save_name}_{i:03d}.mp4"),
fps=save_fps)
return os.path.join(save_subdir, f"{save_name}_{i:03d}.mp4")
def get_video(prompt, seed):
seed_everything(seed)
samples = sample_text2video(model, prompt, n_samples = 1, batch_size = 1,
sampler=ddim_sampler,
)
return save_results(samples)
DESCRIPTION = '# [Latent Video Diffusion Models](https://github.com/VideoCrafter/VideoCrafter)'
DESCRIPTION += '\n<p>π€π€π€ VideoCrafter is an open-source video generation and editing toolbox for crafting video content. This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://github.com/VideoCrafter/VideoCrafter" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
with gr.Blocks(css='style.css') as demo:
gr.Markdown(DESCRIPTION)
with gr.Group():
with gr.Box():
with gr.Row(elem_id='prompt-container').style(equal_height=True):
prompt = gr.Text(
label='Prompt',
show_label=False,
max_lines=1,
placeholder='Enter your prompt',
elem_id='prompt-text-input').style(container=False)
run_button = gr.Button('Generate video').style(
full_width=False)
result = gr.Video(label='Result', show_label=False, elem_id='gallery')
with gr.Accordion('Advanced options', open=False):
seed = gr.Slider(
label='Seed',
minimum=-1,
maximum=1000000,
step=1,
value=-1,
info='If set to -1, a different seed will be used each time.')
# num_frames = gr.Slider(
# label='Number of frames',
# minimum=16,
# maximum=MAX_NUM_FRAMES,
# step=1,
# value=16,
# info=
# 'Note that the content of the video also changes when you change the number of frames.'
# )
# num_inference_steps = gr.Slider(label='Number of inference steps',
# minimum=10,
# maximum=50,
# step=1,
# value=25)
inputs = [
prompt,
seed,
# num_frames,
# num_inference_steps,
]
gr.Examples(examples=[["Astronaut riding a horse", 1000]],
inputs=inputs,
outputs=result,
fn=get_video,
cache_examples=True)
prompt.submit(fn=get_video, inputs=inputs, outputs=result)
run_button.click(fn=get_video, inputs=inputs, outputs=result)
# with gr.Accordion(label='Biases and content acknowledgment', open=False):
# gr.HTML("""<div class="acknowledgments">
# <h4>Biases and content acknowledgment</h4>
# <p>
# Despite how impressive being able to turn text into video is, beware to the fact that this model may output content that reinforces or exacerbates societal biases. The training data includes LAION5B, ImageNet, Webvid and other public datasets. The model was not trained to realistically represent people or events, so using it to generate such content is beyond the model's capabilities.
# </p>
# <p>
# It is not intended to generate content that is demeaning or harmful to people or their environment, culture, religion, etc. Similarly, it is not allowed to generate pornographic, violent and bloody content generation. <b>The model is meant for research purposes</b>.
# </p>
# <p>
# To learn more about the model, head to its <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.
# </p>
# </div>
# """)
demo.queue(api_open=False, max_size=15).launch()
|