Spaces:

RamAnanth1
/

videocrafter

Runtime error

App Files Files Community

videocrafter / app.py

RamAnanth1

Update app.py

c353fff over 1 year ago

raw

history blame

No virus

8.85 kB

	import gradio as gr
	import os
	import time
	import argparse
	import yaml, math
	from tqdm import trange
	import torch
	import numpy as np
	from omegaconf import OmegaConf
	import torch.distributed as dist
	from pytorch_lightning import seed_everything

	from lvdm.samplers.ddim import DDIMSampler
	from lvdm.utils.common_utils import str2bool
	from lvdm.utils.dist_utils import setup_dist, gather_data
	from lvdm.utils.saving_utils import npz_to_video_grid, npz_to_imgsheet_5d
	from utils import load_model, get_conditions, make_model_input_shape, torch_to_np
	from huggingface_hub import hf_hub_url, cached_download

	config_path = "model_config.yaml"
	config = OmegaConf.load(config_path)

	REPO_ID = "RamAnanth1/videocrafter-text2video"
	ckpt_path = cached_download(hf_hub_url(REPO_ID, 'model.ckpt'))
	# # get model & sampler
	model, _, _ = load_model(config, ckpt_path,
	inject_lora=False,
	lora_scale=None,
	)
	ddim_sampler = DDIMSampler(model)

	def sample_denoising_batch(model, noise_shape, condition, *args,
	sample_type="ddim", sampler=None,
	ddim_steps=None, eta=None,
	unconditional_guidance_scale=1.0, uc=None,
	denoising_progress=False,
	**kwargs,
	):

	assert(sampler is not None)
	assert(ddim_steps is not None)
	assert(eta is not None)
	ddim_sampler = sampler
	samples, _ = ddim_sampler.sample(S=ddim_steps,
	conditioning=condition,
	batch_size=noise_shape[0],
	shape=noise_shape[1:],
	verbose=denoising_progress,
	unconditional_guidance_scale=unconditional_guidance_scale,
	unconditional_conditioning=uc,
	eta=eta,
	**kwargs,
	)
	return samples

	@torch.no_grad()
	def sample_text2video(model, prompt, n_samples, batch_size,
	sample_type="ddim", sampler=None,
	ddim_steps=50, eta=1.0, cfg_scale=7.5,
	decode_frame_bs=1,
	ddp=False, all_gather=True,
	batch_progress=True, show_denoising_progress=False,
	):
	# get cond vector
	assert(model.cond_stage_model is not None)
	cond_embd = get_conditions(prompt, model, batch_size)
	uncond_embd = get_conditions("", model, batch_size) if cfg_scale != 1.0 else None

	# sample batches
	all_videos = []
	n_iter = math.ceil(n_samples / batch_size)
	iterator = trange(n_iter, desc="Sampling Batches (text-to-video)") if batch_progress else range(n_iter)
	for _ in iterator:
	noise_shape = make_model_input_shape(model, batch_size)
	samples_latent = sample_denoising_batch(model, noise_shape, cond_embd,
	sample_type=sample_type,
	sampler=sampler,
	ddim_steps=ddim_steps,
	eta=eta,
	unconditional_guidance_scale=cfg_scale,
	uc=uncond_embd,
	denoising_progress=show_denoising_progress,
	)
	samples = model.decode_first_stage(samples_latent, decode_bs=decode_frame_bs, return_cpu=False)

	# gather samples from multiple gpus
	if ddp and all_gather:
	data_list = gather_data(samples, return_np=False)
	all_videos.extend([torch_to_np(data) for data in data_list])
	else:
	all_videos.append(torch_to_np(samples))

	all_videos = np.concatenate(all_videos, axis=0)
	assert(all_videos.shape[0] >= n_samples)
	return all_videos

	def save_results(videos,
	save_name="results", save_fps=8, save_mp4=True,
	save_npz=False, save_mp4_sheet=False, save_jpg=False
	):

	save_subdir = os.path.join("videos")
	os.makedirs(save_subdir, exist_ok=True)
	for i in range(videos.shape[0]):
	npz_to_video_grid(videos[i:i+1,...],
	os.path.join(save_subdir, f"{save_name}_{i:03d}.mp4"),
	fps=save_fps)

	return os.path.join(save_subdir, f"{save_name}_{i:03d}.mp4")

	def get_video(prompt, seed):
	seed_everything(seed)
	samples = sample_text2video(model, prompt, n_samples = 1, batch_size = 1,
	sampler=ddim_sampler,
	)
	return save_results(samples)

	DESCRIPTION = '# [Latent Video Diffusion Models](https://github.com/VideoCrafter/VideoCrafter)'
	DESCRIPTION += '\n<p>🤗🤗🤗 VideoCrafter is an open-source video generation and editing toolbox for crafting video content. This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://github.com/VideoCrafter/VideoCrafter" style="text-decoration: underline;" target="_blank">model card</a>.</p>'

	with gr.Blocks(css='style.css') as demo:
	gr.Markdown(DESCRIPTION)
	with gr.Group():
	with gr.Box():
	with gr.Row(elem_id='prompt-container').style(equal_height=True):
	prompt = gr.Text(
	label='Prompt',
	show_label=False,
	max_lines=1,
	placeholder='Enter your prompt',
	elem_id='prompt-text-input').style(container=False)
	run_button = gr.Button('Generate video').style(
	full_width=False)
	result = gr.Video(label='Result', show_label=False, elem_id='gallery')
	with gr.Accordion('Advanced options', open=False):
	seed = gr.Slider(
	label='Seed',
	minimum=-1,
	maximum=1000000,
	step=1,
	value=-1,
	info='If set to -1, a different seed will be used each time.')
	# num_frames = gr.Slider(
	# label='Number of frames',
	# minimum=16,
	# maximum=MAX_NUM_FRAMES,
	# step=1,
	# value=16,
	# info=
	# 'Note that the content of the video also changes when you change the number of frames.'
	# )
	# num_inference_steps = gr.Slider(label='Number of inference steps',
	# minimum=10,
	# maximum=50,
	# step=1,
	# value=25)

	inputs = [
	prompt,
	seed,
	# num_frames,
	# num_inference_steps,
	]
	gr.Examples(examples=[["Astronaut riding a horse", 1000]],
	inputs=inputs,
	outputs=result,
	fn=get_video,
	cache_examples=True)

	prompt.submit(fn=get_video, inputs=inputs, outputs=result)
	run_button.click(fn=get_video, inputs=inputs, outputs=result)

	# with gr.Accordion(label='Biases and content acknowledgment', open=False):
	# gr.HTML("""<div class="acknowledgments">
	# <h4>Biases and content acknowledgment</h4>
	# <p>
	# Despite how impressive being able to turn text into video is, beware to the fact that this model may output content that reinforces or exacerbates societal biases. The training data includes LAION5B, ImageNet, Webvid and other public datasets. The model was not trained to realistically represent people or events, so using it to generate such content is beyond the model's capabilities.
	# </p>
	# <p>
	# It is not intended to generate content that is demeaning or harmful to people or their environment, culture, religion, etc. Similarly, it is not allowed to generate pornographic, violent and bloody content generation. <b>The model is meant for research purposes</b>.
	# </p>
	# <p>
	# To learn more about the model, head to its <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.
	# </p>
	# </div>
	# """)


	demo.queue(api_open=False, max_size=15).launch()