import torch
import gradio as gr
import diffusers
from modules import scripts, processing, shared, images, sd_models, devices
MODELS = [
{ 'name': 'None', 'info': '' },
# { 'name': 'PIA', 'url': 'openmmlab/PIA-condition-adapter', 'info': 'Open MMLab Personalized Image Animator' },
{ 'name': 'VGen', 'url': 'ali-vilab/i2vgen-xl', 'info': 'Alibaba VGen' },
]
class Script(scripts.Script):
def title(self):
return 'Image-to-Video'
def show(self, is_img2img):
return is_img2img if shared.backend == shared.Backend.DIFFUSERS else False
# return False
# return signature is array of gradio components
def ui(self, _is_img2img):
def video_change(video_type):
return [
gr.update(visible=video_type != 'None'),
gr.update(visible=video_type == 'GIF' or video_type == 'PNG'),
gr.update(visible=video_type == 'MP4'),
gr.update(visible=video_type == 'MP4'),
]
def model_change(model_name):
model = next(m for m in MODELS if m['name'] == model_name)
return gr.update(value=model['info']), gr.update(visible=model_name == 'PIA'), gr.update(visible=model_name == 'VGen')
with gr.Row():
model_name = gr.Dropdown(label='Model', value='None', choices=[m['name'] for m in MODELS])
with gr.Row():
model_info = gr.HTML()
with gr.Row():
num_frames = gr.Slider(label='Frames', minimum=0, maximum=50, step=1, value=16)
with gr.Row():
video_type = gr.Dropdown(label='Video file', choices=['None', 'GIF', 'PNG', 'MP4'], value='None')
duration = gr.Slider(label='Duration', minimum=0.25, maximum=10, step=0.25, value=2, visible=False)
with gr.Accordion('FreeInit', open=False, visible=False) as fi_accordion:
with gr.Row():
fi_method = gr.Dropdown(label='Method', choices=['none', 'butterworth', 'ideal', 'gaussian'], value='none')
with gr.Row():
# fi_fast = gr.Checkbox(label='Fast sampling', value=False)
fi_iters = gr.Slider(label='Iterations', minimum=1, maximum=10, step=1, value=3)
fi_order = gr.Slider(label='Order', minimum=1, maximum=10, step=1, value=4)
with gr.Row():
fi_spatial = gr.Slider(label='Spatial frequency', minimum=0.0, maximum=1.0, step=0.05, value=0.25)
fi_temporal = gr.Slider(label='Temporal frequency', minimum=0.0, maximum=1.0, step=0.05, value=0.25)
with gr.Accordion('VGen params', open=True, visible=False) as vgen_accordion:
with gr.Row():
vg_chunks = gr.Slider(label='Decode chunks', minimum=0.1, maximum=1.0, step=0.1, value=0.5)
vg_fps = gr.Slider(label='Change rate', minimum=0.1, maximum=1.0, step=0.1, value=0.5)
with gr.Row():
gif_loop = gr.Checkbox(label='Loop', value=True, visible=False)
mp4_pad = gr.Slider(label='Pad frames', minimum=0, maximum=24, step=1, value=1, visible=False)
mp4_interpolate = gr.Slider(label='Interpolate frames', minimum=0, maximum=24, step=1, value=0, visible=False)
model_name.change(fn=model_change, inputs=[model_name], outputs=[model_info, fi_accordion, vgen_accordion])
video_type.change(fn=video_change, inputs=[video_type], outputs=[duration, gif_loop, mp4_pad, mp4_interpolate])
return [model_name, num_frames, video_type, duration, gif_loop, mp4_pad, mp4_interpolate, fi_method, fi_iters, fi_order, fi_spatial, fi_temporal, vg_chunks, vg_fps]
def run(self, p: processing.StableDiffusionProcessing, model_name, num_frames, video_type, duration, gif_loop, mp4_pad, mp4_interpolate, fi_method, fi_iters, fi_order, fi_spatial, fi_temporal, vg_chunks, vg_fps): # pylint: disable=arguments-differ, unused-argument
if model_name == 'None':
return
if p.init_images is None or len(p.init_images) == 0:
return
model = [m for m in MODELS if m['name'] == model_name][0]
repo_id = model['url']
shared.log.debug(f'Image2Video: model={model_name} frames={num_frames}, video={video_type} duration={duration} loop={gif_loop} pad={mp4_pad} interpolate={mp4_interpolate}')
p.ops.append('image2video')
p.do_not_save_grid = True
orig_pipeline = shared.sd_model
if model_name == 'PIA':
if shared.sd_model_type != 'sd':
shared.log.error('Image2Video PIA: base model must be SD15')
return
shared.log.info(f'Image2Video PIA load: model={repo_id}')
motion_adapter = diffusers.MotionAdapter.from_pretrained(repo_id)
sd_models.move_model(motion_adapter, devices.device)
shared.sd_model = sd_models.switch_pipe(diffusers.PIAPipeline, shared.sd_model, { 'motion_adapter': motion_adapter })
sd_models.move_model(shared.sd_model, devices.device, force=True) # move pipeline to device
if num_frames > 0:
p.task_args['num_frames'] = num_frames
p.task_args['image'] = p.init_images[0]
if hasattr(shared.sd_model, 'enable_free_init') and fi_method != 'none':
shared.sd_model.enable_free_init(
num_iters=fi_iters,
use_fast_sampling=False,
method=fi_method,
order=fi_order,
spatial_stop_frequency=fi_spatial,
temporal_stop_frequency=fi_temporal,
)
shared.log.debug(f'Image2Video PIA: args={p.task_args}')
processed = processing.process_images(p)
shared.sd_model.motion_adapter = None
if model_name == 'VGen':
if not isinstance(shared.sd_model, diffusers.I2VGenXLPipeline):
shared.log.info(f'Image2Video VGen load: model={repo_id}')
pipe = diffusers.I2VGenXLPipeline.from_pretrained(repo_id, torch_dtype=devices.dtype, cache_dir=shared.opts.diffusers_dir)
sd_models.copy_diffuser_options(pipe, shared.sd_model)
sd_models.set_diffuser_options(pipe)
shared.sd_model = pipe
sd_models.move_model(shared.sd_model, devices.device) # move pipeline to device
shared.sd_model.to(dtype=torch.float32)
if num_frames > 0:
p.task_args['image'] = p.init_images[0]
p.task_args['num_frames'] = num_frames
p.task_args['target_fps'] = max(1, int(num_frames * vg_fps))
p.task_args['decode_chunk_size'] = max(1, int(num_frames * vg_chunks))
p.task_args['output_type'] = 'pil'
shared.log.debug(f'Image2Video VGen: args={p.task_args}')
processed = processing.process_images(p)
shared.sd_model = orig_pipeline
if video_type != 'None' and processed is not None:
images.save_video(p, filename=None, images=processed.images, video_type=video_type, duration=duration, loop=gif_loop, pad=mp4_pad, interpolate=mp4_interpolate)
return processed