Spaces:

hilamanor
/

audioEditing

Running on Zero

App Files Files Community

audioEditing / inversion_utils.py

hilamanor

Stable Audio Open + progbars + mp3 + batched forward + cleanup

7c56def about 2 months ago

raw

history blame

11.2 kB

	import torch
	from tqdm import tqdm
	from typing import List, Optional, Tuple
	from models import PipelineWrapper
	import gradio as gr


	def inversion_forward_process(model: PipelineWrapper,
	x0: torch.Tensor,
	etas: Optional[float] = None,
	prompts: List[str] = [""],
	cfg_scales: List[float] = [3.5],
	num_inference_steps: int = 50,
	numerical_fix: bool = False,
	duration: Optional[float] = None,
	first_order: bool = False,
	save_compute: bool = True,
	progress=gr.Progress()) -> Tuple:
	if len(prompts) > 1 or prompts[0] != "":
	text_embeddings_hidden_states, text_embeddings_class_labels, \
	text_embeddings_boolean_prompt_mask = model.encode_text(prompts)

	# In the forward negative prompts are not supported currently (TODO)
	uncond_embeddings_hidden_states, uncond_embeddings_class_lables, uncond_boolean_prompt_mask = model.encode_text(
	[""], negative=True, save_compute=save_compute, cond_length=text_embeddings_class_labels.shape[1]
	if text_embeddings_class_labels is not None else None)
	else:
	uncond_embeddings_hidden_states, uncond_embeddings_class_lables, uncond_boolean_prompt_mask = model.encode_text(
	[""], negative=True, save_compute=False)

	timesteps = model.model.scheduler.timesteps.to(model.device)
	variance_noise_shape = model.get_noise_shape(x0, num_inference_steps)

	if type(etas) in [int, float]:
	etas = [etas]*model.model.scheduler.num_inference_steps
	xts = model.sample_xts_from_x0(x0, num_inference_steps=num_inference_steps)
	zs = torch.zeros(size=variance_noise_shape, device=model.device)
	extra_info = [None] * len(zs)

	if timesteps[0].dtype == torch.int64:
	t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
	elif timesteps[0].dtype == torch.float32:
	t_to_idx = {float(v): k for k, v in enumerate(timesteps)}
	xt = x0
	op = tqdm(timesteps, desc="Inverting")
	model.setup_extra_inputs(xt, init_timestep=timesteps[0], audio_end_in_s=duration,
	save_compute=save_compute and prompts[0] != "")
	app_op = progress.tqdm(timesteps, desc="Inverting")
	for t, _ in zip(op, app_op):
	idx = num_inference_steps - t_to_idx[int(t) if timesteps[0].dtype == torch.int64 else float(t)] - 1

	# 1. predict noise residual
	xt = xts[idx+1][None]
	xt_inp = model.model.scheduler.scale_model_input(xt, t)

	with torch.no_grad():
	if save_compute and prompts[0] != "":
	comb_out, _, _ = model.unet_forward(
	xt_inp.expand(2, -1, -1, -1) if hasattr(model.model, 'unet') else xt_inp.expand(2, -1, -1),
	timestep=t,
	encoder_hidden_states=torch.cat([uncond_embeddings_hidden_states, text_embeddings_hidden_states
	], dim=0)
	if uncond_embeddings_hidden_states is not None else None,
	class_labels=torch.cat([uncond_embeddings_class_lables, text_embeddings_class_labels], dim=0)
	if uncond_embeddings_class_lables is not None else None,
	encoder_attention_mask=torch.cat([uncond_boolean_prompt_mask, text_embeddings_boolean_prompt_mask
	], dim=0)
	if uncond_boolean_prompt_mask is not None else None,
	)
	out, cond_out = comb_out.sample.chunk(2, dim=0)
	else:
	out = model.unet_forward(xt_inp, timestep=t,
	encoder_hidden_states=uncond_embeddings_hidden_states,
	class_labels=uncond_embeddings_class_lables,
	encoder_attention_mask=uncond_boolean_prompt_mask)[0].sample
	if len(prompts) > 1 or prompts[0] != "":
	cond_out = model.unet_forward(
	xt_inp,
	timestep=t,
	encoder_hidden_states=text_embeddings_hidden_states,
	class_labels=text_embeddings_class_labels,
	encoder_attention_mask=text_embeddings_boolean_prompt_mask)[0].sample

	if len(prompts) > 1 or prompts[0] != "":
	# # classifier free guidance
	noise_pred = out + (cfg_scales[0] * (cond_out - out)).sum(axis=0).unsqueeze(0)
	else:
	noise_pred = out

	# xtm1 = xts[idx+1][None]
	xtm1 = xts[idx][None]
	z, xtm1, extra = model.get_zs_from_xts(xt, xtm1, noise_pred, t,
	eta=etas[idx], numerical_fix=numerical_fix,
	first_order=first_order)
	zs[idx] = z
	# print(f"Fix Xt-1 distance - NORM:{torch.norm(xts[idx] - xtm1):.4g}, MSE:{((xts[idx] - xtm1)**2).mean():.4g}")
	xts[idx] = xtm1
	extra_info[idx] = extra

	if zs is not None:
	# zs[-1] = torch.zeros_like(zs[-1])
	zs[0] = torch.zeros_like(zs[0])
	# zs_cycle[0] = torch.zeros_like(zs[0])

	del app_op.iterables[0]
	return xt, zs, xts, extra_info


	def inversion_reverse_process(model: PipelineWrapper,
	xT: torch.Tensor,
	tstart: torch.Tensor,
	etas: float = 0,
	prompts: List[str] = [""],
	neg_prompts: List[str] = [""],
	cfg_scales: Optional[List[float]] = None,
	zs: Optional[List[torch.Tensor]] = None,
	duration: Optional[float] = None,
	first_order: bool = False,
	extra_info: Optional[List] = None,
	save_compute: bool = True,
	progress=gr.Progress()) -> Tuple[torch.Tensor, torch.Tensor]:

	text_embeddings_hidden_states, text_embeddings_class_labels, \
	text_embeddings_boolean_prompt_mask = model.encode_text(prompts)
	uncond_embeddings_hidden_states, uncond_embeddings_class_lables, \
	uncond_boolean_prompt_mask = model.encode_text(neg_prompts,
	negative=True,
	save_compute=save_compute,
	cond_length=text_embeddings_class_labels.shape[1]
	if text_embeddings_class_labels is not None else None)

	xt = xT[tstart.max()].unsqueeze(0)

	if etas is None:
	etas = 0
	if type(etas) in [int, float]:
	etas = [etas]*model.model.scheduler.num_inference_steps
	assert len(etas) == model.model.scheduler.num_inference_steps
	timesteps = model.model.scheduler.timesteps.to(model.device)

	op = tqdm(timesteps[-zs.shape[0]:], desc="Editing")
	if timesteps[0].dtype == torch.int64:
	t_to_idx = {int(v): k for k, v in enumerate(timesteps[-zs.shape[0]:])}
	elif timesteps[0].dtype == torch.float32:
	t_to_idx = {float(v): k for k, v in enumerate(timesteps[-zs.shape[0]:])}
	model.setup_extra_inputs(xt, extra_info=extra_info, init_timestep=timesteps[-zs.shape[0]],
	audio_end_in_s=duration, save_compute=save_compute)
	app_op = progress.tqdm(timesteps[-zs.shape[0]:], desc="Editing")
	for it, (t, _) in enumerate(zip(op, app_op)):
	idx = model.model.scheduler.num_inference_steps - t_to_idx[
	int(t) if timesteps[0].dtype == torch.int64 else float(t)] - \
	(model.model.scheduler.num_inference_steps - zs.shape[0] + 1)

	xt_inp = model.model.scheduler.scale_model_input(xt, t)

	# # Unconditional embedding
	with torch.no_grad():
	# print(f'xt_inp.shape: {xt_inp.shape}')
	# print(f't.shape: {t.shape}')
	# print(f'uncond_embeddings_hidden_states.shape: {uncond_embeddings_hidden_states.shape}')
	# print(f'uncond_embeddings_class_lables.shape: {uncond_embeddings_class_lables.shape}')
	# print(f'uncond_boolean_prompt_mask.shape: {uncond_boolean_prompt_mask.shape}')
	# print(f'text_embeddings_hidden_states.shape: {text_embeddings_hidden_states.shape}')
	# print(f'text_embeddings_class_labels.shape: {text_embeddings_class_labels.shape}')
	# print(f'text_embeddings_boolean_prompt_mask.shape: {text_embeddings_boolean_prompt_mask.shape}')

	if save_compute:
	comb_out, _, _ = model.unet_forward(
	xt_inp.expand(2, -1, -1, -1) if hasattr(model.model, 'unet') else xt_inp.expand(2, -1, -1),
	timestep=t,
	encoder_hidden_states=torch.cat([uncond_embeddings_hidden_states, text_embeddings_hidden_states
	], dim=0)
	if uncond_embeddings_hidden_states is not None else None,
	class_labels=torch.cat([uncond_embeddings_class_lables, text_embeddings_class_labels], dim=0)
	if uncond_embeddings_class_lables is not None else None,
	encoder_attention_mask=torch.cat([uncond_boolean_prompt_mask, text_embeddings_boolean_prompt_mask
	], dim=0)
	if uncond_boolean_prompt_mask is not None else None,
	)
	uncond_out, cond_out = comb_out.sample.chunk(2, dim=0)
	else:
	uncond_out = model.unet_forward(
	xt_inp, timestep=t,
	encoder_hidden_states=uncond_embeddings_hidden_states,
	class_labels=uncond_embeddings_class_lables,
	encoder_attention_mask=uncond_boolean_prompt_mask,
	)[0].sample

	# Conditional embedding
	cond_out = model.unet_forward(
	xt_inp,
	timestep=t,
	encoder_hidden_states=text_embeddings_hidden_states,
	class_labels=text_embeddings_class_labels,
	encoder_attention_mask=text_embeddings_boolean_prompt_mask,
	)[0].sample

	z = zs[idx] if zs is not None else None
	z = z.unsqueeze(0)
	# classifier free guidance
	noise_pred = uncond_out + (cfg_scales[0] * (cond_out - uncond_out)).sum(axis=0).unsqueeze(0)

	# 2. compute less noisy image and set x_t -> x_t-1
	xt = model.reverse_step_with_custom_noise(noise_pred, t, xt, variance_noise=z,
	eta=etas[idx], first_order=first_order)

	del app_op.iterables[0]
	return xt, zs