lyraSD / lyrasd_model /lyrasd_txt2img_inpaint_pipeline.py

yibolu

update pipeline and demos

6eca12e 9 months ago

39.1 kB

	import inspect
	import os
	import sys
	import time
	from typing import Any, Callable, Dict, List, Optional, Union
	import GPUtil
	import torch
	from diffusers.loaders import TextualInversionLoaderMixin
	from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
	from diffusers.models.modeling_outputs import AutoencoderKLOutput
	from diffusers.utils.torch_utils import logging, randn_tensor
	from PIL import Image
	from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
	import gc
	import numpy as np

	from .lyrasd_vae_model import LyraSdVaeModel

	from diffusers.models.embeddings import ImageProjection
	from transformers import (
	CLIPImageProcessor,
	CLIPVisionModelWithProjection,
	)

	from .lyrasd_pipeline_base import LyraSDXLPipelineBase

	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
	"""
	Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
	Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
	"""
	std_text = noise_pred_text.std(
	dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
	std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
	# rescale the results from guidance (fixes overexposure)
	noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
	# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
	noise_cfg = guidance_rescale * noise_pred_rescaled + \
	(1 - guidance_rescale) * noise_cfg
	return noise_cfg


	def numpy_to_pil(images):
	"""
	Convert a numpy image or a batch of images to a PIL image.
	"""
	if images.ndim == 3:
	images = images[None, ...]
	images = (images * 255).round().astype("uint8")
	if images.shape[-1] == 1:
	# special case for grayscale (single channel) images
	pil_images = [Image.fromarray(image.squeeze(), mode="L")
	for image in images]
	else:
	pil_images = [Image.fromarray(image) for image in images]

	return pil_images


	def retrieve_timesteps(
	scheduler,
	num_inference_steps: Optional[int] = None,
	device: Optional[Union[str, torch.device]] = None,
	timesteps: Optional[List[int]] = None,
	**kwargs,
	):
	"""
	Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
	custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

	Args:
	scheduler (`SchedulerMixin`):
	The scheduler to get timesteps from.
	num_inference_steps (`int`):
	The number of diffusion steps used when generating samples with a pre-trained model. If used,
	`timesteps` must be `None`.
	device (`str` or `torch.device`, optional):
	The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
	timesteps (`List[int]`, optional):
	Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
	timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
	must be `None`.

	Returns:
	`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
	second element is the number of inference steps.
	"""
	if timesteps is not None:
	print("set(inspect.signature(scheduler.set_timesteps).parameters.keys())", set(
	inspect.signature(scheduler.set_timesteps).parameters.keys()))
	accepts_timesteps = "timesteps" in set(
	inspect.signature(scheduler.set_timesteps).parameters.keys())
	if not accepts_timesteps:
	raise ValueError(
	f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
	f" timestep schedules. Please check whether you are using the correct scheduler."
	)
	scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
	timesteps = scheduler.timesteps
	num_inference_steps = len(timesteps)
	else:
	scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
	timesteps = scheduler.timesteps
	return timesteps, num_inference_steps


	def retrieve_latents(
	encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
	):
	if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
	return encoder_output.latent_dist.sample(generator)
	elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
	return encoder_output.latent_dist.mode()
	elif hasattr(encoder_output, "latents"):
	return encoder_output.latents
	else:
	raise AttributeError(
	"Could not access latents of provided encoder_output")


	class LyraSdTxt2ImgInpaintPipeline(LyraSDXLPipelineBase):
	def __init__(self, device=torch.device("cuda"), dtype=torch.float16, vae_scale_factor=8, vae_scaling_factor=0.18215, num_channels_unet=9, num_channels_latents=4) -> None:
	super().__init__(device, dtype, num_channels_unet=num_channels_unet, num_channels_latents=num_channels_latents,
	vae_scale_factor=vae_scale_factor, vae_scaling_factor=vae_scaling_factor)

	def _encode_prompt(
	self,
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	negative_prompt=None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	):
	r"""
	Encodes the prompt into text encoder hidden states.

	Args:
	prompt (`str` or `List[str]`, optional):
	prompt to be encoded
	device: (`torch.device`):
	torch device
	num_images_per_prompt (`int`):
	number of images that should be generated per prompt
	do_classifier_free_guidance (`bool`):
	whether to use classifier free guidance or not
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
	less than `1`).
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt
	weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
	argument.
	"""
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if prompt_embeds is None:
	# textual inversion: procecss multi-vector tokens if necessary
	if isinstance(self, TextualInversionLoaderMixin):
	prompt = self.maybe_convert_prompt(prompt, self.tokenizer)

	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer(
	prompt, padding="longest", return_tensors="pt").input_ids

	if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
	text_input_ids, untruncated_ids
	):
	removed_text = self.tokenizer.batch_decode(
	untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
	)
	logger.warning(
	"The following part of your input was truncated because CLIP can only handle sequences up to"
	f" {self.tokenizer.model_max_length} tokens: {removed_text}"
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = text_inputs.attention_mask.to(device)
	else:
	attention_mask = None

	prompt_embeds = self.text_encoder(
	text_input_ids.to(device),
	attention_mask=attention_mask,
	)
	prompt_embeds = prompt_embeds[0]

	prompt_embeds = prompt_embeds.to(
	dtype=self.text_encoder.dtype, device=device)

	bs_embed, seq_len, _ = prompt_embeds.shape
	# duplicate text embeddings for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(
	bs_embed * num_images_per_prompt, seq_len, -1)

	# get unconditional embeddings for classifier free guidance
	if do_classifier_free_guidance and negative_prompt_embeds is None:
	uncond_tokens: List[str]
	if negative_prompt is None:
	uncond_tokens = [""] * batch_size
	elif type(prompt) is not type(negative_prompt):
	raise TypeError(
	f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
	f" {type(prompt)}."
	)
	elif isinstance(negative_prompt, str):
	uncond_tokens = [negative_prompt]
	elif batch_size != len(negative_prompt):
	raise ValueError(
	f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
	f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
	" the batch size of `prompt`."
	)
	else:
	uncond_tokens = negative_prompt

	# textual inversion: procecss multi-vector tokens if necessary
	if isinstance(self, TextualInversionLoaderMixin):
	uncond_tokens = self.maybe_convert_prompt(
	uncond_tokens, self.tokenizer)

	max_length = prompt_embeds.shape[1]
	uncond_input = self.tokenizer(
	uncond_tokens,
	padding="max_length",
	max_length=max_length,
	truncation=True,
	return_tensors="pt",
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = uncond_input.attention_mask.to(device)
	else:
	attention_mask = None

	negative_prompt_embeds = self.text_encoder(
	uncond_input.input_ids.to(device),
	attention_mask=attention_mask,
	)
	negative_prompt_embeds = negative_prompt_embeds[0]

	if do_classifier_free_guidance:
	# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
	seq_len = negative_prompt_embeds.shape[1]

	negative_prompt_embeds = negative_prompt_embeds.to(
	dtype=self.text_encoder.dtype, device=device)

	negative_prompt_embeds = negative_prompt_embeds.repeat(
	1, num_images_per_prompt, 1)
	negative_prompt_embeds = negative_prompt_embeds.view(
	batch_size * num_images_per_prompt, seq_len, -1)

	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

	return prompt_embeds

	def load_ip_adapter(self,
	pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
	subfolder: str,
	weight_name: str,
	**kwargs
	):
	# if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
	self.feature_extractor = CLIPImageProcessor()

	# if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
	self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
	pretrained_model_name_or_path_or_dict,
	subfolder=os.path.join(subfolder, "image_encoder"),
	).to(self.device, dtype=self.dtype)
	# else:
	# print("kio: already has image_encoder", hasattr(self, "image_encoder"), getattr(self, "feature_extractor", None) is None)

	# kiotodo: init ImageProjection
	model_path = os.path.join(
	pretrained_model_name_or_path_or_dict, subfolder, weight_name)
	state_dict = torch.load(model_path, map_location="cpu")

	clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
	cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
	self.encoder_hid_proj = ImageProjection(
	cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
	)

	image_proj_state_dict = {}
	image_proj_state_dict.update(
	{
	"image_embeds.weight": state_dict["image_proj"]["proj.weight"],
	"image_embeds.bias": state_dict["image_proj"]["proj.bias"],
	"norm.weight": state_dict["image_proj"]["norm.weight"],
	"norm.bias": state_dict["image_proj"]["norm.bias"],
	}
	)

	self.encoder_hid_proj.load_state_dict(image_proj_state_dict)
	self.encoder_hid_proj.to(dtype=self.dtype, device=self.device)

	dir_ipadapter = os.path.join(
	pretrained_model_name_or_path_or_dict, subfolder, '.'.join(weight_name.split(".")[:-1]))
	self.unet.load_ip_adapter(dir_ipadapter, "", 1, "fp16")

	def encode_image(self, image, device, num_images_per_prompt):
	dtype = next(self.image_encoder.parameters()).dtype
	if not isinstance(image, torch.Tensor):
	image = self.feature_extractor(
	image, return_tensors="pt").pixel_values

	image = image.to(device=device, dtype=dtype)
	image_embeds = self.image_encoder(image).image_embeds
	image_embeds = image_embeds.repeat_interleave(
	num_images_per_prompt, dim=0)

	uncond_image_embeds = torch.zeros_like(image_embeds)
	return image_embeds, uncond_image_embeds

	def decode_latents(self, latents):
	latents = 1 / self.vae.scaling_factor * latents
	image = self.vae.decode(latents).sample
	image = (image / 2 + 0.5).clamp(0, 1)
	# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
	image = image.cpu().permute(0, 2, 3, 1).float().numpy()
	return image

	def lyra_decode_latents(self, latents):
	# print("lyra_decode_latents")
	# np.save("", latents.)
	# np.save(f"/workspace/vae_model/latent.npy", latents.detach().cpu().numpy())
	latents = 1 / self.vae.scaling_factor * latents
	# latents = latents.permute(0, 2, 3, 1).contiguous()
	image = self.vae.decode(latents)
	image = image.permute(0, 2, 3, 1)
	# print(image)
	# GPUtil.showUtilization(all=True)

	image = (image / 2 + 0.5).clamp(0, 1)
	# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
	image = image.cpu().float().numpy()

	return image

	def get_timesteps(self, num_inference_steps, strength, device):
	# get the original timestep using init_timestep
	init_timestep = min(
	int(num_inference_steps * strength), num_inference_steps)

	t_start = max(num_inference_steps - init_timestep, 0)
	timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]

	return timesteps, num_inference_steps - t_start

	def check_inputs(
	self,
	prompt,
	height,
	width,
	negative_prompt=None,
	prompt_embeds=None,
	negative_prompt_embeds=None,
	):
	if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
	raise ValueError(
	f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")

	if prompt is not None and prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
	" only forward one of the two."
	)
	elif prompt is None and prompt_embeds is None:
	raise ValueError(
	"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
	)
	elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
	raise ValueError(
	f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

	if negative_prompt is not None and negative_prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
	f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
	)

	if prompt_embeds is not None and negative_prompt_embeds is not None:
	if prompt_embeds.shape != negative_prompt_embeds.shape:
	raise ValueError(
	"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
	f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
	f" {negative_prompt_embeds.shape}."
	)

	def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
	if isinstance(generator, list):
	image_latents = [
	retrieve_latents(AutoencoderKLOutput(
	latent_dist=self.vae.encode(image[i: i + 1])), generator=generator[i])
	for i in range(image.shape[0])
	]
	image_latents = torch.cat(image_latents, dim=0)
	else:
	image_latents = retrieve_latents(AutoencoderKLOutput(
	latent_dist=self.vae.encode(image)), generator=generator)

	image_latents = self.vae_scaling_factor * image_latents

	return image_latents

	def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None,
	image=None, timestep=None, is_strength_max=True, return_noise=False, return_image_latents=False):
	shape = (batch_size, num_channels_latents, height //
	self.vae_scale_factor, width // self.vae_scale_factor)
	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	if (image is None or timestep is None) and not is_strength_max:
	raise ValueError(
	"Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
	"However, either the image or the noise timestep has not been provided."
	)

	if return_image_latents or (latents is None and not is_strength_max):
	image = image.to(device=device, dtype=dtype)

	if image.shape[1] == 4:
	image_latents = image
	else:
	image_latents = self._encode_vae_image(
	image=image, generator=generator)
	image_latents = image_latents.repeat(
	batch_size // image_latents.shape[0], 1, 1, 1)

	if latents is None:
	noise = randn_tensor(shape, generator=generator,
	device=device, dtype=dtype)
	# if strength is 1. then initialise the latents to noise, else initial to image + noise
	latents = noise if is_strength_max else self.scheduler.add_noise(
	image_latents, noise, timestep)
	# if pure noise then scale the initial latents by the Scheduler's init sigma
	latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
	else:
	noise = latents.to(device)
	latents = noise * self.scheduler.init_noise_sigma

	outputs = (latents,)

	if return_noise:
	outputs += (noise,)

	if return_image_latents:
	outputs += (image_latents,)

	return outputs

	def prepare_mask_latents(
	self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
	):
	# resize the mask to latents shape as we concatenate the mask to the latents
	# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
	# and half precision
	mask = torch.nn.functional.interpolate(
	mask, size=(height // self.vae_scale_factor,
	width // self.vae_scale_factor)
	)
	mask = mask.to(device=device, dtype=dtype)

	masked_image = masked_image.to(device=device, dtype=dtype)

	if masked_image.shape[1] == 4:
	masked_image_latents = masked_image
	else:
	masked_image_latents = self._encode_vae_image(
	masked_image, generator=generator)

	# duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
	if mask.shape[0] < batch_size:
	if not batch_size % mask.shape[0] == 0:
	raise ValueError(
	"The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
	f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
	" of masks that you pass is divisible by the total requested batch size."
	)
	mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
	if masked_image_latents.shape[0] < batch_size:
	if not batch_size % masked_image_latents.shape[0] == 0:
	raise ValueError(
	"The passed images and the required batch size don't match. Images are supposed to be duplicated"
	f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
	" Make sure the number of images that you pass is divisible by the total requested batch size."
	)
	masked_image_latents = masked_image_latents.repeat(
	batch_size // masked_image_latents.shape[0], 1, 1, 1)

	mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
	masked_image_latents = (
	torch.cat([masked_image_latents] *
	2) if do_classifier_free_guidance else masked_image_latents
	)

	# aligning device to prevent device errors when concating it with the latent model input
	masked_image_latents = masked_image_latents.to(
	device=device, dtype=dtype)
	return mask, masked_image_latents

	def prepare_extra_step_kwargs(self, generator, eta):
	# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
	# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
	# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
	# and should be between [0, 1]

	accepts_eta = "eta" in set(inspect.signature(
	self.scheduler.step).parameters.keys())
	extra_step_kwargs = {}
	if accepts_eta:
	extra_step_kwargs["eta"] = eta

	# check if the scheduler accepts generator
	accepts_generator = "generator" in set(
	inspect.signature(self.scheduler.step).parameters.keys())
	if accepts_generator:
	extra_step_kwargs["generator"] = generator
	return extra_step_kwargs

	@torch.no_grad()
	def __call__(
	self,
	prompt: Union[str, List[str]] = None,
	image: PipelineImageInput = None,
	mask_image: PipelineImageInput = None,
	masked_image_latents: torch.FloatTensor = None,
	height: Optional[int] = None,
	width: Optional[int] = None,
	strength: float = 1.0,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[torch.Generator,
	List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	ip_adapter_image: Optional[PipelineImageInput] = None,
	param_scale_dict: Optional[dict] = {}
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
	instead.
	height (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated image.
	width (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated image.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
	less than `1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
	[`schedulers.DDIMScheduler`], will be ignored for others.
	generator (`torch.Generator` or `List[torch.Generator]`, optional):
	One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
	to make generation deterministic.
	latents (`torch.FloatTensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt
	weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
	argument.

	"""
	# 0. Default height and width to unet
	height = height or self.unet_config_sample_size * self.vae_scale_factor
	width = width or self.unet_config_sample_size * self.vae_scale_factor
	# self.unet_config.sample_size = 64
	# height = 512
	# width = 512

	# 1. Check inputs. Raise error if not correct
	# self.check_inputs(
	# prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
	# )

	# 2. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	device = self.device
	# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
	# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
	# corresponds to doing no classifier free guidance.
	do_classifier_free_guidance = guidance_scale > 1.0

	# 3. Encode input prompt
	prompt_embeds = self._encode_prompt(
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	negative_prompt,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	)

	# 3.5 Encode ipadapter_image
	if ip_adapter_image is not None:
	image_embeds, negative_image_embeds = self.encode_image(
	ip_adapter_image, device, num_images_per_prompt)
	if do_classifier_free_guidance:
	image_embeds = torch.cat([negative_image_embeds, image_embeds])
	image_embeds = self.encoder_hid_proj(image_embeds).to(self.dtype)

	# 4. Prepare timesteps
	# self.scheduler.set_timesteps(num_inference_steps, device=device)
	# timesteps = self.scheduler.timesteps

	# 4.5 Prepare mask and image
	timesteps = None
	timesteps, num_inference_steps = retrieve_timesteps(
	self.scheduler, num_inference_steps, device, timesteps)
	timesteps, num_inference_steps = self.get_timesteps(
	num_inference_steps=num_inference_steps, strength=strength, device=device
	)
	# check that number of inference steps is not < 1 - as this doesn't make sense
	if num_inference_steps < 1:
	raise ValueError(
	f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
	f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
	)
	# at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
	latent_timestep = timesteps[:1].repeat(
	batch_size * num_images_per_prompt)
	# create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
	is_strength_max = strength == 1.0

	# 5. Preprocess mask and image

	init_image = self.image_processor.preprocess(
	image, height=height, width=width)
	init_image = init_image.to(dtype=torch.float32)

	# 5. Prepare latent variables
	return_image_latents = self.num_channels_unet == 4
	latents_outputs = self.prepare_latents(
	batch_size * num_images_per_prompt,
	self.num_channels_latents,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	image=init_image,
	timestep=latent_timestep,
	is_strength_max=is_strength_max,
	return_noise=True,
	return_image_latents=return_image_latents
	)

	if return_image_latents:
	latents, noise, image_latents = latents_outputs
	else:
	latents, noise = latents_outputs

	# 5.5 Prepare mask latent variables
	mask_condition = self.mask_processor.preprocess(
	mask_image, height=height, width=width)
	if masked_image_latents is None:
	masked_image = init_image * (mask_condition < 0.5)
	else:
	masked_image = masked_image_latents

	mask, masked_image_latents = self.prepare_mask_latents(
	mask_condition,
	masked_image,
	batch_size * num_images_per_prompt,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	do_classifier_free_guidance,
	)

	# Check that sizes of mask, masked image and latents match
	if self.num_channels_unet == 9:
	# default case for runwayml/stable-diffusion-inpainting
	num_channels_mask = mask.shape[1]
	num_channels_masked_image = masked_image_latents.shape[1]
	if self.num_channels_latents + num_channels_mask + num_channels_masked_image != self.num_channels_unet:
	raise ValueError(
	f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
	f" {self.num_channels_latents} but received `num_channels_latents`: {self.num_channels_latents} +"
	f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
	f" = {self.num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
	" `pipeline.unet` or your `mask_image` or `image` input."
	)
	elif self.num_channels_unet != 4:
	raise ValueError(
	f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
	)

	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

	# 7. Denoising loop
	num_warmup_steps = len(timesteps) - \
	num_inference_steps * self.scheduler.order

	for i, t in enumerate(timesteps):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat(
	[latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = self.scheduler.scale_model_input(
	latent_model_input, t)

	if self.num_channels_unet == 9:
	latent_model_input = torch.cat(
	[latent_model_input, mask, masked_image_latents], dim=1)

	latent_model_input = latent_model_input.permute(
	0, 2, 3, 1).contiguous()

	# latent_model_input = latent_model_input[:,:4,:,:].

	# 后边三个 None 是给到controlnet 的参数，暂时给到 None 当 placeholder
	# todo: forward ip image_embeds
	# break
	if ip_adapter_image is not None:
	noise_pred = self.unet.forward(
	latent_model_input, prompt_embeds, t, None, None, None, None, {"ip_hidden_states": image_embeds}, param_scale_dict)
	else:
	noise_pred = self.unet.forward(
	latent_model_input, prompt_embeds, t)

	noise_pred = noise_pred.permute(0, 3, 1, 2).contiguous()
	# saver.save_v(f"latent_model_input_{i}", latent_model_input)
	# saver.save_v(f"noise_pred_{i}", noise_pred)
	# saver.save_v(f"prompt_embeds_{i}", prompt_embeds)

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * \
	(noise_pred_text - noise_pred_uncond)

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.scheduler.step(
	noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
	if self.num_channels_unet == 4:
	init_latents_proper = image_latents
	if self.do_classifier_free_guidance:
	init_mask, _ = mask.chunk(2)
	else:
	init_mask = mask

	if i < len(timesteps) - 1:
	noise_timestep = timesteps[i + 1]
	init_latents_proper = self.scheduler.add_noise(
	init_latents_proper, noise, torch.tensor(
	[noise_timestep])
	)

	latents = (1 - init_mask) * init_latents_proper + \
	init_mask * latents

	# if do_classifier_free_guidance and guidance_rescale > 0.0:
	# # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
	# noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
	# # compute the previous noisy sample x_t -> x_t-1
	# latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
	# image = self.decode_latents(latents)
	image = self.lyra_decode_latents(latents)
	image = numpy_to_pil(image)

	return image