LCM_Dreamshaper_v7 / lcm_pipeline.py

latent-consistency-models

update inference

2db70b2 about 1 year ago

11.1 kB

	import torch
	from diffusers import DiffusionPipeline, AutoencoderKL, UNet2DConditionModel
	from transformers import CLIPTokenizer, CLIPTextModel, CLIPImageProcessor
	from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
	from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
	from diffusers.image_processor import VaeImageProcessor

	from typing import List, Optional, Tuple, Union, Dict, Any

	from diffusers import logging
	logger = logging.get_logger(__name__) # pylint: disable=invalid-name

	class LatentConsistencyModelPipeline(DiffusionPipeline):
	def __init__(
	self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: None,
	safety_checker: StableDiffusionSafetyChecker,
	feature_extractor: CLIPImageProcessor,
	requires_safety_checker: bool = True
	):
	super().__init__()

	self.register_modules(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	safety_checker=safety_checker,
	feature_extractor=feature_extractor,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
	self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)


	def _encode_prompt(
	self,
	prompt,
	device,
	num_images_per_prompt,
	prompt_embeds: None,
	):
	r"""
	Encodes the prompt into text encoder hidden states.

	Args:
	prompt (`str` or `List[str]`, optional):
	prompt to be encoded
	device: (`torch.device`):
	torch device
	num_images_per_prompt (`int`):
	number of images that should be generated per prompt
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	"""

	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if prompt_embeds is None:

	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids

	if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
	text_input_ids, untruncated_ids
	):
	removed_text = self.tokenizer.batch_decode(
	untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
	)
	logger.warning(
	"The following part of your input was truncated because CLIP can only handle sequences up to"
	f" {self.tokenizer.model_max_length} tokens: {removed_text}"
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = text_inputs.attention_mask.to(device)
	else:
	attention_mask = None

	prompt_embeds = self.text_encoder(
	text_input_ids.to(device),
	attention_mask=attention_mask,
	)
	prompt_embeds = prompt_embeds[0]

	if self.text_encoder is not None:
	prompt_embeds_dtype = self.text_encoder.dtype
	elif self.unet is not None:
	prompt_embeds_dtype = self.unet.dtype
	else:
	prompt_embeds_dtype = prompt_embeds.dtype

	prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)

	bs_embed, seq_len, _ = prompt_embeds.shape
	# duplicate text embeddings for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)

	# Don't need to get uncond prompt embedding because of LCM Guided Distillation
	return prompt_embeds


	def run_safety_checker(self, image, device, dtype):
	if self.safety_checker is None:
	has_nsfw_concept = None
	else:
	if torch.is_tensor(image):
	feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
	else:
	feature_extractor_input = self.image_processor.numpy_to_pil(image)
	safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
	image, has_nsfw_concept = self.safety_checker(
	images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
	)
	return image, has_nsfw_concept


	def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, latents=None):
	shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
	if latents is None:
	latents = torch.randn(shape, dtype=dtype).to(device)
	else:
	latents = latents.to(device)
	# scale the initial noise by the standard deviation required by the scheduler
	latents = latents * self.scheduler.init_noise_sigma
	return latents

	def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
	"""
	see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
	Args:
	timesteps: torch.Tensor: generate embedding vectors at these timesteps
	embedding_dim: int: dimension of the embeddings to generate
	dtype: data type of the generated embeddings

	Returns:
	embedding vectors with shape `(len(timesteps), embedding_dim)`
	"""
	assert len(w.shape) == 1
	w = w * 1000.

	half_dim = embedding_dim // 2
	emb = torch.log(torch.tensor(10000.)) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
	emb = w.to(dtype)[:, None] * emb[None, :]
	emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
	if embedding_dim % 2 == 1: # zero pad
	emb = torch.nn.functional.pad(emb, (0, 1))
	assert emb.shape == (w.shape[0], embedding_dim)
	return emb


	@torch.no_grad()
	def __call__(
	self,
	prompt: Union[str, List[str]] = None,
	height: Optional[int] = 768,
	width: Optional[int] = 768,
	guidance_scale: float = 7.5,
	num_images_per_prompt: Optional[int] = 1,
	latents: Optional[torch.FloatTensor] = None,
	num_inference_steps: int = 4,
	lcm_origin_steps: int = 50,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	):

	# 0. Default height and width to unet
	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor

	# 2. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	device = self._execution_device
	# do_classifier_free_guidance = guidance_scale > 0.0 # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)

	# 3. Encode input prompt
	prompt_embeds = self._encode_prompt(
	prompt,
	device,
	num_images_per_prompt,
	prompt_embeds=prompt_embeds,
	)

	# 4. Prepare timesteps
	self.scheduler.set_timesteps(num_inference_steps, lcm_origin_steps)
	timesteps = self.scheduler.timesteps

	# 5. Prepare latent variable
	num_channels_latents = self.unet.config.in_channels
	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height,
	width,
	prompt_embeds.dtype,
	device,
	latents,
	)

	bs = batch_size * num_images_per_prompt

	# 6. Get Guidance Scale Embedding
	w = torch.tensor(guidance_scale).repeat(bs)
	w_embedding = self.get_w_embedding(w, embedding_dim=256).to(device)

	# 7. LCM MultiStep Sampling Loop:
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):

	ts = torch.full((bs,), t, device=device, dtype=torch.long)

	# model prediction (v-prediction, eps, x)
	model_pred = self.unet(
	latents,
	ts,
	timestep_cond=w_embedding,
	encoder_hidden_states=prompt_embeds,
	cross_attention_kwargs=cross_attention_kwargs,
	return_dict=False)[0]

	# compute the previous noisy sample x_t -> x_t-1
	latents, denoised = self.scheduler.step(model_pred, i, t, latents, return_dict=False)

	# # call the callback, if provided
	# if i == len(timesteps) - 1:
	progress_bar.update()

	if not output_type == "latent":
	image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
	image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
	else:
	image = denoised
	has_nsfw_concept = None

	if has_nsfw_concept is None:
	do_denormalize = [True] * image.shape[0]
	else:
	do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]

	image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)


	if not return_dict:
	return (image, has_nsfw_concept)

	return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)