Spaces:

declare-lab
/

tango2

Running on Zero

tango2 / diffusers /examples /community /magic_mix.py

hungchiayu1

initial commit

ffead1e 8 months ago

4.86 kB

	from typing import Union

	import torch
	from PIL import Image
	from torchvision import transforms as tfms
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer

	from diffusers import (
	AutoencoderKL,
	DDIMScheduler,
	DiffusionPipeline,
	LMSDiscreteScheduler,
	PNDMScheduler,
	UNet2DConditionModel,
	)


	class MagicMixPipeline(DiffusionPipeline):
	def __init__(
	self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
	):
	super().__init__()

	self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)

	# convert PIL image to latents
	def encode(self, img):
	with torch.no_grad():
	latent = self.vae.encode(tfms.ToTensor()(img).unsqueeze(0).to(self.device) * 2 - 1)
	latent = 0.18215 * latent.latent_dist.sample()
	return latent

	# convert latents to PIL image
	def decode(self, latent):
	latent = (1 / 0.18215) * latent
	with torch.no_grad():
	img = self.vae.decode(latent).sample
	img = (img / 2 + 0.5).clamp(0, 1)
	img = img.detach().cpu().permute(0, 2, 3, 1).numpy()
	img = (img * 255).round().astype("uint8")
	return Image.fromarray(img[0])

	# convert prompt into text embeddings, also unconditional embeddings
	def prep_text(self, prompt):
	text_input = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	text_embedding = self.text_encoder(text_input.input_ids.to(self.device))[0]

	uncond_input = self.tokenizer(
	"",
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)

	uncond_embedding = self.text_encoder(uncond_input.input_ids.to(self.device))[0]

	return torch.cat([uncond_embedding, text_embedding])

	def __call__(
	self,
	img: Image.Image,
	prompt: str,
	kmin: float = 0.3,
	kmax: float = 0.6,
	mix_factor: float = 0.5,
	seed: int = 42,
	steps: int = 50,
	guidance_scale: float = 7.5,
	) -> Image.Image:
	tmin = steps - int(kmin * steps)
	tmax = steps - int(kmax * steps)

	text_embeddings = self.prep_text(prompt)

	self.scheduler.set_timesteps(steps)

	width, height = img.size
	encoded = self.encode(img)

	torch.manual_seed(seed)
	noise = torch.randn(
	(1, self.unet.in_channels, height // 8, width // 8),
	).to(self.device)

	latents = self.scheduler.add_noise(
	encoded,
	noise,
	timesteps=self.scheduler.timesteps[tmax],
	)

	input = torch.cat([latents] * 2)

	input = self.scheduler.scale_model_input(input, self.scheduler.timesteps[tmax])

	with torch.no_grad():
	pred = self.unet(
	input,
	self.scheduler.timesteps[tmax],
	encoder_hidden_states=text_embeddings,
	).sample

	pred_uncond, pred_text = pred.chunk(2)
	pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)

	latents = self.scheduler.step(pred, self.scheduler.timesteps[tmax], latents).prev_sample

	for i, t in enumerate(tqdm(self.scheduler.timesteps)):
	if i > tmax:
	if i < tmin: # layout generation phase
	orig_latents = self.scheduler.add_noise(
	encoded,
	noise,
	timesteps=t,
	)

	input = (mix_factor * latents) + (
	1 - mix_factor
	) * orig_latents # interpolating between layout noise and conditionally generated noise to preserve layout sematics
	input = torch.cat([input] * 2)

	else: # content generation phase
	input = torch.cat([latents] * 2)

	input = self.scheduler.scale_model_input(input, t)

	with torch.no_grad():
	pred = self.unet(
	input,
	t,
	encoder_hidden_states=text_embeddings,
	).sample

	pred_uncond, pred_text = pred.chunk(2)
	pred = pred_uncond + guidance_scale * (pred_text - pred_uncond)

	latents = self.scheduler.step(pred, t, latents).prev_sample

	return self.decode(latents)