Spaces:

smartfeed
/

image2image

Running on Zero

image2image / app_ddim.py

zhiweili

change base model

e2d5eb3 about 2 months ago

9.92 kB

	import spaces
	import gradio as gr
	import time
	import torch
	import numpy as np

	from tqdm.auto import tqdm
	from torchvision import transforms as tfms
	from PIL import Image
	from segment_utils import(
	segment_image,
	restore_result,
	)
	from diffusers import (
	StableDiffusionPipeline,
	DDIMScheduler,
	)

	# BASE_MODEL = "stable-diffusion-v1-5/stable-diffusion-v1-5"
	BASE_MODEL = "SG161222/Realistic_Vision_V5.1_noVAE"

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	DEFAULT_INPUT_PROMPT = "a woman"
	DEFAULT_EDIT_PROMPT = "a woman with linen-blonde-hair"

	DEFAULT_CATEGORY = "hair"

	basepipeline = StableDiffusionPipeline.from_pretrained(
	BASE_MODEL,
	torch_dtype=torch.float16,
	use_safetensors=True,
	)

	basepipeline.scheduler = DDIMScheduler.from_config(basepipeline.scheduler.config)

	basepipeline = basepipeline.to(DEVICE)

	basepipeline.enable_model_cpu_offload()

	@spaces.GPU(duration=30)
	def image_to_image(
	input_image: Image,
	input_image_prompt: str,
	edit_prompt: str,
	num_steps: int,
	start_step: int,
	guidance_scale: float,
	):
	run_task_time = 0
	time_cost_str = ''
	run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)

	with torch.no_grad():
	input_image_tensor = tfms.functional.to_tensor(input_image).unsqueeze(0).to(DEVICE)
	input_image_tensor = input_image_tensor.to(dtype=torch.float16)
	latent = basepipeline.vae.encode(input_image_tensor * 2 - 1)
	l = 0.18215 * latent.latent_dist.sample()
	inverted_latents = invert(l, input_image_prompt, num_inference_steps=num_steps)
	generated_image = sample(
	edit_prompt,
	start_latents=inverted_latents[-(start_step + 1)][None],
	start_step=start_step,
	num_inference_steps=num_steps,
	guidance_scale=guidance_scale,
	)[0]

	run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)

	return generated_image, time_cost_str

	def make_inpaint_condition(image, image_mask):
	image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
	image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0

	assert image.shape[0:1] == image_mask.shape[0:1], "image and image_mask must have the same image size"
	image[image_mask > 0.5] = -1.0 # set as masked pixel
	image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
	image = torch.from_numpy(image)
	return image

	## Inversion
	@torch.no_grad()
	def invert(
	start_latents,
	prompt,
	guidance_scale=3.5,
	num_inference_steps=80,
	num_images_per_prompt=1,
	do_classifier_free_guidance=True,
	negative_prompt="",
	device=DEVICE,
	):

	# Encode prompt
	text_embeddings = basepipeline._encode_prompt(
	prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
	)

	# Latents are now the specified start latents
	latents = start_latents.clone()

	# We'll keep a list of the inverted latents as the process goes on
	intermediate_latents = []

	# Set num inference steps
	basepipeline.scheduler.set_timesteps(num_inference_steps, device=device)

	# Reversed timesteps <<<<<<<<<<<<<<<<<<<<
	timesteps = reversed(basepipeline.scheduler.timesteps)

	for i in tqdm(range(1, num_inference_steps), total=num_inference_steps - 1):

	# We'll skip the final iteration
	if i >= num_inference_steps - 1:
	continue

	t = timesteps[i]

	# Expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = basepipeline.scheduler.scale_model_input(latent_model_input, t)

	# Predict the noise residual
	noise_pred = basepipeline.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

	# Perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	current_t = max(0, t.item() - (1000 // num_inference_steps)) # t
	next_t = t # min(999, t.item() + (1000//num_inference_steps)) # t+1
	alpha_t = basepipeline.scheduler.alphas_cumprod[current_t]
	alpha_t_next = basepipeline.scheduler.alphas_cumprod[next_t]

	# Inverted update step (re-arranging the update step to get x(t) (new latents) as a function of x(t-1) (current latents)
	latents = (latents - (1 - alpha_t).sqrt() * noise_pred) * (alpha_t_next.sqrt() / alpha_t.sqrt()) + (
	1 - alpha_t_next
	).sqrt() * noise_pred

	# Store
	intermediate_latents.append(latents)

	return torch.cat(intermediate_latents)

	# Sample function (regular DDIM)
	@torch.no_grad()
	def sample(
	prompt,
	start_step=0,
	start_latents=None,
	guidance_scale=3.5,
	num_inference_steps=30,
	num_images_per_prompt=1,
	do_classifier_free_guidance=True,
	negative_prompt="",
	device=DEVICE,
	):

	# Encode prompt
	text_embeddings = basepipeline._encode_prompt(
	prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
	)

	# Set num inference steps
	basepipeline.scheduler.set_timesteps(num_inference_steps, device=device)

	# Create a random starting point if we don't have one already
	if start_latents is None:
	start_latents = torch.randn(1, 4, 64, 64, device=device)
	start_latents *= basepipeline.scheduler.init_noise_sigma

	latents = start_latents.clone()

	for i in tqdm(range(start_step, num_inference_steps)):

	t = basepipeline.scheduler.timesteps[i]

	# Expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = basepipeline.scheduler.scale_model_input(latent_model_input, t)

	# Predict the noise residual
	noise_pred = basepipeline.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

	# Perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	# Normally we'd rely on the scheduler to handle the update step:
	# latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample

	# Instead, let's do it ourselves:
	prev_t = max(1, t.item() - (1000 // num_inference_steps)) # t-1
	alpha_t = basepipeline.scheduler.alphas_cumprod[t.item()]
	alpha_t_prev = basepipeline.scheduler.alphas_cumprod[prev_t]
	predicted_x0 = (latents - (1 - alpha_t).sqrt() * noise_pred) / alpha_t.sqrt()
	direction_pointing_to_xt = (1 - alpha_t_prev).sqrt() * noise_pred
	latents = alpha_t_prev.sqrt() * predicted_x0 + direction_pointing_to_xt

	# Post-processing
	images = basepipeline.decode_latents(latents)
	images = basepipeline.numpy_to_pil(images)

	return images

	def get_time_cost(run_task_time, time_cost_str):
	now_time = int(time.time()*1000)
	if run_task_time == 0:
	time_cost_str = 'start'
	else:
	if time_cost_str != '':
	time_cost_str += f'-->'
	time_cost_str += f'{now_time - run_task_time}'
	run_task_time = now_time
	return run_task_time, time_cost_str

	def create_demo() -> gr.Blocks:
	with gr.Blocks() as demo:
	croper = gr.State()
	with gr.Row():
	with gr.Column():
	input_image_prompt = gr.Textbox(lines=1, label="Input Image Prompt", value=DEFAULT_INPUT_PROMPT)
	edit_prompt = gr.Textbox(lines=1, label="Edit Prompt", value=DEFAULT_EDIT_PROMPT)
	with gr.Column():
	num_steps = gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Num Steps")
	start_step = gr.Slider(minimum=0, maximum=100, value=15, step=1, label="Start Step")
	guidance_scale = gr.Slider(minimum=0, maximum=30, value=5, step=0.5, label="Guidance Scale")
	with gr.Column():
	generate_size = gr.Number(label="Generate Size", value=512)
	with gr.Accordion("Advanced Options", open=False):
	mask_expansion = gr.Number(label="Mask Expansion", value=50, visible=True)
	mask_dilation = gr.Slider(minimum=0, maximum=10, value=2, step=1, label="Mask Dilation")
	category = gr.Textbox(label="Category", value=DEFAULT_CATEGORY, visible=False)
	g_btn = gr.Button("Edit Image")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(label="Input Image", type="pil")
	with gr.Column():
	restored_image = gr.Image(label="Restored Image", type="pil", interactive=False)
	with gr.Column():
	origin_area_image = gr.Image(label="Origin Area Image", type="pil", interactive=False)
	generated_image = gr.Image(label="Generated Image", type="pil", interactive=False)
	generated_cost = gr.Textbox(label="Time cost by step (ms):", visible=True, interactive=False)

	g_btn.click(
	fn=segment_image,
	inputs=[input_image, category, generate_size, mask_expansion, mask_dilation],
	outputs=[origin_area_image, croper],
	).success(
	fn=image_to_image,
	inputs=[origin_area_image, input_image_prompt, edit_prompt, num_steps, start_step, guidance_scale],
	outputs=[generated_image, generated_cost],
	).success(
	fn=restore_result,
	inputs=[croper, category, generated_image],
	outputs=[restored_image],
	)

	return demo