Spaces:

PAIR
/

HD-Painter

Running on A10G

Andranik Sargsyan

update examples

29ac50c 11 months ago

6.25 kB

	import os
	from functools import partial
	from glob import glob
	from pathlib import Path as PythonPath

	import cv2
	import torchvision.transforms.functional as TvF
	import torch
	import torch.nn as nn
	import numpy as np
	from inspect import isfunction
	from PIL import Image

	from lib import smplfusion
	from lib.smplfusion import share, router, attentionpatch, transformerpatch
	from lib.utils.iimage import IImage
	from lib.utils import poisson_blend
	from lib.models.sd2_sr import predict_eps_from_z_and_v, predict_start_from_z_and_v


	def refine_mask(hr_image, hr_mask, lr_image, sam_predictor):
	lr_mask = hr_mask.resize(512)

	x_min, y_min, rect_w, rect_h = cv2.boundingRect(lr_mask.data[0][:, :, 0])
	x_min = max(x_min - 1, 0)
	y_min = max(y_min - 1, 0)
	x_max = x_min + rect_w + 1
	y_max = y_min + rect_h + 1

	input_box = np.array([x_min, y_min, x_max, y_max])

	sam_predictor.set_image(hr_image.resize(512).data[0])
	masks, _, _ = sam_predictor.predict(
	point_coords=None,
	point_labels=None,
	box=input_box[None, :],
	multimask_output=True,
	)
	dilation_kernel = np.ones((13, 13))
	original_object_mask = (np.sum(masks, axis=0) > 0).astype(np.uint8)
	original_object_mask = cv2.dilate(original_object_mask, dilation_kernel)

	sam_predictor.set_image(lr_image.resize(512).data[0])
	masks, _, _ = sam_predictor.predict(
	point_coords=None,
	point_labels=None,
	box=input_box[None, :],
	multimask_output=True,
	)
	dilation_kernel = np.ones((3, 3))
	inpainted_object_mask = (np.sum(masks, axis=0) > 0).astype(np.uint8)
	inpainted_object_mask = cv2.dilate(inpainted_object_mask, dilation_kernel)

	lr_mask_masking = ((original_object_mask + inpainted_object_mask ) > 0).astype(np.uint8)
	new_mask = lr_mask.data[0] * lr_mask_masking[:, :, np.newaxis]
	new_mask = IImage(new_mask).resize(2048, resample = Image.BICUBIC)
	return new_mask


	def run(ddim, sam_predictor, lr_image, hr_image, hr_mask, prompt = 'high resolution professional photo', noise_level=20,
	blend_output = True, blend_trick = True, no_superres = False,
	dt = 50, seed = 1, guidance_scale = 7.5, negative_prompt = '', use_sam_mask = False):
	torch.manual_seed(seed)
	dtype = ddim.vae.encoder.conv_in.weight.dtype
	device = ddim.vae.encoder.conv_in.weight.device

	router.attention_forward = attentionpatch.default.forward_xformers
	router.basic_transformer_forward = transformerpatch.default.forward

	if use_sam_mask:
	with torch.no_grad():
	hr_mask = refine_mask(hr_image, hr_mask, lr_image, sam_predictor)

	orig_h, orig_w = hr_image.torch().shape[2], hr_image.torch().shape[3]
	hr_image = hr_image.padx(256, padding_mode='reflect')
	hr_mask = hr_mask.padx(256, padding_mode='reflect').dilate(19)
	hr_mask_orig = hr_mask
	lr_image = lr_image.padx(64, padding_mode='reflect')
	lr_mask = hr_mask.resize((lr_image.torch().shape[2], lr_image.torch().shape[3]), resample = Image.BICUBIC).alpha().torch(vmin=0).to(device)
	lr_mask = TvF.gaussian_blur(lr_mask, kernel_size=19)

	if no_superres:
	output_tensor = lr_image.resize((hr_image.torch().shape[2], hr_image.torch().shape[3]), resample = Image.BICUBIC).torch().cuda()
	output_tensor = (255*((output_tensor.clip(-1, 1) + 1) / 2)).to(torch.uint8)
	output_tensor = poisson_blend(
	orig_img=hr_image.data[0][:orig_h, :orig_w, :],
	fake_img=output_tensor.cpu().permute(0, 2, 3, 1)[0].numpy()[:orig_h, :orig_w, :],
	mask=hr_mask_orig.alpha().data[0][:orig_h, :orig_w, :]
	)
	return IImage(output_tensor[:orig_h, :orig_w, :])

	# encode hr image
	with torch.no_grad():
	hr_z0 = ddim.vae.encode(hr_image.torch().cuda().to(dtype=dtype, device=device)).mean * ddim.config.scale_factor

	assert hr_z0.shape[2] == lr_image.torch().shape[2]
	assert hr_z0.shape[3] == lr_image.torch().shape[3]

	unet_condition = lr_image.cuda().torch().to(memory_format=torch.contiguous_format).to(dtype=dtype, device=device)
	zT = torch.randn((1,4,unet_condition.shape[2], unet_condition.shape[3])).cuda().to(dtype=dtype, device=device)

	with torch.no_grad():
	context = ddim.encoder.encode([negative_prompt, prompt])

	noise_level = torch.Tensor(1 * [noise_level]).to(device=device).long()
	unet_condition, noise_level = ddim.low_scale_model(unet_condition, noise_level=noise_level)

	with torch.autocast('cuda'), torch.no_grad():
	zt = zT
	for index,t in enumerate(range(999, 0, -dt)):

	_zt = zt if unet_condition is None else torch.cat([zt, unet_condition], 1)

	eps_uncond, eps = ddim.unet(
	torch.cat([_zt, _zt]).to(dtype=dtype, device=device),
	timesteps = torch.tensor([t, t]).to(device=device),
	context = context,
	y=torch.cat([noise_level]*2)
	).chunk(2)

	ts = torch.full((zt.shape[0],), t, device=device, dtype=torch.long)
	model_output = (eps_uncond + guidance_scale * (eps - eps_uncond))
	eps = predict_eps_from_z_and_v(ddim.schedule, zt, ts, model_output).to(dtype)
	z0 = predict_start_from_z_and_v(ddim.schedule, zt, ts, model_output).to(dtype)

	if blend_trick:
	z0 = z0 * lr_mask + hr_z0 * (1-lr_mask)

	zt = ddim.schedule.sqrt_alphas[t - dt] * z0 + ddim.schedule.sqrt_one_minus_alphas[t - dt] * eps

	with torch.no_grad():
	output_tensor = ddim.vae.decode(z0.to(dtype) / ddim.config.scale_factor)

	if blend_output:
	output_tensor = (255*((output_tensor + 1) / 2).clip(0, 1)).to(torch.uint8)
	output_tensor = poisson_blend(
	orig_img=hr_image.data[0][:orig_h, :orig_w, :],
	fake_img=output_tensor.cpu().permute(0, 2, 3, 1)[0].numpy()[:orig_h, :orig_w, :],
	mask=hr_mask_orig.alpha().data[0][:orig_h, :orig_w, :]
	)
	return IImage(output_tensor[:orig_h, :orig_w, :])
	else:
	return IImage(output_tensor[:, :, :orig_h, :orig_w])