remove_background / DIS /IsNetPipeLine.py
Superlang's picture
init
4d0b7ae
"""
reference: https://github.com/xuebinqin/DIS
"""
import PIL.Image
import numpy as np
import torch
import torch.nn.functional as F
from PIL import Image
from torch import nn
from torch.autograd import Variable
from torchvision import transforms
from torchvision.transforms.functional import normalize
from .models import ISNetDIS
# Helpers
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class GOSNormalize(object):
"""
Normalize the Image using torch.transforms
"""
def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
self.mean = mean
self.std = std
def __call__(self, image):
image = normalize(image, self.mean, self.std)
return image
def im_preprocess(im, size):
if len(im.shape) < 3:
im = im[:, :, np.newaxis]
if im.shape[2] == 1:
im = np.repeat(im, 3, axis=2)
im_tensor = torch.tensor(im.copy(), dtype=torch.float32)
im_tensor = torch.transpose(torch.transpose(im_tensor, 1, 2), 0, 1)
if len(size) < 2:
return im_tensor, im.shape[0:2]
else:
im_tensor = torch.unsqueeze(im_tensor, 0)
im_tensor = F.upsample(im_tensor, size, mode="bilinear")
im_tensor = torch.squeeze(im_tensor, 0)
return im_tensor.type(torch.uint8), im.shape[0:2]
class IsNetPipeLine:
def __init__(self, model_path=None, model_digit="full"):
self.model_digit = model_digit
self.model = ISNetDIS()
self.cache_size = [1024, 1024]
self.transform = transforms.Compose([
GOSNormalize([0.5, 0.5, 0.5], [1.0, 1.0, 1.0])
])
# Build Model
self.build_model(model_path)
def load_image(self, image: PIL.Image.Image):
im = np.array(image.convert("RGB"))
im, im_shp = im_preprocess(im, self.cache_size)
im = torch.divide(im, 255.0)
shape = torch.from_numpy(np.array(im_shp))
return self.transform(im).unsqueeze(0), shape.unsqueeze(0) # make a batch of image, shape
def build_model(self, model_path=None):
if model_path is not None:
self.model.load_state_dict(torch.load(model_path, map_location=device))
# convert to half precision
if self.model_digit == "half":
self.model.half()
for layer in self.model.modules():
if isinstance(layer, nn.BatchNorm2d):
layer.float()
self.model.to(device)
self.model.eval()
def __call__(self, image: PIL.Image.Image):
image_tensor, orig_size = self.load_image(image)
mask = self.predict(image_tensor, orig_size)
pil_mask = Image.fromarray(mask).convert('L')
im_rgb = image.convert("RGB")
im_rgba = im_rgb.copy()
im_rgba.putalpha(pil_mask)
return [im_rgba, pil_mask]
def predict(self, inputs_val: torch.Tensor, shapes_val):
"""
Given an Image, predict the mask
"""
if self.model_digit == "full":
inputs_val = inputs_val.type(torch.FloatTensor)
else:
inputs_val = inputs_val.type(torch.HalfTensor)
inputs_val_v = Variable(inputs_val, requires_grad=False).to(device) # wrap inputs in Variable
ds_val = self.model(inputs_val_v)[0] # list of 6 results
# B x 1 x H x W # we want the first one which is the most accurate prediction
pred_val = ds_val[0][0, :, :, :]
# recover the prediction spatial size to the orignal image size
pred_val = torch.squeeze(
F.upsample(torch.unsqueeze(pred_val, 0), (shapes_val[0][0], shapes_val[0][1]), mode='bilinear'))
ma = torch.max(pred_val)
mi = torch.min(pred_val)
pred_val = (pred_val - mi) / (ma - mi) # max = 1
if device == 'cuda':
torch.cuda.empty_cache()
return (pred_val.detach().cpu().numpy() * 255).astype(np.uint8) # it is the mask we need
# a = IsNetPipeLine(model_path="save_models/isnet.pth")
# input_image = Image.open("image_0mx.png")
# rgb, mask = a(input_image)
#
# rgb.save("rgb.png")
# mask.save("mask.png")