Spaces:

JackAILab
/

ConsistentID

Running on Zero

File size: 21,587 Bytes

9669aec

import numpy as np
import math
import types
import torch
import torch.nn as nn
import numpy as np
import cv2
import re
import torch.nn.functional as F
from einops import rearrange
from einops.layers.torch import Rearrange
from PIL import Image

def extract_first_sentence(text):
    end_index = text.find('.')
    if end_index != -1:
        first_sentence = text[:end_index + 1]
        return first_sentence.strip()
    else:
        return text.strip()
    
import re
def remove_duplicate_keywords(text, keywords):
    keyword_counts = {}

    words = re.findall(r'\b\w+\b|[.,;!?]', text)

    for keyword in keywords:
        keyword_counts[keyword] = 0
        for i, word in enumerate(words):
            if word.lower() == keyword.lower():
                keyword_counts[keyword] += 1
                if keyword_counts[keyword] > 1:
                    words[i] = ""
    processed_text = " ".join(words)

    return processed_text

def process_text_with_markers(text, parsing_mask_list):
    keywords = ["face", "ears", "eyes", "nose", "mouth"]
    text = remove_duplicate_keywords(text, keywords)
    key_parsing_mask_markers = ["Face", "Left_Ear", "Right_Ear", "Left_Eye", "Right_Eye", "Nose", "Upper_Lip", "Lower_Lip"]
    mapping = {
        "Face": "face",
        "Left_Ear": "ears",
        "Right_Ear": "ears",
        "Left_Eye": "eyes",
        "Right_Eye": "eyes",
        "Nose": "nose",
        "Upper_Lip": "mouth",
        "Lower_Lip": "mouth",
    }
    facial_features_align = []
    markers_align = []
    for key in key_parsing_mask_markers:
        if key in parsing_mask_list:
            mapped_key = mapping.get(key, key.lower())
            if mapped_key not in facial_features_align:
                facial_features_align.append(mapped_key)
                markers_align.append("<|"+mapped_key+"|>")

    text_marked = text
    align_parsing_mask_list = parsing_mask_list
    for feature, marker in zip(facial_features_align[::-1], markers_align[::-1]):
        pattern = rf'\b{feature}\b'  
        text_marked_new = re.sub(pattern, f'{feature} {marker}', text_marked, count=1)
        if text_marked == text_marked_new:
            for key, value in mapping.items():
                if value == feature:
                    if key in align_parsing_mask_list:
                        del align_parsing_mask_list[key]   

        text_marked = text_marked_new 

    text_marked = text_marked.replace('\n', '')

    ordered_text = []
    text_none_makers = []
    facial_marked_count = 0
    skip_count = 0
    for marker in markers_align:
        start_idx = text_marked.find(marker)
        end_idx = start_idx + len(marker)

        while start_idx > 0 and text_marked[start_idx - 1] not in [",", ".", ";"]:
            start_idx -= 1

        while end_idx < len(text_marked) and text_marked[end_idx] not in [",", ".", ";"]:
            end_idx += 1

        context = text_marked[start_idx:end_idx].strip()
        if context == "":
            text_none_makers.append(text_marked[:end_idx])
        else:
            if skip_count!=0:
                skip_count -= 1 
                continue
            else:
                ordered_text.append(context + ",") 
                text_delete_makers = text_marked[:start_idx] + text_marked[end_idx:]
                text_marked = text_delete_makers
                facial_marked_count += 1

    align_marked_text = " ".join(ordered_text)
    replace_list = ["<|face|>", "<|ears|>", "<|nose|>", "<|eyes|>", "<|mouth|>"] 
    for item in replace_list:
        align_marked_text = align_marked_text.replace(item, "<|facial|>")

    return align_marked_text, align_parsing_mask_list

def tokenize_and_mask_noun_phrases_ends(text, image_token_id, facial_token_id, tokenizer):
    input_ids = tokenizer.encode(text) 
    image_noun_phrase_end_mask = [False for _ in input_ids] 
    facial_noun_phrase_end_mask = [False for _ in input_ids]
    clean_input_ids = []
    clean_index = 0
    image_num = 0

    for i, id in enumerate(input_ids):
        if id == image_token_id:
            image_noun_phrase_end_mask[clean_index + image_num - 1] = True
            image_num += 1
        elif id == facial_token_id:
            facial_noun_phrase_end_mask[clean_index - 1] = True   
        else:
            clean_input_ids.append(id)
            clean_index += 1

    max_len = tokenizer.model_max_length 

    if len(clean_input_ids) > max_len:
        clean_input_ids = clean_input_ids[:max_len]
    else:
        clean_input_ids = clean_input_ids + [tokenizer.pad_token_id] * (
            max_len - len(clean_input_ids)
        )

    if len(image_noun_phrase_end_mask) > max_len: 
        image_noun_phrase_end_mask = image_noun_phrase_end_mask[:max_len]
    else:
        image_noun_phrase_end_mask = image_noun_phrase_end_mask + [False] * (
            max_len - len(image_noun_phrase_end_mask)
        )

    if len(facial_noun_phrase_end_mask) > max_len: 
        facial_noun_phrase_end_mask = facial_noun_phrase_end_mask[:max_len]
    else:
        facial_noun_phrase_end_mask = facial_noun_phrase_end_mask + [False] * (
            max_len - len(facial_noun_phrase_end_mask)
        )        
    clean_input_ids = torch.tensor(clean_input_ids, dtype=torch.long)
    image_noun_phrase_end_mask = torch.tensor(image_noun_phrase_end_mask, dtype=torch.bool)
    facial_noun_phrase_end_mask = torch.tensor(facial_noun_phrase_end_mask, dtype=torch.bool)
    
    return clean_input_ids.unsqueeze(0), image_noun_phrase_end_mask.unsqueeze(0), facial_noun_phrase_end_mask.unsqueeze(0)

def prepare_image_token_idx(image_token_mask, facial_token_mask, max_num_objects=2, max_num_facials=5):
    image_token_idx = torch.nonzero(image_token_mask, as_tuple=True)[1] 
    image_token_idx_mask = torch.ones_like(image_token_idx, dtype=torch.bool) 
    if len(image_token_idx) < max_num_objects: 
        image_token_idx = torch.cat(
            [ 
                image_token_idx,
                torch.zeros(max_num_objects - len(image_token_idx), dtype=torch.long),
            ]
        )
        image_token_idx_mask = torch.cat( 
            [ 
                image_token_idx_mask,
                torch.zeros(
                    max_num_objects - len(image_token_idx_mask),
                    dtype=torch.bool,
                ),
            ]
        )
    facial_token_idx = torch.nonzero(facial_token_mask, as_tuple=True)[1]
    facial_token_idx_mask = torch.ones_like(facial_token_idx, dtype=torch.bool)     
    if len(facial_token_idx) < max_num_facials:
        facial_token_idx = torch.cat(
            [ 
                facial_token_idx,
                torch.zeros(max_num_facials - len(facial_token_idx), dtype=torch.long),
            ]
        )
        facial_token_idx_mask = torch.cat(
            [ 
                facial_token_idx_mask,
                torch.zeros(
                    max_num_facials - len(facial_token_idx_mask),
                    dtype=torch.bool,
                ),
            ]
        )
    image_token_idx = image_token_idx.unsqueeze(0)
    image_token_idx_mask = image_token_idx_mask.unsqueeze(0)
    
    facial_token_idx = facial_token_idx.unsqueeze(0)
    facial_token_idx_mask = facial_token_idx_mask.unsqueeze(0)

    return image_token_idx, image_token_idx_mask, facial_token_idx, facial_token_idx_mask

def get_object_localization_loss_for_one_layer(
    cross_attention_scores,
    object_segmaps,
    object_token_idx,
    object_token_idx_mask,
    loss_fn,
):
    bxh, num_noise_latents, num_text_tokens = cross_attention_scores.shape
    b, max_num_objects, _, _ = object_segmaps.shape
    size = int(num_noise_latents**0.5)

    object_segmaps = F.interpolate(object_segmaps, size=(size, size), mode="bilinear", antialias=True)

    object_segmaps = object_segmaps.view(
        b, max_num_objects, -1
    )

    num_heads = bxh // b
    cross_attention_scores = cross_attention_scores.view(b, num_heads, num_noise_latents, num_text_tokens)

    
    object_token_attn_prob = torch.gather(
        cross_attention_scores,
        dim=3,
        index=object_token_idx.view(b, 1, 1, max_num_objects).expand(
            b, num_heads, num_noise_latents, max_num_objects
        ),
    )
    object_segmaps = (
        object_segmaps.permute(0, 2, 1)
        .unsqueeze(1)
        .expand(b, num_heads, num_noise_latents, max_num_objects)
    )
    loss = loss_fn(object_token_attn_prob, object_segmaps)

    loss = loss * object_token_idx_mask.view(b, 1, max_num_objects)
    object_token_cnt = object_token_idx_mask.sum(dim=1).view(b, 1) + 1e-5
    loss = (loss.sum(dim=2) / object_token_cnt).mean()

    return loss


def get_object_localization_loss(
    cross_attention_scores,
    object_segmaps,
    image_token_idx,
    image_token_idx_mask,
    loss_fn,
):  
    num_layers = len(cross_attention_scores)
    loss = 0
    for k, v in cross_attention_scores.items():
        layer_loss = get_object_localization_loss_for_one_layer(
            v, object_segmaps, image_token_idx, image_token_idx_mask, loss_fn
        )
        loss += layer_loss
    return loss / num_layers

def unet_store_cross_attention_scores(unet, attention_scores, layers=5):
    from diffusers.models.attention_processor import Attention

    UNET_LAYER_NAMES = [ 
        "down_blocks.0",
        "down_blocks.1",
        "down_blocks.2",
        "mid_block",
        "up_blocks.1",
        "up_blocks.2",
        "up_blocks.3",
    ]

    start_layer = (len(UNET_LAYER_NAMES) - layers) // 2
    end_layer = start_layer + layers   
    applicable_layers = UNET_LAYER_NAMES[start_layer:end_layer]

    def make_new_get_attention_scores_fn(name):
        def new_get_attention_scores(module, query, key, attention_mask=None):
            attention_probs = module.old_get_attention_scores(
                query, key, attention_mask
            )
            attention_scores[name] = attention_probs
            return attention_probs

        return new_get_attention_scores 

    for name, module in unet.named_modules():
        if isinstance(module, Attention) and "attn1" in name:
            if not any(layer in name for layer in applicable_layers):
                continue
 
            module.old_get_attention_scores = module.get_attention_scores
            module.get_attention_scores = types.MethodType(
                make_new_get_attention_scores_fn(name), module
            )
    return unet
    
class BalancedL1Loss(nn.Module):
    def __init__(self, threshold=1.0, normalize=False):
        super().__init__()
        self.threshold = threshold
        self.normalize = normalize

    def forward(self, object_token_attn_prob, object_segmaps):
        if self.normalize:
            object_token_attn_prob = object_token_attn_prob / (
                object_token_attn_prob.max(dim=2, keepdim=True)[0] + 1e-5
            )
        background_segmaps = 1 - object_segmaps
        background_segmaps_sum = background_segmaps.sum(dim=2) + 1e-5
        object_segmaps_sum = object_segmaps.sum(dim=2) + 1e-5

        background_loss = (object_token_attn_prob * background_segmaps).sum(
            dim=2
        ) / background_segmaps_sum

        object_loss = (object_token_attn_prob * object_segmaps).sum(
            dim=2
        ) / object_segmaps_sum

        return background_loss - object_loss

def fetch_mask_raw_image(raw_image, mask_image):

    mask_image = mask_image.resize(raw_image.size)
    mask_raw_image = Image.composite(raw_image, Image.new('RGB', raw_image.size, (0, 0, 0)), mask_image) 

    return mask_raw_image

mapping_table = [
    {"Mask Value": 0, "Body Part": "Background", "RGB Color": [0, 0, 0]},
    {"Mask Value": 1, "Body Part": "Face", "RGB Color": [255, 0, 0]},
    {"Mask Value": 2, "Body Part": "Left_Eyebrow", "RGB Color": [255, 85, 0]},
    {"Mask Value": 3, "Body Part": "Right_Eyebrow", "RGB Color": [255, 170, 0]},
    {"Mask Value": 4, "Body Part": "Left_Eye", "RGB Color": [255, 0, 85]},
    {"Mask Value": 5, "Body Part": "Right_Eye", "RGB Color": [255, 0, 170]},
    {"Mask Value": 6, "Body Part": "Hair", "RGB Color": [0, 0, 255]},
    {"Mask Value": 7, "Body Part": "Left_Ear", "RGB Color": [85, 0, 255]},
    {"Mask Value": 8, "Body Part": "Right_Ear", "RGB Color": [170, 0, 255]},
    {"Mask Value": 9, "Body Part": "Mouth_External Contour", "RGB Color": [0, 255, 85]},
    {"Mask Value": 10, "Body Part": "Nose", "RGB Color": [0, 255, 0]},
    {"Mask Value": 11, "Body Part": "Mouth_Inner_Contour", "RGB Color": [0, 255, 170]},
    {"Mask Value": 12, "Body Part": "Upper_Lip", "RGB Color": [85, 255, 0]}, 
    {"Mask Value": 13, "Body Part": "Lower_Lip", "RGB Color": [170, 255, 0]},
    {"Mask Value": 14, "Body Part": "Neck", "RGB Color": [0, 85, 255]},
    {"Mask Value": 15, "Body Part": "Neck_Inner Contour", "RGB Color": [0, 170, 255]},
    {"Mask Value": 16, "Body Part": "Cloth", "RGB Color": [255, 255, 0]},
    {"Mask Value": 17, "Body Part": "Hat", "RGB Color": [255, 0, 255]},
    {"Mask Value": 18, "Body Part": "Earring", "RGB Color": [255, 85, 255]},
    {"Mask Value": 19, "Body Part": "Necklace", "RGB Color": [255, 255, 85]},
    {"Mask Value": 20, "Body Part": "Glasses", "RGB Color": [255, 170, 255]},
    {"Mask Value": 21, "Body Part": "Hand", "RGB Color": [255, 0, 255]},
    {"Mask Value": 22, "Body Part": "Wristband", "RGB Color": [0, 255, 255]},
    {"Mask Value": 23, "Body Part": "Clothes_Upper", "RGB Color": [85, 255, 255]},
    {"Mask Value": 24, "Body Part": "Clothes_Lower", "RGB Color": [170, 255, 255]}
]


def masks_for_unique_values(image_raw_mask):

    image_array = np.array(image_raw_mask)
    unique_values, counts = np.unique(image_array, return_counts=True)
    masks_dict = {}
    for value in unique_values:
        binary_image = np.uint8(image_array == value) * 255
        contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        mask = np.zeros_like(image_array)
        for contour in contours:
            cv2.drawContours(mask, [contour], -1, (255), thickness=cv2.FILLED)
        
        if value == 0: 
            body_part="WithoutBackground"
            mask2 = np.where(mask == 255, 0, 255).astype(mask.dtype)
            masks_dict[body_part] = Image.fromarray(mask2)
        
        body_part = next((entry["Body Part"] for entry in mapping_table if entry["Mask Value"] == value), f"Unknown_{value}")
        if body_part.startswith("Unknown_"):
            continue            

        masks_dict[body_part] = Image.fromarray(mask)
    
    return masks_dict
# FFN
def FeedForward(dim, mult=4):
    inner_dim = int(dim * mult)
    return nn.Sequential(
        nn.LayerNorm(dim),
        nn.Linear(dim, inner_dim, bias=False),
        nn.GELU(),
        nn.Linear(inner_dim, dim, bias=False),
    )


def reshape_tensor(x, heads):
    bs, length, width = x.shape
    x = x.view(bs, length, heads, -1)
    x = x.transpose(1, 2)
    x = x.reshape(bs, heads, length, -1)
    return x

class PerceiverAttention(nn.Module):
    def __init__(self, *, dim, dim_head=64, heads=8):
        super().__init__()
        self.scale = dim_head**-0.5
        self.dim_head = dim_head
        self.heads = heads
        inner_dim = dim_head * heads

        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

        self.to_q = nn.Linear(dim, inner_dim, bias=False)
        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
        self.to_out = nn.Linear(inner_dim, dim, bias=False)

    def forward(self, x, latents):
        """
        Args:
            x (torch.Tensor): image features
                shape (b, n1, D)
            latent (torch.Tensor): latent features
                shape (b, n2, D)
        """

        x = self.norm1(x)
        latents = self.norm2(latents)

        b, l, _ = latents.shape

        q = self.to_q(latents)
        kv_input = torch.cat((x, latents), dim=-2)
        k, v = self.to_kv(kv_input).chunk(2, dim=-1)

        q = reshape_tensor(q, self.heads)
        k = reshape_tensor(k, self.heads)
        v = reshape_tensor(v, self.heads)

        # attention
        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
        weight = (q * scale) @ (k * scale).transpose(-2, -1)
        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
        out = weight @ v

        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)

        return self.to_out(out)

class FacePerceiverResampler(torch.nn.Module):
    def __init__(
        self,
        *,
        dim=768,
        depth=4,
        dim_head=64,
        heads=16,
        embedding_dim=1280,
        output_dim=768,
        ff_mult=4,
    ):
        super().__init__()
        
        self.proj_in = torch.nn.Linear(embedding_dim, dim)
        self.proj_out = torch.nn.Linear(dim, output_dim)
        self.norm_out = torch.nn.LayerNorm(output_dim)
        self.layers = torch.nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(
                torch.nn.ModuleList(
                    [
                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
                        FeedForward(dim=dim, mult=ff_mult),
                    ]
                )
            )
    def forward(self, latents, x): # latents.torch.Size([2, 4, 768])  x.torch.Size([2, 257, 1280])
        x = self.proj_in(x) # x.torch.Size([2, 257, 768])
        for attn, ff in self.layers:
            latents = attn(x, latents) + latents # latents.torch.Size([2, 4, 768])
            latents = ff(latents) + latents # latents.torch.Size([2, 4, 768])
        latents = self.proj_out(latents)
        return self.norm_out(latents)
  
class ProjPlusModel(torch.nn.Module):
    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
        super().__init__()
        
        self.cross_attention_dim = cross_attention_dim
        self.num_tokens = num_tokens
        
        self.proj = torch.nn.Sequential(
            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
            torch.nn.GELU(),
            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
        )
        self.norm = torch.nn.LayerNorm(cross_attention_dim)
        
        self.perceiver_resampler = FacePerceiverResampler(
            dim=cross_attention_dim,
            depth=4,
            dim_head=64,
            heads=cross_attention_dim // 64,
            embedding_dim=clip_embeddings_dim,
            output_dim=cross_attention_dim,
            ff_mult=4,
        )
        
    def forward(self, id_embeds, clip_embeds, shortcut=False, scale=1.0):

        x = self.proj(id_embeds)
        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
        x = self.norm(x) 
        out = self.perceiver_resampler(x, clip_embeds) 
        if shortcut:
            out = scale * x +  out
        return out 
    
class AttentionMLP(nn.Module):
    def __init__(
        self,
        dtype=torch.float16,
        dim=1024,
        depth=8,
        dim_head=64,
        heads=16,
        single_num_tokens=1,
        embedding_dim=1280,
        output_dim=768,
        ff_mult=4,
        max_seq_len: int = 257*2,
        apply_pos_emb: bool = False,
        num_latents_mean_pooled: int = 0,
    ):
        super().__init__()
        self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None

        self.single_num_tokens = single_num_tokens
        self.latents = nn.Parameter(torch.randn(1, self.single_num_tokens, dim) / dim**0.5)

        self.proj_in = nn.Linear(embedding_dim, dim)

        self.proj_out = nn.Linear(dim, output_dim)
        self.norm_out = nn.LayerNorm(output_dim)

        self.to_latents_from_mean_pooled_seq = (
            nn.Sequential(
                nn.LayerNorm(dim),
                nn.Linear(dim, dim * num_latents_mean_pooled),
                Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
            )
            if num_latents_mean_pooled > 0
            else None
        )

        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(
                nn.ModuleList(
                    [
                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
                        FeedForward(dim=dim, mult=ff_mult),
                    ]
                )
            )

    def forward(self, x):
        if self.pos_emb is not None:
            n, device = x.shape[1], x.device
            pos_emb = self.pos_emb(torch.arange(n, device=device))
            x = x + pos_emb
        # x torch.Size([5, 257, 1280])
        latents = self.latents.repeat(x.size(0), 1, 1)

        x = self.proj_in(x) # torch.Size([5, 257, 1024])

        if self.to_latents_from_mean_pooled_seq:
            meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
            latents = torch.cat((meanpooled_latents, latents), dim=-2)

        for attn, ff in self.layers:
            latents = attn(x, latents) + latents
            latents = ff(latents) + latents

        latents = self.proj_out(latents)
        return self.norm_out(latents)


def masked_mean(t, *, dim, mask=None):
    if mask is None:
        return t.mean(dim=dim)

    denom = mask.sum(dim=dim, keepdim=True)
    mask = rearrange(mask, "b n -> b n 1")
    masked_t = t.masked_fill(~mask, 0.0)

    return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)