Spaces:

JackAILab
/

ConsistentID

Running on Zero

App Files Files Community

JackAILab commited on May 7

Commit

9669aec

•

1 Parent(s): 0cf6544

Upload 292 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
app.py +198 -9
attention.py +288 -0
functions.py +599 -0
images/templates/3f8d901770014c1b8f7f261971f0e92.png +3 -0
images/templates/6577b962b6346df03fea83211daaf48.png +0 -0
images/templates/75583964a834abe33b72f52b1a98e84.png +3 -0
images/templates/c9fe4c2d5ddbc5670dde47fc465c48b.jpg +0 -0
models/BiSeNet/6.jpg +0 -0
models/BiSeNet/__init__.py +2 -0
models/BiSeNet/__pycache__/__init__.cpython-38.pyc +0 -0
models/BiSeNet/__pycache__/model.cpython-38.pyc +0 -0
models/BiSeNet/__pycache__/resnet.cpython-38.pyc +0 -0
models/BiSeNet/evaluate.py +95 -0
models/BiSeNet/face_dataset.py +106 -0
models/BiSeNet/hair.png +0 -0
models/BiSeNet/logger.py +23 -0
models/BiSeNet/loss.py +75 -0
models/BiSeNet/makeup.py +130 -0
models/BiSeNet/makeup/116_1.png +0 -0
models/BiSeNet/makeup/116_3.png +0 -0
models/BiSeNet/makeup/116_lip_ori.png +0 -0
models/BiSeNet/makeup/116_ori.png +0 -0
models/BiSeNet/model.py +283 -0
models/BiSeNet/modules/__init__.py +5 -0
models/BiSeNet/modules/bn.py +130 -0
models/BiSeNet/modules/deeplab.py +84 -0
models/BiSeNet/modules/dense.py +42 -0
models/BiSeNet/modules/functions.py +234 -0
models/BiSeNet/modules/misc.py +21 -0
models/BiSeNet/modules/residual.py +88 -0
models/BiSeNet/modules/src/checks.h +15 -0
models/BiSeNet/modules/src/inplace_abn.cpp +95 -0
models/BiSeNet/modules/src/inplace_abn.h +88 -0
models/BiSeNet/modules/src/inplace_abn_cpu.cpp +119 -0
models/BiSeNet/modules/src/inplace_abn_cuda.cu +333 -0
models/BiSeNet/modules/src/inplace_abn_cuda_half.cu +275 -0
models/BiSeNet/modules/src/utils/checks.h +15 -0
models/BiSeNet/modules/src/utils/common.h +49 -0
models/BiSeNet/modules/src/utils/cuda.cuh +71 -0
models/BiSeNet/optimizer.py +69 -0
models/BiSeNet/prepropess_data.py +38 -0
models/BiSeNet/resnet.py +109 -0
models/BiSeNet/test.py +90 -0
models/BiSeNet/train.py +179 -0
models/BiSeNet/transform.py +129 -0
models/BiSeNet_pretrained_for_ConsistentID.pth +3 -0
models/LLaVA/.devcontainer/Dockerfile +53 -0
models/LLaVA/.devcontainer/devcontainer.env +2 -0
models/LLaVA/.devcontainer/devcontainer.json +71 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/templates/3f8d901770014c1b8f7f261971f0e92.png filter=lfs diff=lfs merge=lfs -text
+images/templates/75583964a834abe33b72f52b1a98e84.png filter=lfs diff=lfs merge=lfs -text
+models/LLaVA/images/demo_cli.gif filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,14 +1,203 @@
 import gradio as gr
-import spaces
 import torch
-zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
-@spaces.GPU
-def greet(n):
-    print(zero.device) # <-- 'cuda:0' 🤗
-    return f"Hello {zero + n} Tensor"
-demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
-demo.launch()

 import gradio as gr
 import torch
+import os
+import glob
+import numpy as np
+from datetime import datetime
+from PIL import Image
+from diffusers.utils import load_image
+from diffusers import EulerDiscreteScheduler
+from pipline_StableDiffusion_ConsistentID import ConsistentIDStableDiffusionPipeline
+import sys
+sys.path.append("./models/LLaVA")
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import get_model_name_from_path
+from llava.eval.run_llava import eval_model
+# Load Lava for prompt enhancement
+llva_model_path = "/data6/huangjiehui_m22/pretrained_model/llava-v1.5-7b" #TODO
+llva_tokenizer, llva_model, llva_image_processor, llva_context_len = load_pretrained_model(
+    model_path=llva_model_path,
+    model_base=None,
+    model_name=get_model_name_from_path(llva_model_path),)
+@torch.inference_mode()
+def Enhance_prompt(prompt,select_images):
+    llva_prompt = f'Please ignore the image. Enhance the following text prompt for me. You can associate more details with the character\'s gesture, environment, and decent clothing:"{prompt}".'
+    args = type('Args', (), {
+        "model_path": llva_model_path,
+        "model_base": None,
+        "model_name": get_model_name_from_path(llva_model_path),
+        "query": llva_prompt,
+        "conv_mode": None,
+        "image_file": select_images,
+        "sep": ",",
+        "temperature": 0,
+        "top_p": None,
+        "num_beams": 1,
+        "max_new_tokens": 512
+    })()
+    Enhanced_prompt = eval_model(args, llva_tokenizer, llva_model, llva_image_processor)
+    return Enhanced_prompt
+# print(gr.__version__)
+# 4.16.0
+os.environ['GRADIO_TEMP_DIR'] = "/data6/huangjiehui_m22/z_benke/liaost/ConsistentID/images/gradio_tmp" #TODO
+script_directory = os.path.dirname(os.path.realpath(__file__))
+device = "cuda"
+# TODO
+base_model_path = "/data6/huangjiehui_m22/pretrained_model/Realistic_Vision_V6.0_B1_noVAE" # TODO
+consistentID_path = "/data6/huangjiehui_m22/z_benke/liaost/ConsistentID/models/ConsistentID_model_facemask_pretrain_50w.bin" # TODO
+### Load base model
+pipe = ConsistentIDStableDiffusionPipeline.from_pretrained(
+    base_model_path,
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16"
+).to(device)
+### Load consistentID_model checkpoint
+pipe.load_ConsistentID_model(
+    os.path.dirname(consistentID_path),
+    subfolder="",
+    weight_name=os.path.basename(consistentID_path),
+    trigger_word="img",
+)
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+def process(selected_template_images,costum_image,prompt
+        ,negative_prompt,prompt_selected,retouching,model_selected_tab,prompt_selected_tab,width,height,merge_steps):
+    if model_selected_tab==0:
+        select_images = load_image(Image.open(selected_template_images))
+    else:
+        select_images = load_image(Image.fromarray(costum_image))
+    if prompt_selected_tab==0:
+        prompt = prompt_selected
+        negative_prompt = ""
+        need_safetycheck = False
+    else:
+        need_safetycheck = True
+    # hyper-parameter
+    num_steps = 50
+    # merge_steps = 30
+    if prompt == "":
+        prompt = "A man, in a forest"
+        prompt = "A man, with backpack, in a raining tropical forest, adventuring, holding a flashlight, in mist, seeking animals"
+        prompt = "A person, in a sowm, wearing santa hat and a scarf, with a cottage behind"
+    else:
+        prompt=Enhance_prompt(prompt,Image.new('RGB', (200, 200), color = 'white'))
+        print(prompt)
+        pass
+    if negative_prompt == "":
+        negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
+    #Extend Prompt
+    prompt = "cinematic photo," + prompt + ", 50mm photograph, half-length portrait, film, bokeh, professional, 4k, highly detailed"
+    negtive_prompt_group="((cross-eye)),((cross-eyed)),(((NFSW))),(nipple),((((ugly)))), (((duplicate))), ((morbid)), ((mutilated)), [out of frame], extra fingers, mutated hands, ((poorly drawn hands)), ((poorly drawn face)), (((mutation))), (((deformed))), ((ugly)), blurry, ((bad anatomy)), (((bad proportions))), ((extra limbs)), cloned face, (((disfigured))). out of frame, ugly, extra limbs, (bad anatomy), gross proportions, (malformed limbs), ((missing arms)), ((missing legs)), (((extra arms))), (((extra legs))), mutated hands, (fused fingers), (too many fingers), (((long neck)))"
+    negative_prompt = negative_prompt + negtive_prompt_group
+    seed = torch.randint(0, 1000, (1,)).item()
+    generator = torch.Generator(device=device).manual_seed(seed)
+    images = pipe(
+        prompt=prompt,
+        width=width,
+        height=height,
+        input_id_images=select_images,
+        negative_prompt=negative_prompt,
+        num_images_per_prompt=1,
+        num_inference_steps=num_steps,
+        start_merge_step=merge_steps,
+        generator=generator,
+        retouching=retouching,
+        need_safetycheck=need_safetycheck,
+    ).images[0]
+    current_date = datetime.today()
+    return np.array(images)
+# Gets the templates
+script_directory = os.path.dirname(os.path.realpath(__file__))
+preset_template = glob.glob("./images/templates/*.png")
+preset_template = preset_template + glob.glob("./images/templates/*.jpg")
+with gr.Blocks(title="ConsistentID Demo") as demo:
+    gr.Markdown("# ConsistentID Demo")
+    gr.Markdown("\
+        Put the reference figure to be redrawn into the box below (There is a small probability of referensing failure. You can submit it repeatedly)")
+    gr.Markdown("\
+        If you find our work interesting, please leave a star in GitHub for us!<br>\
+        https://github.com/JackAILab/ConsistentID")
+    with gr.Row():
+        with gr.Column():
+            model_selected_tab = gr.State(0)
+            with gr.TabItem("template images") as template_images_tab:
+                template_gallery_list = [(i, i) for i in preset_template]
+                gallery = gr.Gallery(template_gallery_list,columns=[4], rows=[2], object_fit="contain", height="auto",show_label=False)
+                def select_function(evt: gr.SelectData):
+                    return preset_template[evt.index]
+                selected_template_images = gr.Text(show_label=False, visible=False, placeholder="Selected")
+                gallery.select(select_function, None, selected_template_images)
+            with gr.TabItem("Upload Image") as upload_image_tab:
+                costum_image = gr.Image(label="Upload Image")
+            model_selected_tabs = [template_images_tab, upload_image_tab]
+            for i, tab in enumerate(model_selected_tabs):
+                tab.select(fn=lambda tabnum=i: tabnum, inputs=[], outputs=[model_selected_tab])
+            with gr.Column():
+                prompt_selected_tab = gr.State(0)
+                with gr.TabItem("template prompts") as template_prompts_tab:
+                    prompt_selected = gr.Dropdown(value="A person, police officer, half body shot", elem_id='dropdown', choices=[
+                        "A woman in a wedding dress",
+                        "A woman, queen, in a gorgeous palace",
+                        "A man sitting at the beach with sunset",
+                        "A person, police officer, half body shot",
+                        "A man, sailor, in a boat above ocean",
+                        "A women wearing headphone, listening music",
+                        "A man, firefighter, half body shot"], label=f"prepared prompts")
+                with gr.TabItem("custom prompt") as custom_prompt_tab:
+                    prompt = gr.Textbox(label="prompt",placeholder="A man/woman wearing a santa hat")
+                    nagetive_prompt = gr.Textbox(label="negative prompt",placeholder="monochrome, lowres, bad anatomy, worst quality, low quality, blurry")
+                prompt_selected_tabs = [template_prompts_tab, custom_prompt_tab]
+                for i, tab in enumerate(prompt_selected_tabs):
+                    tab.select(fn=lambda tabnum=i: tabnum, inputs=[], outputs=[prompt_selected_tab])
+            retouching = gr.Checkbox(label="face retouching",value=False)
+            width = gr.Slider(label="image width",minimum=256,maximum=768,value=512,step=8)
+            height = gr.Slider(label="image height",minimum=256,maximum=768,value=768,step=8)
+            width.release(lambda x,y: min(1280-x,y), inputs=[width,height], outputs=[height])
+            height.release(lambda x,y: min(1280-y,x), inputs=[width,height], outputs=[width])
+            merge_steps = gr.Slider(label="step starting to merge facial details(30 is recommended)",minimum=10,maximum=50,value=30,step=1)
+            btn = gr.Button("Run")
+        with gr.Column():
+            out = gr.Image(label="Output")
+            gr.Markdown('''
+                N.B.:<br/>
+                - If the proportion of face in the image is too small, the probability of an error will be slightly higher, and the similarity will also significantly decrease.)
+                - At the same time, use prompt with \"man\" or \"woman\" instead of \"person\" as much as possible, as that may cause the model to be confused whether the protagonist is male or female.
+                - Due to insufficient graphics memory on the demo server, there is an upper limit on the resolution for generating samples. We will support the generation of SDXL as soon as possible<br/><br/>
+                ''')
+        btn.click(fn=process, inputs=[selected_template_images,costum_image,prompt,nagetive_prompt,prompt_selected,retouching
+            ,model_selected_tab,prompt_selected_tab,width,height,merge_steps], outputs=out)
+demo.launch()

attention.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.lora import LoRALinearLayer
+from functions import AttentionMLP
+class FuseModule(nn.Module):
+    def __init__(self, embed_dim):
+        super().__init__()
+        self.mlp1 = MLP(embed_dim * 2, embed_dim, embed_dim, use_residual=False)
+        self.mlp2 = MLP(embed_dim, embed_dim, embed_dim, use_residual=True)
+        self.layer_norm = nn.LayerNorm(embed_dim)
+    def fuse_fn(self, prompt_embeds, id_embeds):
+        stacked_id_embeds = torch.cat([prompt_embeds, id_embeds], dim=-1)
+        stacked_id_embeds = self.mlp1(stacked_id_embeds) + prompt_embeds
+        stacked_id_embeds = self.mlp2(stacked_id_embeds)
+        stacked_id_embeds = self.layer_norm(stacked_id_embeds)
+        return stacked_id_embeds
+    def forward(
+        self,
+        prompt_embeds,
+        id_embeds,
+        class_tokens_mask,
+        valid_id_mask,
+    ) -> torch.Tensor:
+        id_embeds = id_embeds.to(prompt_embeds.dtype)
+        batch_size, max_num_inputs = id_embeds.shape[:2] # 1,5
+        seq_length = prompt_embeds.shape[1] # 77
+        flat_id_embeds = id_embeds.view(-1, id_embeds.shape[-2], id_embeds.shape[-1])
+        # flat_id_embeds torch.Size([5, 1, 768])
+        valid_id_embeds = flat_id_embeds[valid_id_mask.flatten()]
+        # valid_id_embeds torch.Size([4, 1, 768])
+        prompt_embeds = prompt_embeds.view(-1, prompt_embeds.shape[-1]) # torch.Size([77, 768])
+        class_tokens_mask = class_tokens_mask.view(-1) # torch.Size([77])
+        valid_id_embeds = valid_id_embeds.view(-1, valid_id_embeds.shape[-1]) # torch.Size([4, 768])
+        image_token_embeds = prompt_embeds[class_tokens_mask] # torch.Size([4, 768])
+        stacked_id_embeds = self.fuse_fn(image_token_embeds, valid_id_embeds) # torch.Size([4, 768])
+        assert class_tokens_mask.sum() == stacked_id_embeds.shape[0], f"{class_tokens_mask.sum()} != {stacked_id_embeds.shape[0]}"
+        prompt_embeds.masked_scatter_(class_tokens_mask[:, None], stacked_id_embeds.to(prompt_embeds.dtype))
+        updated_prompt_embeds = prompt_embeds.view(batch_size, seq_length, -1)
+        return updated_prompt_embeds
+class MLP(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim, use_residual=True):
+        super().__init__()
+        if use_residual:
+            assert in_dim == out_dim
+        self.layernorm = nn.LayerNorm(in_dim)
+        self.fc1 = nn.Linear(in_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, out_dim)
+        self.use_residual = use_residual
+        self.act_fn = nn.GELU()
+    def forward(self, x):
+        residual = x
+        x = self.layernorm(x)
+        x = self.fc1(x)
+        x = self.act_fn(x)
+        x = self.fc2(x)
+        if self.use_residual:
+            x = x + residual
+        return x
+class FacialEncoder(nn.Module):
+    def __init__(self,image_CLIPModel_encoder=None):
+        super().__init__()
+        self.visual_projection = AttentionMLP()
+        self.fuse_module = FuseModule(768)
+    def forward(self, prompt_embeds, multi_image_embeds, class_tokens_mask, valid_id_mask):
+        bs, num_inputs, token_length, image_dim = multi_image_embeds.shape
+        multi_image_embeds_view = multi_image_embeds.view(bs * num_inputs, token_length, image_dim)
+        id_embeds = self.visual_projection(multi_image_embeds_view) # torch.Size([5, 1, 768])
+        id_embeds = id_embeds.view(bs, num_inputs, 1, -1)
+        updated_prompt_embeds = self.fuse_module(prompt_embeds, id_embeds, class_tokens_mask, valid_id_mask)
+        return updated_prompt_embeds
+class Consistent_AttProcessor(nn.Module):
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+    ):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class Consistent_IPAttProcessor(nn.Module):
+    def __init__(
+            self,
+            hidden_size,
+            cross_attention_dim=None,
+            rank=4,
+            network_alpha=None,
+            lora_scale=1.0,
+            scale=1.0,
+            num_tokens=4):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.num_tokens = num_tokens
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        for module in [self.to_q_lora, self.to_k_lora, self.to_v_lora, self.to_out_lora, self.to_k_ip, self.to_v_ip]:
+            for param in module.parameters():
+                param.requires_grad = False
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        scale=1.0,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

functions.py ADDED Viewed

	@@ -0,0 +1,599 @@

+import numpy as np
+import math
+import types
+import torch
+import torch.nn as nn
+import numpy as np
+import cv2
+import re
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from PIL import Image
+def extract_first_sentence(text):
+    end_index = text.find('.')
+    if end_index != -1:
+        first_sentence = text[:end_index + 1]
+        return first_sentence.strip()
+    else:
+        return text.strip()
+import re
+def remove_duplicate_keywords(text, keywords):
+    keyword_counts = {}
+    words = re.findall(r'\b\w+\b|[.,;!?]', text)
+    for keyword in keywords:
+        keyword_counts[keyword] = 0
+        for i, word in enumerate(words):
+            if word.lower() == keyword.lower():
+                keyword_counts[keyword] += 1
+                if keyword_counts[keyword] > 1:
+                    words[i] = ""
+    processed_text = " ".join(words)
+    return processed_text
+def process_text_with_markers(text, parsing_mask_list):
+    keywords = ["face", "ears", "eyes", "nose", "mouth"]
+    text = remove_duplicate_keywords(text, keywords)
+    key_parsing_mask_markers = ["Face", "Left_Ear", "Right_Ear", "Left_Eye", "Right_Eye", "Nose", "Upper_Lip", "Lower_Lip"]
+    mapping = {
+        "Face": "face",
+        "Left_Ear": "ears",
+        "Right_Ear": "ears",
+        "Left_Eye": "eyes",
+        "Right_Eye": "eyes",
+        "Nose": "nose",
+        "Upper_Lip": "mouth",
+        "Lower_Lip": "mouth",
+    }
+    facial_features_align = []
+    markers_align = []
+    for key in key_parsing_mask_markers:
+        if key in parsing_mask_list:
+            mapped_key = mapping.get(key, key.lower())
+            if mapped_key not in facial_features_align:
+                facial_features_align.append(mapped_key)
+                markers_align.append("<|"+mapped_key+"|>")
+    text_marked = text
+    align_parsing_mask_list = parsing_mask_list
+    for feature, marker in zip(facial_features_align[::-1], markers_align[::-1]):
+        pattern = rf'\b{feature}\b'
+        text_marked_new = re.sub(pattern, f'{feature} {marker}', text_marked, count=1)
+        if text_marked == text_marked_new:
+            for key, value in mapping.items():
+                if value == feature:
+                    if key in align_parsing_mask_list:
+                        del align_parsing_mask_list[key]
+        text_marked = text_marked_new
+    text_marked = text_marked.replace('\n', '')
+    ordered_text = []
+    text_none_makers = []
+    facial_marked_count = 0
+    skip_count = 0
+    for marker in markers_align:
+        start_idx = text_marked.find(marker)
+        end_idx = start_idx + len(marker)
+        while start_idx > 0 and text_marked[start_idx - 1] not in [",", ".", ";"]:
+            start_idx -= 1
+        while end_idx < len(text_marked) and text_marked[end_idx] not in [",", ".", ";"]:
+            end_idx += 1
+        context = text_marked[start_idx:end_idx].strip()
+        if context == "":
+            text_none_makers.append(text_marked[:end_idx])
+        else:
+            if skip_count!=0:
+                skip_count -= 1
+                continue
+            else:
+                ordered_text.append(context + ",")
+                text_delete_makers = text_marked[:start_idx] + text_marked[end_idx:]
+                text_marked = text_delete_makers
+                facial_marked_count += 1
+    align_marked_text = " ".join(ordered_text)
+    replace_list = ["<|face|>", "<|ears|>", "<|nose|>", "<|eyes|>", "<|mouth|>"]
+    for item in replace_list:
+        align_marked_text = align_marked_text.replace(item, "<|facial|>")
+    return align_marked_text, align_parsing_mask_list
+def tokenize_and_mask_noun_phrases_ends(text, image_token_id, facial_token_id, tokenizer):
+    input_ids = tokenizer.encode(text)
+    image_noun_phrase_end_mask = [False for _ in input_ids]
+    facial_noun_phrase_end_mask = [False for _ in input_ids]
+    clean_input_ids = []
+    clean_index = 0
+    image_num = 0
+    for i, id in enumerate(input_ids):
+        if id == image_token_id:
+            image_noun_phrase_end_mask[clean_index + image_num - 1] = True
+            image_num += 1
+        elif id == facial_token_id:
+            facial_noun_phrase_end_mask[clean_index - 1] = True
+        else:
+            clean_input_ids.append(id)
+            clean_index += 1
+    max_len = tokenizer.model_max_length
+    if len(clean_input_ids) > max_len:
+        clean_input_ids = clean_input_ids[:max_len]
+    else:
+        clean_input_ids = clean_input_ids + [tokenizer.pad_token_id] * (
+            max_len - len(clean_input_ids)
+        )
+    if len(image_noun_phrase_end_mask) > max_len:
+        image_noun_phrase_end_mask = image_noun_phrase_end_mask[:max_len]
+    else:
+        image_noun_phrase_end_mask = image_noun_phrase_end_mask + [False] * (
+            max_len - len(image_noun_phrase_end_mask)
+        )
+    if len(facial_noun_phrase_end_mask) > max_len:
+        facial_noun_phrase_end_mask = facial_noun_phrase_end_mask[:max_len]
+    else:
+        facial_noun_phrase_end_mask = facial_noun_phrase_end_mask + [False] * (
+            max_len - len(facial_noun_phrase_end_mask)
+        )
+    clean_input_ids = torch.tensor(clean_input_ids, dtype=torch.long)
+    image_noun_phrase_end_mask = torch.tensor(image_noun_phrase_end_mask, dtype=torch.bool)
+    facial_noun_phrase_end_mask = torch.tensor(facial_noun_phrase_end_mask, dtype=torch.bool)
+    return clean_input_ids.unsqueeze(0), image_noun_phrase_end_mask.unsqueeze(0), facial_noun_phrase_end_mask.unsqueeze(0)
+def prepare_image_token_idx(image_token_mask, facial_token_mask, max_num_objects=2, max_num_facials=5):
+    image_token_idx = torch.nonzero(image_token_mask, as_tuple=True)[1]
+    image_token_idx_mask = torch.ones_like(image_token_idx, dtype=torch.bool)
+    if len(image_token_idx) < max_num_objects:
+        image_token_idx = torch.cat(
+            [
+                image_token_idx,
+                torch.zeros(max_num_objects - len(image_token_idx), dtype=torch.long),
+            ]
+        )
+        image_token_idx_mask = torch.cat(
+            [
+                image_token_idx_mask,
+                torch.zeros(
+                    max_num_objects - len(image_token_idx_mask),
+                    dtype=torch.bool,
+                ),
+            ]
+        )
+    facial_token_idx = torch.nonzero(facial_token_mask, as_tuple=True)[1]
+    facial_token_idx_mask = torch.ones_like(facial_token_idx, dtype=torch.bool)
+    if len(facial_token_idx) < max_num_facials:
+        facial_token_idx = torch.cat(
+            [
+                facial_token_idx,
+                torch.zeros(max_num_facials - len(facial_token_idx), dtype=torch.long),
+            ]
+        )
+        facial_token_idx_mask = torch.cat(
+            [
+                facial_token_idx_mask,
+                torch.zeros(
+                    max_num_facials - len(facial_token_idx_mask),
+                    dtype=torch.bool,
+                ),
+            ]
+        )
+    image_token_idx = image_token_idx.unsqueeze(0)
+    image_token_idx_mask = image_token_idx_mask.unsqueeze(0)
+    facial_token_idx = facial_token_idx.unsqueeze(0)
+    facial_token_idx_mask = facial_token_idx_mask.unsqueeze(0)
+    return image_token_idx, image_token_idx_mask, facial_token_idx, facial_token_idx_mask
+def get_object_localization_loss_for_one_layer(
+    cross_attention_scores,
+    object_segmaps,
+    object_token_idx,
+    object_token_idx_mask,
+    loss_fn,
+):
+    bxh, num_noise_latents, num_text_tokens = cross_attention_scores.shape
+    b, max_num_objects, _, _ = object_segmaps.shape
+    size = int(num_noise_latents**0.5)
+    object_segmaps = F.interpolate(object_segmaps, size=(size, size), mode="bilinear", antialias=True)
+    object_segmaps = object_segmaps.view(
+        b, max_num_objects, -1
+    )
+    num_heads = bxh // b
+    cross_attention_scores = cross_attention_scores.view(b, num_heads, num_noise_latents, num_text_tokens)
+    object_token_attn_prob = torch.gather(
+        cross_attention_scores,
+        dim=3,
+        index=object_token_idx.view(b, 1, 1, max_num_objects).expand(
+            b, num_heads, num_noise_latents, max_num_objects
+        ),
+    )
+    object_segmaps = (
+        object_segmaps.permute(0, 2, 1)
+        .unsqueeze(1)
+        .expand(b, num_heads, num_noise_latents, max_num_objects)
+    )
+    loss = loss_fn(object_token_attn_prob, object_segmaps)
+    loss = loss * object_token_idx_mask.view(b, 1, max_num_objects)
+    object_token_cnt = object_token_idx_mask.sum(dim=1).view(b, 1) + 1e-5
+    loss = (loss.sum(dim=2) / object_token_cnt).mean()
+    return loss
+def get_object_localization_loss(
+    cross_attention_scores,
+    object_segmaps,
+    image_token_idx,
+    image_token_idx_mask,
+    loss_fn,
+):
+    num_layers = len(cross_attention_scores)
+    loss = 0
+    for k, v in cross_attention_scores.items():
+        layer_loss = get_object_localization_loss_for_one_layer(
+            v, object_segmaps, image_token_idx, image_token_idx_mask, loss_fn
+        )
+        loss += layer_loss
+    return loss / num_layers
+def unet_store_cross_attention_scores(unet, attention_scores, layers=5):
+    from diffusers.models.attention_processor import Attention
+    UNET_LAYER_NAMES = [
+        "down_blocks.0",
+        "down_blocks.1",
+        "down_blocks.2",
+        "mid_block",
+        "up_blocks.1",
+        "up_blocks.2",
+        "up_blocks.3",
+    ]
+    start_layer = (len(UNET_LAYER_NAMES) - layers) // 2
+    end_layer = start_layer + layers
+    applicable_layers = UNET_LAYER_NAMES[start_layer:end_layer]
+    def make_new_get_attention_scores_fn(name):
+        def new_get_attention_scores(module, query, key, attention_mask=None):
+            attention_probs = module.old_get_attention_scores(
+                query, key, attention_mask
+            )
+            attention_scores[name] = attention_probs
+            return attention_probs
+        return new_get_attention_scores
+    for name, module in unet.named_modules():
+        if isinstance(module, Attention) and "attn1" in name:
+            if not any(layer in name for layer in applicable_layers):
+                continue
+            module.old_get_attention_scores = module.get_attention_scores
+            module.get_attention_scores = types.MethodType(
+                make_new_get_attention_scores_fn(name), module
+            )
+    return unet
+class BalancedL1Loss(nn.Module):
+    def __init__(self, threshold=1.0, normalize=False):
+        super().__init__()
+        self.threshold = threshold
+        self.normalize = normalize
+    def forward(self, object_token_attn_prob, object_segmaps):
+        if self.normalize:
+            object_token_attn_prob = object_token_attn_prob / (
+                object_token_attn_prob.max(dim=2, keepdim=True)[0] + 1e-5
+            )
+        background_segmaps = 1 - object_segmaps
+        background_segmaps_sum = background_segmaps.sum(dim=2) + 1e-5
+        object_segmaps_sum = object_segmaps.sum(dim=2) + 1e-5
+        background_loss = (object_token_attn_prob * background_segmaps).sum(
+            dim=2
+        ) / background_segmaps_sum
+        object_loss = (object_token_attn_prob * object_segmaps).sum(
+            dim=2
+        ) / object_segmaps_sum
+        return background_loss - object_loss
+def fetch_mask_raw_image(raw_image, mask_image):
+    mask_image = mask_image.resize(raw_image.size)
+    mask_raw_image = Image.composite(raw_image, Image.new('RGB', raw_image.size, (0, 0, 0)), mask_image)
+    return mask_raw_image
+mapping_table = [
+    {"Mask Value": 0, "Body Part": "Background", "RGB Color": [0, 0, 0]},
+    {"Mask Value": 1, "Body Part": "Face", "RGB Color": [255, 0, 0]},
+    {"Mask Value": 2, "Body Part": "Left_Eyebrow", "RGB Color": [255, 85, 0]},
+    {"Mask Value": 3, "Body Part": "Right_Eyebrow", "RGB Color": [255, 170, 0]},
+    {"Mask Value": 4, "Body Part": "Left_Eye", "RGB Color": [255, 0, 85]},
+    {"Mask Value": 5, "Body Part": "Right_Eye", "RGB Color": [255, 0, 170]},
+    {"Mask Value": 6, "Body Part": "Hair", "RGB Color": [0, 0, 255]},
+    {"Mask Value": 7, "Body Part": "Left_Ear", "RGB Color": [85, 0, 255]},
+    {"Mask Value": 8, "Body Part": "Right_Ear", "RGB Color": [170, 0, 255]},
+    {"Mask Value": 9, "Body Part": "Mouth_External Contour", "RGB Color": [0, 255, 85]},
+    {"Mask Value": 10, "Body Part": "Nose", "RGB Color": [0, 255, 0]},
+    {"Mask Value": 11, "Body Part": "Mouth_Inner_Contour", "RGB Color": [0, 255, 170]},
+    {"Mask Value": 12, "Body Part": "Upper_Lip", "RGB Color": [85, 255, 0]},
+    {"Mask Value": 13, "Body Part": "Lower_Lip", "RGB Color": [170, 255, 0]},
+    {"Mask Value": 14, "Body Part": "Neck", "RGB Color": [0, 85, 255]},
+    {"Mask Value": 15, "Body Part": "Neck_Inner Contour", "RGB Color": [0, 170, 255]},
+    {"Mask Value": 16, "Body Part": "Cloth", "RGB Color": [255, 255, 0]},
+    {"Mask Value": 17, "Body Part": "Hat", "RGB Color": [255, 0, 255]},
+    {"Mask Value": 18, "Body Part": "Earring", "RGB Color": [255, 85, 255]},
+    {"Mask Value": 19, "Body Part": "Necklace", "RGB Color": [255, 255, 85]},
+    {"Mask Value": 20, "Body Part": "Glasses", "RGB Color": [255, 170, 255]},
+    {"Mask Value": 21, "Body Part": "Hand", "RGB Color": [255, 0, 255]},
+    {"Mask Value": 22, "Body Part": "Wristband", "RGB Color": [0, 255, 255]},
+    {"Mask Value": 23, "Body Part": "Clothes_Upper", "RGB Color": [85, 255, 255]},
+    {"Mask Value": 24, "Body Part": "Clothes_Lower", "RGB Color": [170, 255, 255]}
+]
+def masks_for_unique_values(image_raw_mask):
+    image_array = np.array(image_raw_mask)
+    unique_values, counts = np.unique(image_array, return_counts=True)
+    masks_dict = {}
+    for value in unique_values:
+        binary_image = np.uint8(image_array == value) * 255
+        contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        mask = np.zeros_like(image_array)
+        for contour in contours:
+            cv2.drawContours(mask, [contour], -1, (255), thickness=cv2.FILLED)
+        if value == 0:
+            body_part="WithoutBackground"
+            mask2 = np.where(mask == 255, 0, 255).astype(mask.dtype)
+            masks_dict[body_part] = Image.fromarray(mask2)
+        body_part = next((entry["Body Part"] for entry in mapping_table if entry["Mask Value"] == value), f"Unknown_{value}")
+        if body_part.startswith("Unknown_"):
+            continue
+        masks_dict[body_part] = Image.fromarray(mask)
+    return masks_dict
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    x = x.view(bs, length, heads, -1)
+    x = x.transpose(1, 2)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class FacePerceiverResampler(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        dim=768,
+        depth=4,
+        dim_head=64,
+        heads=16,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.proj_in = torch.nn.Linear(embedding_dim, dim)
+        self.proj_out = torch.nn.Linear(dim, output_dim)
+        self.norm_out = torch.nn.LayerNorm(output_dim)
+        self.layers = torch.nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                torch.nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, latents, x): # latents.torch.Size([2, 4, 768])  x.torch.Size([2, 257, 1280])
+        x = self.proj_in(x) # x.torch.Size([2, 257, 768])
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents # latents.torch.Size([2, 4, 768])
+            latents = ff(latents) + latents # latents.torch.Size([2, 4, 768])
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+class ProjPlusModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        self.perceiver_resampler = FacePerceiverResampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=cross_attention_dim // 64,
+            embedding_dim=clip_embeddings_dim,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+    def forward(self, id_embeds, clip_embeds, shortcut=False, scale=1.0):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        out = self.perceiver_resampler(x, clip_embeds)
+        if shortcut:
+            out = scale * x +  out
+        return out
+class AttentionMLP(nn.Module):
+    def __init__(
+        self,
+        dtype=torch.float16,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        single_num_tokens=1,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+        max_seq_len: int = 257*2,
+        apply_pos_emb: bool = False,
+        num_latents_mean_pooled: int = 0,
+    ):
+        super().__init__()
+        self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None
+        self.single_num_tokens = single_num_tokens
+        self.latents = nn.Parameter(torch.randn(1, self.single_num_tokens, dim) / dim**0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.to_latents_from_mean_pooled_seq = (
+            nn.Sequential(
+                nn.LayerNorm(dim),
+                nn.Linear(dim, dim * num_latents_mean_pooled),
+                Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
+            )
+            if num_latents_mean_pooled > 0
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        if self.pos_emb is not None:
+            n, device = x.shape[1], x.device
+            pos_emb = self.pos_emb(torch.arange(n, device=device))
+            x = x + pos_emb
+        # x torch.Size([5, 257, 1280])
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x) # torch.Size([5, 257, 1024])
+        if self.to_latents_from_mean_pooled_seq:
+            meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
+            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
+            latents = torch.cat((meanpooled_latents, latents), dim=-2)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+def masked_mean(t, *, dim, mask=None):
+    if mask is None:
+        return t.mean(dim=dim)
+    denom = mask.sum(dim=dim, keepdim=True)
+    mask = rearrange(mask, "b n -> b n 1")
+    masked_t = t.masked_fill(~mask, 0.0)
+    return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)

images/templates/3f8d901770014c1b8f7f261971f0e92.png ADDED Viewed

Git LFS Details

SHA256: 4fa9319750b9927075934c40a180766e75ff539711293581dae6bac5963b9d05
Pointer size: 132 Bytes
Size of remote file: 2.06 MB

images/templates/6577b962b6346df03fea83211daaf48.png ADDED Viewed

images/templates/75583964a834abe33b72f52b1a98e84.png ADDED Viewed

Git LFS Details

SHA256: 318c942eb3cc8a1f9320b2ea84a88cd95067785c07f8ae1dd18fe6c4cf8e8282
Pointer size: 132 Bytes
Size of remote file: 7.54 MB

images/templates/c9fe4c2d5ddbc5670dde47fc465c48b.jpg ADDED Viewed

models/BiSeNet/6.jpg ADDED Viewed

models/BiSeNet/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #__init__.py
2	+ # from BiSeNet.model import *

models/BiSeNet/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (198 Bytes). View file

models/BiSeNet/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (9.18 kB). View file

models/BiSeNet/__pycache__/resnet.cpython-38.pyc ADDED Viewed

Binary file (3.62 kB). View file

models/BiSeNet/evaluate.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+from logger import setup_logger
+from model import BiSeNet
+from face_dataset import FaceMask
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+import torch.distributed as dist
+import os
+import os.path as osp
+import logging
+import time
+import numpy as np
+from tqdm import tqdm
+import math
+from PIL import Image
+import torchvision.transforms as transforms
+import cv2
+def vis_parsing_maps(im, parsing_anno, stride, save_im=False, save_path='vis_results/parsing_map_on_im.jpg'):
+    # Colors for all 20 parts
+    part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
+                   [255, 0, 85], [255, 0, 170],
+                   [0, 255, 0], [85, 255, 0], [170, 255, 0],
+                   [0, 255, 85], [0, 255, 170],
+                   [0, 0, 255], [85, 0, 255], [170, 0, 255],
+                   [0, 85, 255], [0, 170, 255],
+                   [255, 255, 0], [255, 255, 85], [255, 255, 170],
+                   [255, 0, 255], [255, 85, 255], [255, 170, 255],
+                   [0, 255, 255], [85, 255, 255], [170, 255, 255]]
+    im = np.array(im)
+    vis_im = im.copy().astype(np.uint8)
+    vis_parsing_anno = parsing_anno.copy().astype(np.uint8)
+    vis_parsing_anno = cv2.resize(vis_parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)
+    vis_parsing_anno_color = np.zeros((vis_parsing_anno.shape[0], vis_parsing_anno.shape[1], 3)) + 255
+    num_of_class = np.max(vis_parsing_anno)
+    for pi in range(1, num_of_class + 1):
+        index = np.where(vis_parsing_anno == pi)
+        vis_parsing_anno_color[index[0], index[1], :] = part_colors[pi]
+    vis_parsing_anno_color = vis_parsing_anno_color.astype(np.uint8)
+    # print(vis_parsing_anno_color.shape, vis_im.shape)
+    vis_im = cv2.addWeighted(cv2.cvtColor(vis_im, cv2.COLOR_RGB2BGR), 0.4, vis_parsing_anno_color, 0.6, 0)
+    # Save result or not
+    if save_im:
+        cv2.imwrite(save_path, vis_im, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
+    # return vis_im
+def evaluate(respth='./res/test_res', dspth='./data', cp='model_final_diss.pth'):
+    if not os.path.exists(respth):
+        os.makedirs(respth)
+    n_classes = 19
+    net = BiSeNet(n_classes=n_classes)
+    net.cuda()
+    save_pth = osp.join('res/cp', cp)
+    net.load_state_dict(torch.load(save_pth))
+    net.eval()
+    to_tensor = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    with torch.no_grad():
+        for image_path in os.listdir(dspth):
+            img = Image.open(osp.join(dspth, image_path))
+            image = img.resize((512, 512), Image.BILINEAR)
+            img = to_tensor(image)
+            img = torch.unsqueeze(img, 0)
+            img = img.cuda()
+            out = net(img)[0]
+            parsing = out.squeeze(0).cpu().numpy().argmax(0)
+            vis_parsing_maps(image, parsing, stride=1, save_im=True, save_path=osp.join(respth, image_path))
+if __name__ == "__main__":
+    setup_logger('./res')
+    evaluate()

models/BiSeNet/face_dataset.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+import os.path as osp
+import os
+from PIL import Image
+import numpy as np
+import json
+import cv2
+from transform import *
+class FaceMask(Dataset):
+    def __init__(self, rootpth, cropsize=(640, 480), mode='train', *args, **kwargs):
+        super(FaceMask, self).__init__(*args, **kwargs)
+        assert mode in ('train', 'val', 'test')
+        self.mode = mode
+        self.ignore_lb = 255
+        self.rootpth = rootpth
+        self.imgs = os.listdir(os.path.join(self.rootpth, 'CelebA-HQ-img'))
+        #  pre-processing
+        self.to_tensor = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+            ])
+        self.trans_train = Compose([
+            ColorJitter(
+                brightness=0.5,
+                contrast=0.5,
+                saturation=0.5),
+            HorizontalFlip(),
+            RandomScale((0.75, 1.0, 1.25, 1.5, 1.75, 2.0)),
+            RandomCrop(cropsize)
+            ])
+    def __getitem__(self, idx):
+        impth = self.imgs[idx]
+        img = Image.open(osp.join(self.rootpth, 'CelebA-HQ-img', impth))
+        img = img.resize((512, 512), Image.BILINEAR)
+        label = Image.open(osp.join(self.rootpth, 'mask', impth[:-3]+'png')).convert('P')
+        # print(np.unique(np.array(label)))
+        if self.mode == 'train':
+            im_lb = dict(im=img, lb=label)
+            im_lb = self.trans_train(im_lb)
+            img, label = im_lb['im'], im_lb['lb']
+        img = self.to_tensor(img)
+        label = np.array(label).astype(np.int64)[np.newaxis, :]
+        return img, label
+    def __len__(self):
+        return len(self.imgs)
+if __name__ == "__main__":
+    face_data = '/home/zll/data/CelebAMask-HQ/CelebA-HQ-img'
+    face_sep_mask = '/home/zll/data/CelebAMask-HQ/CelebAMask-HQ-mask-anno'
+    mask_path = '/home/zll/data/CelebAMask-HQ/mask'
+    counter = 0
+    total = 0
+    for i in range(15):
+        # files = os.listdir(osp.join(face_sep_mask, str(i)))
+        atts = ['skin', 'l_brow', 'r_brow', 'l_eye', 'r_eye', 'eye_g', 'l_ear', 'r_ear', 'ear_r',
+                'nose', 'mouth', 'u_lip', 'l_lip', 'neck', 'neck_l', 'cloth', 'hair', 'hat']
+        for j in range(i*2000, (i+1)*2000):
+            mask = np.zeros((512, 512))
+            for l, att in enumerate(atts, 1):
+                total += 1
+                file_name = ''.join([str(j).rjust(5, '0'), '_', att, '.png'])
+                path = osp.join(face_sep_mask, str(i), file_name)
+                if os.path.exists(path):
+                    counter += 1
+                    sep_mask = np.array(Image.open(path).convert('P'))
+                    # print(np.unique(sep_mask))
+                    mask[sep_mask == 225] = l
+            cv2.imwrite('{}/{}.png'.format(mask_path, j), mask)
+            print(j)
+    print(counter, total)

models/BiSeNet/hair.png ADDED Viewed

models/BiSeNet/logger.py ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import os.path as osp
+import time
+import sys
+import logging
+import torch.distributed as dist
+def setup_logger(logpth):
+    logfile = 'BiSeNet-{}.log'.format(time.strftime('%Y-%m-%d-%H-%M-%S'))
+    logfile = osp.join(logpth, logfile)
+    FORMAT = '%(levelname)s %(filename)s(%(lineno)d): %(message)s'
+    log_level = logging.INFO
+    if dist.is_initialized() and not dist.get_rank()==0:
+        log_level = logging.ERROR
+    logging.basicConfig(level=log_level, format=FORMAT, filename=logfile)
+    logging.root.addHandler(logging.StreamHandler())

models/BiSeNet/loss.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+class OhemCELoss(nn.Module):
+    def __init__(self, thresh, n_min, ignore_lb=255, *args, **kwargs):
+        super(OhemCELoss, self).__init__()
+        self.thresh = -torch.log(torch.tensor(thresh, dtype=torch.float)).cuda()
+        self.n_min = n_min
+        self.ignore_lb = ignore_lb
+        self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction='none')
+    def forward(self, logits, labels):
+        N, C, H, W = logits.size()
+        loss = self.criteria(logits, labels).view(-1)
+        loss, _ = torch.sort(loss, descending=True)
+        if loss[self.n_min] > self.thresh:
+            loss = loss[loss>self.thresh]
+        else:
+            loss = loss[:self.n_min]
+        return torch.mean(loss)
+class SoftmaxFocalLoss(nn.Module):
+    def __init__(self, gamma, ignore_lb=255, *args, **kwargs):
+        super(SoftmaxFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.nll = nn.NLLLoss(ignore_index=ignore_lb)
+    def forward(self, logits, labels):
+        scores = F.softmax(logits, dim=1)
+        factor = torch.pow(1.-scores, self.gamma)
+        log_score = F.log_softmax(logits, dim=1)
+        log_score = factor * log_score
+        loss = self.nll(log_score, labels)
+        return loss
+if __name__ == '__main__':
+    torch.manual_seed(15)
+    criteria1 = OhemCELoss(thresh=0.7, n_min=16*20*20//16).cuda()
+    criteria2 = OhemCELoss(thresh=0.7, n_min=16*20*20//16).cuda()
+    net1 = nn.Sequential(
+        nn.Conv2d(3, 19, kernel_size=3, stride=2, padding=1),
+    )
+    net1.cuda()
+    net1.train()
+    net2 = nn.Sequential(
+        nn.Conv2d(3, 19, kernel_size=3, stride=2, padding=1),
+    )
+    net2.cuda()
+    net2.train()
+    with torch.no_grad():
+        inten = torch.randn(16, 3, 20, 20).cuda()
+        lbs = torch.randint(0, 19, [16, 20, 20]).cuda()
+        lbs[1, :, :] = 255
+    logits1 = net1(inten)
+    logits1 = F.interpolate(logits1, inten.size()[2:], mode='bilinear')
+    logits2 = net2(inten)
+    logits2 = F.interpolate(logits2, inten.size()[2:], mode='bilinear')
+    loss1 = criteria1(logits1, lbs)
+    loss2 = criteria2(logits2, lbs)
+    loss = loss1 + loss2
+    print(loss.detach().cpu())
+    loss.backward()

models/BiSeNet/makeup.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import cv2
+import os
+import numpy as np
+from skimage.filters import gaussian
+def sharpen(img):
+    img = img * 1.0
+    gauss_out = gaussian(img, sigma=5, multichannel=True)
+    alpha = 1.5
+    img_out = (img - gauss_out) * alpha + img
+    img_out = img_out / 255.0
+    mask_1 = img_out < 0
+    mask_2 = img_out > 1
+    img_out = img_out * (1 - mask_1)
+    img_out = img_out * (1 - mask_2) + mask_2
+    img_out = np.clip(img_out, 0, 1)
+    img_out = img_out * 255
+    return np.array(img_out, dtype=np.uint8)
+def hair(image, parsing, part=17, color=[230, 50, 20]):
+    b, g, r = color      #[10, 50, 250]       # [10, 250, 10]
+    tar_color = np.zeros_like(image)
+    tar_color[:, :, 0] = b
+    tar_color[:, :, 1] = g
+    tar_color[:, :, 2] = r
+    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    tar_hsv = cv2.cvtColor(tar_color, cv2.COLOR_BGR2HSV)
+    if part == 12 or part == 13:
+        image_hsv[:, :, 0:2] = tar_hsv[:, :, 0:2]
+    else:
+        image_hsv[:, :, 0:1] = tar_hsv[:, :, 0:1]
+    changed = cv2.cvtColor(image_hsv, cv2.COLOR_HSV2BGR)
+    if part == 17:
+        changed = sharpen(changed)
+    changed[parsing != part] = image[parsing != part]
+    # changed = cv2.resize(changed, (512, 512))
+    return changed
+#
+# def lip(image, parsing, part=17, color=[230, 50, 20]):
+#     b, g, r = color      #[10, 50, 250]       # [10, 250, 10]
+#     tar_color = np.zeros_like(image)
+#     tar_color[:, :, 0] = b
+#     tar_color[:, :, 1] = g
+#     tar_color[:, :, 2] = r
+#
+#     image_lab = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)
+#     il, ia, ib = cv2.split(image_lab)
+#
+#     tar_lab = cv2.cvtColor(tar_color, cv2.COLOR_BGR2Lab)
+#     tl, ta, tb = cv2.split(tar_lab)
+#
+#     image_lab[:, :, 0] = np.clip(il - np.mean(il) + tl, 0, 100)
+#     image_lab[:, :, 1] = np.clip(ia - np.mean(ia) + ta, -127, 128)
+#     image_lab[:, :, 2] = np.clip(ib - np.mean(ib) + tb, -127, 128)
+#
+#
+#     changed = cv2.cvtColor(image_lab, cv2.COLOR_Lab2BGR)
+#
+#     if part == 17:
+#         changed = sharpen(changed)
+#
+#     changed[parsing != part] = image[parsing != part]
+#     # changed = cv2.resize(changed, (512, 512))
+#     return changed
+if __name__ == '__main__':
+    # 1  face
+    # 10 nose
+    # 11 teeth
+    # 12 upper lip
+    # 13 lower lip
+    # 17 hair
+    num = 116
+    table = {
+        'hair': 17,
+        'upper_lip': 12,
+        'lower_lip': 13
+    }
+    image_path = '/home/zll/data/CelebAMask-HQ/test-img/{}.jpg'.format(num)
+    parsing_path = 'res/test_res/{}.png'.format(num)
+    image = cv2.imread(image_path)
+    ori = image.copy()
+    parsing = np.array(cv2.imread(parsing_path, 0))
+    parsing = cv2.resize(parsing, image.shape[0:2], interpolation=cv2.INTER_NEAREST)
+    parts = [table['hair'], table['upper_lip'], table['lower_lip']]
+    # colors = [[20, 20, 200], [100, 100, 230], [100, 100, 230]]
+    colors = [[100, 200, 100]]
+    for part, color in zip(parts, colors):
+        image = hair(image, parsing, part, color)
+    cv2.imwrite('res/makeup/116_ori.png', cv2.resize(ori, (512, 512)))
+    cv2.imwrite('res/makeup/116_2.png', cv2.resize(image, (512, 512)))
+    cv2.imshow('image', cv2.resize(ori, (512, 512)))
+    cv2.imshow('color', cv2.resize(image, (512, 512)))
+    # cv2.imshow('image', ori)
+    # cv2.imshow('color', image)
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()

models/BiSeNet/makeup/116_1.png ADDED Viewed

models/BiSeNet/makeup/116_3.png ADDED Viewed

models/BiSeNet/makeup/116_lip_ori.png ADDED Viewed

models/BiSeNet/makeup/116_ori.png ADDED Viewed

models/BiSeNet/model.py ADDED Viewed

	@@ -0,0 +1,283 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from resnet import Resnet18
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1, *args, **kwargs):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2d(in_chan,
+                out_chan,
+                kernel_size = ks,
+                stride = stride,
+                padding = padding,
+                bias = False)
+        self.bn = nn.BatchNorm2d(out_chan)
+        self.init_weight()
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(self.bn(x))
+        return x
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+class BiSeNetOutput(nn.Module):
+    def __init__(self, in_chan, mid_chan, n_classes, *args, **kwargs):
+        super(BiSeNetOutput, self).__init__()
+        self.conv = ConvBNReLU(in_chan, mid_chan, ks=3, stride=1, padding=1)
+        self.conv_out = nn.Conv2d(mid_chan, n_classes, kernel_size=1, bias=False)
+        self.init_weight()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_out(x)
+        return x
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+class AttentionRefinementModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(AttentionRefinementModule, self).__init__()
+        self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1)
+        self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size= 1, bias=False)
+        self.bn_atten = nn.BatchNorm2d(out_chan)
+        self.sigmoid_atten = nn.Sigmoid()
+        self.init_weight()
+    def forward(self, x):
+        feat = self.conv(x)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv_atten(atten)
+        atten = self.bn_atten(atten)
+        atten = self.sigmoid_atten(atten)
+        out = torch.mul(feat, atten)
+        return out
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+class ContextPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(ContextPath, self).__init__()
+        self.resnet = Resnet18()
+        self.arm16 = AttentionRefinementModule(256, 128)
+        self.arm32 = AttentionRefinementModule(512, 128)
+        self.conv_head32 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_head16 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_avg = ConvBNReLU(512, 128, ks=1, stride=1, padding=0)
+        self.init_weight()
+    def forward(self, x):
+        H0, W0 = x.size()[2:]
+        feat8, feat16, feat32 = self.resnet(x)
+        H8, W8 = feat8.size()[2:]
+        H16, W16 = feat16.size()[2:]
+        H32, W32 = feat32.size()[2:]
+        avg = F.avg_pool2d(feat32, feat32.size()[2:])
+        avg = self.conv_avg(avg)
+        avg_up = F.interpolate(avg, (H32, W32), mode='nearest')
+        feat32_arm = self.arm32(feat32)
+        feat32_sum = feat32_arm + avg_up
+        feat32_up = F.interpolate(feat32_sum, (H16, W16), mode='nearest')
+        feat32_up = self.conv_head32(feat32_up)
+        feat16_arm = self.arm16(feat16)
+        feat16_sum = feat16_arm + feat32_up
+        feat16_up = F.interpolate(feat16_sum, (H8, W8), mode='nearest')
+        feat16_up = self.conv_head16(feat16_up)
+        return feat8, feat16_up, feat32_up  # x8, x8, x16
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+### This is not used, since I replace this with the resnet feature with the same size
+class SpatialPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(SpatialPath, self).__init__()
+        self.conv1 = ConvBNReLU(3, 64, ks=7, stride=2, padding=3)
+        self.conv2 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv3 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv_out = ConvBNReLU(64, 128, ks=1, stride=1, padding=0)
+        self.init_weight()
+    def forward(self, x):
+        feat = self.conv1(x)
+        feat = self.conv2(feat)
+        feat = self.conv3(feat)
+        feat = self.conv_out(feat)
+        return feat
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+class FeatureFusionModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(FeatureFusionModule, self).__init__()
+        self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0)
+        self.conv1 = nn.Conv2d(out_chan,
+                out_chan//4,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.conv2 = nn.Conv2d(out_chan//4,
+                out_chan,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+    def forward(self, fsp, fcp):
+        fcat = torch.cat([fsp, fcp], dim=1)
+        feat = self.convblk(fcat)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv1(atten)
+        atten = self.relu(atten)
+        atten = self.conv2(atten)
+        atten = self.sigmoid(atten)
+        feat_atten = torch.mul(feat, atten)
+        feat_out = feat_atten + feat
+        return feat_out
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+class BiSeNet(nn.Module):
+    def __init__(self, n_classes, *args, **kwargs):
+        super(BiSeNet, self).__init__()
+        self.cp = ContextPath()
+        ## here self.sp is deleted
+        self.ffm = FeatureFusionModule(256, 256)
+        self.conv_out = BiSeNetOutput(256, 256, n_classes)
+        self.conv_out16 = BiSeNetOutput(128, 64, n_classes)
+        self.conv_out32 = BiSeNetOutput(128, 64, n_classes)
+        self.init_weight()
+    def forward(self, x):
+        H, W = x.size()[2:]
+        feat_res8, feat_cp8, feat_cp16 = self.cp(x)  # here return res3b1 feature
+        feat_sp = feat_res8  # use res3b1 feature to replace spatial path feature
+        feat_fuse = self.ffm(feat_sp, feat_cp8)
+        feat_out = self.conv_out(feat_fuse)
+        feat_out16 = self.conv_out16(feat_cp8)
+        feat_out32 = self.conv_out32(feat_cp16)
+        feat_out = F.interpolate(feat_out, (H, W), mode='bilinear', align_corners=True)
+        feat_out16 = F.interpolate(feat_out16, (H, W), mode='bilinear', align_corners=True)
+        feat_out32 = F.interpolate(feat_out32, (H, W), mode='bilinear', align_corners=True)
+        return feat_out, feat_out16, feat_out32
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+    def get_params(self):
+        wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params = [], [], [], []
+        for name, child in self.named_children():
+            child_wd_params, child_nowd_params = child.get_params()
+            if isinstance(child, FeatureFusionModule) or isinstance(child, BiSeNetOutput):
+                lr_mul_wd_params += child_wd_params
+                lr_mul_nowd_params += child_nowd_params
+            else:
+                wd_params += child_wd_params
+                nowd_params += child_nowd_params
+        return wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params
+if __name__ == "__main__":
+    net = BiSeNet(19)
+    net.cuda()
+    net.eval()
+    in_ten = torch.randn(16, 3, 640, 480).cuda()
+    out, out16, out32 = net(in_ten)
+    print(out.shape)
+    net.get_params()

models/BiSeNet/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .bn import ABN, InPlaceABN, InPlaceABNSync
+from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+from .misc import GlobalAvgPool2d, SingleGPU
+from .residual import IdentityResidualBlock
+from .dense import DenseModule

models/BiSeNet/modules/bn.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+try:
+    from queue import Queue
+except ImportError:
+    from Queue import Queue
+from .functions import *
+class ABN(nn.Module):
+    """Activated Batch Normalization
+    This gathers a `BatchNorm2d` and an activation function in a single module
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(ABN, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        self.activation = activation
+        self.slope = slope
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.constant_(self.running_mean, 0)
+        nn.init.constant_(self.running_var, 1)
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+    def forward(self, x):
+        x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
+                                  self.training, self.momentum, self.eps)
+        if self.activation == ACT_RELU:
+            return functional.relu(x, inplace=True)
+        elif self.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
+        elif self.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)
+class InPlaceABN(ABN):
+    """InPlace Activated Batch Normalization"""
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an InPlace Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
+    def forward(self, x):
+        return inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
+                           self.training, self.momentum, self.eps, self.activation, self.slope)
+class InPlaceABNSync(ABN):
+    """InPlace Activated Batch Normalization with cross-GPU synchronization
+    This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
+    """
+    def forward(self, x):
+        return inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
+                                   self.training, self.momentum, self.eps, self.activation, self.slope)
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)

models/BiSeNet/modules/deeplab.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+from models._util import try_index
+from .bn import ABN
+class DeeplabV3(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 hidden_channels=256,
+                 dilations=(12, 24, 36),
+                 norm_act=ABN,
+                 pooling_size=None):
+        super(DeeplabV3, self).__init__()
+        self.pooling_size = pooling_size
+        self.map_convs = nn.ModuleList([
+            nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
+        ])
+        self.map_bn = norm_act(hidden_channels * 4)
+        self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
+        self.global_pooling_bn = norm_act(hidden_channels)
+        self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
+        self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
+        self.red_bn = norm_act(out_channels)
+        self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
+    def reset_parameters(self, activation, slope):
+        gain = nn.init.calculate_gain(activation, slope)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight.data, gain)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, ABN):
+                if hasattr(m, "weight") and m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        # Map convolutions
+        out = torch.cat([m(x) for m in self.map_convs], dim=1)
+        out = self.map_bn(out)
+        out = self.red_conv(out)
+        # Global pooling
+        pool = self._global_pooling(x)
+        pool = self.global_pooling_conv(pool)
+        pool = self.global_pooling_bn(pool)
+        pool = self.pool_red_conv(pool)
+        if self.training or self.pooling_size is None:
+            pool = pool.repeat(1, 1, x.size(2), x.size(3))
+        out += pool
+        out = self.red_bn(out)
+        return out
+    def _global_pooling(self, x):
+        if self.training or self.pooling_size is None:
+            pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
+            pool = pool.view(x.size(0), x.size(1), 1, 1)
+        else:
+            pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
+                            min(try_index(self.pooling_size, 1), x.shape[3]))
+            padding = (
+                (pooling_size[1] - 1) // 2,
+                (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
+                (pooling_size[0] - 1) // 2,
+                (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
+            )
+            pool = functional.avg_pool2d(x, pooling_size, stride=1)
+            pool = functional.pad(pool, pad=padding, mode="replicate")
+        return pool

models/BiSeNet/modules/dense.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from .bn import ABN
+class DenseModule(nn.Module):
+    def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
+        super(DenseModule, self).__init__()
+        self.in_channels = in_channels
+        self.growth = growth
+        self.layers = layers
+        self.convs1 = nn.ModuleList()
+        self.convs3 = nn.ModuleList()
+        for i in range(self.layers):
+            self.convs1.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(in_channels)),
+                ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
+            ])))
+            self.convs3.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(self.growth * bottleneck_factor)),
+                ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
+                                   dilation=dilation))
+            ])))
+            in_channels += self.growth
+    @property
+    def out_channels(self):
+        return self.in_channels + self.growth * self.layers
+    def forward(self, x):
+        inputs = [x]
+        for i in range(self.layers):
+            x = torch.cat(inputs, dim=1)
+            x = self.convs1[i](x)
+            x = self.convs3[i](x)
+            inputs += [x]
+        return torch.cat(inputs, dim=1)

models/BiSeNet/modules/functions.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from os import path
+import torch
+import torch.distributed as dist
+import torch.autograd as autograd
+import torch.cuda.comm as comm
+from torch.autograd.function import once_differentiable
+from torch.utils.cpp_extension import load
+_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
+_backend = load(name="inplace_abn",
+                extra_cflags=["-O3"],
+                sources=[path.join(_src_path, f) for f in [
+                    "inplace_abn.cpp",
+                    "inplace_abn_cpu.cpp",
+                    "inplace_abn_cuda.cu",
+                    "inplace_abn_cuda_half.cu"
+                ]],
+                extra_cuda_cflags=["--expt-extended-lambda"])
+# Activation names
+ACT_RELU = "relu"
+ACT_LEAKY_RELU = "leaky_relu"
+ACT_ELU = "elu"
+ACT_NONE = "none"
+def _check(fn, *args, **kwargs):
+    success = fn(*args, **kwargs)
+    if not success:
+        raise RuntimeError("CUDA Error encountered in {}".format(fn))
+def _broadcast_shape(x):
+    out_size = []
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            out_size.append(1)
+        else:
+            out_size.append(s)
+    return out_size
+def _reduce(x):
+    if len(x.size()) == 2:
+        return x.sum(dim=0)
+    else:
+        n, c = x.size()[0:2]
+        return x.contiguous().view((n, c, -1)).sum(2).sum(0)
+def _count_samples(x):
+    count = 1
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            count *= s
+    return count
+def _act_forward(ctx, x):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_forward(x, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_forward(x)
+    elif ctx.activation == ACT_NONE:
+        pass
+def _act_backward(ctx, x, dx):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_backward(x, dx, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_backward(x, dx)
+    elif ctx.activation == ACT_NONE:
+        pass
+class InPlaceABN(autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        count = _count_samples(x)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        return x
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+        else:
+            # TODO: implement simplified CUDA backward for inference mode
+            edz = dz.new_zeros(dz.size(1))
+            eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        dweight = eydz * weight.sign() if ctx.affine else None
+        dbias = edz if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+class InPlaceABNSync(autograd.Function):
+    @classmethod
+    def forward(cls, ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
+        #count = _count_samples(x)
+        batch_size = x.new_tensor([x.shape[0]],dtype=torch.long)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            if ctx.world_size>1:
+                # get global batch size
+                if equal_batches:
+                    batch_size *= ctx.world_size
+                else:
+                    dist.all_reduce(batch_size, dist.ReduceOp.SUM)
+                ctx.factor = x.shape[0]/float(batch_size.item())
+                mean_all = mean.clone() * ctx.factor
+                dist.all_reduce(mean_all, dist.ReduceOp.SUM)
+                var_all = (var + (mean - mean_all) ** 2) * ctx.factor
+                dist.all_reduce(var_all, dist.ReduceOp.SUM)
+                mean = mean_all
+                var = var_all
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            count = batch_size.item() * x.view(x.shape[0],x.shape[1],-1).shape[-1]
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        return x
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+            edz_local = edz.clone()
+            eydz_local = eydz.clone()
+            if ctx.world_size>1:
+                edz *= ctx.factor
+                dist.all_reduce(edz, dist.ReduceOp.SUM)
+                eydz *= ctx.factor
+                dist.all_reduce(eydz, dist.ReduceOp.SUM)
+        else:
+            edz_local = edz = dz.new_zeros(dz.size(1))
+            eydz_local = eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        dweight = eydz_local * weight.sign() if ctx.affine else None
+        dbias = edz_local if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+inplace_abn = InPlaceABN.apply
+inplace_abn_sync = InPlaceABNSync.apply
+__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]

models/BiSeNet/modules/misc.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch.nn as nn
+import torch
+import torch.distributed as dist
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+    def forward(self, inputs):
+        in_size = inputs.size()
+        return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
+class SingleGPU(nn.Module):
+    def __init__(self, module):
+        super(SingleGPU, self).__init__()
+        self.module=module
+    def forward(self, input):
+        return self.module(input.cuda(non_blocking=True))

models/BiSeNet/modules/residual.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from collections import OrderedDict
+import torch.nn as nn
+from .bn import ABN
+class IdentityResidualBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        """Configurable identity-mapping residual block
+        Parameters
+        ----------
+        in_channels : int
+            Number of input channels.
+        channels : list of int
+            Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+            a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+            `3 x 3` then `1 x 1` convolutions.
+        stride : int
+            Stride of the first `3 x 3` convolution
+        dilation : int
+            Dilation to apply to the `3 x 3` convolutions.
+        groups : int
+            Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+            bottleneck blocks.
+        norm_act : callable
+            Function to create normalization / activation Module.
+        dropout: callable
+            Function to create Dropout Module.
+        """
+        super(IdentityResidualBlock, self).__init__()
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+        self.bn1 = norm_act(in_channels)
+        if not is_bottleneck:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation))
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn3", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            bn1 = self.bn1(x)
+            shortcut = self.proj_conv(bn1)
+        else:
+            shortcut = x.clone()
+            bn1 = self.bn1(x)
+        out = self.convs(bn1)
+        out.add_(shortcut)
+        return out

models/BiSeNet/modules/src/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

models/BiSeNet/modules/src/inplace_abn.cpp ADDED Viewed

	@@ -0,0 +1,95 @@

+#include <torch/extension.h>
+#include <vector>
+#include "inplace_abn.h"
+std::vector<at::Tensor> mean_var(at::Tensor x) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return mean_var_cuda_h(x);
+    } else {
+      return mean_var_cuda(x);
+    }
+  } else {
+    return mean_var_cpu(x);
+  }
+}
+at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                   bool affine, float eps) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
+    } else {
+      return forward_cuda(x, mean, var, weight, bias, affine, eps);
+    }
+  } else {
+    return forward_cpu(x, mean, var, weight, bias, affine, eps);
+  }
+}
+std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                 bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
+    } else {
+      return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
+	}
+  } else {
+    return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
+  }
+}
+at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                 at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
+	} else {
+      return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
+    }
+  } else {
+    return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
+  }
+}
+void leaky_relu_forward(at::Tensor z, float slope) {
+  at::leaky_relu_(z, slope);
+}
+void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return leaky_relu_backward_cuda_h(z, dz, slope);
+	} else {
+      return leaky_relu_backward_cuda(z, dz, slope);
+    }
+  } else {
+    return leaky_relu_backward_cpu(z, dz, slope);
+  }
+}
+void elu_forward(at::Tensor z) {
+  at::elu_(z);
+}
+void elu_backward(at::Tensor z, at::Tensor dz) {
+  if (z.is_cuda()) {
+    return elu_backward_cuda(z, dz);
+  } else {
+    return elu_backward_cpu(z, dz);
+  }
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("mean_var", &mean_var, "Mean and variance computation");
+  m.def("forward", &forward, "In-place forward computation");
+  m.def("edz_eydz", &edz_eydz, "First part of backward computation");
+  m.def("backward", &backward, "Second part of backward computation");
+  m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
+  m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
+  m.def("elu_forward", &elu_forward, "Elu forward computation");
+  m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
+}

models/BiSeNet/modules/src/inplace_abn.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+#include <ATen/ATen.h>
+#include <vector>
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps);
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps);
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                          bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                        bool affine, float eps);
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                        at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
+void elu_backward_cpu(at::Tensor z, at::Tensor dz);
+void elu_backward_cuda(at::Tensor z, at::Tensor dz);
+static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
+  num = x.size(0);
+  chn = x.size(1);
+  sp = 1;
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    sp *= x.size(i);
+}
+/*
+ * Specialized CUDA reduction functions for BN
+ */
+#ifdef __CUDACC__
+#include "utils/cuda.cuh"
+template <typename T, typename Op>
+__device__ T reduce(Op op, int plane, int N, int S) {
+  T sum = (T)0;
+  for (int batch = 0; batch < N; ++batch) {
+    for (int x = threadIdx.x; x < S; x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+#endif

models/BiSeNet/modules/src/inplace_abn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,119 @@

+#include <ATen/ATen.h>
+#include <vector>
+#include "utils/checks.h"
+#include "inplace_abn.h"
+at::Tensor reduce_sum(at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return x.sum(0);
+  } else {
+    auto x_view = x.view({x.size(0), x.size(1), -1});
+    return x_view.sum(-1).sum(0);
+  }
+}
+at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return v;
+  } else {
+    std::vector<int64_t> broadcast_size = {1, -1};
+    for (int64_t i = 2; i < x.ndimension(); ++i)
+      broadcast_size.push_back(1);
+    return v.view(broadcast_size);
+  }
+}
+int64_t count(at::Tensor x) {
+  int64_t count = x.size(0);
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    count *= x.size(i);
+  return count;
+}
+at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
+  if (affine) {
+    return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
+  } else {
+    return z;
+  }
+}
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
+  auto num = count(x);
+  auto mean = reduce_sum(x) / num;
+  auto diff = x - broadcast_to(mean, x);
+  auto var = reduce_sum(diff.pow(2)) / num;
+  return {mean, var};
+}
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps) {
+  auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
+  auto mul = at::rsqrt(var + eps) * gamma;
+  x.sub_(broadcast_to(mean, x));
+  x.mul_(broadcast_to(mul, x));
+  if (affine) x.add_(broadcast_to(bias, x));
+  return x;
+}
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps) {
+  auto edz = reduce_sum(dz);
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto eydz = reduce_sum(y * dz);
+  return {edz, eydz};
+}
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
+  auto num = count(z);
+  auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
+  return dx;
+}
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] *= 1 / slope;
+        _dz[i] *= slope;
+      }
+    }
+  }));
+}
+void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] = log1p(_z[i]);
+        _dz[i] *= (_z[i] + 1.f);
+      }
+    }
+  }));
+}

models/BiSeNet/modules/src/inplace_abn_cuda.cu ADDED Viewed

	@@ -0,0 +1,333 @@

+#include <ATen/ATen.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+template<typename T>
+struct SumOp {
+  __device__ SumOp(const T *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    return tensor[(batch * chn + plane) * sp + n];
+  }
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct VarOp {
+  __device__ VarOp(T m, const T *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    T val = tensor[(batch * chn + plane) * sp + n];
+    return (val - mean) * (val - mean);
+  }
+  const T mean;
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct GradOp {
+  __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
+    T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
+    T _dz = dz[(batch * chn + plane) * sp + n];
+    return Pair<T>(_dz, _y * _dz);
+  }
+  const T weight;
+  const T bias;
+  const T *z;
+  const T *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+template<typename T>
+__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T norm = T(1) / T(num * sp);
+  T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn}, x.options());
+  auto var = at::empty({chn}, x.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
+    mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        num, chn, sp);
+  }));
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+template<typename T>
+__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
+                               bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _mean = mean[plane];
+  T _var = var[plane];
+  T _weight = affine ? abs(weight[plane]) + eps : T(1);
+  T _bias = affine ? bias[plane] : T(0);
+  T mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _x = x[(batch * chn + plane) * sp + n];
+      T _y = (_x - _mean) * mul + _bias;
+      x[(batch * chn + plane) * sp + n] = _y;
+    }
+  }
+}
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
+    forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return x;
+}
+/***********
+ * edz_eydz
+ ***********/
+template<typename T>
+__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
+                                T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn}, z.options());
+  auto eydz = at::empty({chn}, z.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
+    edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return {edz, eydz};
+}
+/***********
+ * backward
+ ***********/
+template<typename T>
+__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
+	                        const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  T _var = var[plane];
+  T _edz = edz[plane];
+  T _eydz = eydz[plane];
+  T _mul = _weight * rsqrt(_var + eps);
+  T count = T(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _dz = dz[(batch * chn + plane) * sp + n];
+      T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
+    }
+  }
+}
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
+    backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        dx.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return dx;
+}
+/**************
+ * activations
+ **************/
+template<typename T>
+inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_dz,
+                       [slope] __device__ (const T& dz) { return dz * slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [slope] __device__ (const T& z) { return z / slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
+  }));
+}
+template<typename T>
+inline void elu_backward_impl(T *z, T *dz, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_z, th_dz,
+                       [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [] __device__ (const T& z) { return log1p(z); },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
+  }));
+}

models/BiSeNet/modules/src/inplace_abn_cuda_half.cu ADDED Viewed

	@@ -0,0 +1,275 @@

+#include <ATen/ATen.h>
+#include <cuda_fp16.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+struct SumOpH {
+  __device__ SumOpH(const half *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    return __half2float(tensor[(batch * chn + plane) * sp + n]);
+  }
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct VarOpH {
+  __device__ VarOpH(float m, const half *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
+    return (t - mean) * (t - mean);
+  }
+  const float mean;
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct GradOpH {
+  __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
+    float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
+    float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+    return Pair<float>(_dz, _y * _dz);
+  }
+  const float weight;
+  const float bias;
+  const half *z;
+  const half *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float norm = 1.f / static_cast<float>(num * sp);
+  float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn},x.options().dtype(at::kFloat));
+  auto var = at::empty({chn},x.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      num, chn, sp);
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
+                                 bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  const float _mean = mean[plane];
+  const float _var = var[plane];
+  const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  const float _bias = affine ? bias[plane] : 0.f;
+  const float mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      half *x_ptr = x + (batch * chn + plane) * sp + n;
+      float _x = __half2float(*x_ptr);
+      float _y = (_x - _mean) * mul + _bias;
+      *x_ptr = __float2half(_y);
+    }
+  }
+}
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  forward_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      weight.data<float>(),
+      bias.data<float>(),
+      affine, eps, num, chn, sp);
+  return x;
+}
+__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
+                                float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn},z.options().dtype(at::kFloat));
+  auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        affine, eps, num, chn, sp);
+  return {edz, eydz};
+}
+__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
+                                  const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  float _var = var[plane];
+  float _edz = edz[plane];
+  float _eydz = eydz[plane];
+  float _mul = _weight * rsqrt(_var + eps);
+  float count = float(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+      float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
+    }
+  }
+}
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  backward_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        var.data<float>(),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        reinterpret_cast<half*>(dx.data<at::Half>()),
+        affine, eps, num, chn, sp);
+  return dx;
+}
+__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count;  i += blockDim.x * gridDim.x){
+    float _z = __half2float(z[i]);
+    if (_z < 0) {
+      dz[i] = __float2half(__half2float(dz[i]) * slope);
+      z[i] = __float2half(_z / slope);
+    }
+  }
+}
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  dim3 threads(getNumThreads(count));
+  dim3 blocks = (count + threads.x - 1) / threads.x;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(z.data<at::Half>()),
+      reinterpret_cast<half*>(dz.data<at::Half>()),
+      slope, count);
+}

models/BiSeNet/modules/src/utils/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

models/BiSeNet/modules/src/utils/common.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#include <ATen/ATen.h>
+/*
+ * Functions to share code between CPU and GPU
+ */
+#ifdef __CUDACC__
+// CUDA versions
+#define HOST_DEVICE __host__ __device__
+#define INLINE_HOST_DEVICE __host__ __device__ inline
+#define FLOOR(x) floor(x)
+#if __CUDA_ARCH__ >= 600
+// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
+#define ACCUM(x,y) atomicAdd_block(&(x),(y))
+#else
+// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
+// and use the known atomicCAS-based implementation for double
+template<typename data_t>
+__device__ inline data_t atomic_add(data_t *address, data_t val) {
+  return atomicAdd(address, val);
+}
+template<>
+__device__ inline double atomic_add(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#define ACCUM(x,y) atomic_add(&(x),(y))
+#endif // #if __CUDA_ARCH__ >= 600
+#else
+// CPU versions
+#define HOST_DEVICE
+#define INLINE_HOST_DEVICE inline
+#define FLOOR(x) std::floor(x)
+#define ACCUM(x,y) (x) += (y)
+#endif // #ifdef __CUDACC__

models/BiSeNet/modules/src/utils/cuda.cuh ADDED Viewed

	@@ -0,0 +1,71 @@

+#pragma once
+/*
+ * General settings and functions
+ */
+const int WARP_SIZE = 32;
+const int MAX_BLOCK_SIZE = 1024;
+static int getNumThreads(int nElem) {
+  int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
+  for (int i = 0; i < 6; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+/*
+ * Reduction utilities
+ */
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
+                                           unsigned int mask = 0xffffffff) {
+#if CUDART_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
+template<typename T>
+struct Pair {
+  T v1, v2;
+  __device__ Pair() {}
+  __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
+  __device__ Pair(T v) : v1(v), v2(v) {}
+  __device__ Pair(int v) : v1(v), v2(v) {}
+  __device__ Pair &operator+=(const Pair<T> &a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+template<typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+template<typename T>
+static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}

models/BiSeNet/optimizer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import logging
+logger = logging.getLogger()
+class Optimizer(object):
+    def __init__(self,
+                model,
+                lr0,
+                momentum,
+                wd,
+                warmup_steps,
+                warmup_start_lr,
+                max_iter,
+                power,
+                *args, **kwargs):
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr
+        self.lr0 = lr0
+        self.lr = self.lr0
+        self.max_iter = float(max_iter)
+        self.power = power
+        self.it = 0
+        wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params = model.get_params()
+        param_list = [
+                {'params': wd_params},
+                {'params': nowd_params, 'weight_decay': 0},
+                {'params': lr_mul_wd_params, 'lr_mul': True},
+                {'params': lr_mul_nowd_params, 'weight_decay': 0, 'lr_mul': True}]
+        self.optim = torch.optim.SGD(
+                param_list,
+                lr = lr0,
+                momentum = momentum,
+                weight_decay = wd)
+        self.warmup_factor = (self.lr0/self.warmup_start_lr)**(1./self.warmup_steps)
+    def get_lr(self):
+        if self.it <= self.warmup_steps:
+            lr = self.warmup_start_lr*(self.warmup_factor**self.it)
+        else:
+            factor = (1-(self.it-self.warmup_steps)/(self.max_iter-self.warmup_steps))**self.power
+            lr = self.lr0 * factor
+        return lr
+    def step(self):
+        self.lr = self.get_lr()
+        for pg in self.optim.param_groups:
+            if pg.get('lr_mul', False):
+                pg['lr'] = self.lr * 10
+            else:
+                pg['lr'] = self.lr
+        if self.optim.defaults.get('lr_mul', False):
+            self.optim.defaults['lr'] = self.lr * 10
+        else:
+            self.optim.defaults['lr'] = self.lr
+        self.it += 1
+        self.optim.step()
+        if self.it == self.warmup_steps+2:
+            logger.info('==> warmup done, start to implement poly lr strategy')
+    def zero_grad(self):
+        self.optim.zero_grad()

models/BiSeNet/prepropess_data.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import os.path as osp
+import os
+import cv2
+from transform import *
+from PIL import Image
+face_data = '/home/zll/data/CelebAMask-HQ/CelebA-HQ-img'
+face_sep_mask = '/home/zll/data/CelebAMask-HQ/CelebAMask-HQ-mask-anno'
+mask_path = '/home/zll/data/CelebAMask-HQ/mask'
+counter = 0
+total = 0
+for i in range(15):
+    atts = ['skin', 'l_brow', 'r_brow', 'l_eye', 'r_eye', 'eye_g', 'l_ear', 'r_ear', 'ear_r',
+            'nose', 'mouth', 'u_lip', 'l_lip', 'neck', 'neck_l', 'cloth', 'hair', 'hat']
+    for j in range(i * 2000, (i + 1) * 2000):
+        mask = np.zeros((512, 512))
+        for l, att in enumerate(atts, 1):
+            total += 1
+            file_name = ''.join([str(j).rjust(5, '0'), '_', att, '.png'])
+            path = osp.join(face_sep_mask, str(i), file_name)
+            if os.path.exists(path):
+                counter += 1
+                sep_mask = np.array(Image.open(path).convert('P'))
+                # print(np.unique(sep_mask))
+                mask[sep_mask == 225] = l
+        cv2.imwrite('{}/{}.png'.format(mask_path, j), mask)
+        print(j)
+print(counter, total)

models/BiSeNet/resnet.py ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as modelzoo
+# from modules.bn import InPlaceABNSync as BatchNorm2d
+resnet18_url = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    def __init__(self, in_chan, out_chan, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(in_chan, out_chan, stride)
+        self.bn1 = nn.BatchNorm2d(out_chan)
+        self.conv2 = conv3x3(out_chan, out_chan)
+        self.bn2 = nn.BatchNorm2d(out_chan)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        if in_chan != out_chan or stride != 1:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_chan, out_chan,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_chan),
+                )
+    def forward(self, x):
+        residual = self.conv1(x)
+        residual = F.relu(self.bn1(residual))
+        residual = self.conv2(residual)
+        residual = self.bn2(residual)
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+        out = shortcut + residual
+        out = self.relu(out)
+        return out
+def create_layer_basic(in_chan, out_chan, bnum, stride=1):
+    layers = [BasicBlock(in_chan, out_chan, stride=stride)]
+    for i in range(bnum-1):
+        layers.append(BasicBlock(out_chan, out_chan, stride=1))
+    return nn.Sequential(*layers)
+class Resnet18(nn.Module):
+    def __init__(self):
+        super(Resnet18, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = create_layer_basic(64, 64, bnum=2, stride=1)
+        self.layer2 = create_layer_basic(64, 128, bnum=2, stride=2)
+        self.layer3 = create_layer_basic(128, 256, bnum=2, stride=2)
+        self.layer4 = create_layer_basic(256, 512, bnum=2, stride=2)
+        self.init_weight()
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(self.bn1(x))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        feat8 = self.layer2(x) # 1/8
+        feat16 = self.layer3(feat8) # 1/16
+        feat32 = self.layer4(feat16) # 1/32
+        return feat8, feat16, feat32
+    def init_weight(self):
+        state_dict = modelzoo.load_url(resnet18_url)
+        self_state_dict = self.state_dict()
+        for k, v in state_dict.items():
+            if 'fc' in k: continue
+            self_state_dict.update({k: v})
+        self.load_state_dict(self_state_dict)
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module,  nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+if __name__ == "__main__":
+    net = Resnet18()
+    x = torch.randn(16, 3, 224, 224)
+    out = net(x)
+    print(out[0].size())
+    print(out[1].size())
+    print(out[2].size())
+    net.get_params()

models/BiSeNet/test.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+from logger import setup_logger
+from model import BiSeNet
+import torch
+import os
+import os.path as osp
+import numpy as np
+from PIL import Image
+import torchvision.transforms as transforms
+import cv2
+def vis_parsing_maps(im, parsing_anno, stride, save_im=False, save_path='vis_results/parsing_map_on_im.jpg'):
+    # Colors for all 20 parts
+    part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
+                   [255, 0, 85], [255, 0, 170],
+                   [0, 255, 0], [85, 255, 0], [170, 255, 0],
+                   [0, 255, 85], [0, 255, 170],
+                   [0, 0, 255], [85, 0, 255], [170, 0, 255],
+                   [0, 85, 255], [0, 170, 255],
+                   [255, 255, 0], [255, 255, 85], [255, 255, 170],
+                   [255, 0, 255], [255, 85, 255], [255, 170, 255],
+                   [0, 255, 255], [85, 255, 255], [170, 255, 255]]
+    im = np.array(im)
+    vis_im = im.copy().astype(np.uint8)
+    vis_parsing_anno = parsing_anno.copy().astype(np.uint8)
+    vis_parsing_anno = cv2.resize(vis_parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)
+    vis_parsing_anno_color = np.zeros((vis_parsing_anno.shape[0], vis_parsing_anno.shape[1], 3)) + 255
+    num_of_class = np.max(vis_parsing_anno)
+    for pi in range(1, num_of_class + 1):
+        index = np.where(vis_parsing_anno == pi)
+        vis_parsing_anno_color[index[0], index[1], :] = part_colors[pi]
+    vis_parsing_anno_color = vis_parsing_anno_color.astype(np.uint8)
+    # print(vis_parsing_anno_color.shape, vis_im.shape)
+    vis_im = cv2.addWeighted(cv2.cvtColor(vis_im, cv2.COLOR_RGB2BGR), 0.4, vis_parsing_anno_color, 0.6, 0)
+    # Save result or not
+    if save_im:
+        cv2.imwrite(save_path[:-4] +'.png', vis_parsing_anno)
+        cv2.imwrite(save_path, vis_im, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
+    # return vis_im
+def evaluate(respth='./res/test_res', dspth='./data', cp='model_final_diss.pth'):
+    if not os.path.exists(respth):
+        os.makedirs(respth)
+    n_classes = 19
+    net = BiSeNet(n_classes=n_classes)
+    net.cuda()
+    save_pth = osp.join('res/cp', cp)
+    net.load_state_dict(torch.load(save_pth))
+    net.eval()
+    to_tensor = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    with torch.no_grad():
+        for image_path in os.listdir(dspth):
+            img = Image.open(osp.join(dspth, image_path))
+            image = img.resize((512, 512), Image.BILINEAR)
+            img = to_tensor(image)
+            img = torch.unsqueeze(img, 0)
+            img = img.cuda()
+            out = net(img)[0]
+            parsing = out.squeeze(0).cpu().numpy().argmax(0)
+            # print(parsing)
+            print(np.unique(parsing))
+            vis_parsing_maps(image, parsing, stride=1, save_im=True, save_path=osp.join(respth, image_path))
+if __name__ == "__main__":
+    evaluate(dspth='/home/zll/data/CelebAMask-HQ/test-img', cp='79999_iter.pth')

models/BiSeNet/train.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+from logger import setup_logger
+from model import BiSeNet
+from face_dataset import FaceMask
+from loss import OhemCELoss
+from evaluate import evaluate
+from optimizer import Optimizer
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+import torch.distributed as dist
+import os
+import os.path as osp
+import logging
+import time
+import datetime
+import argparse
+respth = './res'
+if not osp.exists(respth):
+    os.makedirs(respth)
+logger = logging.getLogger()
+def parse_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument(
+            '--local_rank',
+            dest = 'local_rank',
+            type = int,
+            default = -1,
+            )
+    return parse.parse_args()
+def train():
+    args = parse_args()
+    torch.cuda.set_device(args.local_rank)
+    dist.init_process_group(
+                backend = 'nccl',
+                init_method = 'tcp://127.0.0.1:33241',
+                world_size = torch.cuda.device_count(),
+                rank=args.local_rank
+                )
+    setup_logger(respth)
+    # dataset
+    n_classes = 19
+    n_img_per_gpu = 16
+    n_workers = 8
+    cropsize = [448, 448]
+    data_root = '/home/zll/data/CelebAMask-HQ/'
+    ds = FaceMask(data_root, cropsize=cropsize, mode='train')
+    sampler = torch.utils.data.distributed.DistributedSampler(ds)
+    dl = DataLoader(ds,
+                    batch_size = n_img_per_gpu,
+                    shuffle = False,
+                    sampler = sampler,
+                    num_workers = n_workers,
+                    pin_memory = True,
+                    drop_last = True)
+    # model
+    ignore_idx = -100
+    net = BiSeNet(n_classes=n_classes)
+    net.cuda()
+    net.train()
+    net = nn.parallel.DistributedDataParallel(net,
+            device_ids = [args.local_rank, ],
+            output_device = args.local_rank
+            )
+    score_thres = 0.7
+    n_min = n_img_per_gpu * cropsize[0] * cropsize[1]//16
+    LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx)
+    Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx)
+    Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx)
+    ## optimizer
+    momentum = 0.9
+    weight_decay = 5e-4
+    lr_start = 1e-2
+    max_iter = 80000
+    power = 0.9
+    warmup_steps = 1000
+    warmup_start_lr = 1e-5
+    optim = Optimizer(
+            model = net.module,
+            lr0 = lr_start,
+            momentum = momentum,
+            wd = weight_decay,
+            warmup_steps = warmup_steps,
+            warmup_start_lr = warmup_start_lr,
+            max_iter = max_iter,
+            power = power)
+    ## train loop
+    msg_iter = 50
+    loss_avg = []
+    st = glob_st = time.time()
+    diter = iter(dl)
+    epoch = 0
+    for it in range(max_iter):
+        try:
+            im, lb = next(diter)
+            if not im.size()[0] == n_img_per_gpu:
+                raise StopIteration
+        except StopIteration:
+            epoch += 1
+            sampler.set_epoch(epoch)
+            diter = iter(dl)
+            im, lb = next(diter)
+        im = im.cuda()
+        lb = lb.cuda()
+        H, W = im.size()[2:]
+        lb = torch.squeeze(lb, 1)
+        optim.zero_grad()
+        out, out16, out32 = net(im)
+        lossp = LossP(out, lb)
+        loss2 = Loss2(out16, lb)
+        loss3 = Loss3(out32, lb)
+        loss = lossp + loss2 + loss3
+        loss.backward()
+        optim.step()
+        loss_avg.append(loss.item())
+        #  print training log message
+        if (it+1) % msg_iter == 0:
+            loss_avg = sum(loss_avg) / len(loss_avg)
+            lr = optim.lr
+            ed = time.time()
+            t_intv, glob_t_intv = ed - st, ed - glob_st
+            eta = int((max_iter - it) * (glob_t_intv / it))
+            eta = str(datetime.timedelta(seconds=eta))
+            msg = ', '.join([
+                    'it: {it}/{max_it}',
+                    'lr: {lr:4f}',
+                    'loss: {loss:.4f}',
+                    'eta: {eta}',
+                    'time: {time:.4f}',
+                ]).format(
+                    it = it+1,
+                    max_it = max_iter,
+                    lr = lr,
+                    loss = loss_avg,
+                    time = t_intv,
+                    eta = eta
+                )
+            logger.info(msg)
+            loss_avg = []
+            st = ed
+        if dist.get_rank() == 0:
+            if (it+1) % 5000 == 0:
+                state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict()
+                if dist.get_rank() == 0:
+                    torch.save(state, './res/cp/{}_iter.pth'.format(it))
+                evaluate(dspth='/home/zll/data/CelebAMask-HQ/test-img', cp='{}_iter.pth'.format(it))
+    #  dump the final model
+    save_pth = osp.join(respth, 'model_final_diss.pth')
+    # net.cpu()
+    state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict()
+    if dist.get_rank() == 0:
+        torch.save(state, save_pth)
+    logger.info('training done, model saved to: {}'.format(save_pth))
+if __name__ == "__main__":
+    train()

models/BiSeNet/transform.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+from PIL import Image
+import PIL.ImageEnhance as ImageEnhance
+import random
+import numpy as np
+class RandomCrop(object):
+    def __init__(self, size, *args, **kwargs):
+        self.size = size
+    def __call__(self, im_lb):
+        im = im_lb['im']
+        lb = im_lb['lb']
+        assert im.size == lb.size
+        W, H = self.size
+        w, h = im.size
+        if (W, H) == (w, h): return dict(im=im, lb=lb)
+        if w < W or h < H:
+            scale = float(W) / w if w < h else float(H) / h
+            w, h = int(scale * w + 1), int(scale * h + 1)
+            im = im.resize((w, h), Image.BILINEAR)
+            lb = lb.resize((w, h), Image.NEAREST)
+        sw, sh = random.random() * (w - W), random.random() * (h - H)
+        crop = int(sw), int(sh), int(sw) + W, int(sh) + H
+        return dict(
+                im = im.crop(crop),
+                lb = lb.crop(crop)
+                    )
+class HorizontalFlip(object):
+    def __init__(self, p=0.5, *args, **kwargs):
+        self.p = p
+    def __call__(self, im_lb):
+        if random.random() > self.p:
+            return im_lb
+        else:
+            im = im_lb['im']
+            lb = im_lb['lb']
+            # atts = [1 'skin', 2 'l_brow', 3 'r_brow', 4 'l_eye', 5 'r_eye', 6 'eye_g', 7 'l_ear', 8 'r_ear', 9 'ear_r',
+            #         10 'nose', 11 'mouth', 12 'u_lip', 13 'l_lip', 14 'neck', 15 'neck_l', 16 'cloth', 17 'hair', 18 'hat']
+            flip_lb = np.array(lb)
+            flip_lb[lb == 2] = 3
+            flip_lb[lb == 3] = 2
+            flip_lb[lb == 4] = 5
+            flip_lb[lb == 5] = 4
+            flip_lb[lb == 7] = 8
+            flip_lb[lb == 8] = 7
+            flip_lb = Image.fromarray(flip_lb)
+            return dict(im = im.transpose(Image.FLIP_LEFT_RIGHT),
+                        lb = flip_lb.transpose(Image.FLIP_LEFT_RIGHT),
+                    )
+class RandomScale(object):
+    def __init__(self, scales=(1, ), *args, **kwargs):
+        self.scales = scales
+    def __call__(self, im_lb):
+        im = im_lb['im']
+        lb = im_lb['lb']
+        W, H = im.size
+        scale = random.choice(self.scales)
+        w, h = int(W * scale), int(H * scale)
+        return dict(im = im.resize((w, h), Image.BILINEAR),
+                    lb = lb.resize((w, h), Image.NEAREST),
+                )
+class ColorJitter(object):
+    def __init__(self, brightness=None, contrast=None, saturation=None, *args, **kwargs):
+        if not brightness is None and brightness>0:
+            self.brightness = [max(1-brightness, 0), 1+brightness]
+        if not contrast is None and contrast>0:
+            self.contrast = [max(1-contrast, 0), 1+contrast]
+        if not saturation is None and saturation>0:
+            self.saturation = [max(1-saturation, 0), 1+saturation]
+    def __call__(self, im_lb):
+        im = im_lb['im']
+        lb = im_lb['lb']
+        r_brightness = random.uniform(self.brightness[0], self.brightness[1])
+        r_contrast = random.uniform(self.contrast[0], self.contrast[1])
+        r_saturation = random.uniform(self.saturation[0], self.saturation[1])
+        im = ImageEnhance.Brightness(im).enhance(r_brightness)
+        im = ImageEnhance.Contrast(im).enhance(r_contrast)
+        im = ImageEnhance.Color(im).enhance(r_saturation)
+        return dict(im = im,
+                    lb = lb,
+                )
+class MultiScale(object):
+    def __init__(self, scales):
+        self.scales = scales
+    def __call__(self, img):
+        W, H = img.size
+        sizes = [(int(W*ratio), int(H*ratio)) for ratio in self.scales]
+        imgs = []
+        [imgs.append(img.resize(size, Image.BILINEAR)) for size in sizes]
+        return imgs
+class Compose(object):
+    def __init__(self, do_list):
+        self.do_list = do_list
+    def __call__(self, im_lb):
+        for comp in self.do_list:
+            im_lb = comp(im_lb)
+        return im_lb
+if __name__ == '__main__':
+    flip = HorizontalFlip(p = 1)
+    crop = RandomCrop((321, 321))
+    rscales = RandomScale((0.75, 1.0, 1.5, 1.75, 2.0))
+    img = Image.open('data/img.jpg')
+    lb = Image.open('data/label.png')

models/BiSeNet_pretrained_for_ConsistentID.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:468e13ca13a9b43cc0881a9f99083a430e9c0a38abd935431d1c28ee94b26567
+size 53289463

models/LLaVA/.devcontainer/Dockerfile ADDED Viewed

	@@ -0,0 +1,53 @@

+FROM mcr.microsoft.com/devcontainers/base:ubuntu-20.04
+SHELL [ "bash", "-c" ]
+# update apt and install packages
+RUN apt update && \
+    apt install -yq \
+        ffmpeg \
+        dkms \
+        build-essential
+# add user tools
+RUN sudo apt install -yq \
+        jq \
+        jp \
+        tree \
+        tldr
+# add git-lfs and install
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \
+    sudo apt-get install -yq git-lfs && \
+    git lfs install
+############################################
+# Setup user
+############################################
+USER vscode
+# install azcopy, a tool to copy to/from blob storage
+# for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file
+RUN cd /tmp && \
+    wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \
+    tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \
+    mkdir -p ~/.local/bin && \
+    mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \
+    chmod +x ~/.local/bin/azcopy && \
+    rm -rf azcopy_linux_amd64*
+# Setup conda
+RUN cd /tmp && \
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash ./Miniconda3-latest-Linux-x86_64.sh -b && \
+    rm ./Miniconda3-latest-Linux-x86_64.sh
+# Install dotnet
+RUN cd /tmp && \
+    wget https://dot.net/v1/dotnet-install.sh && \
+    chmod +x dotnet-install.sh && \
+    ./dotnet-install.sh --channel 7.0 && \
+    ./dotnet-install.sh --channel 3.1 && \
+    rm ./dotnet-install.sh

models/LLaVA/.devcontainer/devcontainer.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ SAMPLE_ENV_VAR1="Sample Value"
2	+ SAMPLE_ENV_VAR2=332431bf-68bf

models/LLaVA/.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+    "name": "LLaVA",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {}
+    },
+    "features": {
+        "ghcr.io/devcontainers/features/docker-in-docker:2": {},
+        "ghcr.io/devcontainers/features/azure-cli:1": {},
+        "ghcr.io/azure/azure-dev/azd:0": {},
+        "ghcr.io/devcontainers/features/powershell:1": {},
+        "ghcr.io/devcontainers/features/common-utils:2": {},
+        "ghcr.io/devcontainers-contrib/features/zsh-plugins:0": {},
+    },
+    // "forwardPorts": [],
+    "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh",
+    "customizations": {
+        "vscode": {
+            "settings": {
+                "python.analysis.autoImportCompletions": true,
+                "python.analysis.autoImportUserSymbols": true,
+                "python.defaultInterpreterPath": "~/miniconda3/envs/llava/bin/python",
+                "python.formatting.provider": "yapf",
+                "python.linting.enabled": true,
+                "python.linting.flake8Enabled": true,
+                "isort.check": true,
+                "dev.containers.copyGitConfig": true,
+                "terminal.integrated.defaultProfile.linux": "zsh",
+                "terminal.integrated.profiles.linux": {
+                    "zsh": {
+                        "path": "/usr/bin/zsh"
+                    },
+                }
+            },
+            "extensions": [
+                "aaron-bond.better-comments",
+                "eamodio.gitlens",
+                "EditorConfig.EditorConfig",
+                "foxundermoon.shell-format",
+                "GitHub.copilot-chat",
+                "GitHub.copilot-labs",
+                "GitHub.copilot",
+                "lehoanganh298.json-lines-viewer",
+                "mhutchie.git-graph",
+                "ms-azuretools.vscode-docker",
+                "ms-dotnettools.dotnet-interactive-vscode",
+                "ms-python.flake8",
+                "ms-python.isort",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "njpwerner.autodocstring",
+                "redhat.vscode-yaml",
+                "stkb.rewrap",
+                "yzhang.markdown-all-in-one",
+            ]
+        }
+    },
+    "mounts": [],
+    "runArgs": [
+        "--gpus",
+        "all",
+        // "--ipc",
+        // "host",
+        "--ulimit",
+        "memlock=-1",
+        "--env-file",
+        ".devcontainer/devcontainer.env"
+    ],
+    // "remoteUser": "root"
+}