Spaces:

grostaco
/

IRRA

Sleeping

App Files Files Community

grostaco commited on Nov 12, 2023

Commit

ba2ab36

•

1 Parent(s): b4c1a13

initial commit

Browse files

Files changed (12) hide show

.gitignore +1 -0
README.md +3 -2
app.py +40 -0
lib/IRRA/image.py +23 -0
lib/IRRA/model/__init__.py +1 -0
lib/IRRA/model/build.py +150 -0
lib/IRRA/model/clip_model.py +602 -0
lib/IRRA/model/objectives.py +119 -0
lib/IRRA/tokenizer.py +153 -0
lib/__init__.py +0 -0
lib/components/__init__.py +0 -0
lib/utils/model.py +31 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -9,5 +9,6 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
- f

 pinned: false
 ---
+# IRRA space
+Space for Text-To-Image Person retrieval for the [IRRA](https://github.com/anosorae/IRRA/tree/main) model

app.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import streamlit as st
+from lib.utils.model import get_model, get_similarities
+from PIL import Image
+st.title('IRRA Text-To-Image-Retrival')
+st.header('Inputs')
+caption = st.text_input('Description Input')
+images = st.file_uploader('Upload images', accept_multiple_files=True)
+if images is not None:
+    st.image(images) # type: ignore
+st.header('Options')
+st.subheader('Ranks')
+ranks = st.slider('slider_ranks', min_value=1, max_value=10, label_visibility='collapsed',value=5)
+button = st.button('Match most similar', disabled=len(images) == 0 or caption == '')
+if button:
+    st.header('Results')
+    with st.spinner('Loading model'):
+        model = get_model()
+    st.text(f'IRRA model loaded with {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters')
+    with st.spinner('Computing and ranking similarities'):
+        similarities = get_similarities(caption, images, model)
+    indices = similarities.argsort(descending=True).squeeze(0).cpu().tolist()[:ranks]
+    for i, idx in enumerate(indices):
+        c1, c2 = st.columns(2)
+        with c1:
+            st.text(f'Rank {i + 1}')
+        with c2:
+            st.image(images[idx])

lib/IRRA/image.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+import torchvision.transforms as T
+from PIL import Image
+def prepare_images(files: list[str]):
+    mean = [0.48145466, 0.4578275, 0.40821073]
+    std = [0.26862954, 0.26130258, 0.27577711]
+    transforms = T.Compose([
+            T.Resize((384, 128)),
+            T.RandomHorizontalFlip(0.5),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std),
+        ])
+    tensors = []
+    for file in files:
+        tensors.append(transforms(Image.open(file).convert('RGB')).unsqueeze(0))
+    return torch.cat(tensors, dim=0)

lib/IRRA/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .build import build_model

lib/IRRA/model/build.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from . import objectives
+from .clip_model import Transformer, QuickGELU, LayerNorm, build_CLIP_from_openai_pretrained, convert_weights
+import numpy as np
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+class IRRA(nn.Module):
+    def __init__(self, args, num_classes=11003):
+        super().__init__()
+        self.args = args
+        self.num_classes = num_classes
+        self._set_task()
+        self.base_model, base_cfg = build_CLIP_from_openai_pretrained(args.pretrain_choice, args.img_size, args.stride_size)
+        self.embed_dim = base_cfg['embed_dim']
+        self.logit_scale = torch.ones([]) * (1 / args.temperature)
+        if 'id' in args.loss_names:
+            self.classifier = nn.Linear(self.embed_dim, self.num_classes)
+            nn.init.normal_(self.classifier.weight.data, std=0.001)
+            nn.init.constant_(self.classifier.bias.data, val=0.0)
+        if 'mlm' in args.loss_names:
+            self.cross_attn = nn.MultiheadAttention(self.embed_dim,
+                                                    self.embed_dim // 64,
+                                                    batch_first=True)
+            self.cross_modal_transformer = Transformer(width=self.embed_dim,
+                                                       layers=args.cmt_depth,
+                                                       heads=self.embed_dim //
+                                                       64)
+            scale = self.cross_modal_transformer.width**-0.5
+            self.ln_pre_t = LayerNorm(self.embed_dim)
+            self.ln_pre_i = LayerNorm(self.embed_dim)
+            self.ln_post = LayerNorm(self.embed_dim)
+            proj_std = scale * ((2 * self.cross_modal_transformer.layers)**-0.5)
+            attn_std = scale
+            fc_std = (2 * self.cross_modal_transformer.width)**-0.5
+            for block in self.cross_modal_transformer.resblocks:
+                nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+                nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+                nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+                nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+            # init cross attn
+            nn.init.normal_(self.cross_attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(self.cross_attn.out_proj.weight, std=proj_std)
+            self.mlm_head = nn.Sequential(
+                OrderedDict([('dense', nn.Linear(self.embed_dim, self.embed_dim)),
+                            ('gelu', QuickGELU()),
+                            ('ln', LayerNorm(self.embed_dim)),
+                            ('fc', nn.Linear(self.embed_dim, args.vocab_size))]))
+            # init mlm head
+            nn.init.normal_(self.mlm_head.dense.weight, std=fc_std)
+            nn.init.normal_(self.mlm_head.fc.weight, std=proj_std)
+    def _set_task(self):
+        loss_names = self.args.loss_names
+        self.current_task = [l.strip() for l in loss_names.split('+')]
+        print(f'Training Model with {self.current_task} tasks')
+    def cross_former(self, q, k, v):
+        x = self.cross_attn(
+                self.ln_pre_t(q),
+                self.ln_pre_i(k),
+                self.ln_pre_i(v),
+                need_weights=False)[0]
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.cross_modal_transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x)
+        return x
+    def encode_image(self, image):
+        x = self.base_model.encode_image(image)
+        return x[:, 0, :].float()
+        # return x.float() # for CLIP ResNet visual model
+    def encode_text(self, text):
+        x = self.base_model.encode_text(text)
+        return x[torch.arange(x.shape[0]), text.argmax(dim=-1)].float()
+    def forward(self, batch):
+        ret = dict()
+        images = batch['images']
+        caption_ids = batch['caption_ids']
+        image_feats, text_feats = self.base_model(images, caption_ids)
+        i_feats = image_feats[:, 0, :].float()
+        # i_feats = image_feats.float() # for CLIP ResNet visual model
+        t_feats = text_feats[torch.arange(text_feats.shape[0]), caption_ids.argmax(dim=-1)].float()
+        logit_scale = self.logit_scale
+        ret.update({'temperature': 1 / logit_scale})
+        if 'itc' in self.current_task:
+            ret.update({'itc_loss':objectives.compute_itc(i_feats, t_feats, logit_scale)})
+        if 'sdm' in self.current_task:
+            ret.update({'sdm_loss':objectives.compute_sdm(i_feats, t_feats, batch['pids'], logit_scale)})
+        if 'cmpm' in self.current_task:
+            ret.update({'cmpm_loss':objectives.compute_cmpm(i_feats, t_feats, batch['pids'])})
+        if 'id' in self.current_task:
+            image_logits = self.classifier(i_feats.half()).float()
+            text_logits = self.classifier(t_feats.half()).float()
+            ret.update({'id_loss':objectives.compute_id(image_logits, text_logits, batch['pids'])*self.args.id_loss_weight})
+            image_pred = torch.argmax(image_logits, dim=1)
+            text_pred = torch.argmax(text_logits, dim=1)
+            image_precision = (image_pred == batch['pids']).float().mean()
+            text_precision = (text_pred == batch['pids']).float().mean()
+            ret.update({'img_acc': image_precision})
+            ret.update({'txt_acc': text_precision})
+        if 'mlm' in self.current_task:
+            mlm_ids = batch['mlm_ids']
+            mlm_feats = self.base_model.encode_text(mlm_ids)
+            x = self.cross_former(mlm_feats, image_feats, image_feats)
+            x = self.mlm_head(x)  # [batch_size, text_len, num_colors]
+            scores = x.float().reshape(-1, self.args.vocab_size)
+            mlm_labels = batch['mlm_labels'].reshape(-1)
+            ret.update({'mlm_loss': objectives.compute_mlm(scores, mlm_labels)*self.args.mlm_loss_weight})
+            pred = scores.max(1)[1]
+            mlm_label_idx = torch.nonzero(mlm_labels)
+            acc = (pred[mlm_label_idx] == mlm_labels[mlm_label_idx]).float().mean()
+            ret.update({'mlm_acc': acc})
+        return ret
+def build_model(args, num_classes=11003):
+    model = IRRA(args, num_classes)
+    # covert model to fp16
+    convert_weights(model)
+    return model

lib/IRRA/model/clip_model.py ADDED Viewed

	@@ -0,0 +1,602 @@

+""" CLIP Model
+Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+from collections import OrderedDict
+import logging
+import math
+import os
+from typing import List, Tuple, Union
+import hashlib
+import urllib
+from tqdm import tqdm
+import warnings
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+logger = logging.getLogger("IRRA.model")
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+}
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        # self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.positional_embedding = nn.Parameter(torch.randn((spacial_dim[0] * spacial_dim[1]) + 1, embed_dim)/ embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        spacial_dim = (
+            input_resolution[0] // 32,
+            input_resolution[1] // 32,
+        )
+        self.attnpool = AttentionPool2d(spacial_dim, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: Tuple[int, int], patch_size: int, stride_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution # (384, 128)
+        self.num_x = (input_resolution[1] - patch_size) // stride_size + 1
+        self.num_y = (input_resolution[0] - patch_size) // stride_size + 1
+        num_patches = self.num_x * self.num_y
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=stride_size, bias=False)
+        scale = width ** -0.5 # 1/sqrt(768)
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(num_patches + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        # x = self.ln_post(x[:, 0, :])
+        x = self.ln_post(x)
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: Union[int, Tuple[int, int]],
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 stride_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                stride_size=stride_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        # self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        x = x @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # # normalized features
+        # image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        # text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        # # cosine similarity as logits
+        # logit_scale = self.logit_scale.exp()
+        # logits_per_image = logit_scale * image_features @ text_features.t()
+        # logits_per_text = logits_per_image.t()
+        # # shape = [global_batch_size, global_batch_size]
+        # return logits_per_image, logits_per_text
+        return image_features, text_features
+    def load_param(self, state_dict):
+        # 将pretrained_dict里不属于model_dict的键剔除掉
+        param_dict =  {k: v for k, v in state_dict.items() if k in self.state_dict()}
+        if 'model' in param_dict:
+            param_dict = param_dict['model']
+        if 'state_dict' in param_dict:
+            param_dict = param_dict['state_dict']
+        for k, v in param_dict.items():
+            if k == 'visual.positional_embedding' and v.shape != self.visual.positional_embedding.shape:
+                v = resize_pos_embed(v, self.visual.positional_embedding, self.visual.num_y, self.visual.num_x)
+            elif k == 'positional_embedding' and v.shape != self.positional_embedding.shape:
+                v = resize_text_pos_embed(v, self.context_length)
+            try:
+                self.state_dict()[k].copy_(v)
+            except:
+                print(f'===========================ERROR occur in copy {k}, {v.shape}=========================')
+                print('shape do not match in k :{}: param_dict{} vs self.state_dict(){}'.format(k, v.shape, self.state_dict()[k].shape))
+def resize_pos_embed(posemb, posemb_new, hight, width):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    posemb = posemb.unsqueeze(0)
+    posemb_new = posemb_new.unsqueeze(0)
+    posemb_token, posemb_grid = posemb[:, :1], posemb[0, 1:]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    print('Resized position embedding from size:{} to size: {} with height:{} width: {}'.format(posemb.shape, posemb_new.shape, hight, width))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(hight, width), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, hight * width, -1)
+    posemb = torch.cat([posemb_token, posemb_grid], dim=1)
+    return posemb.squeeze(0)
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        # if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+        #     l.weight.data = l.weight.data.half()
+        #     if l.bias is not None:
+        #         l.bias.data = l.bias.data.half()
+        # if isinstance(l, nn.MultiheadAttention):
+        #     for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+        #         tensor = getattr(l, attr)
+        #         if tensor is not None:
+        #             tensor.data = tensor.data.half()
+        # for name in ["text_projection", "proj", "mcq_proj"]:
+        #     if hasattr(l, name):
+        #         attr = getattr(l, name)
+        #         if attr is not None:
+        #             attr.data = attr.data.half()
+        ...
+    model.apply(_convert_weights_to_fp16)
+def build_CLIP_from_openai_pretrained(name: str, image_size: Union[int, Tuple[int, int]], stride_size: int, jit: bool = False, download_root: str = None):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    image_size: Union[int, Tuple[int, int]]
+        Input image size, in Re-ID task, image size commonly set to 384x128, instead of 224x224
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location="cpu")
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+    state_dict = state_dict or model.state_dict()
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    model_cfg = {
+        'embed_dim': embed_dim,
+        'image_resolution': image_resolution,
+        'vision_layers': vision_layers,
+        'vision_width': vision_width,
+        'vision_patch_size': vision_patch_size,
+        'context_length': context_length,
+        'vocab_size': vocab_size,
+        'transformer_width': transformer_width,
+        'transformer_heads': transformer_heads,
+        'transformer_layers': transformer_layers
+    }
+    # modify image resolution to adapt Re-ID task
+    model_cfg['image_resolution'] = image_size
+    model_cfg['stride_size'] = stride_size
+    logger.info(f"Load pretrained {name} CLIP model with model config: {model_cfg}")
+    model = CLIP(**model_cfg)
+    # covert model to fp16
+    # convert_weights(model)
+    # resize modified pos embedding
+    model.load_param(state_dict)
+    return model, model_cfg

lib/IRRA/model/objectives.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def compute_sdm(image_fetures, text_fetures, pid, logit_scale, image_id=None, factor=0.3, epsilon=1e-8):
+    """
+    Similarity Distribution Matching
+    """
+    batch_size = image_fetures.shape[0]
+    pid = pid.reshape((batch_size, 1)) # make sure pid size is [batch_size, 1]
+    pid_dist = pid - pid.t()
+    labels = (pid_dist == 0).float()
+    if image_id != None:
+        # print("Mix PID and ImageID to create soft label.")
+        image_id = image_id.reshape((-1, 1))
+        image_id_dist = image_id - image_id.t()
+        image_id_mask = (image_id_dist == 0).float()
+        labels = (labels - image_id_mask) * factor + image_id_mask
+        # labels = (labels + image_id_mask) / 2
+    image_norm = image_fetures / image_fetures.norm(dim=1, keepdim=True)
+    text_norm = text_fetures / text_fetures.norm(dim=1, keepdim=True)
+    t2i_cosine_theta = text_norm @ image_norm.t()
+    i2t_cosine_theta = t2i_cosine_theta.t()
+    text_proj_image = logit_scale * t2i_cosine_theta
+    image_proj_text = logit_scale * i2t_cosine_theta
+    # normalize the true matching distribution
+    labels_distribute = labels / labels.sum(dim=1)
+    i2t_pred = F.softmax(image_proj_text, dim=1)
+    i2t_loss = i2t_pred * (F.log_softmax(image_proj_text, dim=1) - torch.log(labels_distribute + epsilon))
+    t2i_pred = F.softmax(text_proj_image, dim=1)
+    t2i_loss = t2i_pred * (F.log_softmax(text_proj_image, dim=1) - torch.log(labels_distribute + epsilon))
+    loss = torch.mean(torch.sum(i2t_loss, dim=1)) + torch.mean(torch.sum(t2i_loss, dim=1))
+    return loss
+def compute_mlm(scores, labels):
+    ce = nn.CrossEntropyLoss(ignore_index=0)
+    return ce(scores, labels)
+def compute_itc(image_features, text_features, logit_scale):
+    """
+    image-text contrastive (ITC) loss, InfoNCE
+    """
+    batch_size = image_features.shape[0]
+    labels = torch.arange(start=0, end=batch_size, dtype=torch.int64)
+    labels = labels.to(image_features.device)
+    # normalized features
+    image_norm = image_features / image_features.norm(dim=-1, keepdim=True)
+    text_norm = text_features / text_features.norm(dim=-1, keepdim=True)
+    # cosine similarity as logits
+    logits_per_image = logit_scale * image_norm @ text_norm.t()
+    logits_per_text = logits_per_image.t()
+    loss_i = F.cross_entropy(logits_per_image, labels)
+    loss_t =F.cross_entropy(logits_per_text, labels)
+    loss = (loss_i +  loss_t)/2
+    return loss
+def compute_id(image_logits, text_logits, labels):
+    """
+    Instance loss proposed at http://arxiv.org/abs/1711.05535
+    """
+    criterion = nn.CrossEntropyLoss(reduction="mean")
+    loss = criterion(image_logits, labels) + criterion(text_logits, labels)
+    return loss / 2
+def compute_cmpm(image_embeddings, text_embeddings, labels, epsilon=1e-8):
+    """
+    Cross-Modal Projection Matching Loss(CMPM)
+    :param image_embeddings: Tensor with dtype torch.float32
+    :param text_embeddings: Tensor with dtype torch.float32
+    :param labels: Tensor with dtype torch.int32
+    :return:
+        i2t_loss: cmpm loss for image projected to text
+        t2i_loss: cmpm loss for text projected to image
+        pos_avg_sim: average cosine-similarity for positive pairs
+        neg_avg_sim: averate cosine-similarity for negative pairs
+    """
+    batch_size = image_embeddings.shape[0]
+    labels_reshape = torch.reshape(labels, (batch_size, 1))
+    labels_dist = labels_reshape - labels_reshape.t()
+    labels_mask = (labels_dist == 0).float()
+    image_norm = image_embeddings / image_embeddings.norm(dim=1, keepdim=True)
+    text_norm = text_embeddings / text_embeddings.norm(dim=1, keepdim=True)
+    image_proj_text = torch.matmul(image_embeddings, text_norm.t())
+    text_proj_image = torch.matmul(text_embeddings, image_norm.t())
+    # normalize the true matching distribution
+    labels_mask_norm = labels_mask / labels_mask.norm(dim=1)
+    i2t_pred = F.softmax(image_proj_text, dim=1)
+    i2t_loss = i2t_pred * (F.log_softmax(image_proj_text, dim=1) - torch.log(labels_mask_norm + epsilon))
+    t2i_pred = F.softmax(text_proj_image, dim=1)
+    t2i_loss = t2i_pred * (F.log_softmax(text_proj_image, dim=1) - torch.log(labels_mask_norm + epsilon))
+    cmpm_loss = torch.mean(torch.sum(i2t_loss, dim=1)) + torch.mean(torch.sum(t2i_loss, dim=1))
+    return cmpm_loss

lib/IRRA/tokenizer.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+import torch
+@lru_cache()
+def default_bpe():
+    return "./model/bpe_simple_vocab_16e6.txt.gz"
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.pop(-1) # remove last one in vocab(jekyll) to keep vocab_size unchanged
+        vocab.extend(['<|mask|>', '<|startoftext|>', '<|endoftext|>']) # vocab_size 49408
+        # vocab.extend(['<|startoftext|>', '<|endoftext|>']) # vocab_size 49408
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|mask|>': '<|mask|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|mask\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
+def tokenize(caption: str, tokenizer, text_length=77, truncate=True) -> torch.LongTensor:
+    sot_token = tokenizer.encoder["<|startoftext|>"]
+    eot_token = tokenizer.encoder["<|endoftext|>"]
+    tokens = [sot_token] + tokenizer.encode(caption) + [eot_token]
+    result = torch.zeros(text_length, dtype=torch.long)
+    if len(tokens) > text_length:
+        if truncate:
+            tokens = tokens[:text_length]
+            tokens[-1] = eot_token
+        else:
+            raise RuntimeError(
+                f"Input {caption} is too long for context length {text_length}"
+            )
+    result[:len(tokens)] = torch.tensor(tokens)
+    return result # type: ignore

lib/__init__.py ADDED Viewed

File without changes

lib/components/__init__.py ADDED Viewed

File without changes

lib/utils/model.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import streamlit as st
+import yaml
+import torch
+from lib.IRRA.tokenizer import tokenize, SimpleTokenizer
+from lib.IRRA.image import prepare_images
+from lib.IRRA.model.build import build_model, IRRA
+from easydict import EasyDict
+@st.cache_resource
+def get_model():
+    args = yaml.load(open('model/configs.yaml'), Loader=yaml.FullLoader)
+    args = EasyDict(args)
+    args['training'] = False
+    model = build_model(args)
+    return model
+def get_similarities(text: str, images: list[str], model: IRRA) -> torch.Tensor:
+    tokenizer = SimpleTokenizer()
+    txt = tokenize(text, tokenizer)
+    imgs = prepare_images(images)
+    print(imgs.shape)
+    image_feats = model.encode_image(imgs)
+    text_feats = model.encode_text(txt.unsqueeze(0))
+    return text_feats @ image_feats.t()