Spaces:

darknoon
/

image-tokens

Sleeping

App Files Files Community

darknoon commited on Jul 15

Commit

77819e0

•

1 Parent(s): 674d65b

Add model from Chameleon

Browse files

Files changed (4) hide show

app.py +100 -37
chameleon/LICENSE +51 -0
chameleon/image_tokenizer.py +124 -0
chameleon/vqgan.py +675 -0

app.py CHANGED Viewed

@@ -1,18 +1,27 @@
-from typing import List, Literal
 import gradio as gr
 import torch
 import numpy as np
 import colorsys
 from diffusers import VQModel
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.wuerstchen.modeling_paella_vq_model import PaellaVQModel
-from abc import abstractmethod
 import torch.backends
 import torch.mps
 from PIL import Image
 import spaces
 if torch.cuda.is_available():
     device = torch.device("cuda")
 elif torch.backends.mps.is_available():
@@ -21,9 +30,7 @@ else:
     device = torch.device("cpu")
-# abstract class VQImageRoundtripPipeline:
 class ImageRoundtripPipeline:
-    @abstractmethod
     def roundtrip_image(self, image, output_type="pil"): ...
@@ -63,6 +70,12 @@ class VQImageRoundtripPipeline(ImageRoundtripPipeline):
         latents = self.vqvae.quantize(latents)[2][2].reshape(
             batch_size, latents_height, latents_width
         )
         output = self.vqvae.decode(
             latents,
             force_not_quantize=True,
@@ -81,6 +94,55 @@ class VQImageRoundtripPipeline(ImageRoundtripPipeline):
         return output[0], latents.cpu().numpy(), self.vqvae.config.num_vq_embeddings
 class PaellaImageRoundtripPipeline(ImageRoundtripPipeline):
     vqgan: PaellaVQModel
     vae_scale_factor: int
@@ -127,6 +189,7 @@ class PaellaImageRoundtripPipeline(ImageRoundtripPipeline):
 pipeline_paella = PaellaImageRoundtripPipeline()
 pipeline_vq = VQImageRoundtripPipeline()
 # Function to generate a list of unique colors
@@ -171,26 +234,27 @@ def vqgan_tokens_to_image(tokens, codebook_size, downscale_factor):
     return img
-# This is a gradio space that lets you encode an image with various encoder-decoder pairs, eg VQ-GAN, SDXL's VAE, etc and check the image quality
-# def image_grid_to_string(image_grid):
-#     """Convert a latent vq index "image" grid to a string, input shape is (1, height, width)"""
-#     return "\n".join(
-#         [" ".join([str(int(x)) for x in row]) for row in image_grid.squeeze()]
-#     )
 def describe_shape(shape):
     return f"Shape: {shape} num elements: {np.prod(shape)}"
 @spaces.GPU(duration=32)
 @torch.no_grad()
 def roundtrip_image(
     image,
-    model: List[Literal["vqgan", Literal["paella"]]],
-    size: List[Literal["256x256", "512x512", "1024x1024"]],
     output_type="pil",
 ):
     if size == "256x256":
@@ -202,41 +266,40 @@ def roundtrip_image(
     else:
         raise ValueError(f"Unknown size {size}")
     if model == "vqgan":
-        image, latents, codebook_size = pipeline_vq.roundtrip_image(image, output_type)
-        return (
-            image,
-            vqgan_tokens_to_image(
-                latents, codebook_size, downscale_factor=pipeline_vq.vae_scale_factor
-            ),
-            describe_shape(latents.shape),
-        )
     elif model == "paella":
-        image, latents, codebook_size = pipeline_paella.roundtrip_image(
-            image, output_type
-        )
-        return (
-            image,
-            vqgan_tokens_to_image(
-                latents, codebook_size, downscale_factor=pipeline_vq.vae_scale_factor
-            ),
-            describe_shape(latents.shape),
-        )
     else:
         raise ValueError(f"Unknown model {model}")
 demo = gr.Interface(
     fn=roundtrip_image,
     inputs=[
         gr.Image(type="pil"),
-        gr.Dropdown(["vqgan", "paella"], label="Model", value="vqgan"),
         gr.Dropdown(["256x256", "512x512", "1024x1024"], label="Size", value="512x512"),
     ],
     outputs=[
-        gr.Image(label="Reconstructed"),
-        gr.Image(label="Tokens"),
         gr.Text(label="VQ Shape"),
     ],
     title="Image Tokenizer Playground",
     description="Round-trip an image through an encode-decoder pair to see the quality loss from the VQ-GAN for image generation, etc.",

+from typing import Literal
 import gradio as gr
 import torch
 import numpy as np
 import colorsys
+import yaml
+from huggingface_hub import hf_hub_download
 from diffusers import VQModel
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.wuerstchen.modeling_paella_vq_model import PaellaVQModel
+from chameleon.image_tokenizer import ImageTokenizer
 import torch.backends
 import torch.mps
 from PIL import Image
 import spaces
+Model = Literal["vqgan", "paella", "chameleon"]
+models = ["vqgan", "paella", "chameleon"]
 if torch.cuda.is_available():
     device = torch.device("cuda")
 elif torch.backends.mps.is_available():
     device = torch.device("cpu")
 class ImageRoundtripPipeline:
     def roundtrip_image(self, image, output_type="pil"): ...
         latents = self.vqvae.quantize(latents)[2][2].reshape(
             batch_size, latents_height, latents_width
         )
+        # replace 20% of latents with random values
+        # random_latents = torch.randint(
+        #     0, self.vqvae.config.num_vq_embeddings, latents.shape, device=device
+        # )
+        # random_mask = torch.rand(latents.shape, device=device) < 0.2
+        # latents = torch.where(random_mask, random_latents, latents)
         output = self.vqvae.decode(
             latents,
             force_not_quantize=True,
         return output[0], latents.cpu().numpy(), self.vqvae.config.num_vq_embeddings
+class ChameleonVQImageRoundtripPipeline(ImageRoundtripPipeline):
+    tokenizer: ImageTokenizer
+    n_embed: int
+    vae_scale_factor: int
+    def __init__(self):
+        vqgan_path = hf_hub_download(
+            "darknoon/chameleon-tokenizer", "tokenizer/vqgan.ckpt"
+        )
+        vqgan_config_path = hf_hub_download(
+            "darknoon/chameleon-tokenizer", "tokenizer/vqgan.yaml"
+        )
+        self.tokenizer = ImageTokenizer(
+            cfg_path=vqgan_config_path, ckpt_path=vqgan_path, device=device
+        )
+        with open(vqgan_config_path) as f:
+            vq_config = yaml.safe_load(f)
+        self.n_embed = vq_config["model"]["params"]["n_embed"]
+        self.vae_scale_factor = 16
+        print("Chameleon VQGan model loaded", self.tokenizer._vq_model, self.n_embed)
+    def preprocess(self, image: Image):
+        # copied from _vqgan_input_from
+        np_img = np.array(image) / 255.0  # Normalize to [0, 1]
+        np_img = np_img * 2 - 1  # Scale to [-1, 1]
+        tensor_img = (
+            torch.from_numpy(np_img).permute(2, 0, 1).float()
+        )  # (Channels, Height, Width) format.
+        # Add batch dimension.
+        return tensor_img.unsqueeze(0)
+    def roundtrip_image(self, image, output_type="pil"):
+        # image = self.tokenizer._vqgan_input_from(image).to(device)
+        image = self.preprocess(image).to(device)
+        _, _, [_, _, latents] = self.tokenizer._vq_model.encode(image)
+        # emb_dim = self._vq_model.quantize.embedding.weight.shape[-1]
+        output = self.tokenizer.pil_from_img_toks(latents)
+        # we actually do want this to be a grid, sorry!
+        latents = latents.reshape(1, 32, 32)
+        return (
+            output,
+            latents.cpu().numpy(),
+            self.n_embed,
+        )
 class PaellaImageRoundtripPipeline(ImageRoundtripPipeline):
     vqgan: PaellaVQModel
     vae_scale_factor: int
 pipeline_paella = PaellaImageRoundtripPipeline()
 pipeline_vq = VQImageRoundtripPipeline()
+pipeline_vq_chameleon = ChameleonVQImageRoundtripPipeline()
 # Function to generate a list of unique colors
     return img
 def describe_shape(shape):
     return f"Shape: {shape} num elements: {np.prod(shape)}"
+def calc_psnr(img1: Image, img2: Image):
+    if img1.size != img2.size:
+        raise ValueError("Images must have the same dimensions")
+    img1 = np.array(img1)
+    img2 = np.array(img2)
+    mse = np.mean((img1 - img2) ** 2)
+    if mse == 0:
+        return float("inf")
+    return 2 * 10 * np.log10(255.0 / np.sqrt(mse))
 @spaces.GPU(duration=32)
 @torch.no_grad()
 def roundtrip_image(
     image,
+    model: Model,
+    size: Literal["256x256", "512x512", "1024x1024"],
     output_type="pil",
 ):
     if size == "256x256":
     else:
         raise ValueError(f"Unknown size {size}")
+    image_orig = image
     if model == "vqgan":
+        pipeline = pipeline_vq
     elif model == "paella":
+        pipeline = pipeline_paella
+    elif model == "chameleon":
+        pipeline = pipeline_vq_chameleon
     else:
         raise ValueError(f"Unknown model {model}")
+    image, latents, codebook_size = pipeline.roundtrip_image(image, output_type)
+    return (
+        image,
+        vqgan_tokens_to_image(
+            latents, codebook_size, downscale_factor=pipeline.vae_scale_factor
+        ),
+        describe_shape(latents.shape),
+        f"{calc_psnr(image_orig, image):.2f}",
+    )
 demo = gr.Interface(
     fn=roundtrip_image,
     inputs=[
         gr.Image(type="pil"),
+        gr.Dropdown(models, label="Model", value="vqgan"),
         gr.Dropdown(["256x256", "512x512", "1024x1024"], label="Size", value="512x512"),
     ],
     outputs=[
+        gr.Image(label="Reconstructed", format="png"),
+        gr.Image(label="Tokens", format="png"),
         gr.Text(label="VQ Shape"),
+        gr.Text(label="PSNR"),
     ],
     title="Image Tokenizer Playground",
     description="Round-trip an image through an encode-decoder pair to see the quality loss from the VQ-GAN for image generation, etc.",

chameleon/LICENSE ADDED Viewed

	@@ -0,0 +1,51 @@

+Chameleon Research License
+Chameleon Version Release Date: June 18, 2024
+This Chameleon Research License ("Agreement") contains the terms and conditions that govern your access and use of the Chameleon Materials (as defined below). You may not use the Chameleon Materials if you do not accept this Agreement.  By clicking "I Accept" to accept, or accessing, using, or distributing any portion or element of the Chameleon Materials you hereby agree to be bound by the terms of this Agreement.  If you are agreeing to be bound by the Agreement on behalf of your employer or other entity, you represent and warrant to Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland) ("Meta") that you have full legal authority to bind your employer or such entity to this Agreement.  If you do not have requisite authority, you may not accept the Agreement or access the Chameleon Materials on behalf of your employer or other entity.
+This Agreement is effective upon the earlier of the date that you first access the Chameleon Materials or accept this Agreement ("Effective Date"), and is entered into by and between Meta, and you, or if you are entering into this Agreement on behalf of your employer or other entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules, or regulations to provide legal consent and, your employer or other entity and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf ("Licensee" or "You").
+1. Definitions.
+   1. "Documentation" means the specifications, manuals and documentation accompanying Chameleon distributed by Meta at https://github.com/facebookresearch/chameleon and https://ai.meta.com/resources/models-and-libraries/chameleon-downloads/.
+   2. "Noncommercial Research Uses" means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
+   3. "Chameleon" means the models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta at [INSERT RESOURCE HYPERLINK].
+   4. "Chameleon Materials" means, collectively, Meta's proprietary Chameleon and Documentation (and any portion thereof) made available under this Agreement.
+   5. "Trade Control Laws" means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.
+   6. "Acceptable Use Policy" means the Acceptable Use Policy applicable to Chameleon Materials ([INSERT Chameleon AUP HYPERLINK]) that is incorporated into this Agreement.
+2. License Rights and Redistribution. Subject to Your compliance with the terms and conditions of this Agreement, Meta hereby grants you the following:
+   1. Grant of Rights. You are hereby granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta's intellectual property or other rights owned by Meta embodied in the Chameleon Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Chameleon Materials solely for Noncommercial Research Uses.
+   2. Redistribution and Use.
+      1. Distribution of Chameleon Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Chameleon Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+      2. If you submit for publication the results of research you perform on, using, or otherwise in connection with Chameleon Materials, you must acknowledge the use of Chameleon Materials in your publication as follows (or an equivalent acknowledgement of your choosing): "This material is based on work supported by the Chameleon Research License, Copyright (c) Meta Platforms, Inc. All Rights Reserved."
+      3. You must retain in all copies of the Chameleon Materials that you distribute and include the following attribution notice within a "Notice" text file distributed as a part of such copies: "Chameleon is licensed under the Chameleon Research License, Copyright (c) Meta Platforms, Inc. All Rights Reserved."
+      4. Your use of the Chameleon Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the Acceptable Use Policy for the Chameleon Materials (https://ai.meta.com/resources/models-and-libraries/chameleon-use-policy/) which is hereby incorporated by reference into this Agreement.
+3. Restrictions. You will not, and will not permit, assist or cause any third party to:
+   1. use the Chameleon Materials or any outputs or results of the Chameleon Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
+   2. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Meta in connection with the Chameleon Materials, or to circumvent or remove any usage restrictions or other safety measures, or to enable functionality disabled by Meta;
+   3. disguise your or their location through IP proxying or other methods;
+   4. use or download Chameleon Materials if you or they are: (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) will use Chameleon Materials for any purpose prohibited by Trade Control Laws; or
+   5. directly or indirectly export, re-export, provide, or otherwise transfer Chameleon Materials: (a) to any individual, entity, or country prohibited by Trade Control Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Trade Control Laws, including nuclear, chemical or biological weapons, or missile technology applications.
+4. User Support. Your Noncommercial Research Use of the Chameleon Materials is done at your own discretion; Meta does not provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Chameleon Materials. Any support provided is "as is", "with all faults", and without warranty of any kind.
+5. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE Chameleon MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE Chameleon MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE Chameleon MATERIALS AND ANY OUTPUT AND RESULTS.
+6. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+7. Intellectual Property.
+   1. No trademark licenses are granted under this Agreement, and in connection with the Chameleon Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Chameleon Materials.
+   2. Subject to Meta's ownership of Chameleon Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Chameleon Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+   3. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Chameleon Materials or Chameleon outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses and rights  granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Chameleon Materials.
+8. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Chameleon Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Chameleon Materials. Sections 3, 4, 5, 6(c),  7,  8 and 9 shall survive the termination of this Agreement.
+9. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+10. Modifications and Amendments. Meta may modify this Agreement from time to time by posting a revised version at https://ai.meta.com/resources/models-and-libraries/chameleon-license/
+11. ; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Chameleon Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no other modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.

chameleon/image_tokenizer.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import PIL
+import torch
+import yaml
+from PIL import Image
+from .vqgan import VQModel
+class ImageTokenizer:
+    def __init__(
+        self,
+        cfg_path: str,
+        ckpt_path: str,
+        device: str | torch.device | None = None,
+    ):
+        with open(cfg_path) as f:
+            config = yaml.safe_load(f)
+        params = config["model"]["params"]
+        if "lossconfig" in params:
+            del params["lossconfig"]
+        params["ckpt_path"] = ckpt_path
+        self._vq_model = VQModel(**params)
+        self._vq_model.eval()
+        if device is None:
+            devices = {p.device for p in self._vq_model.parameters()}
+            assert len(devices) == 1
+            device = devices.pop()
+        else:
+            self._vq_model.to(device)
+        self._device = device
+        dtypes = {p.dtype for p in self._vq_model.parameters()}
+        assert len(dtypes) == 1
+        self._dtype = dtypes.pop()
+    def _whiten_transparency(self, img: PIL.Image) -> PIL.Image:
+        # Check if it's already in RGB format.
+        if img.mode == "RGB":
+            return img
+        vals_rgba = np.array(img.convert("RGBA"))
+        # If there is no transparency layer, simple convert and return.
+        if not (vals_rgba[:, :, 3] < 255).any():
+            return img.convert("RGB")
+        # There is a transparency layer, blend it with a white background.
+        # Calculate the alpha proportion for blending.
+        alpha = vals_rgba[:, :, 3] / 255.0
+        # Blend with white background.
+        vals_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[
+            :, :, np.newaxis
+        ] * vals_rgba[:, :, :3]
+        return PIL.Image.fromarray(vals_rgb.astype("uint8"), "RGB")
+    def _vqgan_input_from(self, img: PIL.Image, target_image_size=512) -> torch.Tensor:
+        # Resize with aspect ratio preservation.
+        s = min(img.size)
+        scale = target_image_size / s
+        new_size = (round(scale * img.size[0]), round(scale * img.size[1]))
+        img = img.resize(new_size, PIL.Image.LANCZOS)
+        # Center crop.
+        x0 = (img.width - target_image_size) // 2
+        y0 = (img.height - target_image_size) // 2
+        img = img.crop((x0, y0, x0 + target_image_size, y0 + target_image_size))
+        # Convert to tensor.
+        np_img = np.array(img) / 255.0  # Normalize to [0, 1]
+        np_img = np_img * 2 - 1  # Scale to [-1, 1]
+        tensor_img = (
+            torch.from_numpy(np_img).permute(2, 0, 1).float()
+        )  # (Channels, Height, Width) format.
+        # Add batch dimension.
+        return tensor_img.unsqueeze(0)
+    def img_tokens_from_pil(self, image: PIL.Image) -> list[int]:
+        image = self._whiten_transparency(image)
+        vqgan_input = self._vqgan_input_from(image).to(self._device).to(self._dtype)
+        _, _, [_, _, img_toks] = self._vq_model.encode(vqgan_input)
+        return img_toks
+    def _pil_from_chw_tensor(self, chw_tensor: torch.Tensor) -> PIL.Image:
+        # Ensure detachment and move tensor to CPU.
+        detached_chw_tensor = chw_tensor.detach().cpu()
+        # Normalize tensor to [0, 1] range from [-1, 1] range.
+        normalized_chw_tensor = (
+            torch.clamp(detached_chw_tensor, -1.0, 1.0) + 1.0
+        ) / 2.0
+        # Permute CHW tensor to HWC format and convert to NumPy array.
+        hwc_array = normalized_chw_tensor.permute(1, 2, 0).numpy()
+        # Convert to an 8-bit unsigned integer format.
+        image_array_uint8 = (hwc_array * 255).astype(np.uint8)
+        # Convert NumPy array to PIL Image.
+        pil_image = Image.fromarray(image_array_uint8)
+        # Convert image to RGB if it is not already.
+        if pil_image.mode != "RGB":
+            pil_image = pil_image.convert("RGB")
+        return pil_image
+    def pil_from_img_toks(self, img_tensor: torch.Tensor) -> PIL.Image:
+        emb_dim = self._vq_model.quantize.embedding.weight.shape[-1]
+        codebook_entry = self._vq_model.quantize.get_codebook_entry(
+            img_tensor, (1, 32, 32, emb_dim)
+        )
+        pixels = self._vq_model.decode(codebook_entry)
+        return self._pil_from_chw_tensor(pixels[0])

chameleon/vqgan.py ADDED Viewed

	@@ -0,0 +1,675 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Contents of this file are taken from https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/models/vqgan.py
+[with minimal dependencies]
+This implementation is inference-only -- training steps and optimizer components
+introduce significant additional dependencies
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class VectorQuantizer2(nn.Module):
+    """
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
+    avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+    """
+    # NOTE: due to a bug the beta term was applied to the wrong term. for
+    # backwards compatibility we use the buggy version by default, but you can
+    # specify legacy=False to fix it.
+    def __init__(
+        self,
+        n_e,
+        e_dim,
+        beta,
+        remap=None,
+        unknown_index="random",
+        sane_index_shape=False,
+        legacy=True,
+    ):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.legacy = legacy
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed + 1
+            print(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+        else:
+            self.re_embed = n_e
+        self.sane_index_shape = sane_index_shape
+    def remap_to_used(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(
+                device=new.device
+            )
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+    def unmap_to_all(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+    def forward(self, z, temp=None, rescale_logits=False, return_logits=False):
+        assert temp is None or temp == 1.0, "Only for interface compatible with Gumbel"
+        assert rescale_logits is False, "Only for interface compatible with Gumbel"
+        assert return_logits is False, "Only for interface compatible with Gumbel"
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = (
+            torch.sum(z_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2
+            * torch.einsum(
+                "bd,dn->bn", z_flattened, self.embedding.weight.transpose(0, 1)
+            )
+        )
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        perplexity = None
+        min_encodings = None
+        # compute loss for embedding
+        if not self.legacy:
+            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean(
+                (z_q - z.detach()) ** 2
+            )
+        else:
+            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean(
+                (z_q - z.detach()) ** 2
+            )
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(
+                z.shape[0], -1
+            )  # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(
+                z_q.shape[0], z_q.shape[2], z_q.shape[3]
+            )
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape):
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+        # get quantized latent vectors
+        z_q = self.embedding(indices)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+# Alias
+VectorQuantizer = VectorQuantizer2
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f"attn_type {attn_type} unknown"
+    # print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        raise ValueError("Unexpected attention type")
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        double_z=True,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        give_pre_end=False,
+        tanh_out=False,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignorekwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+class VQModel(nn.Module):
+    def __init__(
+        self,
+        ddconfig,
+        n_embed,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key="image",
+        colorize_nlabels=None,
+        monitor=None,
+        scheduler_config=None,
+        lr_g_factor=1.0,
+        remap=None,
+        sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+    ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.quantize = VectorQuantizer(
+            n_embed,
+            embed_dim,
+            beta=0.25,
+            remap=remap,
+            sane_index_shape=sane_index_shape,
+        )
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.image_key = image_key
+        if colorize_nlabels is not None:
+            assert isinstance(colorize_nlabels, int)
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.scheduler_config = scheduler_config
+        self.lr_g_factor = lr_g_factor
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"VQModel loaded from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        return x.float()
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
+        return x