patrickvonplaten commited on Oct 20, 2023

Commit

03c0c42

•

0 Parent(s):

Duplicate from hf-internal-testing/tiny-sdxl-custom-components

Browse files

Files changed (44) hide show

.gitattributes +35 -0
README.md +13 -0
model_index.json +33 -0
my_pipeline.py +974 -0
scheduler/my_scheduler.py +514 -0
scheduler/scheduler_config.json +14 -0
text_encoder/config.json +23 -0
text_encoder/flax_model.msgpack +3 -0
text_encoder/model.onnx +3 -0
text_encoder/openvino_model.bin +3 -0
text_encoder/openvino_model.xml +0 -0
text_encoder/pytorch_model.bin +3 -0
text_encoder_2/config.json +23 -0
text_encoder_2/flax_model.msgpack +3 -0
text_encoder_2/model.onnx +3 -0
text_encoder_2/openvino_model.bin +3 -0
text_encoder_2/openvino_model.xml +0 -0
text_encoder_2/pytorch_model.bin +3 -0
tokenizer/merges.txt +647 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +33 -0
tokenizer/vocab.json +1002 -0
tokenizer_2/merges.txt +647 -0
tokenizer_2/special_tokens_map.json +24 -0
tokenizer_2/tokenizer_config.json +33 -0
tokenizer_2/vocab.json +1002 -0
unet/config.json +64 -0
unet/diffusion_flax_model.msgpack +3 -0
unet/diffusion_pytorch_model.bin +3 -0
unet/model.onnx +3 -0
unet/my_unet_model.py +1129 -0
unet/openvino_model.bin +3 -0
unet/openvino_model.xml +0 -0
vae/config.json +24 -0
vae/diffusion_flax_model.msgpack +3 -0
vae/diffusion_pytorch_model.bin +3 -0
vae_decoder/config.json +24 -0
vae_decoder/model.onnx +3 -0
vae_decoder/openvino_model.bin +3 -0
vae_decoder/openvino_model.xml +0 -0
vae_encoder/config.json +24 -0
vae_encoder/model.onnx +3 -0
vae_encoder/openvino_model.bin +3 -0
vae_encoder/openvino_model.xml +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+library_name: diffusers
+tags:
+- text-to-image
+---
+```python
+from diffusers import DiffusionPipeline
+pipe = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
+```
+The pipeline was created using this [Colab Notebook](https://colab.research.google.com/gist/sayakpaul/a7b986af7e9ea26562eed4ec1410d766/scratchpad.ipynb).

model_index.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_class_name": "StableDiffusionXLPipeline",
+  "_diffusers_version": "0.18.1",
+  "force_zeros_for_empty_prompt": true,
+  "scheduler": [
+    "my_scheduler",
+    "MyScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "text_encoder_2": [
+    "transformers",
+    "CLIPTextModelWithProjection"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "tokenizer_2": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "my_unet_model",
+    "MyUNetModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

my_pipeline.py ADDED Viewed

	@@ -0,0 +1,974 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import DiffusionPipeline
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MyPipeline
+        >>> pipe = MyPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+class MyPipeline(
+    DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    In addition the pipeline inherits the following loading methods:
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+        self.watermark = None
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # textual inversion: procecss multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+                prompt_embeds_list.append(prompt_embeds)
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2)
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+        The suffixes after the scaling factors represent the stages where they are being applied.
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.MyPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.MyPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.MyPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=clip_skip,
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 8.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        return (image,)

scheduler/my_scheduler.py ADDED Viewed

	@@ -0,0 +1,514 @@

+# Copyright 2023 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+@dataclass
+class MySchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+    elif alpha_transform_type == "exp":
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+class MyScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `MyScheduler` explores the connections between denoising score matching and Langevin dynamics sampling.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        variance_type (`str`, defaults to `"fixed_small"`):
+            Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
+            `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps. You can use a combination of `offset=1` and
+            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
+            Diffusion.
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        # setable values
+        self.custom_timesteps = False
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+        self.variance_type = variance_type
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+        """
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+            self.num_inference_steps = num_inference_steps
+            self.custom_timesteps = False
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = (
+                    np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                    .round()[::-1]
+                    .copy()
+                    .astype(np.int64)
+                )
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+        prev_t = self.previous_timestep(t)
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+        # we always take the log of variance, so clamp it to ensure it's not 0
+        variance = torch.clamp(variance, min=1e-20)
+        if variance_type is None:
+            variance_type = self.config.variance_type
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small":
+            variance = variance
+        # for rl-diffuser https://arxiv.org/abs/2205.09991
+        elif variance_type == "fixed_small_log":
+            variance = torch.log(variance)
+            variance = torch.exp(0.5 * variance)
+        elif variance_type == "fixed_large":
+            variance = current_beta_t
+        elif variance_type == "fixed_large_log":
+            # Glide max_log
+            variance = torch.log(current_beta_t)
+        elif variance_type == "learned":
+            return predicted_variance
+        elif variance_type == "learned_range":
+            min_log = torch.log(variance)
+            max_log = torch.log(current_beta_t)
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+        return variance
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+        return sample
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[MySchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddpm.MySchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_ddpm.MySchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddpm.MySchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        t = timestep
+        prev_t = self.previous_timestep(t)
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the MyScheduler."
+            )
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            variance_noise = randn_tensor(
+                model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+            )
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
+            elif self.variance_type == "learned_range":
+                variance = self._get_variance(t, predicted_variance=predicted_variance)
+                variance = torch.exp(0.5 * variance) * variance_noise
+            else:
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+        pred_prev_sample = pred_prev_sample + variance
+        if not return_dict:
+            return (pred_prev_sample,)
+        return MySchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+    def __len__(self):
+        return self.config.num_train_timesteps
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+        return prev_t

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_class_name": "EulerDiscreteScheduler",
+  "_diffusers_version": "0.18.1",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "interpolation_type": "linear",
+  "num_train_timesteps": 1000,
+  "prediction_type": "epsilon",
+  "steps_offset": 1,
+  "timestep_spacing": "leading",
+  "trained_betas": null,
+  "use_karras_sigmas": false
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 32,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 37,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 5,
+  "pad_token_id": 1,
+  "projection_dim": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2",
+  "vocab_size": 1000
+}

text_encoder/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0f56336d7bb3ca2c416bb9d74d452c67d9443609084e712e59e57de96dac918
+size 276381

text_encoder/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dca7ac73495ecfc94f5840e567a33958390faa352296eef7b5cd72f3a7661f83
+size 426918

text_encoder/openvino_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc33d238032a59513693649443a7a7cee4767e614275dd73584b22b608b5d8f1
+size 268300

text_encoder/openvino_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

text_encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8db031dc7a69f514ab2b725e1653abb62f13146b92fd9a1c0a6258b63a4d71eb
+size 301680

text_encoder_2/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "CLIPTextModelWithProjection"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 32,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 37,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 5,
+  "pad_token_id": 1,
+  "projection_dim": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2",
+  "vocab_size": 1000
+}

text_encoder_2/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b99248c7d146aac446d888daa22351a30ee7c60ca4b4a02f5dc04b9a1694d160
+size 280520

text_encoder_2/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00e31e9d12a7527fcdd90c94333c4ddf50cecc6efc4cbea8691f1c21d6c45663
+size 431174

text_encoder_2/openvino_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c89c490e82f3bd6ca2a7cc1951846bf4ce961e442d030d1563070cb280b6e4f
+size 272396

text_encoder_2/openvino_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

text_encoder_2/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21086f503ac508b05ea74abf51b676c1b99c6f9c23c28f7aecc71cdd138dc385
+size 306099

tokenizer/merges.txt ADDED Viewed

	@@ -0,0 +1,647 @@

+#version: 0.2
+Ġ t
+Ġt h
+Ġ a
+Ġth e</w>
+i n
+Ġ o
+Ġ ,</w>
+Ġ s
+e d</w>
+Ġ w
+e r
+Ġ .</w>
+Ġ i
+r e
+Ġ c
+n d</w>
+Ġ f
+Ġ b
+a t
+Ġo f</w>
+e r</w>
+e n
+a r
+o r
+i t
+Ġ p
+Ġ h
+Ġa nd</w>
+o n
+in g</w>
+a n
+r o
+Ġ m
+Ġ d
+e s</w>
+Ġi n</w>
+o n</w>
+Ġt o</w>
+o u
+i s
+Ġ a</w>
+i c
+Ġ T
+a l
+Ġ l
+Ġ =</w>
+Ġ re
+Ġ "</w>
+e s
+Ġ S
+a s</w>
+a l</w>
+i l
+e l
+i on</w>
+Ġ A
+Ġ C
+Ġ 1
+Ġ Ċ</w>
+u r
+ĠT h
+Ġ n
+a s
+Ġ @
+e c
+o m
+a c
+Ġ e
+Ġw as</w>
+Ġ M
+o r</w>
+a n</w>
+a m
+e n</w>
+o l
+Ġ in
+Ġ g
+Ġ '</w>
+Ġ B
+l y</w>
+a t</w>
+i v
+t s</w>
+ĠTh e</w>
+u s
+- @</w>
+Ġ@ -@</w>
+i s</w>
+Ġ I
+Ġw h
+i g
+Ġ H
+Ġs t
+o s
+u n
+t h
+Ġ P
+Ġw it
+Ġth at</w>
+i r
+Ġa s</w>
+e m
+Ġo n</w>
+r a
+Ġf or</w>
+Ġ R
+e t
+o w
+Ġ 2
+i d
+Ġ D
+l e</w>
+Ġwit h</w>
+l a
+en t</w>
+i m
+Ġ F
+e a
+i on
+Ġb y</w>
+Ġ )</w>
+Ġ (</w>
+Ġa l
+Ġc on
+en t
+Ġ W
+Ġi s</w>
+er e</w>
+Ġ G
+Ġ N
+Ġ L
+Ġh a
+er s</w>
+r i
+t h</w>
+t ed</w>
+u c
+Ġ J
+Ġ1 9
+e v
+u l
+Ġ v
+c e</w>
+at ion</w>
+ro m</w>
+Ġb e
+Ġ E
+i n</w>
+Ġth e
+Ġf rom</w>
+Ġ O
+t er</w>
+Ġp ro
+Ġa r
+a d
+Ġc om
+i c</w>
+a g
+Ġh is</w>
+Ġs h
+Ġa t</w>
+o v
+i es</w>
+o o
+p p
+s t
+c h
+Ġ r
+Ġ2 0
+a y</w>
+i f
+Ġw ere</w>
+Ġc h
+u t</w>
+s t</w>
+u t
+d s</w>
+o p
+u m
+Ġi t</w>
+o c
+t er
+l e
+ig h
+u d
+Ġe x
+ion s</w>
+at e</w>
+it y</w>
+at ed</w>
+Ġ un
+e p
+q u
+Ġn o
+Ġ K
+iv e</w>
+is t
+Ġo n
+am e</w>
+ou n
+i r</w>
+a b
+Ġ â
+in g
+Ġh e</w>
+l d</w>
+u g
+ic h</w>
+Ġa n</w>
+e d
+Ġ k
+Ġâ Ģ
+Ġha d</w>
+v e</w>
+a in
+Ġs e
+t ion</w>
+or e</w>
+re s
+Ġwh ich</w>
+ĠI n</w>
+o d
+th er</w>
+a k
+Ġs p
+a r</w>
+Ġ y
+ĠC h
+on g</w>
+Ġa c
+es t</w>
+Ġ U
+a p
+f f
+al ly</w>
+r it
+ĠS t
+u b
+g e</w>
+b er</w>
+e t</w>
+Ġb e</w>
+e ar
+Ġre c
+er s
+Ġf ir
+o t
+Ġar e</w>
+Ġa n
+c h</w>
+o g
+i a</w>
+es t
+in e</w>
+il l
+an d
+e l</w>
+ar y</w>
+e w</w>
+i d</w>
+Ġf or
+Ġ ;</w>
+Ġcom p
+Ġ V
+Ġin c
+t r
+Ġ20 0
+Ġthe ir</w>
+u s</w>
+Ġb ut</w>
+r an
+ic al</w>
+Ġfir st</w>
+Ġd e
+Ġin t
+Ġ ro
+s o</w>
+ĠâĢ ĵ</w>
+Ġno t</w>
+d ing</w>
+f ter</w>
+ur e</w>
+Ġp ar
+Ġ :</w>
+i an</w>
+Ġt w
+ou ld</w>
+Ġal so</w>
+Ġi ts</w>
+Ġw or
+u m</w>
+Ġo r</w>
+os t</w>
+0 0</w>
+ou r
+ar d</w>
+Ġre s
+m p
+u e</w>
+Ġa b
+is h</w>
+Ġcon t
+Ġa d
+ow n</w>
+al l</w>
+ou g
+Ġh er</w>
+as t</w>
+Ġ en
+om e</w>
+al l
+d ed</w>
+o w</w>
+Ġha ve</w>
+Ġ us
+ea r</w>
+ac k</w>
+d uc
+i al</w>
+s s
+en ts</w>
+a in</w>
+t ing</w>
+Ġon e</w>
+es s
+Ġh as</w>
+igh t</w>
+a v
+Ġe v
+ou t</w>
+a y
+en ce</w>
+Ġbe en</w>
+e w
+Ġtw o</w>
+Ġc l
+d er</w>
+im e</w>
+k s</w>
+es s</w>
+is h
+. @</w>
+Ġ@ .@</w>
+Ġp la
+Ġp l
+Ġo r
+u p</w>
+m ent</w>
+ur ing</w>
+ol l
+ĠI n
+Ġth is</w>
+Ġb ec
+Ġcom m
+Ġd is
+at er</w>
+ag e</w>
+Ġa pp
+ou s</w>
+e y</w>
+i l</w>
+p er
+ĠA l
+ion al</w>
+l ud
+el y</w>
+t t
+il e</w>
+i z
+Ġ j
+Ġwh o</w>
+Ġa g
+i b
+Ġthe y</w>
+f or
+Ġo v
+at h
+e g
+Ġs c
+i p
+Ġ20 1
+Ġ 3
+Ġp er
+or y</w>
+Ġd es
+id e</w>
+Ġs er
+s e</w>
+ĠH e</w>
+la nd</w>
+at ions</w>
+r ic
+i t</w>
+re s</w>
+er ed</w>
+Ġp re
+ĠS h
+an ce</w>
+or t</w>
+an t</w>
+, @</w>
+Ġ@ ,@</w>
+el l</w>
+Ġ Y
+n ed</w>
+el l
+it e</w>
+Ġinc lud
+Ġre p
+Ġa fter</w>
+Ġs uc
+re e</w>
+an y</w>
+i m</w>
+or t
+Ġ1 8
+Ġs u
+ad e</w>
+ou r</w>
+ĠU n
+ĠI t</w>
+i k
+ĠM ar
+em ber</w>
+Ġ 1</w>
+e en</w>
+a nd</w>
+Ġs ec
+ic e</w>
+Ġt ime</w>
+ĠA n
+Ġint o</w>
+Ġf in
+Ġo ther</w>
+Ġa tt
+il l</w>
+re n
+ac h
+as s
+er al</w>
+es e</w>
+s h
+al s</w>
+it ion</w>
+oug h</w>
+l es</w>
+am p
+Ġw ould</w>
+Ġm ore</w>
+ro ug
+ri b
+er y</w>
+ac e</w>
+Ġ A</w>
+Ġpla y
+it ed</w>
+k ed</w>
+is t</w>
+i ed</w>
+Ġ 2</w>
+as ed</w>
+ing s</w>
+an g
+a m</w>
+i p</w>
+Ġb o
+ab le</w>
+t y</w>
+Ġch ar
+Ġc ent
+et w
+at es</w>
+ro p
+Ġ I</w>
+u nd</w>
+ĠA m
+c es</w>
+o in
+Ġin ter
+u p
+c t
+on e</w>
+Ġt ra
+an t
+ec t
+Ġal l</w>
+e f
+Ġcon s
+ub l
+n ing</w>
+an s</w>
+Ġf e
+us t</w>
+Ġ 0
+Ġre m
+as e</w>
+on g
+Ġwh en</w>
+e b
+ĠW h
+Ġe ar
+ev er</w>
+Ġov er</w>
+Ġk n
+a us
+Ġp os
+a d</w>
+er m
+Ġsh e</w>
+Ġ ra
+Ġd uring</w>
+as on</w>
+v i
+Ġex p
+Ġl ea
+Ġ el
+Ġ 4
+Ġon ly</w>
+o nd</w>
+Ġd ec
+Ġac c
+Ġo ff
+is s
+Ġf l
+ĠE n
+o t</w>
+en s
+os e</w>
+ak e</w>
+o m</w>
+Ġs ev
+ac h</w>
+etw een</w>
+er n
+Ġ 3</w>
+Ġp r
+Ġg ro
+r uc
+Ġd i
+Ġ19 9
+ĠA r
+Ġg ame</w>
+Ġh im</w>
+oo k</w>
+Ġ up</w>
+Ġab out</w>
+Ġre l
+for m
+Ġth ree</w>
+at t
+ĠC om
+Ġs a
+ear s</w>
+Ġ 5
+r y</w>
+Ġi mp
+Ġm ost</w>
+f er
+Ġp res
+Ġf il
+Ġb etween</w>
+Ġbe g
+p h
+or s</w>
+Ġth an</w>
+Ġrec or
+o b
+er ic
+at ing</w>
+Ġth roug
+k ing</w>
+Ġo ut</w>
+Ġn um
+oo d</w>
+oll ow
+ac t
+u il
+Ġc re
+ol og
+at ional</w>
+Ġpro duc
+Ġwh ile</w>
+Ġl ater</w>
+Ġw rit
+e x
+Ġst ar
+Ġsp ec
+e e
+ish ed</w>
+Ġre g
+is ion</w>
+ou th</w>
+Ġre le
+Ġa ss
+Ġse ason</w>
+Ġm ade</w>
+il y</w>
+r u
+o y
+t ur
+t e</w>
+Ġ qu
+Ġm ov
+ur y</w>
+ĠAm eric
+em ent</w>
+c c
+ou nd</w>
+Ġl ar
+Ġfor m
+ec t</w>
+Ġde f
+Ġm us
+ĠP ar
+Ġm e
+Ġs ub
+w ay</w>
+o p</w>
+o h
+el d</w>
+i e</w>
+em p
+am es</w>
+er n</w>
+Ġn or
+iv ed</w>
+ev el
+Ġsuc h</w>
+ar ds</w>
+Ġin d
+ik e</w>
+Ġg en
+er t
+Ġy ear</w>
+Ġus ed</w>
+Ġn ew</w>
+Ġ 5</w>
+Ġal b
+s p
+y p
+Ġwit h
+Ġwh ere</w>
+ic s</w>
+ĠTh is</w>
+Ġthe m</w>
+w n</w>

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/vocab.json ADDED Viewed

	@@ -0,0 +1,1002 @@

+{
+  "!": 2,
+  "!</w>": 345,
+  "\"": 3,
+  "\"</w>": 344,
+  "#": 4,
+  "#</w>": 325,
+  "$": 5,
+  "$</w>": 348,
+  "%": 6,
+  "%</w>": 351,
+  "&": 7,
+  "&</w>": 352,
+  "'": 8,
+  "'</w>": 296,
+  "(": 9,
+  "(</w>": 318,
+  ")": 10,
+  ")</w>": 330,
+  "*": 11,
+  "*</w>": 327,
+  "+": 12,
+  "+</w>": 341,
+  ",": 13,
+  ",</w>": 279,
+  ",@</w>": 754,
+  "-": 14,
+  "-</w>": 276,
+  "-@</w>": 439,
+  ".": 15,
+  ".</w>": 253,
+  ".@</w>": 695,
+  "/": 16,
+  "/</w>": 350,
+  "0": 17,
+  "00</w>": 647,
+  "0</w>": 216,
+  "1": 18,
+  "1</w>": 222,
+  "2": 19,
+  "2</w>": 231,
+  "3": 20,
+  "3</w>": 243,
+  "4": 21,
+  "4</w>": 233,
+  "5": 22,
+  "5</w>": 240,
+  "6": 23,
+  "6</w>": 226,
+  "7": 24,
+  "7</w>": 215,
+  "8": 25,
+  "8</w>": 236,
+  "9": 26,
+  "9</w>": 242,
+  ":": 27,
+  ":</w>": 353,
+  ";": 28,
+  ";</w>": 317,
+  "<": 29,
+  "<</w>": 340,
+  "<|endoftext|>": 1,
+  "<|startoftext|>": 0,
+  "=": 30,
+  "=</w>": 342,
+  ">": 31,
+  "></w>": 300,
+  "?": 32,
+  "?</w>": 346,
+  "@": 33,
+  "@</w>": 320,
+  "A": 34,
+  "A</w>": 227,
+  "B": 35,
+  "B</w>": 258,
+  "C": 36,
+  "C</w>": 239,
+  "D": 37,
+  "D</w>": 255,
+  "E": 38,
+  "E</w>": 246,
+  "F": 39,
+  "F</w>": 213,
+  "G": 40,
+  "G</w>": 283,
+  "H": 41,
+  "H</w>": 219,
+  "I": 42,
+  "I</w>": 237,
+  "J": 43,
+  "J</w>": 251,
+  "K": 44,
+  "K</w>": 254,
+  "L": 45,
+  "L</w>": 218,
+  "M": 46,
+  "M</w>": 234,
+  "N": 47,
+  "N</w>": 238,
+  "O": 48,
+  "O</w>": 265,
+  "P": 49,
+  "P</w>": 245,
+  "Q": 50,
+  "Q</w>": 309,
+  "R": 51,
+  "R</w>": 264,
+  "S": 52,
+  "S</w>": 230,
+  "T": 53,
+  "T</w>": 235,
+  "U": 54,
+  "U</w>": 268,
+  "V": 55,
+  "V</w>": 248,
+  "W": 56,
+  "W</w>": 274,
+  "X": 57,
+  "X</w>": 263,
+  "Y": 58,
+  "Y</w>": 310,
+  "Z": 59,
+  "Z</w>": 207,
+  "[": 60,
+  "[</w>": 270,
+  "\\": 61,
+  "\\</w>": 338,
+  "]": 62,
+  "]</w>": 289,
+  "^": 63,
+  "^</w>": 331,
+  "_": 64,
+  "_</w>": 334,
+  "`": 65,
+  "`</w>": 347,
+  "a": 66,
+  "a</w>": 197,
+  "ab": 555,
+  "able</w>": 820,
+  "ac": 420,
+  "ace</w>": 806,
+  "ach": 791,
+  "ach</w>": 885,
+  "ack</w>": 670,
+  "act": 929,
+  "ad": 508,
+  "ad</w>": 860,
+  "ade</w>": 771,
+  "ag": 511,
+  "age</w>": 710,
+  "ain": 568,
+  "ain</w>": 675,
+  "ak": 577,
+  "ake</w>": 882,
+  "al": 397,
+  "al</w>": 405,
+  "all": 664,
+  "all</w>": 658,
+  "ally</w>": 588,
+  "als</w>": 796,
+  "am": 426,
+  "am</w>": 817,
+  "ame</w>": 552,
+  "ames</w>": 976,
+  "amp": 800,
+  "an": 384,
+  "an</w>": 425,
+  "ance</w>": 751,
+  "and": 609,
+  "and</w>": 780,
+  "ang": 816,
+  "ans</w>": 844,
+  "ant": 837,
+  "ant</w>": 753,
+  "any</w>": 766,
+  "ap": 586,
+  "ar": 376,
+  "ar</w>": 579,
+  "ard</w>": 649,
+  "ards</w>": 982,
+  "ary</w>": 611,
+  "as": 416,
+  "as</w>": 404,
+  "ase</w>": 849,
+  "ased</w>": 814,
+  "ason</w>": 865,
+  "ass": 792,
+  "ast</w>": 661,
+  "at": 372,
+  "at</w>": 434,
+  "ate</w>": 541,
+  "ated</w>": 543,
+  "ater</w>": 709,
+  "ates</w>": 825,
+  "ath": 730,
+  "ating</w>": 922,
+  "ation</w>": 497,
+  "ational</w>": 933,
+  "ations</w>": 744,
+  "att": 903,
+  "aus": 858,
+  "av": 681,
+  "ay": 684,
+  "ay</w>": 523,
+  "b": 67,
+  "b</w>": 212,
+  "ber</w>": 593,
+  "c": 68,
+  "c</w>": 224,
+  "cc": 960,
+  "ce</w>": 496,
+  "ces</w>": 830,
+  "ch": 520,
+  "ch</w>": 603,
+  "ct": 834,
+  "d": 69,
+  "d</w>": 196,
+  "ded</w>": 665,
+  "der</w>": 690,
+  "ding</w>": 633,
+  "ds</w>": 530,
+  "duc": 671,
+  "e": 70,
+  "e</w>": 195,
+  "ea": 471,
+  "ear": 596,
+  "ear</w>": 669,
+  "ears</w>": 906,
+  "eb": 852,
+  "ec": 418,
+  "ect": 838,
+  "ect</w>": 964,
+  "ed": 563,
+  "ed</w>": 362,
+  "ee": 941,
+  "een</w>": 779,
+  "ef": 840,
+  "eg": 731,
+  "el": 407,
+  "el</w>": 610,
+  "eld</w>": 973,
+  "ell": 759,
+  "ell</w>": 756,
+  "ely</w>": 719,
+  "em": 455,
+  "ember</w>": 777,
+  "ement</w>": 959,
+  "emp": 975,
+  "en": 375,
+  "en</w>": 427,
+  "ence</w>": 685,
+  "ens": 880,
+  "ent": 478,
+  "ent</w>": 468,
+  "ents</w>": 674,
+  "ep": 545,
+  "er": 364,
+  "er</w>": 374,
+  "eral</w>": 793,
+  "ere</w>": 481,
+  "ered</w>": 748,
+  "eric": 921,
+  "erm": 861,
+  "ern": 887,
+  "ern</w>": 977,
+  "ers": 598,
+  "ers</w>": 486,
+  "ert": 986,
+  "ery</w>": 805,
+  "es": 402,
+  "es</w>": 388,
+  "ese</w>": 794,
+  "ess": 678,
+  "ess</w>": 693,
+  "est": 606,
+  "est</w>": 584,
+  "et": 460,
+  "et</w>": 594,
+  "etw": 824,
+  "etween</w>": 886,
+  "ev": 493,
+  "evel": 980,
+  "ever</w>": 855,
+  "ew": 687,
+  "ew</w>": 612,
+  "ex": 938,
+  "ey</w>": 713,
+  "f": 71,
+  "f</w>": 209,
+  "fer": 911,
+  "ff": 587,
+  "for": 728,
+  "form": 901,
+  "fter</w>": 634,
+  "g": 72,
+  "g</w>": 214,
+  "ge</w>": 592,
+  "h": 73,
+  "h</w>": 203,
+  "i": 74,
+  "i</w>": 205,
+  "ia</w>": 605,
+  "ial</w>": 672,
+  "ian</w>": 638,
+  "ib": 726,
+  "ic": 395,
+  "ic</w>": 510,
+  "ical</w>": 625,
+  "ice</w>": 782,
+  "ich</w>": 561,
+  "ics</w>": 996,
+  "id": 463,
+  "id</w>": 613,
+  "ide</w>": 739,
+  "ie</w>": 974,
+  "ied</w>": 812,
+  "ies</w>": 516,
+  "if": 524,
+  "ig": 444,
+  "igh": 537,
+  "ight</w>": 680,
+  "ik": 775,
+  "ike</w>": 984,
+  "il": 406,
+  "il</w>": 714,
+  "ile</w>": 721,
+  "ill": 608,
+  "ill</w>": 789,
+  "ily</w>": 950,
+  "im": 469,
+  "im</w>": 767,
+  "ime</w>": 691,
+  "in": 358,
+  "in</w>": 501,
+  "ine</w>": 607,
+  "ing": 557,
+  "ing</w>": 383,
+  "ings</w>": 815,
+  "ion": 472,
+  "ion</w>": 408,
+  "ional</w>": 717,
+  "ions</w>": 540,
+  "ip": 733,
+  "ip</w>": 818,
+  "ir": 453,
+  "ir</w>": 554,
+  "is": 393,
+  "is</w>": 441,
+  "ish": 694,
+  "ish</w>": 654,
+  "ished</w>": 942,
+  "ision</w>": 944,
+  "iss": 876,
+  "ist": 550,
+  "ist</w>": 811,
+  "it": 378,
+  "it</w>": 746,
+  "ite</w>": 760,
+  "ited</w>": 809,
+  "ition</w>": 797,
+  "ity</w>": 542,
+  "iv": 435,
+  "ive</w>": 549,
+  "ived</w>": 979,
+  "iz": 722,
+  "j": 75,
+  "j</w>": 288,
+  "k": 76,
+  "k</w>": 210,
+  "ked</w>": 810,
+  "king</w>": 924,
+  "ks</w>": 692,
+  "l": 77,
+  "l</w>": 201,
+  "la": 467,
+  "land</w>": 743,
+  "ld</w>": 559,
+  "le": 536,
+  "le</w>": 465,
+  "les</w>": 799,
+  "lud": 718,
+  "ly</w>": 433,
+  "m": 78,
+  "m</w>": 202,
+  "ment</w>": 701,
+  "mp": 651,
+  "n": 79,
+  "n</w>": 199,
+  "nd</w>": 369,
+  "ned</w>": 758,
+  "ning</w>": 843,
+  "o": 80,
+  "o</w>": 198,
+  "ob": 920,
+  "oc": 534,
+  "od": 575,
+  "og": 604,
+  "oh": 972,
+  "oin": 831,
+  "ol": 428,
+  "oll": 703,
+  "ollow": 928,
+  "olog": 932,
+  "om": 419,
+  "om</w>": 883,
+  "ome</w>": 663,
+  "on": 382,
+  "on</w>": 390,
+  "ond</w>": 872,
+  "one</w>": 835,
+  "ong": 850,
+  "ong</w>": 582,
+  "oo": 517,
+  "ood</w>": 927,
+  "ook</w>": 897,
+  "op": 531,
+  "op</w>": 971,
+  "or": 377,
+  "or</w>": 424,
+  "ore</w>": 571,
+  "ors</w>": 917,
+  "ort": 768,
+  "ort</w>": 752,
+  "ory</w>": 737,
+  "os": 447,
+  "ose</w>": 881,
+  "ost</w>": 646,
+  "ot": 600,
+  "ot</w>": 879,
+  "ou": 392,
+  "oug": 659,
+  "ough</w>": 798,
+  "ould</w>": 640,
+  "oun": 553,
+  "ound</w>": 961,
+  "our": 648,
+  "our</w>": 772,
+  "ous</w>": 712,
+  "out</w>": 683,
+  "outh</w>": 945,
+  "ov": 515,
+  "ow": 461,
+  "ow</w>": 666,
+  "own</w>": 657,
+  "oy": 952,
+  "p": 81,
+  "p</w>": 217,
+  "per": 715,
+  "ph": 916,
+  "pp": 518,
+  "q": 82,
+  "q</w>": 280,
+  "qu": 546,
+  "r": 83,
+  "r</w>": 204,
+  "ra": 457,
+  "ran": 624,
+  "re": 367,
+  "ree</w>": 765,
+  "ren": 790,
+  "res": 572,
+  "res</w>": 747,
+  "ri": 487,
+  "rib": 804,
+  "ric": 745,
+  "rit": 589,
+  "ro": 385,
+  "rom</w>": 498,
+  "rop": 826,
+  "roug": 803,
+  "ru": 951,
+  "ruc": 891,
+  "ry</w>": 908,
+  "s": 84,
+  "s</w>": 206,
+  "se</w>": 741,
+  "sh": 795,
+  "so</w>": 630,
+  "sp": 992,
+  "ss": 673,
+  "st": 519,
+  "st</w>": 528,
+  "t": 85,
+  "t</w>": 208,
+  "te</w>": 954,
+  "ted</w>": 489,
+  "ter": 535,
+  "ter</w>": 505,
+  "th": 449,
+  "th</w>": 488,
+  "ther</w>": 576,
+  "ting</w>": 676,
+  "tion</w>": 570,
+  "tr": 619,
+  "ts</w>": 436,
+  "tt": 720,
+  "tur": 953,
+  "ty</w>": 821,
+  "u": 86,
+  "u</w>": 229,
+  "ub": 591,
+  "ubl": 842,
+  "uc": 490,
+  "ud": 538,
+  "ue</w>": 652,
+  "ug": 560,
+  "uil": 930,
+  "ul": 494,
+  "um": 532,
+  "um</w>": 644,
+  "un": 448,
+  "und</w>": 828,
+  "up": 833,
+  "up</w>": 700,
+  "ur": 413,
+  "ure</w>": 635,
+  "uring</w>": 702,
+  "ury</w>": 957,
+  "us": 438,
+  "us</w>": 622,
+  "ust</w>": 846,
+  "ut": 529,
+  "ut</w>": 527,
+  "v": 87,
+  "v</w>": 232,
+  "ve</w>": 567,
+  "vi": 866,
+  "w": 88,
+  "w</w>": 250,
+  "way</w>": 970,
+  "wn</w>": 999,
+  "x": 89,
+  "x</w>": 269,
+  "y": 90,
+  "y</w>": 211,
+  "yp": 993,
+  "z": 91,
+  "z</w>": 228,
+  "|": 92,
+  "|</w>": 304,
+  "}": 93,
+  "}</w>": 336,
+  "~": 94,
+  "~</w>": 343,
+  "¡": 95,
+  "¡</w>": 220,
+  "¢": 96,
+  "¢</w>": 306,
+  "£": 97,
+  "£</w>": 323,
+  "¤": 98,
+  "¤</w>": 292,
+  "¥": 99,
+  "¥</w>": 339,
+  "¦": 100,
+  "¦</w>": 303,
+  "§": 101,
+  "§</w>": 275,
+  "¨": 102,
+  "¨</w>": 282,
+  "©": 103,
+  "©</w>": 259,
+  "ª": 104,
+  "ª</w>": 286,
+  "«": 105,
+  "«</w>": 266,
+  "¬": 106,
+  "¬</w>": 319,
+  "®": 107,
+  "®</w>": 329,
+  "¯": 108,
+  "¯</w>": 287,
+  "°": 109,
+  "°</w>": 298,
+  "±": 110,
+  "±</w>": 200,
+  "²": 111,
+  "²</w>": 284,
+  "³": 112,
+  "³</w>": 272,
+  "´": 113,
+  "´</w>": 307,
+  "µ": 114,
+  "µ</w>": 261,
+  "¶": 115,
+  "¶</w>": 301,
+  "·": 116,
+  "·</w>": 326,
+  "¸": 117,
+  "¸</w>": 257,
+  "¹": 118,
+  "¹</w>": 241,
+  "º": 119,
+  "º</w>": 260,
+  "»": 120,
+  "»</w>": 247,
+  "¼": 121,
+  "¼</w>": 305,
+  "½": 122,
+  "½</w>": 294,
+  "¾": 123,
+  "¾</w>": 316,
+  "¿": 124,
+  "¿</w>": 271,
+  "Â": 125,
+  "Ã": 126,
+  "Ä": 127,
+  "Å": 128,
+  "Æ": 129,
+  "Ç": 130,
+  "È": 131,
+  "É": 132,
+  "Ê": 133,
+  "Ë": 134,
+  "Ì": 135,
+  "Í": 136,
+  "Î": 137,
+  "Ï": 138,
+  "Ð": 139,
+  "Ñ": 140,
+  "Ö": 141,
+  "×": 142,
+  "Ø": 143,
+  "Ù": 144,
+  "Ü": 145,
+  "à": 146,
+  "á": 147,
+  "â": 148,
+  "ã": 149,
+  "ä": 150,
+  "å": 151,
+  "æ": 152,
+  "ç": 153,
+  "è": 154,
+  "é": 155,
+  "ë": 156,
+  "ì": 157,
+  "ï": 158,
+  "Ċ": 159,
+  "Ċ</w>": 349,
+  "Ġ": 160,
+  "Ġ\"</w>": 401,
+  "Ġ'</w>": 431,
+  "Ġ(</w>": 475,
+  "Ġ)</w>": 474,
+  "Ġ,</w>": 360,
+  "Ġ.</w>": 365,
+  "Ġ0": 847,
+  "Ġ1": 411,
+  "Ġ18": 769,
+  "Ġ19": 492,
+  "Ġ199": 893,
+  "Ġ1</w>": 778,
+  "Ġ2": 462,
+  "Ġ20": 522,
+  "Ġ200": 620,
+  "Ġ201": 734,
+  "Ġ2</w>": 813,
+  "Ġ3": 735,
+  "Ġ3</w>": 888,
+  "Ġ4": 870,
+  "Ġ5": 907,
+  "Ġ5</w>": 990,
+  "Ġ:</w>": 637,
+  "Ġ;</w>": 615,
+  "Ġ</w>": 333,
+  "Ġ=</w>": 399,
+  "Ġ@": 417,
+  "Ġ@,@</w>": 755,
+  "Ġ@-@</w>": 440,
+  "Ġ@.@</w>": 696,
+  "ĠA": 409,
+  "ĠA</w>": 807,
+  "ĠAl": 716,
+  "ĠAm": 829,
+  "ĠAmeric": 958,
+  "ĠAn": 784,
+  "ĠAr": 894,
+  "ĠB": 432,
+  "ĠC": 410,
+  "ĠCh": 581,
+  "ĠCom": 904,
+  "ĠD": 464,
+  "ĠE": 500,
+  "ĠEn": 878,
+  "ĠF": 470,
+  "ĠG": 482,
+  "ĠH": 445,
+  "ĠHe</w>": 742,
+  "ĠI": 442,
+  "ĠI</w>": 827,
+  "ĠIn": 704,
+  "ĠIn</w>": 574,
+  "ĠIt</w>": 774,
+  "ĠJ": 491,
+  "ĠK": 548,
+  "ĠL": 484,
+  "ĠM": 423,
+  "ĠMar": 776,
+  "ĠN": 483,
+  "ĠO": 504,
+  "ĠP": 450,
+  "ĠPar": 967,
+  "ĠR": 459,
+  "ĠS": 403,
+  "ĠSh": 750,
+  "ĠSt": 590,
+  "ĠT": 396,
+  "ĠTh": 414,
+  "ĠThe</w>": 437,
+  "ĠThis</w>": 997,
+  "ĠU": 585,
+  "ĠUn": 773,
+  "ĠV": 617,
+  "ĠW": 479,
+  "ĠWh": 853,
+  "ĠY": 757,
+  "Ġa": 356,
+  "Ġa</w>": 394,
+  "Ġab": 653,
+  "Ġabout</w>": 899,
+  "Ġac": 583,
+  "Ġacc": 874,
+  "Ġad": 656,
+  "Ġafter</w>": 763,
+  "Ġag": 725,
+  "Ġal": 476,
+  "Ġalb": 991,
+  "Ġall</w>": 839,
+  "Ġalso</w>": 641,
+  "Ġan": 602,
+  "Ġan</w>": 562,
+  "Ġand</w>": 381,
+  "Ġapp": 711,
+  "Ġar": 507,
+  "Ġare</w>": 601,
+  "Ġas</w>": 454,
+  "Ġass": 947,
+  "Ġat</w>": 514,
+  "Ġatt": 788,
+  "Ġb": 371,
+  "Ġbe": 499,
+  "Ġbe</w>": 595,
+  "Ġbec": 706,
+  "Ġbeen</w>": 686,
+  "Ġbeg": 915,
+  "Ġbetween</w>": 914,
+  "Ġbo": 819,
+  "Ġbut</w>": 623,
+  "Ġby</w>": 473,
+  "Ġc": 368,
+  "Ġcent": 823,
+  "Ġch": 526,
+  "Ġchar": 822,
+  "Ġcl": 689,
+  "Ġcom": 509,
+  "Ġcomm": 707,
+  "Ġcomp": 616,
+  "Ġcon": 477,
+  "Ġcons": 841,
+  "Ġcont": 655,
+  "Ġcre": 931,
+  "Ġd": 387,
+  "Ġde": 627,
+  "Ġdec": 873,
+  "Ġdef": 965,
+  "Ġdes": 738,
+  "Ġdi": 892,
+  "Ġdis": 708,
+  "Ġduring</w>": 864,
+  "Ġe": 421,
+  "Ġear": 854,
+  "Ġel": 869,
+  "Ġen": 662,
+  "Ġev": 682,
+  "Ġex": 539,
+  "Ġexp": 867,
+  "Ġf": 370,
+  "Ġfe": 845,
+  "Ġfil": 913,
+  "Ġfin": 786,
+  "Ġfir": 599,
+  "Ġfirst</w>": 626,
+  "Ġfl": 877,
+  "Ġfor": 614,
+  "Ġfor</w>": 458,
+  "Ġform": 963,
+  "Ġfrom</w>": 503,
+  "Ġg": 430,
+  "Ġgame</w>": 895,
+  "Ġgen": 985,
+  "Ġgro": 890,
+  "Ġh": 380,
+  "Ġha": 485,
+  "Ġhad</w>": 566,
+  "Ġhas</w>": 679,
+  "Ġhave</w>": 667,
+  "Ġhe</w>": 558,
+  "Ġher</w>": 660,
+  "Ġhim</w>": 896,
+  "Ġhis</w>": 512,
+  "Ġi": 366,
+  "Ġimp": 909,
+  "Ġin": 429,
+  "Ġin</w>": 389,
+  "Ġinc": 618,
+  "Ġinclud": 761,
+  "Ġind": 983,
+  "Ġint": 628,
+  "Ġinter": 832,
+  "Ġinto</w>": 785,
+  "Ġis</w>": 480,
+  "Ġit</w>": 533,
+  "Ġits</w>": 642,
+  "Ġj": 723,
+  "Ġk": 564,
+  "Ġkn": 857,
+  "Ġl": 398,
+  "Ġlar": 962,
+  "Ġlater</w>": 936,
+  "Ġlea": 868,
+  "Ġm": 386,
+  "Ġmade</w>": 949,
+  "Ġme": 968,
+  "Ġmore</w>": 802,
+  "Ġmost</w>": 910,
+  "Ġmov": 956,
+  "Ġmus": 966,
+  "Ġn": 415,
+  "Ġnew</w>": 989,
+  "Ġno": 547,
+  "Ġnor": 978,
+  "Ġnot</w>": 632,
+  "Ġnum": 926,
+  "Ġo": 359,
+  "Ġof</w>": 373,
+  "Ġoff": 875,
+  "Ġon": 551,
+  "Ġon</w>": 456,
+  "Ġone</w>": 677,
+  "Ġonly</w>": 871,
+  "Ġor": 699,
+  "Ġor</w>": 645,
+  "Ġother</w>": 787,
+  "Ġout</w>": 925,
+  "Ġov": 729,
+  "Ġover</w>": 856,
+  "Ġp": 379,
+  "Ġpar": 636,
+  "Ġper": 736,
+  "Ġpl": 698,
+  "Ġpla": 697,
+  "Ġplay": 808,
+  "Ġpos": 859,
+  "Ġpr": 889,
+  "Ġpre": 749,
+  "Ġpres": 912,
+  "Ġpro": 506,
+  "Ġproduc": 934,
+  "Ġqu": 955,
+  "Ġr": 521,
+  "Ġra": 863,
+  "Ġre": 400,
+  "Ġrec": 597,
+  "Ġrecor": 919,
+  "Ġreg": 943,
+  "Ġrel": 900,
+  "Ġrele": 946,
+  "Ġrem": 848,
+  "Ġrep": 762,
+  "Ġres": 650,
+  "Ġro": 629,
+  "Ġs": 361,
+  "Ġsa": 905,
+  "Ġsc": 732,
+  "Ġse": 569,
+  "Ġseason</w>": 948,
+  "Ġsec": 781,
+  "Ġser": 740,
+  "Ġsev": 884,
+  "Ġsh": 513,
+  "Ġshe</w>": 862,
+  "Ġsp": 578,
+  "Ġspec": 940,
+  "Ġst": 446,
+  "Ġstar": 939,
+  "Ġsu": 770,
+  "Ġsub": 969,
+  "Ġsuc": 764,
+  "Ġsuch</w>": 981,
+  "Ġt": 354,
+  "Ġth": 355,
+  "Ġthan</w>": 918,
+  "Ġthat</w>": 452,
+  "Ġthe": 502,
+  "Ġthe</w>": 357,
+  "Ġtheir</w>": 621,
+  "Ġthem</w>": 998,
+  "Ġthey</w>": 727,
+  "Ġthis</w>": 705,
+  "Ġthree</w>": 902,
+  "Ġthroug": 923,
+  "Ġtime</w>": 783,
+  "Ġto</w>": 391,
+  "Ġtra": 836,
+  "Ġtw": 639,
+  "Ġtwo</w>": 688,
+  "Ġun": 544,
+  "Ġup</w>": 898,
+  "Ġus": 668,
+  "Ġused</w>": 988,
+  "Ġv": 495,
+  "Ġw": 363,
+  "Ġwas</w>": 422,
+  "Ġwere</w>": 525,
+  "Ġwh": 443,
+  "Ġwhen</w>": 851,
+  "Ġwhere</w>": 995,
+  "Ġwhich</w>": 573,
+  "Ġwhile</w>": 935,
+  "Ġwho</w>": 724,
+  "Ġwit": 451,
+  "Ġwith": 994,
+  "Ġwith</w>": 466,
+  "Ġwor": 643,
+  "Ġwould</w>": 801,
+  "Ġwrit": 937,
+  "Ġy": 580,
+  "Ġyear</w>": 987,
+  "Ġâ": 556,
+  "ĠâĢ": 565,
+  "ĠâĢĵ</w>": 631,
+  "ĠĊ</w>": 412,
+  "Ģ": 161,
+  "Ģ</w>": 223,
+  "ģ": 162,
+  "ģ</w>": 273,
+  "Ĥ": 163,
+  "Ĥ</w>": 262,
+  "ĥ": 164,
+  "ĥ</w>": 337,
+  "Ħ": 165,
+  "Ħ</w>": 278,
+  "ħ": 166,
+  "ħ</w>": 281,
+  "Ĩ": 167,
+  "Ĩ</w>": 308,
+  "ĩ": 168,
+  "ĩ</w>": 225,
+  "Ī": 169,
+  "Ī</w>": 221,
+  "ī": 170,
+  "ī</w>": 244,
+  "Ĭ": 171,
+  "Ĭ</w>": 315,
+  "ĭ": 172,
+  "ĭ</w>": 321,
+  "Į": 173,
+  "Į</w>": 324,
+  "į": 174,
+  "į</w>": 302,
+  "İ": 175,
+  "İ</w>": 249,
+  "ı": 176,
+  "ı</w>": 332,
+  "Ĳ": 177,
+  "Ĳ</w>": 295,
+  "ĳ": 178,
+  "ĳ</w>": 313,
+  "Ĵ": 179,
+  "Ĵ</w>": 328,
+  "ĵ": 180,
+  "ĵ</w>": 312,
+  "Ķ": 181,
+  "Ķ</w>": 256,
+  "ķ": 182,
+  "ķ</w>": 314,
+  "ĸ": 183,
+  "ĸ</w>": 277,
+  "Ĺ": 184,
+  "Ĺ</w>": 322,
+  "ĺ": 185,
+  "ĺ</w>": 285,
+  "Ļ": 186,
+  "Ļ</w>": 267,
+  "ļ": 187,
+  "ļ</w>": 290,
+  "Ľ": 188,
+  "Ľ</w>": 311,
+  "ľ": 189,
+  "ľ</w>": 299,
+  "Ŀ": 190,
+  "Ŀ</w>": 291,
+  "ŀ": 191,
+  "ŀ</w>": 293,
+  "Ł": 192,
+  "Ł</w>": 335,
+  "ł": 193,
+  "ł</w>": 252,
+  "Ń": 194,
+  "Ń</w>": 297
+}

tokenizer_2/merges.txt ADDED Viewed

	@@ -0,0 +1,647 @@

+#version: 0.2
+Ġ t
+Ġt h
+Ġ a
+Ġth e</w>
+i n
+Ġ o
+Ġ ,</w>
+Ġ s
+e d</w>
+Ġ w
+e r
+Ġ .</w>
+Ġ i
+r e
+Ġ c
+n d</w>
+Ġ f
+Ġ b
+a t
+Ġo f</w>
+e r</w>
+e n
+a r
+o r
+i t
+Ġ p
+Ġ h
+Ġa nd</w>
+o n
+in g</w>
+a n
+r o
+Ġ m
+Ġ d
+e s</w>
+Ġi n</w>
+o n</w>
+Ġt o</w>
+o u
+i s
+Ġ a</w>
+i c
+Ġ T
+a l
+Ġ l
+Ġ =</w>
+Ġ re
+Ġ "</w>
+e s
+Ġ S
+a s</w>
+a l</w>
+i l
+e l
+i on</w>
+Ġ A
+Ġ C
+Ġ 1
+Ġ Ċ</w>
+u r
+ĠT h
+Ġ n
+a s
+Ġ @
+e c
+o m
+a c
+Ġ e
+Ġw as</w>
+Ġ M
+o r</w>
+a n</w>
+a m
+e n</w>
+o l
+Ġ in
+Ġ g
+Ġ '</w>
+Ġ B
+l y</w>
+a t</w>
+i v
+t s</w>
+ĠTh e</w>
+u s
+- @</w>
+Ġ@ -@</w>
+i s</w>
+Ġ I
+Ġw h
+i g
+Ġ H
+Ġs t
+o s
+u n
+t h
+Ġ P
+Ġw it
+Ġth at</w>
+i r
+Ġa s</w>
+e m
+Ġo n</w>
+r a
+Ġf or</w>
+Ġ R
+e t
+o w
+Ġ 2
+i d
+Ġ D
+l e</w>
+Ġwit h</w>
+l a
+en t</w>
+i m
+Ġ F
+e a
+i on
+Ġb y</w>
+Ġ )</w>
+Ġ (</w>
+Ġa l
+Ġc on
+en t
+Ġ W
+Ġi s</w>
+er e</w>
+Ġ G
+Ġ N
+Ġ L
+Ġh a
+er s</w>
+r i
+t h</w>
+t ed</w>
+u c
+Ġ J
+Ġ1 9
+e v
+u l
+Ġ v
+c e</w>
+at ion</w>
+ro m</w>
+Ġb e
+Ġ E
+i n</w>
+Ġth e
+Ġf rom</w>
+Ġ O
+t er</w>
+Ġp ro
+Ġa r
+a d
+Ġc om
+i c</w>
+a g
+Ġh is</w>
+Ġs h
+Ġa t</w>
+o v
+i es</w>
+o o
+p p
+s t
+c h
+Ġ r
+Ġ2 0
+a y</w>
+i f
+Ġw ere</w>
+Ġc h
+u t</w>
+s t</w>
+u t
+d s</w>
+o p
+u m
+Ġi t</w>
+o c
+t er
+l e
+ig h
+u d
+Ġe x
+ion s</w>
+at e</w>
+it y</w>
+at ed</w>
+Ġ un
+e p
+q u
+Ġn o
+Ġ K
+iv e</w>
+is t
+Ġo n
+am e</w>
+ou n
+i r</w>
+a b
+Ġ â
+in g
+Ġh e</w>
+l d</w>
+u g
+ic h</w>
+Ġa n</w>
+e d
+Ġ k
+Ġâ Ģ
+Ġha d</w>
+v e</w>
+a in
+Ġs e
+t ion</w>
+or e</w>
+re s
+Ġwh ich</w>
+ĠI n</w>
+o d
+th er</w>
+a k
+Ġs p
+a r</w>
+Ġ y
+ĠC h
+on g</w>
+Ġa c
+es t</w>
+Ġ U
+a p
+f f
+al ly</w>
+r it
+ĠS t
+u b
+g e</w>
+b er</w>
+e t</w>
+Ġb e</w>
+e ar
+Ġre c
+er s
+Ġf ir
+o t
+Ġar e</w>
+Ġa n
+c h</w>
+o g
+i a</w>
+es t
+in e</w>
+il l
+an d
+e l</w>
+ar y</w>
+e w</w>
+i d</w>
+Ġf or
+Ġ ;</w>
+Ġcom p
+Ġ V
+Ġin c
+t r
+Ġ20 0
+Ġthe ir</w>
+u s</w>
+Ġb ut</w>
+r an
+ic al</w>
+Ġfir st</w>
+Ġd e
+Ġin t
+Ġ ro
+s o</w>
+ĠâĢ ĵ</w>
+Ġno t</w>
+d ing</w>
+f ter</w>
+ur e</w>
+Ġp ar
+Ġ :</w>
+i an</w>
+Ġt w
+ou ld</w>
+Ġal so</w>
+Ġi ts</w>
+Ġw or
+u m</w>
+Ġo r</w>
+os t</w>
+0 0</w>
+ou r
+ar d</w>
+Ġre s
+m p
+u e</w>
+Ġa b
+is h</w>
+Ġcon t
+Ġa d
+ow n</w>
+al l</w>
+ou g
+Ġh er</w>
+as t</w>
+Ġ en
+om e</w>
+al l
+d ed</w>
+o w</w>
+Ġha ve</w>
+Ġ us
+ea r</w>
+ac k</w>
+d uc
+i al</w>
+s s
+en ts</w>
+a in</w>
+t ing</w>
+Ġon e</w>
+es s
+Ġh as</w>
+igh t</w>
+a v
+Ġe v
+ou t</w>
+a y
+en ce</w>
+Ġbe en</w>
+e w
+Ġtw o</w>
+Ġc l
+d er</w>
+im e</w>
+k s</w>
+es s</w>
+is h
+. @</w>
+Ġ@ .@</w>
+Ġp la
+Ġp l
+Ġo r
+u p</w>
+m ent</w>
+ur ing</w>
+ol l
+ĠI n
+Ġth is</w>
+Ġb ec
+Ġcom m
+Ġd is
+at er</w>
+ag e</w>
+Ġa pp
+ou s</w>
+e y</w>
+i l</w>
+p er
+ĠA l
+ion al</w>
+l ud
+el y</w>
+t t
+il e</w>
+i z
+Ġ j
+Ġwh o</w>
+Ġa g
+i b
+Ġthe y</w>
+f or
+Ġo v
+at h
+e g
+Ġs c
+i p
+Ġ20 1
+Ġ 3
+Ġp er
+or y</w>
+Ġd es
+id e</w>
+Ġs er
+s e</w>
+ĠH e</w>
+la nd</w>
+at ions</w>
+r ic
+i t</w>
+re s</w>
+er ed</w>
+Ġp re
+ĠS h
+an ce</w>
+or t</w>
+an t</w>
+, @</w>
+Ġ@ ,@</w>
+el l</w>
+Ġ Y
+n ed</w>
+el l
+it e</w>
+Ġinc lud
+Ġre p
+Ġa fter</w>
+Ġs uc
+re e</w>
+an y</w>
+i m</w>
+or t
+Ġ1 8
+Ġs u
+ad e</w>
+ou r</w>
+ĠU n
+ĠI t</w>
+i k
+ĠM ar
+em ber</w>
+Ġ 1</w>
+e en</w>
+a nd</w>
+Ġs ec
+ic e</w>
+Ġt ime</w>
+ĠA n
+Ġint o</w>
+Ġf in
+Ġo ther</w>
+Ġa tt
+il l</w>
+re n
+ac h
+as s
+er al</w>
+es e</w>
+s h
+al s</w>
+it ion</w>
+oug h</w>
+l es</w>
+am p
+Ġw ould</w>
+Ġm ore</w>
+ro ug
+ri b
+er y</w>
+ac e</w>
+Ġ A</w>
+Ġpla y
+it ed</w>
+k ed</w>
+is t</w>
+i ed</w>
+Ġ 2</w>
+as ed</w>
+ing s</w>
+an g
+a m</w>
+i p</w>
+Ġb o
+ab le</w>
+t y</w>
+Ġch ar
+Ġc ent
+et w
+at es</w>
+ro p
+Ġ I</w>
+u nd</w>
+ĠA m
+c es</w>
+o in
+Ġin ter
+u p
+c t
+on e</w>
+Ġt ra
+an t
+ec t
+Ġal l</w>
+e f
+Ġcon s
+ub l
+n ing</w>
+an s</w>
+Ġf e
+us t</w>
+Ġ 0
+Ġre m
+as e</w>
+on g
+Ġwh en</w>
+e b
+ĠW h
+Ġe ar
+ev er</w>
+Ġov er</w>
+Ġk n
+a us
+Ġp os
+a d</w>
+er m
+Ġsh e</w>
+Ġ ra
+Ġd uring</w>
+as on</w>
+v i
+Ġex p
+Ġl ea
+Ġ el
+Ġ 4
+Ġon ly</w>
+o nd</w>
+Ġd ec
+Ġac c
+Ġo ff
+is s
+Ġf l
+ĠE n
+o t</w>
+en s
+os e</w>
+ak e</w>
+o m</w>
+Ġs ev
+ac h</w>
+etw een</w>
+er n
+Ġ 3</w>
+Ġp r
+Ġg ro
+r uc
+Ġd i
+Ġ19 9
+ĠA r
+Ġg ame</w>
+Ġh im</w>
+oo k</w>
+Ġ up</w>
+Ġab out</w>
+Ġre l
+for m
+Ġth ree</w>
+at t
+ĠC om
+Ġs a
+ear s</w>
+Ġ 5
+r y</w>
+Ġi mp
+Ġm ost</w>
+f er
+Ġp res
+Ġf il
+Ġb etween</w>
+Ġbe g
+p h
+or s</w>
+Ġth an</w>
+Ġrec or
+o b
+er ic
+at ing</w>
+Ġth roug
+k ing</w>
+Ġo ut</w>
+Ġn um
+oo d</w>
+oll ow
+ac t
+u il
+Ġc re
+ol og
+at ional</w>
+Ġpro duc
+Ġwh ile</w>
+Ġl ater</w>
+Ġw rit
+e x
+Ġst ar
+Ġsp ec
+e e
+ish ed</w>
+Ġre g
+is ion</w>
+ou th</w>
+Ġre le
+Ġa ss
+Ġse ason</w>
+Ġm ade</w>
+il y</w>
+r u
+o y
+t ur
+t e</w>
+Ġ qu
+Ġm ov
+ur y</w>
+ĠAm eric
+em ent</w>
+c c
+ou nd</w>
+Ġl ar
+Ġfor m
+ec t</w>
+Ġde f
+Ġm us
+ĠP ar
+Ġm e
+Ġs ub
+w ay</w>
+o p</w>
+o h
+el d</w>
+i e</w>
+em p
+am es</w>
+er n</w>
+Ġn or
+iv ed</w>
+ev el
+Ġsuc h</w>
+ar ds</w>
+Ġin d
+ik e</w>
+Ġg en
+er t
+Ġy ear</w>
+Ġus ed</w>
+Ġn ew</w>
+Ġ 5</w>
+Ġal b
+s p
+y p
+Ġwit h
+Ġwh ere</w>
+ic s</w>
+ĠTh is</w>
+Ġthe m</w>
+w n</w>

tokenizer_2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_2/vocab.json ADDED Viewed

	@@ -0,0 +1,1002 @@

+{
+  "!": 2,
+  "!</w>": 345,
+  "\"": 3,
+  "\"</w>": 344,
+  "#": 4,
+  "#</w>": 325,
+  "$": 5,
+  "$</w>": 348,
+  "%": 6,
+  "%</w>": 351,
+  "&": 7,
+  "&</w>": 352,
+  "'": 8,
+  "'</w>": 296,
+  "(": 9,
+  "(</w>": 318,
+  ")": 10,
+  ")</w>": 330,
+  "*": 11,
+  "*</w>": 327,
+  "+": 12,
+  "+</w>": 341,
+  ",": 13,
+  ",</w>": 279,
+  ",@</w>": 754,
+  "-": 14,
+  "-</w>": 276,
+  "-@</w>": 439,
+  ".": 15,
+  ".</w>": 253,
+  ".@</w>": 695,
+  "/": 16,
+  "/</w>": 350,
+  "0": 17,
+  "00</w>": 647,
+  "0</w>": 216,
+  "1": 18,
+  "1</w>": 222,
+  "2": 19,
+  "2</w>": 231,
+  "3": 20,
+  "3</w>": 243,
+  "4": 21,
+  "4</w>": 233,
+  "5": 22,
+  "5</w>": 240,
+  "6": 23,
+  "6</w>": 226,
+  "7": 24,
+  "7</w>": 215,
+  "8": 25,
+  "8</w>": 236,
+  "9": 26,
+  "9</w>": 242,
+  ":": 27,
+  ":</w>": 353,
+  ";": 28,
+  ";</w>": 317,
+  "<": 29,
+  "<</w>": 340,
+  "<|endoftext|>": 1,
+  "<|startoftext|>": 0,
+  "=": 30,
+  "=</w>": 342,
+  ">": 31,
+  "></w>": 300,
+  "?": 32,
+  "?</w>": 346,
+  "@": 33,
+  "@</w>": 320,
+  "A": 34,
+  "A</w>": 227,
+  "B": 35,
+  "B</w>": 258,
+  "C": 36,
+  "C</w>": 239,
+  "D": 37,
+  "D</w>": 255,
+  "E": 38,
+  "E</w>": 246,
+  "F": 39,
+  "F</w>": 213,
+  "G": 40,
+  "G</w>": 283,
+  "H": 41,
+  "H</w>": 219,
+  "I": 42,
+  "I</w>": 237,
+  "J": 43,
+  "J</w>": 251,
+  "K": 44,
+  "K</w>": 254,
+  "L": 45,
+  "L</w>": 218,
+  "M": 46,
+  "M</w>": 234,
+  "N": 47,
+  "N</w>": 238,
+  "O": 48,
+  "O</w>": 265,
+  "P": 49,
+  "P</w>": 245,
+  "Q": 50,
+  "Q</w>": 309,
+  "R": 51,
+  "R</w>": 264,
+  "S": 52,
+  "S</w>": 230,
+  "T": 53,
+  "T</w>": 235,
+  "U": 54,
+  "U</w>": 268,
+  "V": 55,
+  "V</w>": 248,
+  "W": 56,
+  "W</w>": 274,
+  "X": 57,
+  "X</w>": 263,
+  "Y": 58,
+  "Y</w>": 310,
+  "Z": 59,
+  "Z</w>": 207,
+  "[": 60,
+  "[</w>": 270,
+  "\\": 61,
+  "\\</w>": 338,
+  "]": 62,
+  "]</w>": 289,
+  "^": 63,
+  "^</w>": 331,
+  "_": 64,
+  "_</w>": 334,
+  "`": 65,
+  "`</w>": 347,
+  "a": 66,
+  "a</w>": 197,
+  "ab": 555,
+  "able</w>": 820,
+  "ac": 420,
+  "ace</w>": 806,
+  "ach": 791,
+  "ach</w>": 885,
+  "ack</w>": 670,
+  "act": 929,
+  "ad": 508,
+  "ad</w>": 860,
+  "ade</w>": 771,
+  "ag": 511,
+  "age</w>": 710,
+  "ain": 568,
+  "ain</w>": 675,
+  "ak": 577,
+  "ake</w>": 882,
+  "al": 397,
+  "al</w>": 405,
+  "all": 664,
+  "all</w>": 658,
+  "ally</w>": 588,
+  "als</w>": 796,
+  "am": 426,
+  "am</w>": 817,
+  "ame</w>": 552,
+  "ames</w>": 976,
+  "amp": 800,
+  "an": 384,
+  "an</w>": 425,
+  "ance</w>": 751,
+  "and": 609,
+  "and</w>": 780,
+  "ang": 816,
+  "ans</w>": 844,
+  "ant": 837,
+  "ant</w>": 753,
+  "any</w>": 766,
+  "ap": 586,
+  "ar": 376,
+  "ar</w>": 579,
+  "ard</w>": 649,
+  "ards</w>": 982,
+  "ary</w>": 611,
+  "as": 416,
+  "as</w>": 404,
+  "ase</w>": 849,
+  "ased</w>": 814,
+  "ason</w>": 865,
+  "ass": 792,
+  "ast</w>": 661,
+  "at": 372,
+  "at</w>": 434,
+  "ate</w>": 541,
+  "ated</w>": 543,
+  "ater</w>": 709,
+  "ates</w>": 825,
+  "ath": 730,
+  "ating</w>": 922,
+  "ation</w>": 497,
+  "ational</w>": 933,
+  "ations</w>": 744,
+  "att": 903,
+  "aus": 858,
+  "av": 681,
+  "ay": 684,
+  "ay</w>": 523,
+  "b": 67,
+  "b</w>": 212,
+  "ber</w>": 593,
+  "c": 68,
+  "c</w>": 224,
+  "cc": 960,
+  "ce</w>": 496,
+  "ces</w>": 830,
+  "ch": 520,
+  "ch</w>": 603,
+  "ct": 834,
+  "d": 69,
+  "d</w>": 196,
+  "ded</w>": 665,
+  "der</w>": 690,
+  "ding</w>": 633,
+  "ds</w>": 530,
+  "duc": 671,
+  "e": 70,
+  "e</w>": 195,
+  "ea": 471,
+  "ear": 596,
+  "ear</w>": 669,
+  "ears</w>": 906,
+  "eb": 852,
+  "ec": 418,
+  "ect": 838,
+  "ect</w>": 964,
+  "ed": 563,
+  "ed</w>": 362,
+  "ee": 941,
+  "een</w>": 779,
+  "ef": 840,
+  "eg": 731,
+  "el": 407,
+  "el</w>": 610,
+  "eld</w>": 973,
+  "ell": 759,
+  "ell</w>": 756,
+  "ely</w>": 719,
+  "em": 455,
+  "ember</w>": 777,
+  "ement</w>": 959,
+  "emp": 975,
+  "en": 375,
+  "en</w>": 427,
+  "ence</w>": 685,
+  "ens": 880,
+  "ent": 478,
+  "ent</w>": 468,
+  "ents</w>": 674,
+  "ep": 545,
+  "er": 364,
+  "er</w>": 374,
+  "eral</w>": 793,
+  "ere</w>": 481,
+  "ered</w>": 748,
+  "eric": 921,
+  "erm": 861,
+  "ern": 887,
+  "ern</w>": 977,
+  "ers": 598,
+  "ers</w>": 486,
+  "ert": 986,
+  "ery</w>": 805,
+  "es": 402,
+  "es</w>": 388,
+  "ese</w>": 794,
+  "ess": 678,
+  "ess</w>": 693,
+  "est": 606,
+  "est</w>": 584,
+  "et": 460,
+  "et</w>": 594,
+  "etw": 824,
+  "etween</w>": 886,
+  "ev": 493,
+  "evel": 980,
+  "ever</w>": 855,
+  "ew": 687,
+  "ew</w>": 612,
+  "ex": 938,
+  "ey</w>": 713,
+  "f": 71,
+  "f</w>": 209,
+  "fer": 911,
+  "ff": 587,
+  "for": 728,
+  "form": 901,
+  "fter</w>": 634,
+  "g": 72,
+  "g</w>": 214,
+  "ge</w>": 592,
+  "h": 73,
+  "h</w>": 203,
+  "i": 74,
+  "i</w>": 205,
+  "ia</w>": 605,
+  "ial</w>": 672,
+  "ian</w>": 638,
+  "ib": 726,
+  "ic": 395,
+  "ic</w>": 510,
+  "ical</w>": 625,
+  "ice</w>": 782,
+  "ich</w>": 561,
+  "ics</w>": 996,
+  "id": 463,
+  "id</w>": 613,
+  "ide</w>": 739,
+  "ie</w>": 974,
+  "ied</w>": 812,
+  "ies</w>": 516,
+  "if": 524,
+  "ig": 444,
+  "igh": 537,
+  "ight</w>": 680,
+  "ik": 775,
+  "ike</w>": 984,
+  "il": 406,
+  "il</w>": 714,
+  "ile</w>": 721,
+  "ill": 608,
+  "ill</w>": 789,
+  "ily</w>": 950,
+  "im": 469,
+  "im</w>": 767,
+  "ime</w>": 691,
+  "in": 358,
+  "in</w>": 501,
+  "ine</w>": 607,
+  "ing": 557,
+  "ing</w>": 383,
+  "ings</w>": 815,
+  "ion": 472,
+  "ion</w>": 408,
+  "ional</w>": 717,
+  "ions</w>": 540,
+  "ip": 733,
+  "ip</w>": 818,
+  "ir": 453,
+  "ir</w>": 554,
+  "is": 393,
+  "is</w>": 441,
+  "ish": 694,
+  "ish</w>": 654,
+  "ished</w>": 942,
+  "ision</w>": 944,
+  "iss": 876,
+  "ist": 550,
+  "ist</w>": 811,
+  "it": 378,
+  "it</w>": 746,
+  "ite</w>": 760,
+  "ited</w>": 809,
+  "ition</w>": 797,
+  "ity</w>": 542,
+  "iv": 435,
+  "ive</w>": 549,
+  "ived</w>": 979,
+  "iz": 722,
+  "j": 75,
+  "j</w>": 288,
+  "k": 76,
+  "k</w>": 210,
+  "ked</w>": 810,
+  "king</w>": 924,
+  "ks</w>": 692,
+  "l": 77,
+  "l</w>": 201,
+  "la": 467,
+  "land</w>": 743,
+  "ld</w>": 559,
+  "le": 536,
+  "le</w>": 465,
+  "les</w>": 799,
+  "lud": 718,
+  "ly</w>": 433,
+  "m": 78,
+  "m</w>": 202,
+  "ment</w>": 701,
+  "mp": 651,
+  "n": 79,
+  "n</w>": 199,
+  "nd</w>": 369,
+  "ned</w>": 758,
+  "ning</w>": 843,
+  "o": 80,
+  "o</w>": 198,
+  "ob": 920,
+  "oc": 534,
+  "od": 575,
+  "og": 604,
+  "oh": 972,
+  "oin": 831,
+  "ol": 428,
+  "oll": 703,
+  "ollow": 928,
+  "olog": 932,
+  "om": 419,
+  "om</w>": 883,
+  "ome</w>": 663,
+  "on": 382,
+  "on</w>": 390,
+  "ond</w>": 872,
+  "one</w>": 835,
+  "ong": 850,
+  "ong</w>": 582,
+  "oo": 517,
+  "ood</w>": 927,
+  "ook</w>": 897,
+  "op": 531,
+  "op</w>": 971,
+  "or": 377,
+  "or</w>": 424,
+  "ore</w>": 571,
+  "ors</w>": 917,
+  "ort": 768,
+  "ort</w>": 752,
+  "ory</w>": 737,
+  "os": 447,
+  "ose</w>": 881,
+  "ost</w>": 646,
+  "ot": 600,
+  "ot</w>": 879,
+  "ou": 392,
+  "oug": 659,
+  "ough</w>": 798,
+  "ould</w>": 640,
+  "oun": 553,
+  "ound</w>": 961,
+  "our": 648,
+  "our</w>": 772,
+  "ous</w>": 712,
+  "out</w>": 683,
+  "outh</w>": 945,
+  "ov": 515,
+  "ow": 461,
+  "ow</w>": 666,
+  "own</w>": 657,
+  "oy": 952,
+  "p": 81,
+  "p</w>": 217,
+  "per": 715,
+  "ph": 916,
+  "pp": 518,
+  "q": 82,
+  "q</w>": 280,
+  "qu": 546,
+  "r": 83,
+  "r</w>": 204,
+  "ra": 457,
+  "ran": 624,
+  "re": 367,
+  "ree</w>": 765,
+  "ren": 790,
+  "res": 572,
+  "res</w>": 747,
+  "ri": 487,
+  "rib": 804,
+  "ric": 745,
+  "rit": 589,
+  "ro": 385,
+  "rom</w>": 498,
+  "rop": 826,
+  "roug": 803,
+  "ru": 951,
+  "ruc": 891,
+  "ry</w>": 908,
+  "s": 84,
+  "s</w>": 206,
+  "se</w>": 741,
+  "sh": 795,
+  "so</w>": 630,
+  "sp": 992,
+  "ss": 673,
+  "st": 519,
+  "st</w>": 528,
+  "t": 85,
+  "t</w>": 208,
+  "te</w>": 954,
+  "ted</w>": 489,
+  "ter": 535,
+  "ter</w>": 505,
+  "th": 449,
+  "th</w>": 488,
+  "ther</w>": 576,
+  "ting</w>": 676,
+  "tion</w>": 570,
+  "tr": 619,
+  "ts</w>": 436,
+  "tt": 720,
+  "tur": 953,
+  "ty</w>": 821,
+  "u": 86,
+  "u</w>": 229,
+  "ub": 591,
+  "ubl": 842,
+  "uc": 490,
+  "ud": 538,
+  "ue</w>": 652,
+  "ug": 560,
+  "uil": 930,
+  "ul": 494,
+  "um": 532,
+  "um</w>": 644,
+  "un": 448,
+  "und</w>": 828,
+  "up": 833,
+  "up</w>": 700,
+  "ur": 413,
+  "ure</w>": 635,
+  "uring</w>": 702,
+  "ury</w>": 957,
+  "us": 438,
+  "us</w>": 622,
+  "ust</w>": 846,
+  "ut": 529,
+  "ut</w>": 527,
+  "v": 87,
+  "v</w>": 232,
+  "ve</w>": 567,
+  "vi": 866,
+  "w": 88,
+  "w</w>": 250,
+  "way</w>": 970,
+  "wn</w>": 999,
+  "x": 89,
+  "x</w>": 269,
+  "y": 90,
+  "y</w>": 211,
+  "yp": 993,
+  "z": 91,
+  "z</w>": 228,
+  "|": 92,
+  "|</w>": 304,
+  "}": 93,
+  "}</w>": 336,
+  "~": 94,
+  "~</w>": 343,
+  "¡": 95,
+  "¡</w>": 220,
+  "¢": 96,
+  "¢</w>": 306,
+  "£": 97,
+  "£</w>": 323,
+  "¤": 98,
+  "¤</w>": 292,
+  "¥": 99,
+  "¥</w>": 339,
+  "¦": 100,
+  "¦</w>": 303,
+  "§": 101,
+  "§</w>": 275,
+  "¨": 102,
+  "¨</w>": 282,
+  "©": 103,
+  "©</w>": 259,
+  "ª": 104,
+  "ª</w>": 286,
+  "«": 105,
+  "«</w>": 266,
+  "¬": 106,
+  "¬</w>": 319,
+  "®": 107,
+  "®</w>": 329,
+  "¯": 108,
+  "¯</w>": 287,
+  "°": 109,
+  "°</w>": 298,
+  "±": 110,
+  "±</w>": 200,
+  "²": 111,
+  "²</w>": 284,
+  "³": 112,
+  "³</w>": 272,
+  "´": 113,
+  "´</w>": 307,
+  "µ": 114,
+  "µ</w>": 261,
+  "¶": 115,
+  "¶</w>": 301,
+  "·": 116,
+  "·</w>": 326,
+  "¸": 117,
+  "¸</w>": 257,
+  "¹": 118,
+  "¹</w>": 241,
+  "º": 119,
+  "º</w>": 260,
+  "»": 120,
+  "»</w>": 247,
+  "¼": 121,
+  "¼</w>": 305,
+  "½": 122,
+  "½</w>": 294,
+  "¾": 123,
+  "¾</w>": 316,
+  "¿": 124,
+  "¿</w>": 271,
+  "Â": 125,
+  "Ã": 126,
+  "Ä": 127,
+  "Å": 128,
+  "Æ": 129,
+  "Ç": 130,
+  "È": 131,
+  "É": 132,
+  "Ê": 133,
+  "Ë": 134,
+  "Ì": 135,
+  "Í": 136,
+  "Î": 137,
+  "Ï": 138,
+  "Ð": 139,
+  "Ñ": 140,
+  "Ö": 141,
+  "×": 142,
+  "Ø": 143,
+  "Ù": 144,
+  "Ü": 145,
+  "à": 146,
+  "á": 147,
+  "â": 148,
+  "ã": 149,
+  "ä": 150,
+  "å": 151,
+  "æ": 152,
+  "ç": 153,
+  "è": 154,
+  "é": 155,
+  "ë": 156,
+  "ì": 157,
+  "ï": 158,
+  "Ċ": 159,
+  "Ċ</w>": 349,
+  "Ġ": 160,
+  "Ġ\"</w>": 401,
+  "Ġ'</w>": 431,
+  "Ġ(</w>": 475,
+  "Ġ)</w>": 474,
+  "Ġ,</w>": 360,
+  "Ġ.</w>": 365,
+  "Ġ0": 847,
+  "Ġ1": 411,
+  "Ġ18": 769,
+  "Ġ19": 492,
+  "Ġ199": 893,
+  "Ġ1</w>": 778,
+  "Ġ2": 462,
+  "Ġ20": 522,
+  "Ġ200": 620,
+  "Ġ201": 734,
+  "Ġ2</w>": 813,
+  "Ġ3": 735,
+  "Ġ3</w>": 888,
+  "Ġ4": 870,
+  "Ġ5": 907,
+  "Ġ5</w>": 990,
+  "Ġ:</w>": 637,
+  "Ġ;</w>": 615,
+  "Ġ</w>": 333,
+  "Ġ=</w>": 399,
+  "Ġ@": 417,
+  "Ġ@,@</w>": 755,
+  "Ġ@-@</w>": 440,
+  "Ġ@.@</w>": 696,
+  "ĠA": 409,
+  "ĠA</w>": 807,
+  "ĠAl": 716,
+  "ĠAm": 829,
+  "ĠAmeric": 958,
+  "ĠAn": 784,
+  "ĠAr": 894,
+  "ĠB": 432,
+  "ĠC": 410,
+  "ĠCh": 581,
+  "ĠCom": 904,
+  "ĠD": 464,
+  "ĠE": 500,
+  "ĠEn": 878,
+  "ĠF": 470,
+  "ĠG": 482,
+  "ĠH": 445,
+  "ĠHe</w>": 742,
+  "ĠI": 442,
+  "ĠI</w>": 827,
+  "ĠIn": 704,
+  "ĠIn</w>": 574,
+  "ĠIt</w>": 774,
+  "ĠJ": 491,
+  "ĠK": 548,
+  "ĠL": 484,
+  "ĠM": 423,
+  "ĠMar": 776,
+  "ĠN": 483,
+  "ĠO": 504,
+  "ĠP": 450,
+  "ĠPar": 967,
+  "ĠR": 459,
+  "ĠS": 403,
+  "ĠSh": 750,
+  "ĠSt": 590,
+  "ĠT": 396,
+  "ĠTh": 414,
+  "ĠThe</w>": 437,
+  "ĠThis</w>": 997,
+  "ĠU": 585,
+  "ĠUn": 773,
+  "ĠV": 617,
+  "ĠW": 479,
+  "ĠWh": 853,
+  "ĠY": 757,
+  "Ġa": 356,
+  "Ġa</w>": 394,
+  "Ġab": 653,
+  "Ġabout</w>": 899,
+  "Ġac": 583,
+  "Ġacc": 874,
+  "Ġad": 656,
+  "Ġafter</w>": 763,
+  "Ġag": 725,
+  "Ġal": 476,
+  "Ġalb": 991,
+  "Ġall</w>": 839,
+  "Ġalso</w>": 641,
+  "Ġan": 602,
+  "Ġan</w>": 562,
+  "Ġand</w>": 381,
+  "Ġapp": 711,
+  "Ġar": 507,
+  "Ġare</w>": 601,
+  "Ġas</w>": 454,
+  "Ġass": 947,
+  "Ġat</w>": 514,
+  "Ġatt": 788,
+  "Ġb": 371,
+  "Ġbe": 499,
+  "Ġbe</w>": 595,
+  "Ġbec": 706,
+  "Ġbeen</w>": 686,
+  "Ġbeg": 915,
+  "Ġbetween</w>": 914,
+  "Ġbo": 819,
+  "Ġbut</w>": 623,
+  "Ġby</w>": 473,
+  "Ġc": 368,
+  "Ġcent": 823,
+  "Ġch": 526,
+  "Ġchar": 822,
+  "Ġcl": 689,
+  "Ġcom": 509,
+  "Ġcomm": 707,
+  "Ġcomp": 616,
+  "Ġcon": 477,
+  "Ġcons": 841,
+  "Ġcont": 655,
+  "Ġcre": 931,
+  "Ġd": 387,
+  "Ġde": 627,
+  "Ġdec": 873,
+  "Ġdef": 965,
+  "Ġdes": 738,
+  "Ġdi": 892,
+  "Ġdis": 708,
+  "Ġduring</w>": 864,
+  "Ġe": 421,
+  "Ġear": 854,
+  "Ġel": 869,
+  "Ġen": 662,
+  "Ġev": 682,
+  "Ġex": 539,
+  "Ġexp": 867,
+  "Ġf": 370,
+  "Ġfe": 845,
+  "Ġfil": 913,
+  "Ġfin": 786,
+  "Ġfir": 599,
+  "Ġfirst</w>": 626,
+  "Ġfl": 877,
+  "Ġfor": 614,
+  "Ġfor</w>": 458,
+  "Ġform": 963,
+  "Ġfrom</w>": 503,
+  "Ġg": 430,
+  "Ġgame</w>": 895,
+  "Ġgen": 985,
+  "Ġgro": 890,
+  "Ġh": 380,
+  "Ġha": 485,
+  "Ġhad</w>": 566,
+  "Ġhas</w>": 679,
+  "Ġhave</w>": 667,
+  "Ġhe</w>": 558,
+  "Ġher</w>": 660,
+  "Ġhim</w>": 896,
+  "Ġhis</w>": 512,
+  "Ġi": 366,
+  "Ġimp": 909,
+  "Ġin": 429,
+  "Ġin</w>": 389,
+  "Ġinc": 618,
+  "Ġinclud": 761,
+  "Ġind": 983,
+  "Ġint": 628,
+  "Ġinter": 832,
+  "Ġinto</w>": 785,
+  "Ġis</w>": 480,
+  "Ġit</w>": 533,
+  "Ġits</w>": 642,
+  "Ġj": 723,
+  "Ġk": 564,
+  "Ġkn": 857,
+  "Ġl": 398,
+  "Ġlar": 962,
+  "Ġlater</w>": 936,
+  "Ġlea": 868,
+  "Ġm": 386,
+  "Ġmade</w>": 949,
+  "Ġme": 968,
+  "Ġmore</w>": 802,
+  "Ġmost</w>": 910,
+  "Ġmov": 956,
+  "Ġmus": 966,
+  "Ġn": 415,
+  "Ġnew</w>": 989,
+  "Ġno": 547,
+  "Ġnor": 978,
+  "Ġnot</w>": 632,
+  "Ġnum": 926,
+  "Ġo": 359,
+  "Ġof</w>": 373,
+  "Ġoff": 875,
+  "Ġon": 551,
+  "Ġon</w>": 456,
+  "Ġone</w>": 677,
+  "Ġonly</w>": 871,
+  "Ġor": 699,
+  "Ġor</w>": 645,
+  "Ġother</w>": 787,
+  "Ġout</w>": 925,
+  "Ġov": 729,
+  "Ġover</w>": 856,
+  "Ġp": 379,
+  "Ġpar": 636,
+  "Ġper": 736,
+  "Ġpl": 698,
+  "Ġpla": 697,
+  "Ġplay": 808,
+  "Ġpos": 859,
+  "Ġpr": 889,
+  "Ġpre": 749,
+  "Ġpres": 912,
+  "Ġpro": 506,
+  "Ġproduc": 934,
+  "Ġqu": 955,
+  "Ġr": 521,
+  "Ġra": 863,
+  "Ġre": 400,
+  "Ġrec": 597,
+  "Ġrecor": 919,
+  "Ġreg": 943,
+  "Ġrel": 900,
+  "Ġrele": 946,
+  "Ġrem": 848,
+  "Ġrep": 762,
+  "Ġres": 650,
+  "Ġro": 629,
+  "Ġs": 361,
+  "Ġsa": 905,
+  "Ġsc": 732,
+  "Ġse": 569,
+  "Ġseason</w>": 948,
+  "Ġsec": 781,
+  "Ġser": 740,
+  "Ġsev": 884,
+  "Ġsh": 513,
+  "Ġshe</w>": 862,
+  "Ġsp": 578,
+  "Ġspec": 940,
+  "Ġst": 446,
+  "Ġstar": 939,
+  "Ġsu": 770,
+  "Ġsub": 969,
+  "Ġsuc": 764,
+  "Ġsuch</w>": 981,
+  "Ġt": 354,
+  "Ġth": 355,
+  "Ġthan</w>": 918,
+  "Ġthat</w>": 452,
+  "Ġthe": 502,
+  "Ġthe</w>": 357,
+  "Ġtheir</w>": 621,
+  "Ġthem</w>": 998,
+  "Ġthey</w>": 727,
+  "Ġthis</w>": 705,
+  "Ġthree</w>": 902,
+  "Ġthroug": 923,
+  "Ġtime</w>": 783,
+  "Ġto</w>": 391,
+  "Ġtra": 836,
+  "Ġtw": 639,
+  "Ġtwo</w>": 688,
+  "Ġun": 544,
+  "Ġup</w>": 898,
+  "Ġus": 668,
+  "Ġused</w>": 988,
+  "Ġv": 495,
+  "Ġw": 363,
+  "Ġwas</w>": 422,
+  "Ġwere</w>": 525,
+  "Ġwh": 443,
+  "Ġwhen</w>": 851,
+  "Ġwhere</w>": 995,
+  "Ġwhich</w>": 573,
+  "Ġwhile</w>": 935,
+  "Ġwho</w>": 724,
+  "Ġwit": 451,
+  "Ġwith": 994,
+  "Ġwith</w>": 466,
+  "Ġwor": 643,
+  "Ġwould</w>": 801,
+  "Ġwrit": 937,
+  "Ġy": 580,
+  "Ġyear</w>": 987,
+  "Ġâ": 556,
+  "ĠâĢ": 565,
+  "ĠâĢĵ</w>": 631,
+  "ĠĊ</w>": 412,
+  "Ģ": 161,
+  "Ģ</w>": 223,
+  "ģ": 162,
+  "ģ</w>": 273,
+  "Ĥ": 163,
+  "Ĥ</w>": 262,
+  "ĥ": 164,
+  "ĥ</w>": 337,
+  "Ħ": 165,
+  "Ħ</w>": 278,
+  "ħ": 166,
+  "ħ</w>": 281,
+  "Ĩ": 167,
+  "Ĩ</w>": 308,
+  "ĩ": 168,
+  "ĩ</w>": 225,
+  "Ī": 169,
+  "Ī</w>": 221,
+  "ī": 170,
+  "ī</w>": 244,
+  "Ĭ": 171,
+  "Ĭ</w>": 315,
+  "ĭ": 172,
+  "ĭ</w>": 321,
+  "Į": 173,
+  "Į</w>": 324,
+  "į": 174,
+  "į</w>": 302,
+  "İ": 175,
+  "İ</w>": 249,
+  "ı": 176,
+  "ı</w>": 332,
+  "Ĳ": 177,
+  "Ĳ</w>": 295,
+  "ĳ": 178,
+  "ĳ</w>": 313,
+  "Ĵ": 179,
+  "Ĵ</w>": 328,
+  "ĵ": 180,
+  "ĵ</w>": 312,
+  "Ķ": 181,
+  "Ķ</w>": 256,
+  "ķ": 182,
+  "ķ</w>": 314,
+  "ĸ": 183,
+  "ĸ</w>": 277,
+  "Ĺ": 184,
+  "Ĺ</w>": 322,
+  "ĺ": 185,
+  "ĺ</w>": 285,
+  "Ļ": 186,
+  "Ļ</w>": 267,
+  "ļ": 187,
+  "ļ</w>": 290,
+  "Ľ": 188,
+  "Ľ</w>": 311,
+  "ľ": 189,
+  "ľ</w>": 299,
+  "Ŀ": 190,
+  "Ŀ</w>": 291,
+  "ŀ": 191,
+  "ŀ</w>": 293,
+  "Ł": 192,
+  "Ł</w>": 335,
+  "ł": 193,
+  "ł</w>": 252,
+  "Ń": 194,
+  "Ń</w>": 297
+}

unet/config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.18.1",
+  "act_fn": "silu",
+  "addition_embed_type": "text_time",
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": 8,
+  "attention_head_dim": [
+    2,
+    4
+  ],
+  "block_out_channels": [
+    32,
+    64
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 64,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "DownBlock2D",
+    "CrossAttnDownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": 80,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "sample_size": 32,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": [
+    1,
+    2
+  ],
+  "up_block_types": [
+    "CrossAttnUpBlock2D",
+    "UpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": true
+}

unet/diffusion_flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fed31ac1c90efa34dc35ea81f5b15c1f04f52cba3cb0a965d6473dd81afeddfd
+size 7919640

unet/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d6093265f73fe68d5f3f5ed18228bfa91fa23b11baaf9f7f9663b25cc605d26
+size 8083273

unet/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b24b7a6214d413f104e91756d7bd08e5926f7906afb055d23dcd2d53e10469bb
+size 9126212

unet/my_unet_model.py ADDED Viewed

	@@ -0,0 +1,1129 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unet_2d_blocks import (
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`MyUNetModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class MyUNetModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = PositionNet(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k) is not None:
+                    setattr(upsample_block, k, None)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`MyUNetModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            # Forward upsample size to force interpolation output size.
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)

unet/openvino_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d88961abcf8276c903b23ee1f1226960cfc314f9351bec493cdf774c4a3150b9
+size 7875412

unet/openvino_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

vae/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.18.1",
+  "act_fn": "silu",
+  "block_out_channels": [
+    32,
+    64
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 1,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 128,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae/diffusion_flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f660614068b8211de0076bbf4739ed5b4bb34a1d752ebfd6b350bf142643883a
+size 2637326

vae/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e84bb0d30f9de5f723541259119fa2702639a8c73465fe8263085739154eff9f
+size 2681001

vae_decoder/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.18.1",
+  "act_fn": "silu",
+  "block_out_channels": [
+    32,
+    64
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 1,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 128,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae_decoder/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d43e163a98640b88a7fc09bdab4ce4e94dd14eb9d75405d3cd78704484c12de
+size 1682764

vae_decoder/openvino_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18cb02fba3ec844ab18b875931d13e48144a1e294e8d1ff7b92490864fbffecd
+size 1603072

vae_decoder/openvino_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

vae_encoder/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.18.1",
+  "act_fn": "silu",
+  "block_out_channels": [
+    32,
+    64
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 1,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 128,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae_encoder/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:155849e8986089d7b02e137f01aacca23f8e3c5a133f69209d50cd4a296a48e9
+size 1095370

vae_encoder/openvino_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e20a8b2c826a5e08ace068bd50e19f5c845ea6a33f6357a6710d726e433ca015
+size 1021888

vae_encoder/openvino_model.xml ADDED Viewed

The diff for this file is too large to render. See raw diff