Spaces:

sironagasuyagi
/

Pyramid-Flow

Build error

App Files Files Community

sironagasuyagi commited on 5 days ago

Commit

910e2ad

•

1 Parent(s): 43fbfb0

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +119 -0
.gradio/certificate.pem +31 -0
LICENSE +21 -0
README - コピー.md +295 -0
README.md +295 -12
annotation/image_text.jsonl +20 -0
annotation/video_text.jsonl +17 -0
app.py +356 -0
app_multigpu.py +143 -0
assets/motivation.jpg +0 -0
assets/the_great_wall.jpg +0 -0
assets/user_study.jpg +0 -0
assets/vbench.jpg +0 -0
causal_video_vae_demo.ipynb +221 -0
dataset/__init__.py +12 -0
dataset/bucket_loader.py +148 -0
dataset/dataloaders.py +190 -0
dataset/dataset_cls.py +377 -0
diffusion_schedulers/__init__.py +2 -0
diffusion_schedulers/scheduling_cosine_ddpm.py +137 -0
diffusion_schedulers/scheduling_flow_matching.py +297 -0
docs/DiT.md +54 -0
docs/VAE.md +42 -0
image_generation_demo.ipynb +123 -0
inference_multigpu.py +123 -0
pyramid_dit/__init__.py +3 -0
pyramid_dit/flux_modules/__init__.py +3 -0
pyramid_dit/flux_modules/modeling_embedding.py +201 -0
pyramid_dit/flux_modules/modeling_flux_block.py +1044 -0
pyramid_dit/flux_modules/modeling_normalization.py +249 -0
pyramid_dit/flux_modules/modeling_pyramid_flux.py +543 -0
pyramid_dit/flux_modules/modeling_text_encoder.py +134 -0
pyramid_dit/mmdit_modules/__init__.py +3 -0
pyramid_dit/mmdit_modules/modeling_embedding.py +390 -0
pyramid_dit/mmdit_modules/modeling_mmdit_block.py +671 -0
pyramid_dit/mmdit_modules/modeling_normalization.py +179 -0
pyramid_dit/mmdit_modules/modeling_pyramid_mmdit.py +497 -0
pyramid_dit/mmdit_modules/modeling_text_encoder.py +140 -0
pyramid_dit/pyramid_dit_for_video_gen_pipeline.py +1279 -0
pyramid_flow_model.lnk +0 -0
pyramid_flow_model/.gitattributes +35 -0
pyramid_flow_model/README.md +191 -0
pyramid_flow_model/causal_video_vae/config.json +92 -0
pyramid_flow_model/causal_video_vae/diffusion_pytorch_model.bin +3 -0
pyramid_flow_model/diffusion_transformer_384p/config.json +21 -0
pyramid_flow_model/diffusion_transformer_384p/diffusion_pytorch_model.safetensors +3 -0
pyramid_flow_model/diffusion_transformer_768p/config.json +21 -0
pyramid_flow_model/diffusion_transformer_768p/diffusion_pytorch_model.safetensors +3 -0
pyramid_flow_model/diffusion_transformer_image/config.json +21 -0
pyramid_flow_model/diffusion_transformer_image/diffusion_pytorch_model.safetensors +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,119 @@

+# Xcode
+.DS_Store
+.idea
+# tyte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+onnx_model/*.onnx
+onnx_model/antelope/*.onnx
+logs/
+prompts/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.pt2/
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+.bak

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Yang Jin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README - コピー.md ADDED Viewed

	@@ -0,0 +1,295 @@

+---
+title: Pyramid-Flow
+app_file: app.py
+sdk: gradio
+sdk_version: 5.6.0
+---
+<div align="center">
+# ⚡️Pyramid Flow⚡️
+[[Paper]](https://arxiv.org/abs/2410.05954) [[Project Page ✨]](https://pyramid-flow.github.io) [[miniFLUX Model 🚀]](https://huggingface.co/rain1011/pyramid-flow-miniflux) [[SD3 Model ⚡️]](https://huggingface.co/rain1011/pyramid-flow-sd3) [[demo 🤗](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow)]
+</div>
+This is the official repository for Pyramid Flow, a training-efficient **Autoregressive Video Generation** method based on **Flow Matching**. By training only on **open-source datasets**, it can generate high-quality 10-second videos at 768p resolution and 24 FPS, and naturally supports image-to-video generation.
+<table class="center" border="0" style="width: 100%; text-align: left;">
+<tr>
+  <th>10s, 768p, 24fps</th>
+  <th>5s, 768p, 24fps</th>
+  <th>Image-to-video</th>
+</tr>
+<tr>
+  <td><video src="https://github.com/user-attachments/assets/9935da83-ae56-4672-8747-0f46e90f7b2b" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/3412848b-64db-4d9e-8dbf-11403f6d02c5" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/3bd7251f-7b2c-4bee-951d-656fdb45f427" autoplay muted loop playsinline></video></td>
+</tr>
+</table>
+## News
+* `2024.11.13`  🚀🚀🚀 We release the [768p miniFLUX checkpoint](https://huggingface.co/rain1011/pyramid-flow-miniflux) (up to 10s).
+  > We have switched the model structure from SD3 to a mini FLUX to fix human structure issues, please try our 1024p image checkpoint, 384p video checkpoint (up to 5s) and 768p video checkpoint (up to 10s). The new miniflux model shows great improvement on human structure and motion stability
+* `2024.10.29` ⚡️⚡️⚡️ We release [training code for VAE](#1-training-vae), [finetuning code for DiT](#2-finetuning-dit) and [new model checkpoints](https://huggingface.co/rain1011/pyramid-flow-miniflux) with FLUX structure trained from scratch.
+* `2024.10.13`  ✨✨✨ [Multi-GPU inference](#3-multi-gpu-inference) and [CPU offloading](#cpu-offloading) are supported. Use it with **less than 8GB** of GPU memory, with great speedup on multiple GPUs.
+* `2024.10.11`  🤗🤗🤗 [Hugging Face demo](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow) is available. Thanks [@multimodalart](https://huggingface.co/multimodalart) for the commit!
+* `2024.10.10`  🚀🚀🚀 We release the [technical report](https://arxiv.org/abs/2410.05954), [project page](https://pyramid-flow.github.io) and [model checkpoint](https://huggingface.co/rain1011/pyramid-flow-sd3) of Pyramid Flow.
+## Table of Contents
+* [Introduction](#introduction)
+* [Installation](#installation)
+* [Inference](#inference)
+  1. [Quick Start with Gradio](#1-quick-start-with-gradio)
+  2. [Inference Code](#2-inference-code)
+  3. [Multi-GPU Inference](#3-multi-gpu-inference)
+  4. [Usage Tips](#4-usage-tips)
+* [Training](#Training)
+  1. [Training VAE](#training-vae)
+  2. [Finetuning DiT](#finetuning-dit)
+* [Gallery](#gallery)
+* [Comparison](#comparison)
+* [Acknowledgement](#acknowledgement)
+* [Citation](#citation)
+## Introduction
+![motivation](assets/motivation.jpg)
+Existing video diffusion models operate at full resolution, spending a lot of computation on very noisy latents. By contrast, our method harnesses the flexibility of flow matching ([Lipman et al., 2023](https://openreview.net/forum?id=PqvMRDCJT9t); [Liu et al., 2023](https://openreview.net/forum?id=XVjTT1nw5z); [Albergo & Vanden-Eijnden, 2023](https://openreview.net/forum?id=li7qeBbCR1t)) to interpolate between latents of different resolutions and noise levels, allowing for simultaneous generation and decompression of visual content with better computational efficiency. The entire framework is end-to-end optimized with a single DiT ([Peebles & Xie, 2023](http://openaccess.thecvf.com/content/ICCV2023/html/Peebles_Scalable_Diffusion_Models_with_Transformers_ICCV_2023_paper.html)), generating high-quality 10-second videos at 768p resolution and 24 FPS within 20.7k A100 GPU training hours.
+## Installation
+We recommend setting up the environment with conda. The codebase currently uses Python 3.8.10 and PyTorch 2.1.2 ([guide](https://pytorch.org/get-started/previous-versions/#v212)), and we are actively working to support a wider range of versions.
+```bash
+git clone https://github.com/jy0205/Pyramid-Flow
+cd Pyramid-Flow
+# create env using conda
+conda create -n pyramid python==3.8.10
+conda activate pyramid
+pip install -r requirements.txt
+```
+Then, download the model from [Huggingface](https://huggingface.co/rain1011) (there are two variants: [miniFLUX](https://huggingface.co/rain1011/pyramid-flow-miniflux) or [SD3](https://huggingface.co/rain1011/pyramid-flow-sd3)). The miniFLUX models support 1024p image, 384p and 768p video generation, and the SD3-based models support 768p and 384p video generation. The 384p checkpoint generates 5-second video at 24FPS, while the 768p checkpoint generates up to 10-second video at 24FPS.
+```python
+from huggingface_hub import snapshot_download
+model_path = 'PATH'   # The local directory to save downloaded checkpoint
+snapshot_download("rain1011/pyramid-flow-miniflux", local_dir=model_path, local_dir_use_symlinks=False, repo_type='model')
+```
+## Inference
+### 1. Quick start with Gradio
+To get started, first install [Gradio](https://www.gradio.app/guides/quickstart), set your model path at [#L36](https://github.com/jy0205/Pyramid-Flow/blob/3777f8b84bddfa2aa2b497ca919b3f40567712e6/app.py#L36), and then run on your local machine:
+```bash
+python app.py
+```
+The Gradio demo will be opened in a browser. Thanks to [@tpc2233](https://github.com/tpc2233) the commit, see [#48](https://github.com/jy0205/Pyramid-Flow/pull/48) for details.
+Or, try it out effortlessly on [Hugging Face Space 🤗](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow) created by [@multimodalart](https://huggingface.co/multimodalart). Due to GPU limits, this online demo can only generate 25 frames (export at 8FPS or 24FPS). Duplicate the space to generate longer videos.
+#### Quick Start on Google Colab
+To quickly try out Pyramid Flow on Google Colab, run the code below:
+```
+# Setup
+!git clone https://github.com/jy0205/Pyramid-Flow
+%cd Pyramid-Flow
+!pip install -r requirements.txt
+!pip install gradio
+# This code downloads miniFLUX
+from huggingface_hub import snapshot_download
+model_path = '/content/Pyramid-Flow'
+snapshot_download("rain1011/pyramid-flow-miniflux", local_dir=model_path, local_dir_use_symlinks=False, repo_type='model')
+# Start
+!python app.py
+```
+### 2. Inference Code
+To use our model, please follow the inference code in `video_generation_demo.ipynb` at [this link](https://github.com/jy0205/Pyramid-Flow/blob/main/video_generation_demo.ipynb). We strongly recommend you to try the latest published pyramid-miniflux, which shows great improvement on human structure and motion stability. Set the param `model_name` to `pyramid_flux` to use. We further simplify it into the following two-step procedure. First, load the downloaded model:
+```python
+import torch
+from PIL import Image
+from pyramid_dit import PyramidDiTForVideoGeneration
+from diffusers.utils import load_image, export_to_video
+torch.cuda.set_device(0)
+model_dtype, torch_dtype = 'bf16', torch.bfloat16   # Use bf16 (not support fp16 yet)
+model = PyramidDiTForVideoGeneration(
+    'PATH',                                         # The downloaded checkpoint dir
+    model_name="pyramid_flux",
+    model_dtype,
+    model_variant='diffusion_transformer_768p',
+)
+model.vae.enable_tiling()
+# model.vae.to("cuda")
+# model.dit.to("cuda")
+# model.text_encoder.to("cuda")
+# if you're not using sequential offloading bellow uncomment the lines above ^
+model.enable_sequential_cpu_offload()
+```
+Then, you can try text-to-video generation on your own prompts. Noting that the 384p version only support 5s now (set temp up to 16)!
+```python
+prompt = "A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors"
+# used for 384p model variant
+# width = 640
+# height = 384
+# used for 768p model variant
+width = 1280
+height = 768
+with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
+    frames = model.generate(
+        prompt=prompt,
+        num_inference_steps=[20, 20, 20],
+        video_num_inference_steps=[10, 10, 10],
+        height=height,
+        width=width,
+        temp=16,                    # temp=16: 5s, temp=31: 10s
+        guidance_scale=7.0,         # The guidance for the first frame, set it to 7 for 384p variant
+        video_guidance_scale=5.0,   # The guidance for the other video latent
+        output_type="pil",
+        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+    )
+export_to_video(frames, "./text_to_video_sample.mp4", fps=24)
+```
+As an autoregressive model, our model also supports (text conditioned) image-to-video generation:
+```python
+# used for 384p model variant
+# width = 640
+# height = 384
+# used for 768p model variant
+width = 1280
+height = 768
+image = Image.open('assets/the_great_wall.jpg').convert("RGB").resize((width, height))
+prompt = "FPV flying over the Great Wall"
+with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
+    frames = model.generate_i2v(
+        prompt=prompt,
+        input_image=image,
+        num_inference_steps=[10, 10, 10],
+        temp=16,
+        video_guidance_scale=4.0,
+        output_type="pil",
+        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+    )
+export_to_video(frames, "./image_to_video_sample.mp4", fps=24)
+```
+#### CPU offloading
+We also support two types of CPU offloading to reduce GPU memory requirements. Note that they may sacrifice efficiency.
+* Adding a `cpu_offloading=True` parameter to the generate function allows inference with **less than 12GB** of GPU memory. This feature was contributed by [@Ednaordinary](https://github.com/Ednaordinary), see [#23](https://github.com/jy0205/Pyramid-Flow/pull/23) for details.
+* Calling `model.enable_sequential_cpu_offload()` before the above procedure allows inference with **less than 8GB** of GPU memory. This feature was contributed by [@rodjjo](https://github.com/rodjjo), see [#75](https://github.com/jy0205/Pyramid-Flow/pull/75) for details.
+#### MPS backend
+Thanks to [@niw](https://github.com/niw), Apple Silicon users (e.g. MacBook Pro with M2 24GB) can also try our model using the MPS backend! Please see [#113](https://github.com/jy0205/Pyramid-Flow/pull/113) for the details.
+### 3. Multi-GPU Inference
+For users with multiple GPUs, we provide an [inference script](https://github.com/jy0205/Pyramid-Flow/blob/main/scripts/inference_multigpu.sh) that uses sequence parallelism to save memory on each GPU. This also brings a big speedup, taking only 2.5 minutes to generate a 5s, 768p, 24fps video on 4 A100 GPUs (vs. 5.5 minutes on a single A100 GPU). Run it on 2 GPUs with the following command:
+```bash
+CUDA_VISIBLE_DEVICES=0,1 sh scripts/inference_multigpu.sh
+```
+It currently supports 2 or 4 GPUs (For SD3 Version), with more configurations available in the original script. You can also launch a [multi-GPU Gradio demo](https://github.com/jy0205/Pyramid-Flow/blob/main/scripts/app_multigpu_engine.sh) created by [@tpc2233](https://github.com/tpc2233), see [#59](https://github.com/jy0205/Pyramid-Flow/pull/59) for details.
+  > Spoiler: We didn't even use sequence parallelism in training, thanks to our efficient pyramid flow designs.
+### 4. Usage tips
+* The `guidance_scale` parameter controls the visual quality. We suggest using a guidance within [7, 9] for the 768p checkpoint during text-to-video generation, and 7 for the 384p checkpoint.
+* The `video_guidance_scale` parameter controls the motion. A larger value increases the dynamic degree and mitigates the autoregressive generation degradation, while a smaller value stabilizes the video.
+* For 10-second video generation, we recommend using a guidance scale of 7 and a video guidance scale of 5.
+## Training
+### 1. Training VAE
+The hardware requirements for training VAE are at least 8 A100 GPUs. Please refer to [this document](https://github.com/jy0205/Pyramid-Flow/blob/main/docs/VAE.md). This is a [MAGVIT-v2](https://arxiv.org/abs/2310.05737) like continuous 3D VAE, which should be quite flexible. Feel free to build your own video generative model on this part of VAE training code.
+### 2. Finetuning DiT
+The hardware requirements for finetuning DiT are at least 8 A100 GPUs. Please refer to [this document](https://github.com/jy0205/Pyramid-Flow/blob/main/docs/DiT.md). We provide instructions for both autoregressive and non-autoregressive versions of Pyramid Flow. The former is more research oriented and the latter is more stable (but less efficient without temporal pyramid).
+## Gallery
+The following video examples are generated at 5s, 768p, 24fps. For more results, please visit our [project page](https://pyramid-flow.github.io).
+<table class="center" border="0" style="width: 100%; text-align: left;">
+<tr>
+  <td><video src="https://github.com/user-attachments/assets/5b44a57e-fa08-4554-84a2-2c7a99f2b343" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/5afd5970-de72-40e2-900d-a20d18308e8e" autoplay muted loop playsinline></video></td>
+</tr>
+<tr>
+  <td><video src="https://github.com/user-attachments/assets/1d44daf8-017f-40e9-bf18-1e19c0a8983b" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/7f5dd901-b7d7-48cc-b67a-3c5f9e1546d2" autoplay muted loop playsinline></video></td>
+</tr>
+</table>
+## Comparison
+On VBench ([Huang et al., 2024](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard)), our method surpasses all the compared open-source baselines. Even with only public video data, it achieves comparable performance to commercial models like Kling ([Kuaishou, 2024](https://kling.kuaishou.com/en)) and Gen-3 Alpha ([Runway, 2024](https://runwayml.com/research/introducing-gen-3-alpha)), especially in the quality score (84.74 vs. 84.11 of Gen-3) and motion smoothness.
+![vbench](assets/vbench.jpg)
+We conduct an additional user study with 20+ participants. As can be seen, our method is preferred over open-source models such as [Open-Sora](https://github.com/hpcaitech/Open-Sora) and [CogVideoX-2B](https://github.com/THUDM/CogVideo) especially in terms of motion smoothness.
+![user_study](assets/user_study.jpg)
+## Acknowledgement
+We are grateful for the following awesome projects when implementing Pyramid Flow:
+* [SD3 Medium](https://huggingface.co/stabilityai/stable-diffusion-3-medium) and [Flux 1.0](https://huggingface.co/black-forest-labs/FLUX.1-dev): State-of-the-art image generation models based on flow matching.
+* [Diffusion Forcing](https://boyuan.space/diffusion-forcing) and [GameNGen](https://gamengen.github.io): Next-token prediction meets full-sequence diffusion.
+* [WebVid-10M](https://github.com/m-bain/webvid), [OpenVid-1M](https://github.com/NJU-PCALab/OpenVid-1M) and [Open-Sora Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan): Large-scale datasets for text-to-video generation.
+* [CogVideoX](https://github.com/THUDM/CogVideo): An open-source text-to-video generation model that shares many training details.
+* [Video-LLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2): An open-source video LLM for our video recaptioning.
+## Citation
+Consider giving this repository a star and cite Pyramid Flow in your publications if it helps your research.
+```
+@article{jin2024pyramidal,
+  title={Pyramidal Flow Matching for Efficient Video Generative Modeling},
+  author={Jin, Yang and Sun, Zhicheng and Li, Ningyuan and Xu, Kun and Xu, Kun and Jiang, Hao and Zhuang, Nan and Huang, Quzhe and Song, Yang and Mu, Yadong and Lin, Zhouchen},
+  jounal={arXiv preprint arXiv:2410.05954},
+  year={2024}
+}
+```

README.md CHANGED Viewed

@@ -1,12 +1,295 @@
----
-title: Pyramid Flow
-emoji: 😻
-colorFrom: green
-colorTo: gray
-sdk: gradio
-sdk_version: 5.6.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Pyramid-Flow
+app_file: app.py
+sdk: gradio
+sdk_version: 5.6.0
+---
+<div align="center">
+# Pyramid Flow
+[[Paper]](https://arxiv.org/abs/2410.05954) [[Project Page]](https://pyramid-flow.github.io) [[miniFLUX Model]](https://huggingface.co/rain1011/pyramid-flow-miniflux) [[SD3 Model]](https://huggingface.co/rain1011/pyramid-flow-sd3) [[demo](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow)]
+</div>
+This is the official repository for Pyramid Flow, a training-efficient **Autoregressive Video Generation** method based on **Flow Matching**. By training only on **open-source datasets**, it can generate high-quality 10-second videos at 768p resolution and 24 FPS, and naturally supports image-to-video generation.
+<table class="center" border="0" style="width: 100%; text-align: left;">
+<tr>
+  <th>10s, 768p, 24fps</th>
+  <th>5s, 768p, 24fps</th>
+  <th>Image-to-video</th>
+</tr>
+<tr>
+  <td><video src="https://github.com/user-attachments/assets/9935da83-ae56-4672-8747-0f46e90f7b2b" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/3412848b-64db-4d9e-8dbf-11403f6d02c5" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/3bd7251f-7b2c-4bee-951d-656fdb45f427" autoplay muted loop playsinline></video></td>
+</tr>
+</table>
+## News
+* `2024.11.13`   We release the [768p miniFLUX checkpoint](https://huggingface.co/rain1011/pyramid-flow-miniflux) (up to 10s).
+  > We have switched the model structure from SD3 to a mini FLUX to fix human structure issues, please try our 1024p image checkpoint, 384p video checkpoint (up to 5s) and 768p video checkpoint (up to 10s). The new miniflux model shows great improvement on human structure and motion stability
+* `2024.10.29`  We release [training code for VAE](#1-training-vae), [finetuning code for DiT](#2-finetuning-dit) and [new model checkpoints](https://huggingface.co/rain1011/pyramid-flow-miniflux) with FLUX structure trained from scratch.
+* `2024.10.13`  [Multi-GPU inference](#3-multi-gpu-inference) and [CPU offloading](#cpu-offloading) are supported. Use it with **less than 8GB** of GPU memory, with great speedup on multiple GPUs.
+* `2024.10.11`  [Hugging Face demo](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow) is available. Thanks [@multimodalart](https://huggingface.co/multimodalart) for the commit!
+* `2024.10.10`  We release the [technical report](https://arxiv.org/abs/2410.05954), [project page](https://pyramid-flow.github.io) and [model checkpoint](https://huggingface.co/rain1011/pyramid-flow-sd3) of Pyramid Flow.
+## Table of Contents
+* [Introduction](#introduction)
+* [Installation](#installation)
+* [Inference](#inference)
+  1. [Quick Start with Gradio](#1-quick-start-with-gradio)
+  2. [Inference Code](#2-inference-code)
+  3. [Multi-GPU Inference](#3-multi-gpu-inference)
+  4. [Usage Tips](#4-usage-tips)
+* [Training](#Training)
+  1. [Training VAE](#training-vae)
+  2. [Finetuning DiT](#finetuning-dit)
+* [Gallery](#gallery)
+* [Comparison](#comparison)
+* [Acknowledgement](#acknowledgement)
+* [Citation](#citation)
+## Introduction
+![motivation](assets/motivation.jpg)
+Existing video diffusion models operate at full resolution, spending a lot of computation on very noisy latents. By contrast, our method harnesses the flexibility of flow matching ([Lipman et al., 2023](https://openreview.net/forum?id=PqvMRDCJT9t); [Liu et al., 2023](https://openreview.net/forum?id=XVjTT1nw5z); [Albergo & Vanden-Eijnden, 2023](https://openreview.net/forum?id=li7qeBbCR1t)) to interpolate between latents of different resolutions and noise levels, allowing for simultaneous generation and decompression of visual content with better computational efficiency. The entire framework is end-to-end optimized with a single DiT ([Peebles & Xie, 2023](http://openaccess.thecvf.com/content/ICCV2023/html/Peebles_Scalable_Diffusion_Models_with_Transformers_ICCV_2023_paper.html)), generating high-quality 10-second videos at 768p resolution and 24 FPS within 20.7k A100 GPU training hours.
+## Installation
+We recommend setting up the environment with conda. The codebase currently uses Python 3.8.10 and PyTorch 2.1.2 ([guide](https://pytorch.org/get-started/previous-versions/#v212)), and we are actively working to support a wider range of versions.
+```bash
+git clone https://github.com/jy0205/Pyramid-Flow
+cd Pyramid-Flow
+# create env using conda
+conda create -n pyramid python==3.8.10
+conda activate pyramid
+pip install -r requirements.txt
+```
+Then, download the model from [Huggingface](https://huggingface.co/rain1011) (there are two variants: [miniFLUX](https://huggingface.co/rain1011/pyramid-flow-miniflux) or [SD3](https://huggingface.co/rain1011/pyramid-flow-sd3)). The miniFLUX models support 1024p image, 384p and 768p video generation, and the SD3-based models support 768p and 384p video generation. The 384p checkpoint generates 5-second video at 24FPS, while the 768p checkpoint generates up to 10-second video at 24FPS.
+```python
+from huggingface_hub import snapshot_download
+model_path = 'PATH'   # The local directory to save downloaded checkpoint
+snapshot_download("rain1011/pyramid-flow-miniflux", local_dir=model_path, local_dir_use_symlinks=False, repo_type='model')
+```
+## Inference
+### 1. Quick start with Gradio
+To get started, first install [Gradio](https://www.gradio.app/guides/quickstart), set your model path at [#L36](https://github.com/jy0205/Pyramid-Flow/blob/3777f8b84bddfa2aa2b497ca919b3f40567712e6/app.py#L36), and then run on your local machine:
+```bash
+python app.py
+```
+The Gradio demo will be opened in a browser. Thanks to [@tpc2233](https://github.com/tpc2233) the commit, see [#48](https://github.com/jy0205/Pyramid-Flow/pull/48) for details.
+Or, try it out effortlessly on [Hugging Face Space](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow) created by [@multimodalart](https://huggingface.co/multimodalart). Due to GPU limits, this online demo can only generate 25 frames (export at 8FPS or 24FPS). Duplicate the space to generate longer videos.
+#### Quick Start on Google Colab
+To quickly try out Pyramid Flow on Google Colab, run the code below:
+```
+# Setup
+!git clone https://github.com/jy0205/Pyramid-Flow
+%cd Pyramid-Flow
+!pip install -r requirements.txt
+!pip install gradio
+# This code downloads miniFLUX
+from huggingface_hub import snapshot_download
+model_path = '/content/Pyramid-Flow'
+snapshot_download("rain1011/pyramid-flow-miniflux", local_dir=model_path, local_dir_use_symlinks=False, repo_type='model')
+# Start
+!python app.py
+```
+### 2. Inference Code
+To use our model, please follow the inference code in `video_generation_demo.ipynb` at [this link](https://github.com/jy0205/Pyramid-Flow/blob/main/video_generation_demo.ipynb). We strongly recommend you to try the latest published pyramid-miniflux, which shows great improvement on human structure and motion stability. Set the param `model_name` to `pyramid_flux` to use. We further simplify it into the following two-step procedure. First, load the downloaded model:
+```python
+import torch
+from PIL import Image
+from pyramid_dit import PyramidDiTForVideoGeneration
+from diffusers.utils import load_image, export_to_video
+torch.cuda.set_device(0)
+model_dtype, torch_dtype = 'bf16', torch.bfloat16   # Use bf16 (not support fp16 yet)
+model = PyramidDiTForVideoGeneration(
+    'PATH',                                         # The downloaded checkpoint dir
+    model_name="pyramid_flux",
+    model_dtype,
+    model_variant='diffusion_transformer_768p',
+)
+model.vae.enable_tiling()
+# model.vae.to("cuda")
+# model.dit.to("cuda")
+# model.text_encoder.to("cuda")
+# if you're not using sequential offloading bellow uncomment the lines above ^
+model.enable_sequential_cpu_offload()
+```
+Then, you can try text-to-video generation on your own prompts. Noting that the 384p version only support 5s now (set temp up to 16)!
+```python
+prompt = "A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors"
+# used for 384p model variant
+# width = 640
+# height = 384
+# used for 768p model variant
+width = 1280
+height = 768
+with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
+    frames = model.generate(
+        prompt=prompt,
+        num_inference_steps=[20, 20, 20],
+        video_num_inference_steps=[10, 10, 10],
+        height=height,
+        width=width,
+        temp=16,                    # temp=16: 5s, temp=31: 10s
+        guidance_scale=7.0,         # The guidance for the first frame, set it to 7 for 384p variant
+        video_guidance_scale=5.0,   # The guidance for the other video latent
+        output_type="pil",
+        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+    )
+export_to_video(frames, "./text_to_video_sample.mp4", fps=24)
+```
+As an autoregressive model, our model also supports (text conditioned) image-to-video generation:
+```python
+# used for 384p model variant
+# width = 640
+# height = 384
+# used for 768p model variant
+width = 1280
+height = 768
+image = Image.open('assets/the_great_wall.jpg').convert("RGB").resize((width, height))
+prompt = "FPV flying over the Great Wall"
+with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
+    frames = model.generate_i2v(
+        prompt=prompt,
+        input_image=image,
+        num_inference_steps=[10, 10, 10],
+        temp=16,
+        video_guidance_scale=4.0,
+        output_type="pil",
+        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+    )
+export_to_video(frames, "./image_to_video_sample.mp4", fps=24)
+```
+#### CPU offloading
+We also support two types of CPU offloading to reduce GPU memory requirements. Note that they may sacrifice efficiency.
+* Adding a `cpu_offloading=True` parameter to the generate function allows inference with **less than 12GB** of GPU memory. This feature was contributed by [@Ednaordinary](https://github.com/Ednaordinary), see [#23](https://github.com/jy0205/Pyramid-Flow/pull/23) for details.
+* Calling `model.enable_sequential_cpu_offload()` before the above procedure allows inference with **less than 8GB** of GPU memory. This feature was contributed by [@rodjjo](https://github.com/rodjjo), see [#75](https://github.com/jy0205/Pyramid-Flow/pull/75) for details.
+#### MPS backend
+Thanks to [@niw](https://github.com/niw), Apple Silicon users (e.g. MacBook Pro with M2 24GB) can also try our model using the MPS backend! Please see [#113](https://github.com/jy0205/Pyramid-Flow/pull/113) for the details.
+### 3. Multi-GPU Inference
+For users with multiple GPUs, we provide an [inference script](https://github.com/jy0205/Pyramid-Flow/blob/main/scripts/inference_multigpu.sh) that uses sequence parallelism to save memory on each GPU. This also brings a big speedup, taking only 2.5 minutes to generate a 5s, 768p, 24fps video on 4 A100 GPUs (vs. 5.5 minutes on a single A100 GPU). Run it on 2 GPUs with the following command:
+```bash
+CUDA_VISIBLE_DEVICES=0,1 sh scripts/inference_multigpu.sh
+```
+It currently supports 2 or 4 GPUs (For SD3 Version), with more configurations available in the original script. You can also launch a [multi-GPU Gradio demo](https://github.com/jy0205/Pyramid-Flow/blob/main/scripts/app_multigpu_engine.sh) created by [@tpc2233](https://github.com/tpc2233), see [#59](https://github.com/jy0205/Pyramid-Flow/pull/59) for details.
+  > Spoiler: We didn't even use sequence parallelism in training, thanks to our efficient pyramid flow designs.
+### 4. Usage tips
+* The `guidance_scale` parameter controls the visual quality. We suggest using a guidance within [7, 9] for the 768p checkpoint during text-to-video generation, and 7 for the 384p checkpoint.
+* The `video_guidance_scale` parameter controls the motion. A larger value increases the dynamic degree and mitigates the autoregressive generation degradation, while a smaller value stabilizes the video.
+* For 10-second video generation, we recommend using a guidance scale of 7 and a video guidance scale of 5.
+## Training
+### 1. Training VAE
+The hardware requirements for training VAE are at least 8 A100 GPUs. Please refer to [this document](https://github.com/jy0205/Pyramid-Flow/blob/main/docs/VAE.md). This is a [MAGVIT-v2](https://arxiv.org/abs/2310.05737) like continuous 3D VAE, which should be quite flexible. Feel free to build your own video generative model on this part of VAE training code.
+### 2. Finetuning DiT
+The hardware requirements for finetuning DiT are at least 8 A100 GPUs. Please refer to [this document](https://github.com/jy0205/Pyramid-Flow/blob/main/docs/DiT.md). We provide instructions for both autoregressive and non-autoregressive versions of Pyramid Flow. The former is more research oriented and the latter is more stable (but less efficient without temporal pyramid).
+## Gallery
+The following video examples are generated at 5s, 768p, 24fps. For more results, please visit our [project page](https://pyramid-flow.github.io).
+<table class="center" border="0" style="width: 100%; text-align: left;">
+<tr>
+  <td><video src="https://github.com/user-attachments/assets/5b44a57e-fa08-4554-84a2-2c7a99f2b343" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/5afd5970-de72-40e2-900d-a20d18308e8e" autoplay muted loop playsinline></video></td>
+</tr>
+<tr>
+  <td><video src="https://github.com/user-attachments/assets/1d44daf8-017f-40e9-bf18-1e19c0a8983b" autoplay muted loop playsinline></video></td>
+  <td><video src="https://github.com/user-attachments/assets/7f5dd901-b7d7-48cc-b67a-3c5f9e1546d2" autoplay muted loop playsinline></video></td>
+</tr>
+</table>
+## Comparison
+On VBench ([Huang et al., 2024](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard)), our method surpasses all the compared open-source baselines. Even with only public video data, it achieves comparable performance to commercial models like Kling ([Kuaishou, 2024](https://kling.kuaishou.com/en)) and Gen-3 Alpha ([Runway, 2024](https://runwayml.com/research/introducing-gen-3-alpha)), especially in the quality score (84.74 vs. 84.11 of Gen-3) and motion smoothness.
+![vbench](assets/vbench.jpg)
+We conduct an additional user study with 20+ participants. As can be seen, our method is preferred over open-source models such as [Open-Sora](https://github.com/hpcaitech/Open-Sora) and [CogVideoX-2B](https://github.com/THUDM/CogVideo) especially in terms of motion smoothness.
+![user_study](assets/user_study.jpg)
+## Acknowledgement
+We are grateful for the following awesome projects when implementing Pyramid Flow:
+* [SD3 Medium](https://huggingface.co/stabilityai/stable-diffusion-3-medium) and [Flux 1.0](https://huggingface.co/black-forest-labs/FLUX.1-dev): State-of-the-art image generation models based on flow matching.
+* [Diffusion Forcing](https://boyuan.space/diffusion-forcing) and [GameNGen](https://gamengen.github.io): Next-token prediction meets full-sequence diffusion.
+* [WebVid-10M](https://github.com/m-bain/webvid), [OpenVid-1M](https://github.com/NJU-PCALab/OpenVid-1M) and [Open-Sora Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan): Large-scale datasets for text-to-video generation.
+* [CogVideoX](https://github.com/THUDM/CogVideo): An open-source text-to-video generation model that shares many training details.
+* [Video-LLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2): An open-source video LLM for our video recaptioning.
+## Citation
+Consider giving this repository a star and cite Pyramid Flow in your publications if it helps your research.
+```
+@article{jin2024pyramidal,
+  title={Pyramidal Flow Matching for Efficient Video Generative Modeling},
+  author={Jin, Yang and Sun, Zhicheng and Li, Ningyuan and Xu, Kun and Xu, Kun and Jiang, Hao and Zhuang, Nan and Huang, Quzhe and Song, Yang and Mu, Yadong and Lin, Zhouchen},
+  jounal={arXiv preprint arXiv:2410.05954},
+  year={2024}
+}
+```

annotation/image_text.jsonl ADDED Viewed

	@@ -0,0 +1,20 @@

+{"image": "SAM_filter/000424/sa_4749867.jpg", "text": "a cityscape with a large body of water, such as a lake or a river, in the foreground"}
+{"image": "SAM_filter/000311/sa_3490721.jpg", "text": "a large, stately building with a white and blue color scheme, which gives it a grand and elegant appearance"}
+{"image": "SAM_filter/000273/sa_3059407.jpg", "text": "a close-up of a green bag containing a package of Japanese soybeans, along with a bottle of sake, a traditional Japanese alcoholic beverage"}
+{"image": "SAM_filter/000745/sa_8344729.jpg", "text": "a large, old-fashioned building with a red and white color scheme"}
+{"image": "SAM_filter/000832/sa_9310794.jpg", "text": "a cityscape with a large tower, likely the Eiffel Tower, as the main focal point"}
+{"image": "SAM_filter/000427/sa_4779422.jpg", "text": "a large cruise ship, specifically a Royal Caribbean cruise ship, docked at a pier in a harbor"}
+{"image": "SAM_filter/000105/sa_1178255.jpg", "text": "a close-up view of a computer screen with a magnifying glass placed over it"}
+{"image": "SAM_filter/000765/sa_8560467.jpg", "text": "a tree with a sign attached to it, which is located in a lush green field"}
+{"image": "SAM_filter/000216/sa_2417372.jpg", "text": "a large airport terminal with a long blue and white rope-style security line"}
+{"image": "SAM_filter/000385/sa_4308806.jpg", "text": "a close-up of a cell phone screen displaying a blue and white logo, which appears to be a bank logo"}
+{"image": "SAM_filter/000931/sa_10425835.jpg", "text": "a large body of water, possibly a lake, with a lush green landscape surrounding it"}
+{"image": "SAM_filter/000364/sa_4079002.jpg", "text": "a large, empty airport terminal with a long row of gray metal chairs arranged in a straight line"}
+{"image": "SAM_filter/000474/sa_5306222.jpg", "text": "a large, modern building with a tall, glass structure, which is likely a museum"}
+{"image": "SAM_filter/000584/sa_6536849.jpg", "text": "a city street scene with a black car parked in a parking lot, a building with a balcony, and a city skyline in the background"}
+{"image": "SAM_filter/000188/sa_2104485.jpg", "text": "a large jet fighter airplane flying through the sky, captured in a high-quality photograph"}
+{"image": "SAM_filter/000219/sa_2458908.jpg", "text": "a stone structure with a tall tower, which is situated in a lush green garden"}
+{"image": "SAM_filter/000440/sa_4929413.jpg", "text": "a large city street with a mix of architectural styles, including a Gothic-style building and a modern building"}
+{"image": "SAM_filter/000739/sa_8279296.jpg", "text": "a vintage blue and white bus parked on the side of a dirt road, with a building in the background"}
+{"image": "SAM_filter/000809/sa_9052304.jpg", "text": "a large, old stone building with a clock tower, which is situated in a small town"}
+{"image": "SAM_filter/000294/sa_3300200.jpg", "text": "a table with various utensils, including a bowl, spoon, and fork, placed on a wooden surface"}

annotation/video_text.jsonl ADDED Viewed

	@@ -0,0 +1,17 @@

+{"video": "webvid10m/train/010451_010500/23388121.mp4", "text": "the serene beauty of a valley with a river, mountains, and clouds", "latent": "webvid10m/train/010451_010500/23388121-latent-384-2.pt", "text_fea": "text_feature/webvid10m/train/010451_010500/23388121-text.pt"}
+{"video": "pexels/8440980-uhd_3840_2160_25fps.mp4", "text": "A group of people, including two men and two women, are seen sitting at a table, smiling and waving at the camera, and appear to be in a good mood", "latent": "pexels/8440980-uhd_3840_2160_25fps-latent-384-2.pt", "text_fea": "text_feature/pexels/8440980-uhd_3840_2160_25fps-text.pt"}
+{"video": "webvid10m/train/176251_176300/1011015221.mp4", "text": "an aerial view of a large wheat field with a road running through it, and a car driving on the road", "latent": "webvid10m/train/176251_176300/1011015221-latent-384-4.pt", "text_fea": "text_feature/webvid10m/train/176251_176300/1011015221-text.pt"}
+{"video": "webvid10m/train/005801_005850/22143805.mp4", "text": "a close-up of paint mixing in water, creating swirling patterns", "latent": "webvid10m/train/005801_005850/22143805-latent-384-8.pt", "text_fea": "text_feature/webvid10m/train/005801_005850/22143805-text.pt"}
+{"video": "OpenVid-1M/videos/qsXY7FkNFwE_2_0to743.mp4", "text": "A baby girl in a pink shirt and striped pants sits in a high chair, eats a piece of bread, and looks at the camera", "latent": "OpenVid-1M/videos/qsXY7FkNFwE_2_0to743-latent-384-0.pt", "text_fea": "text_feature/OpenVid-1M/videos/qsXY7FkNFwE_2_0to743-text.pt"}
+{"video": "webvid10m/train/134901_134950/1037990273.mp4", "text": "a field of green wheat waving in the wind", "latent": "webvid10m/train/134901_134950/1037990273-latent-384-6.pt", "text_fea": "text_feature/webvid10m/train/134901_134950/1037990273-text.pt"}
+{"video": "pexels/5263258-uhd_2160_4096_30fps.mp4", "text": "A dog sits patiently in front of its bowl, waiting for it to be filled with food", "latent": "pexels/5263258-uhd_2160_4096_30fps-latent-384-6.pt", "text_fea": "text_feature/pexels/5263258-uhd_2160_4096_30fps-text.pt"}
+{"video": "webvid10m/train/117851_117900/6461432.mp4", "text": "A ladybug crawls along a blade of grass in a serene natural setting", "latent": "webvid10m/train/117851_117900/6461432-latent-384-4.pt", "text_fea": "text_feature/webvid10m/train/117851_117900/6461432-text.pt"}
+{"video": "webvid10m/train/053051_053100/1058396656.mp4", "text": "a group of construction workers working on a rooftop, with a supervisor overseeing the work", "latent": "webvid10m/train/053051_053100/1058396656-latent-384-10.pt", "text_fea": "text_feature/webvid10m/train/053051_053100/1058396656-text.pt"}
+{"video": "webvid10m/train/073651_073700/1021916425.mp4", "text": "an aerial view of a beautiful coastline with rocky islands, blue water, and a white cloud in the sky", "latent": "webvid10m/train/073651_073700/1021916425-latent-384-4.pt", "text_fea": "text_feature/webvid10m/train/073651_073700/1021916425-text.pt"}
+{"video": "webvid10m/train/027051_027100/1032549941.mp4", "text": "a young woman waking up in bed, smiling at the camera, and then lying back down on the bed", "latent": "webvid10m/train/027051_027100/1032549941-latent-384-10.pt", "text_fea": "text_feature/webvid10m/train/027051_027100/1032549941-text.pt"}
+{"video": "pexels/5564564-uhd_3840_2160_24fps.mp4", "text": "a person rolling out dough on a table using a rolling pin", "latent": "pexels/5564564-uhd_3840_2160_24fps-latent-384-8.pt", "text_fea": "text_feature/pexels/5564564-uhd_3840_2160_24fps-text.pt"}
+{"video": "webvid10m/train/073701_073750/24008116.mp4", "text": "a cityscape with a moon in the sky, and the camera pans across the city", "latent": "webvid10m/train/073701_073750/24008116-latent-384-2.pt", "text_fea": "text_feature/webvid10m/train/073701_073750/24008116-text.pt"}
+{"video": "webvid10m/train/118351_118400/23370991.mp4", "text": "a group of dolphins swimming in the ocean, with a person on a boat nearby", "latent": "webvid10m/train/118351_118400/23370991-latent-384-2.pt", "text_fea": "text_feature/webvid10m/train/118351_118400/23370991-text.pt"}
+{"video": "webvid10m/train/022001_022050/1023013066.mp4", "text": "a bird's eye view of a beachfront city, highlighting the hotels, pools, and proximity to the ocean", "latent": "webvid10m/train/022001_022050/1023013066-latent-384-10.pt", "text_fea": "text_feature/webvid10m/train/022001_022050/1023013066-text.pt"}
+{"video": "webvid10m/train/004601_004650/1015979020.mp4", "text": "a bridge over a body of water, with a boat passing under it", "latent": "webvid10m/train/004601_004650/1015979020-latent-384-4.pt", "text_fea": "text_feature/webvid10m/train/004601_004650/1015979020-text.pt"}
+{"video": "webvid10m/train/149701_149750/1034525579.mp4", "text": "a group of owls and a moon, with the moon appearing to grow larger as the video progresses", "latent": "webvid10m/train/149701_149750/1034525579-latent-384-2.pt", "text_fea": "text_feature/webvid10m/train/149701_149750/1034525579-text.pt"}

app.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import os
+import uuid
+import gradio as gr
+import torch
+import PIL
+from PIL import Image
+from pyramid_dit import PyramidDiTForVideoGeneration
+from diffusers.utils import export_to_video
+from huggingface_hub import snapshot_download
+import threading
+import random
+# Global model cache
+model_cache = {}
+# Lock to ensure thread-safe access to the model cache
+model_cache_lock = threading.Lock()
+# Configuration
+model_name = "pyramid_flux"    # or pyramid_mmdit
+model_repo = "rain1011/pyramid-flow-sd3" if model_name == "pyramid_mmdit" else "rain1011/pyramid-flow-miniflux"
+model_dtype = "bf16"                      # Support bf16 and fp32
+variants = {
+    'high': 'diffusion_transformer_768p',  # For high-resolution version
+    'low': 'diffusion_transformer_384p'    # For low-resolution version
+}
+required_file = 'config.json'  # Ensure config.json is present
+width_high = 1280
+height_high = 768
+width_low = 640
+height_low = 384
+cpu_offloading = True  # enable cpu_offloading by default
+# Get the current working directory and create a folder to store the model
+current_directory = os.getcwd()
+model_path = os.path.join(current_directory, "pyramid_flow_model")  # Directory to store the model
+# Download the model if not already present
+def download_model_from_hf(model_repo, model_dir, variants, required_file):
+    need_download = False
+    if not os.path.exists(model_dir):
+        print(f"[INFO] Model directory '{model_dir}' does not exist. Initiating download...")
+        need_download = True
+    else:
+        # Check if all required files exist for each variant
+        for variant_key, variant_dir in variants.items():
+            variant_path = os.path.join(model_dir, variant_dir)
+            file_path = os.path.join(variant_path, required_file)
+            if not os.path.exists(file_path):
+                print(f"[WARNING] Required file '{required_file}' missing in '{variant_path}'.")
+                need_download = True
+                break
+    if need_download:
+        print(f"[INFO] Downloading model from '{model_repo}' to '{model_dir}'...")
+        try:
+            snapshot_download(
+                repo_id=model_repo,
+                local_dir=model_dir,
+                local_dir_use_symlinks=False,
+                repo_type='model'
+            )
+            print("[INFO] Model download complete.")
+        except Exception as e:
+            print(f"[ERROR] Failed to download the model: {e}")
+            raise
+    else:
+        print(f"[INFO] All required model files are present in '{model_dir}'. Skipping download.")
+# Download model from Hugging Face if not present
+download_model_from_hf(model_repo, model_path, variants, required_file)
+# Function to initialize the model based on user options
+def initialize_model(variant):
+    print(f"[INFO] Initializing model with variant='{variant}', using bf16 precision...")
+    # Determine the correct variant directory
+    variant_dir = variants['high'] if variant == '768p' else variants['low']
+    base_path = model_path  # Pass the base model path
+    print(f"[DEBUG] Model base path: {base_path}")
+    # Verify that config.json exists in the variant directory
+    config_path = os.path.join(model_path, variant_dir, 'config.json')
+    if not os.path.exists(config_path):
+        print(f"[ERROR] config.json not found in '{os.path.join(model_path, variant_dir)}'.")
+        raise FileNotFoundError(f"config.json not found in '{os.path.join(model_path, variant_dir)}'.")
+    if model_dtype == "bf16":
+        torch_dtype_selected = torch.bfloat16
+    else:
+        torch_dtype_selected = torch.float32
+    # Initialize the model
+    try:
+        model = PyramidDiTForVideoGeneration(
+            base_path,                # Pass the base model path
+            model_name=model_name,     # set to pyramid_flux or pyramid_mmdit
+            model_dtype=model_dtype,  # Use bf16
+            model_variant=variant_dir,  # Pass the variant directory name
+            cpu_offloading=cpu_offloading,  # Pass the CPU offloading flag
+        )
+        # Always enable tiling for the VAE
+        model.vae.enable_tiling()
+        # Remove manual device placement when using CPU offloading
+        # The components will be moved to the appropriate devices automatically
+        if torch.cuda.is_available():
+            torch.cuda.set_device(0)
+            # Manual device replacement when not using CPU offloading
+            if not cpu_offloading:
+                model.vae.to("cuda")
+                model.dit.to("cuda")
+                model.text_encoder.to("cuda")
+        else:
+            print("[WARNING] CUDA is not available. Proceeding without GPU.")
+        print("[INFO] Model initialized successfully.")
+        return model, torch_dtype_selected
+    except Exception as e:
+        print(f"[ERROR] Error initializing model: {e}")
+        raise
+# Function to get the model from cache or initialize it
+def initialize_model_cached(variant, seed):
+    key = variant
+    if seed == 0:
+        seed = random.randint(0, 2**8 - 1)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    # Check if the model is already in the cache
+    if key not in model_cache:
+        with model_cache_lock:
+            # Double-checked locking to prevent race conditions
+            if key not in model_cache:
+                model, dtype = initialize_model(variant)
+                model_cache[key] = (model, dtype)
+    return model_cache[key]
+def resize_crop_image(img: PIL.Image.Image, tgt_width, tgt_height):
+    ori_width, ori_height = img.width, img.height
+    scale = max(tgt_width / ori_width, tgt_height / ori_height)
+    resized_width = round(ori_width * scale)
+    resized_height = round(ori_height * scale)
+    img = img.resize((resized_width, resized_height), resample=PIL.Image.LANCZOS)
+    left = (resized_width - tgt_width) / 2
+    top = (resized_height - tgt_height) / 2
+    right = (resized_width + tgt_width) / 2
+    bottom = (resized_height + tgt_height) / 2
+    # Crop the center of the image
+    img = img.crop((left, top, right, bottom))
+    return img
+# Function to generate text-to-video
+def generate_text_to_video(prompt, temp, guidance_scale, video_guidance_scale, resolution, seed, progress=gr.Progress()):
+    progress(0, desc="Loading model")
+    print("[DEBUG] generate_text_to_video called.")
+    variant = '768p' if resolution == "768p" else '384p'
+    height = height_high if resolution == "768p" else height_low
+    width = width_high if resolution == "768p" else width_low
+    def progress_callback(i, m):
+        progress(i/m)
+    # Initialize model based on user options using cached function
+    try:
+        model, torch_dtype_selected = initialize_model_cached(variant, seed)
+    except Exception as e:
+        print(f"[ERROR] Model initialization failed: {e}")
+        return f"Model initialization failed: {e}"
+    try:
+        print("[INFO] Starting text-to-video generation...")
+        with torch.no_grad(), torch.autocast('cuda', dtype=torch_dtype_selected):
+            frames = model.generate(
+                prompt=prompt,
+                num_inference_steps=[20, 20, 20],
+                video_num_inference_steps=[10, 10, 10],
+                height=height,
+                width=width,
+                temp=temp,
+                guidance_scale=guidance_scale,
+                video_guidance_scale=video_guidance_scale,
+                output_type="pil",
+                cpu_offloading=cpu_offloading,
+                save_memory=True,
+                callback=progress_callback,
+            )
+        print("[INFO] Text-to-video generation completed.")
+    except Exception as e:
+        print(f"[ERROR] Error during text-to-video generation: {e}")
+        return f"Error during video generation: {e}"
+    video_path = f"{str(uuid.uuid4())}_text_to_video_sample.mp4"
+    try:
+        export_to_video(frames, video_path, fps=24)
+        print(f"[INFO] Video exported to {video_path}.")
+    except Exception as e:
+        print(f"[ERROR] Error exporting video: {e}")
+        return f"Error exporting video: {e}"
+    return video_path
+# Function to generate image-to-video
+def generate_image_to_video(image, prompt, temp, video_guidance_scale, resolution, seed, progress=gr.Progress()):
+    progress(0, desc="Loading model")
+    print("[DEBUG] generate_image_to_video called.")
+    variant = '768p' if resolution == "768p" else '384p'
+    height = height_high if resolution == "768p" else height_low
+    width = width_high if resolution == "768p" else width_low
+    try:
+        image = resize_crop_image(image, width, height)
+        print("[INFO] Image resized and cropped successfully.")
+    except Exception as e:
+        print(f"[ERROR] Error processing image: {e}")
+        return f"Error processing image: {e}"
+    def progress_callback(i, m):
+        progress(i/m)
+    # Initialize model based on user options using cached function
+    try:
+        model, torch_dtype_selected = initialize_model_cached(variant, seed)
+    except Exception as e:
+        print(f"[ERROR] Model initialization failed: {e}")
+        return f"Model initialization failed: {e}"
+    try:
+        print("[INFO] Starting image-to-video generation...")
+        with torch.no_grad(), torch.autocast('cuda', dtype=torch_dtype_selected):
+            frames = model.generate_i2v(
+                prompt=prompt,
+                input_image=image,
+                num_inference_steps=[10, 10, 10],
+                temp=temp,
+                video_guidance_scale=video_guidance_scale,
+                output_type="pil",
+                cpu_offloading=cpu_offloading,
+                save_memory=True,
+                callback=progress_callback,
+            )
+        print("[INFO] Image-to-video generation completed.")
+    except Exception as e:
+        print(f"[ERROR] Error during image-to-video generation: {e}")
+        return f"Error during video generation: {e}"
+    video_path = f"{str(uuid.uuid4())}_image_to_video_sample.mp4"
+    try:
+        export_to_video(frames, video_path, fps=24)
+        print(f"[INFO] Video exported to {video_path}.")
+    except Exception as e:
+        print(f"[ERROR] Error exporting video: {e}")
+        return f"Error exporting video: {e}"
+    return video_path
+def update_slider(resolution):
+    if resolution == "768p":
+        return [gr.update(maximum=31), gr.update(maximum=31)]
+    else:
+        return [gr.update(maximum=16), gr.update(maximum=16)]
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+# Pyramid Flow Video Generation Demo
+Pyramid Flow is a training-efficient **Autoregressive Video Generation** model based on **Flow Matching**. It is trained only on open-source datasets within 20.7k A100 GPU hours.
+[[Paper]](https://arxiv.org/abs/2410.05954) [[Project Page]](https://pyramid-flow.github.io) [[Code]](https://github.com/jy0205/Pyramid-Flow) [[Model]](https://huggingface.co/rain1011/pyramid-flow-sd3)
+"""
+    )
+    # Shared settings
+    with gr.Row():
+        resolution_dropdown = gr.Dropdown(
+            choices=["768p", "384p"],
+            value="384p",
+            label="Model Resolution"
+        )
+    with gr.Tab("Text-to-Video"):
+        with gr.Row():
+            with gr.Column():
+                text_prompt = gr.Textbox(label="Prompt (Less than 128 words)", placeholder="Enter a text prompt for the video", lines=2)
+                temp_slider = gr.Slider(1, 16, value=16, step=1, label="Duration")
+                guidance_scale_slider = gr.Slider(1.0, 15.0, value=9.0, step=0.1, label="Guidance Scale")
+                video_guidance_scale_slider = gr.Slider(1.0, 10.0, value=5.0, step=0.1, label="Video Guidance Scale")
+                text_seed = gr.Number(label="Inference Seed (Enter a positive number, 0 for random)", value=0)
+                txt_generate = gr.Button("Generate Video")
+            with gr.Column():
+                txt_output = gr.Video(label="Generated Video")
+        gr.Examples(
+            examples=[
+                ["A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors", 16, 7.0, 5.0, "384p"],
+                ["Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes", 16, 7.0, 5.0, "384p"],
+                # ["Extreme close-up of chicken and green pepper kebabs grilling on a barbeque with flames. Shallow focus and light smoke. vivid colours", 31, 9.0, 5.0, "768p"],
+            ],
+            inputs=[text_prompt, temp_slider, guidance_scale_slider, video_guidance_scale_slider, resolution_dropdown, text_seed],
+            outputs=[txt_output],
+            fn=generate_text_to_video,
+            cache_examples='lazy',
+        )
+    with gr.Tab("Image-to-Video"):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(type="pil", label="Input Image")
+                image_prompt = gr.Textbox(label="Prompt (Less than 128 words)", placeholder="Enter a text prompt for the video", lines=2)
+                image_temp_slider = gr.Slider(2, 16, value=16, step=1, label="Duration")
+                image_video_guidance_scale_slider = gr.Slider(1.0, 7.0, value=4.0, step=0.1, label="Video Guidance Scale")
+                image_seed = gr.Number(label="Inference Seed (Enter a positive number, 0 for random)", value=0)
+                img_generate = gr.Button("Generate Video")
+            with gr.Column():
+                img_output = gr.Video(label="Generated Video")
+        gr.Examples(
+            examples=[
+                ['assets/the_great_wall.jpg', 'FPV flying over the Great Wall', 16, 4.0, "384p"]
+            ],
+            inputs=[image_input, image_prompt, image_temp_slider, image_video_guidance_scale_slider, resolution_dropdown, image_seed],
+            outputs=[img_output],
+            fn=generate_image_to_video,
+            cache_examples='lazy',
+        )
+    # Update generate functions to include resolution options
+    txt_generate.click(
+        generate_text_to_video,
+        inputs=[text_prompt, temp_slider, guidance_scale_slider, video_guidance_scale_slider, resolution_dropdown, text_seed],
+        outputs=txt_output
+    )
+    img_generate.click(
+        generate_image_to_video,
+        inputs=[image_input, image_prompt, image_temp_slider, image_video_guidance_scale_slider, resolution_dropdown, image_seed],
+        outputs=img_output
+    )
+    resolution_dropdown.change(
+        fn=update_slider,
+        inputs=resolution_dropdown,
+        outputs=[temp_slider, image_temp_slider]
+    )
+# Launch Gradio app
+demo.launch(share=True)

app_multigpu.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import uuid
+import gradio as gr
+import subprocess
+import tempfile
+import shutil
+def run_inference_multigpu(gpus, variant, model_path, temp, guidance_scale, video_guidance_scale, resolution, prompt):
+    """
+    Runs the external multi-GPU inference script and returns the path to the generated video.
+    """
+    # Create a temporary directory to store inputs and outputs
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_video = os.path.join(tmpdir, f"{uuid.uuid4()}_output.mp4")
+        # Path to the external shell script
+        script_path = "./scripts/app_multigpu_engine.sh"  # Updated script path
+        # Prepare the command
+        cmd = [
+            script_path,
+            str(gpus),
+            variant,
+            model_path,
+            't2v',  # Task is always 't2v' since 'i2v' is removed
+            str(temp),
+            str(guidance_scale),
+            str(video_guidance_scale),
+            resolution,
+            output_video,
+            prompt  # Pass the prompt directly as an argument
+        ]
+        try:
+            # Run the external script
+            subprocess.run(cmd, check=True)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Error during video generation: {e}")
+        # After generation, move the video to a permanent location
+        final_output = os.path.join("generated_videos", f"{uuid.uuid4()}_output.mp4")
+        os.makedirs("generated_videos", exist_ok=True)
+        shutil.move(output_video, final_output)
+        return final_output
+def generate_text_to_video(prompt, temp, guidance_scale, video_guidance_scale, resolution, gpus):
+    model_path = "./pyramid_flow_model"  # Use the model path as specified
+    # Determine variant based on resolution
+    if resolution == "768p":
+        variant = "diffusion_transformer_768p"
+    else:
+        variant = "diffusion_transformer_384p"
+    return run_inference_multigpu(gpus, variant, model_path, temp, guidance_scale, video_guidance_scale, resolution, prompt)
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+# Pyramid Flow Video Generation Demo
+Pyramid Flow is a training-efficient **Autoregressive Video Generation** model based on **Flow Matching**. It is trained only on open-source datasets within 20.7k A100 GPU hours.
+[[Paper]](https://arxiv.org/abs/2410.05954) [[Project Page]](https://pyramid-flow.github.io) [[Code]](https://github.com/jy0205/Pyramid-Flow) [[Model]](https://huggingface.co/rain1011/pyramid-flow-sd3)
+"""
+    )
+    # Shared settings
+    with gr.Row():
+        gpus_dropdown = gr.Dropdown(
+            choices=[2, 4],
+            value=4,
+            label="Number of GPUs"
+        )
+        resolution_dropdown = gr.Dropdown(
+            choices=["768p", "384p"],
+            value="768p",
+            label="Model Resolution"
+        )
+    with gr.Tab("Text-to-Video"):
+        with gr.Row():
+            with gr.Column():
+                text_prompt = gr.Textbox(
+                    label="Prompt (Less than 128 words)",
+                    placeholder="Enter a text prompt for the video",
+                    lines=2
+                )
+                temp_slider = gr.Slider(1, 31, value=16, step=1, label="Duration")
+                guidance_scale_slider = gr.Slider(1.0, 15.0, value=9.0, step=0.1, label="Guidance Scale")
+                video_guidance_scale_slider = gr.Slider(1.0, 10.0, value=5.0, step=0.1, label="Video Guidance Scale")
+                txt_generate = gr.Button("Generate Video")
+            with gr.Column():
+                txt_output = gr.Video(label="Generated Video")
+        gr.Examples(
+            examples=[
+                [
+                    "A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors",
+                    16,
+                    9.0,
+                    5.0,
+                    "768p",
+                    4
+                ],
+                [
+                    "Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes",
+                    16,
+                    9.0,
+                    5.0,
+                    "768p",
+                    4
+                ],
+                [
+                    "Extreme close-up of chicken and green pepper kebabs grilling on a barbeque with flames. Shallow focus and light smoke. vivid colours",
+                    31,
+                    9.0,
+                    5.0,
+                    "768p",
+                    4
+                ],
+            ],
+            inputs=[text_prompt, temp_slider, guidance_scale_slider, video_guidance_scale_slider, resolution_dropdown, gpus_dropdown],
+            outputs=[txt_output],
+            fn=generate_text_to_video,
+            cache_examples='lazy',
+        )
+    # Update generate function for Text-to-Video
+    txt_generate.click(
+        generate_text_to_video,
+        inputs=[
+            text_prompt,
+            temp_slider,
+            guidance_scale_slider,
+            video_guidance_scale_slider,
+            resolution_dropdown,
+            gpus_dropdown
+        ],
+        outputs=txt_output
+    )
+# Launch Gradio app
+demo.launch(share=True)

assets/motivation.jpg ADDED Viewed

assets/the_great_wall.jpg ADDED Viewed

assets/user_study.jpg ADDED Viewed

assets/vbench.jpg ADDED Viewed

causal_video_vae_demo.ipynb ADDED Viewed

	@@ -0,0 +1,221 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import cv2\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import PIL\n",
+    "from PIL import Image\n",
+    "from einops import rearrange\n",
+    "from video_vae import CausalVideoVAELossWrapper\n",
+    "from torchvision import transforms as pth_transforms\n",
+    "from torchvision.transforms.functional import InterpolationMode\n",
+    "from IPython.display import Image as ipython_image\n",
+    "from diffusers.utils import load_image, export_to_video, export_to_gif\n",
+    "from IPython.display import HTML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = \"pyramid-flow-miniflux/causal_video_vae\"   # The video-vae checkpoint dir\n",
+    "model_dtype = 'bf16'\n",
+    "\n",
+    "device_id = 3\n",
+    "torch.cuda.set_device(device_id)\n",
+    "\n",
+    "model = CausalVideoVAELossWrapper(\n",
+    "    model_path,\n",
+    "    model_dtype,\n",
+    "    interpolate=False, \n",
+    "    add_discriminator=False,\n",
+    ")\n",
+    "model = model.to(\"cuda\")\n",
+    "\n",
+    "if model_dtype == \"bf16\":\n",
+    "    torch_dtype = torch.bfloat16 \n",
+    "elif model_dtype == \"fp16\":\n",
+    "    torch_dtype = torch.float16\n",
+    "else:\n",
+    "    torch_dtype = torch.float32\n",
+    "\n",
+    "def image_transform(images, resize_width, resize_height):\n",
+    "    transform_list = pth_transforms.Compose([\n",
+    "        pth_transforms.Resize((resize_height, resize_width), InterpolationMode.BICUBIC, antialias=True),\n",
+    "        pth_transforms.ToTensor(),\n",
+    "        pth_transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
+    "    ])\n",
+    "    return torch.stack([transform_list(image) for image in images])\n",
+    "\n",
+    "\n",
+    "def get_transform(width, height, new_width=None, new_height=None, resize=False,):\n",
+    "    transform_list = []\n",
+    "\n",
+    "    if resize:\n",
+    "        if new_width is None:\n",
+    "            new_width = width // 8 * 8\n",
+    "        if new_height is None:\n",
+    "            new_height = height // 8 * 8\n",
+    "        transform_list.append(pth_transforms.Resize((new_height, new_width), InterpolationMode.BICUBIC, antialias=True))\n",
+    "    \n",
+    "    transform_list.extend([\n",
+    "        pth_transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),\n",
+    "    ])\n",
+    "    transform_list = pth_transforms.Compose(transform_list)\n",
+    "\n",
+    "    return transform_list\n",
+    "\n",
+    "\n",
+    "def load_video_and_transform(video_path, frame_number, new_width=None, new_height=None, max_frames=600, sample_fps=24, resize=False):\n",
+    "    try:\n",
+    "        video_capture = cv2.VideoCapture(video_path)\n",
+    "        fps = video_capture.get(cv2.CAP_PROP_FPS)\n",
+    "        frames = []\n",
+    "        pil_frames = []\n",
+    "        while True:\n",
+    "            flag, frame = video_capture.read()\n",
+    "            if not flag:\n",
+    "                break\n",
+    "    \n",
+    "            pil_frames.append(np.ascontiguousarray(frame[:, :, ::-1]))\n",
+    "            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
+    "            frame = torch.from_numpy(frame)\n",
+    "            frame = frame.permute(2, 0, 1)\n",
+    "            frames.append(frame)\n",
+    "            if len(frames) >= max_frames:\n",
+    "                break\n",
+    "\n",
+    "        video_capture.release()\n",
+    "        interval = max(int(fps / sample_fps), 1)\n",
+    "        pil_frames = pil_frames[::interval][:frame_number]\n",
+    "        frames = frames[::interval][:frame_number]\n",
+    "        frames = torch.stack(frames).float() / 255\n",
+    "        width = frames.shape[-1]\n",
+    "        height = frames.shape[-2]\n",
+    "        video_transform = get_transform(width, height, new_width, new_height, resize=resize)\n",
+    "        frames = video_transform(frames)\n",
+    "        pil_frames = [Image.fromarray(frame).convert(\"RGB\") for frame in pil_frames]\n",
+    "\n",
+    "        if resize:\n",
+    "            if new_width is None:\n",
+    "                new_width = width // 32 * 32\n",
+    "            if new_height is None:\n",
+    "                new_height = height // 32 * 32\n",
+    "            pil_frames = [frame.resize((new_width or width, new_height or height), PIL.Image.BICUBIC) for frame in pil_frames]\n",
+    "        return frames, pil_frames\n",
+    "    except Exception:\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "def show_video(ori_path, rec_path, width=\"100%\"):\n",
+    "    html = ''\n",
+    "    if ori_path is not None:\n",
+    "        html += f\"\"\"<video controls=\"\" name=\"media\" data-fullscreen-container=\"true\" width=\"{width}\">\n",
+    "        <source src=\"{ori_path}\" type=\"video/mp4\">\n",
+    "        </video>\n",
+    "        \"\"\"\n",
+    "    \n",
+    "    html += f\"\"\"<video controls=\"\" name=\"media\" data-fullscreen-container=\"true\" width=\"{width}\">\n",
+    "    <source src=\"{rec_path}\" type=\"video/mp4\">\n",
+    "    </video>\n",
+    "    \"\"\"\n",
+    "    return HTML(html)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Image Reconstruction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_path = 'image_path'\n",
+    "\n",
+    "image = Image.open(image_path).convert(\"RGB\")\n",
+    "resize_width = image.width // 8 * 8\n",
+    "resize_height = image.height // 8 * 8\n",
+    "input_image_tensor = image_transform([image], resize_width, resize_height)\n",
+    "input_image_tensor = input_image_tensor.permute(1, 0, 2, 3).unsqueeze(0)\n",
+    "\n",
+    "with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):\n",
+    "    latent = model.encode_latent(input_image_tensor.to(\"cuda\"), sample=True)\n",
+    "    rec_images = model.decode_latent(latent)\n",
+    "\n",
+    "display(image)\n",
+    "display(rec_images[0])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Video Reconstruction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video_path = 'video_path'\n",
+    "\n",
+    "frame_number = 57   # x*8 + 1\n",
+    "width = 640\n",
+    "height = 384\n",
+    "\n",
+    "video_frames_tensor, pil_video_frames = load_video_and_transform(video_path, frame_number, new_width=width, new_height=height, resize=True)\n",
+    "video_frames_tensor = video_frames_tensor.permute(1, 0, 2, 3).unsqueeze(0)\n",
+    "print(video_frames_tensor.shape)\n",
+    "\n",
+    "with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):\n",
+    "    latent = model.encode_latent(video_frames_tensor.to(\"cuda\"), sample=False, window_size=8, temporal_chunk=True)\n",
+    "    rec_frames = model.decode_latent(latent.float(), window_size=2, temporal_chunk=True)\n",
+    "\n",
+    "export_to_video(pil_video_frames, './ori_video.mp4', fps=24)\n",
+    "export_to_video(rec_frames, \"./rec_video.mp4\", fps=24)\n",
+    "show_video('./ori_video.mp4', \"./rec_video.mp4\", \"60%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .dataset_cls import (
+    ImageTextDataset,
+    LengthGroupedVideoTextDataset,
+    ImageDataset,
+    VideoDataset,
+)
+from .dataloaders import (
+    create_image_text_dataloaders,
+    create_length_grouped_video_text_dataloader,
+    create_mixed_dataloaders,
+)

dataset/bucket_loader.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch
+import torchvision
+import numpy as np
+import math
+import random
+import time
+class Bucketeer:
+    def __init__(
+        self, dataloader,
+        sizes=[(256, 256), (192, 384), (192, 320), (384, 192), (320, 192)],
+        is_infinite=True, epoch=0,
+    ):
+        # Ratios and Sizes : (w h)
+        self.sizes = sizes
+        self.batch_size = dataloader.batch_size
+        self._dataloader = dataloader
+        self.iterator = iter(dataloader)
+        self.sampler = dataloader.sampler
+        self.buckets = {s: [] for s in self.sizes}
+        self.is_infinite = is_infinite
+        self._epoch = epoch
+    def get_available_batch(self):
+        available_size = []
+        for b in self.buckets:
+            if len(self.buckets[b]) >= self.batch_size:
+                available_size.append(b)
+        if len(available_size) == 0:
+            return None
+        else:
+            b = random.choice(available_size)
+            batch = self.buckets[b][:self.batch_size]
+            self.buckets[b] = self.buckets[b][self.batch_size:]
+            return batch
+    def __next__(self):
+        batch = self.get_available_batch()
+        while batch is None:
+            try:
+                elements = next(self.iterator)
+            except StopIteration:
+                # To make it infinity
+                if self.is_infinite:
+                    self._epoch += 1
+                    if hasattr(self._dataloader.sampler, "set_epoch"):
+                        self._dataloader.sampler.set_epoch(self._epoch)
+                    time.sleep(2) # Prevent possible deadlock during epoch transition
+                    self.iterator = iter(self._dataloader)
+                    elements = next(self.iterator)
+                else:
+                    raise StopIteration
+            for dct in elements:
+                try:
+                    img = dct['video']
+                    size = (img.shape[-1], img.shape[-2])
+                    self.buckets[size].append({**{'video': img}, **{k:dct[k] for k in dct if k != 'video'}})
+                except Exception as e:
+                    continue
+            batch = self.get_available_batch()
+        out = {k:[batch[i][k] for i in range(len(batch))] for k in batch[0]}
+        return {k: torch.stack(o, dim=0) if isinstance(o[0], torch.Tensor) else o for k, o in out.items()}
+    def __iter__(self):
+        return self
+    def __len__(self):
+        return len(self.iterator)
+class TemporalLengthBucketeer:
+    def __init__(
+        self, dataloader, max_frames=16, epoch=0,
+    ):
+        self.batch_size = dataloader.batch_size
+        self._dataloader = dataloader
+        self.iterator = iter(dataloader)
+        self.buckets = {temp: [] for temp in range(1, max_frames + 1)}
+        self._epoch = epoch
+    def get_available_batch(self):
+        available_size = []
+        for b in self.buckets:
+            if len(self.buckets[b]) >= self.batch_size:
+                available_size.append(b)
+        if len(available_size) == 0:
+            return None
+        else:
+            b = random.choice(available_size)
+            batch = self.buckets[b][:self.batch_size]
+            self.buckets[b] = self.buckets[b][self.batch_size:]
+            return batch
+    def __next__(self):
+        batch = self.get_available_batch()
+        while batch is None:
+            try:
+                elements = next(self.iterator)
+            except StopIteration:
+                # To make it infinity
+                self._epoch += 1
+                if hasattr(self._dataloader.sampler, "set_epoch"):
+                    self._dataloader.sampler.set_epoch(self._epoch)
+                time.sleep(2) # Prevent possible deadlock during epoch transition
+                self.iterator = iter(self._dataloader)
+                elements = next(self.iterator)
+            for dct in elements:
+                try:
+                    video_latent = dct['video']
+                    temp = video_latent.shape[2]
+                    self.buckets[temp].append({**{'video': video_latent}, **{k:dct[k] for k in dct if k != 'video'}})
+                except Exception as e:
+                    continue
+            batch = self.get_available_batch()
+        out = {k:[batch[i][k] for i in range(len(batch))] for k in batch[0]}
+        out = {k: torch.cat(o, dim=0) if isinstance(o[0], torch.Tensor) else o for k, o in out.items()}
+        if 'prompt_embed' in out:
+            # Loading the pre-extrcted textual features
+            prompt_embeds = out['prompt_embed'].clone()
+            del out['prompt_embed']
+            prompt_attention_mask = out['prompt_attention_mask'].clone()
+            del out['prompt_attention_mask']
+            pooled_prompt_embeds = out['pooled_prompt_embed'].clone()
+            del out['pooled_prompt_embed']
+            out['text'] = {
+                'prompt_embeds' : prompt_embeds,
+                'prompt_attention_mask': prompt_attention_mask,
+                'pooled_prompt_embeds': pooled_prompt_embeds,
+            }
+        return out
+    def __iter__(self):
+        return self
+    def __len__(self):
+        return len(self.iterator)

dataset/dataloaders.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os
+import json
+import torch
+import time
+import random
+from typing import Iterable
+from collections import OrderedDict
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader, ConcatDataset, IterableDataset, DistributedSampler, RandomSampler
+from torch.utils.data.dataloader import default_collate
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from torchvision.transforms import functional as F
+from .bucket_loader import Bucketeer, TemporalLengthBucketeer
+class IterLoader:
+    """
+    A wrapper to convert DataLoader as an infinite iterator.
+    Modified from:
+        https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py
+    """
+    def __init__(self, dataloader: DataLoader, use_distributed: bool = False, epoch: int = 0):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._use_distributed = use_distributed
+        self._epoch = epoch
+    @property
+    def epoch(self) -> int:
+        return self._epoch
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed:
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+        return data
+    def __iter__(self):
+        return self
+    def __len__(self):
+        return len(self._dataloader)
+def identity(x):
+    return x
+def create_image_text_dataloaders(dataset, batch_size, num_workers,
+    multi_aspect_ratio=True, epoch=0, sizes=[(512, 512), (384, 640), (640, 384)],
+    use_distributed=True, world_size=None, rank=None,
+):
+    """
+        The dataset has already been splited by different rank
+    """
+    if use_distributed:
+        assert world_size is not None
+        assert rank is not None
+        sampler = DistributedSampler(
+            dataset,
+            shuffle=True,
+            num_replicas=world_size,
+            rank=rank,
+            seed=epoch,
+        )
+    else:
+        sampler = RandomSampler(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        sampler=sampler,
+        collate_fn=identity if multi_aspect_ratio else default_collate,
+        drop_last=True,
+    )
+    if multi_aspect_ratio:
+        dataloader_iterator = Bucketeer(
+            dataloader,
+            sizes=sizes,
+            is_infinite=True, epoch=epoch,
+        )
+    else:
+        dataloader_iterator = iter(dataloader)
+    # To make it infinite
+    loader = IterLoader(dataloader_iterator, use_distributed=False, epoch=epoch)
+    return loader
+def create_length_grouped_video_text_dataloader(dataset, batch_size, num_workers, max_frames,
+    world_size=None, rank=None, epoch=0, use_distributed=False):
+    if use_distributed:
+        assert world_size is not None
+        assert rank is not None
+        sampler = DistributedSampler(
+            dataset,
+            shuffle=True,
+            num_replicas=world_size,
+            rank=rank,
+            seed=epoch,
+        )
+    else:
+        sampler = RandomSampler(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        sampler=sampler,
+        collate_fn=identity,
+        drop_last=True,
+    )
+    # make it infinite
+    dataloader_iterator = TemporalLengthBucketeer(
+        dataloader,
+        max_frames=max_frames,
+        epoch=epoch,
+    )
+    return dataloader_iterator
+def create_mixed_dataloaders(
+    dataset, batch_size, num_workers, world_size=None, rank=None, epoch=0,
+    image_mix_ratio=0.1, use_image_video_mixed_training=True,
+):
+    """
+        The video & image mixed training dataloader builder
+    """
+    assert world_size is not None
+    assert rank is not None
+    image_gpus = max(1, int(world_size * image_mix_ratio))
+    if use_image_video_mixed_training:
+        video_gpus = world_size - image_gpus
+    else:
+        # only use video data
+        video_gpus = world_size
+        image_gpus = 0
+    print(f"{image_gpus} gpus for image, {video_gpus} gpus for video")
+    if rank < video_gpus:
+        sampler = DistributedSampler(
+            dataset,
+            shuffle=True,
+            num_replicas=video_gpus,
+            rank=rank,
+            seed=epoch,
+        )
+    else:
+        sampler = DistributedSampler(
+            dataset,
+            shuffle=True,
+            num_replicas=image_gpus,
+            rank=rank - video_gpus,
+            seed=epoch,
+        )
+    loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        sampler=sampler,
+        collate_fn=default_collate,
+        drop_last=True,
+    )
+    # To make it infinite
+    loader = IterLoader(loader, use_distributed=True, epoch=epoch)
+    return loader

dataset/dataset_cls.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import os
+import json
+import jsonlines
+import torch
+import math
+import random
+import cv2
+from tqdm import tqdm
+from collections import OrderedDict
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+import numpy as np
+import subprocess
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from torchvision.transforms import functional as F
+class ImageTextDataset(Dataset):
+    """
+        Usage:
+            The dataset class for image-text pairs, used for image generation training
+            It supports multi-aspect ratio training
+        params:
+            anno_file: The annotation file list
+            add_normalize: whether to normalize the input image pixel to [-1, 1], default: True
+            ratios: The aspect ratios during training, format: width / height
+            sizes: The resoultion of training images, format: (width, height)
+    """
+    def __init__(
+        self, anno_file, add_normalize=True,
+        ratios=[1/1, 3/5, 5/3],
+        sizes=[(1024, 1024), (768, 1280), (1280, 768)],
+        crop_mode='random', p_random_ratio=0.0,
+    ):
+        # Ratios and Sizes : (w h)
+        super().__init__()
+        self.image_annos = []
+        if not isinstance(anno_file, list):
+            anno_file = [anno_file]
+        for anno_file_ in anno_file:
+            print(f"Load image annotation files from {anno_file_}")
+            with jsonlines.open(anno_file_, 'r') as reader:
+                for item in reader:
+                    self.image_annos.append(item)
+        print(f"Totally Remained {len(self.image_annos)} images")
+        transform_list = [
+            transforms.ToTensor(),
+        ]
+        if add_normalize:
+            transform_list.append(transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))
+        self.transform = transforms.Compose(transform_list)
+        print(f"Transform List is {transform_list}")
+        assert crop_mode in ['center', 'random']
+        self.crop_mode = crop_mode
+        self.ratios = ratios
+        self.sizes = sizes
+        self.p_random_ratio = p_random_ratio
+    def get_closest_size(self, x):
+        if self.p_random_ratio > 0 and np.random.rand() < self.p_random_ratio:
+            best_size_idx = np.random.randint(len(self.ratios))
+        else:
+            w, h = x.width, x.height
+            best_size_idx = np.argmin([abs(w/h-r) for r in self.ratios])
+        return self.sizes[best_size_idx]
+    def get_resize_size(self, orig_size, tgt_size):
+        if (tgt_size[1]/tgt_size[0] - 1) * (orig_size[1]/orig_size[0] - 1) >= 0:
+            alt_min = int(math.ceil(max(tgt_size)*min(orig_size)/max(orig_size)))
+            resize_size = max(alt_min, min(tgt_size))
+        else:
+            alt_max = int(math.ceil(min(tgt_size)*max(orig_size)/min(orig_size)))
+            resize_size = max(alt_max, max(tgt_size))
+        return resize_size
+    def __len__(self):
+        return len(self.image_annos)
+    def __getitem__(self, index):
+        image_anno = self.image_annos[index]
+        try:
+            img = Image.open(image_anno['image']).convert("RGB")
+            text = image_anno['text']
+            assert isinstance(text, str), "Text should be str"
+            size = self.get_closest_size(img)
+            resize_size = self.get_resize_size((img.width, img.height), size)
+            img = transforms.functional.resize(img, resize_size, interpolation=transforms.InterpolationMode.BICUBIC, antialias=True)
+            if self.crop_mode == 'center':
+                img = transforms.functional.center_crop(img, (size[1], size[0]))
+            elif self.crop_mode == 'random':
+                img = transforms.RandomCrop((size[1], size[0]))(img)
+            else:
+                img = transforms.functional.center_crop(img, (size[1], size[0]))
+            image_tensor = self.transform(img)
+            return {
+                "video": image_tensor,    # using keyname `video`, to be compatible with video
+                "text" : text,
+                "identifier": 'image',
+            }
+        except Exception as e:
+            print(f'Load Image Error with {e}')
+            return self.__getitem__(random.randint(0, self.__len__() - 1))
+class LengthGroupedVideoTextDataset(Dataset):
+    """
+        Usage:
+            The dataset class for video-text pairs, used for video generation training
+            It groups the video with the same frames together
+            Now only supporting fixed resolution during training
+        params:
+            anno_file: The annotation file list
+            max_frames: The maximum temporal lengths (This is the vae latent temporal length) 16 => (16 - 1) * 8 + 1 = 121 frames
+            load_vae_latent: Loading the pre-extracted vae latents during training, we recommend to extract the latents in advance
+                to reduce the time cost per batch
+            load_text_fea: Loading the pre-extracted text features during training, we recommend to extract the prompt textual features
+                in advance, since the T5 encoder will cost many GPU memories
+    """
+    def __init__(self, anno_file, max_frames=16, resolution='384p', load_vae_latent=True, load_text_fea=True):
+        super().__init__()
+        self.video_annos = []
+        self.max_frames = max_frames
+        self.load_vae_latent = load_vae_latent
+        self.load_text_fea = load_text_fea
+        self.resolution = resolution
+        assert load_vae_latent, "Now only support loading vae latents, we will support to directly load video frames in the future"
+        if not isinstance(anno_file, list):
+            anno_file = [anno_file]
+        for anno_file_ in anno_file:
+            with jsonlines.open(anno_file_, 'r') as reader:
+                for item in tqdm(reader):
+                    self.video_annos.append(item)
+        print(f"Totally Remained {len(self.video_annos)} videos")
+    def __len__(self):
+        return len(self.video_annos)
+    def __getitem__(self, index):
+        try:
+            video_anno = self.video_annos[index]
+            text = video_anno['text']
+            latent_path = video_anno['latent']
+            latent = torch.load(latent_path, map_location='cpu')  # loading the pre-extracted video latents
+            # TODO: remove the hard code latent shape checking
+            if self.resolution == '384p':
+                assert latent.shape[-1] == 640 // 8
+                assert latent.shape[-2] == 384 // 8
+            else:
+                assert self.resolution == '768p'
+                assert latent.shape[-1] == 1280 // 8
+                assert latent.shape[-2] == 768 // 8
+            cur_temp = latent.shape[2]
+            cur_temp = min(cur_temp, self.max_frames)
+            video_latent = latent[:,:,:cur_temp].float()
+            assert video_latent.shape[1] == 16
+            if self.load_text_fea:
+                text_fea_path = video_anno['text_fea']
+                text_fea = torch.load(text_fea_path, map_location='cpu')
+                return {
+                    'video': video_latent,
+                    'prompt_embed': text_fea['prompt_embed'],
+                    'prompt_attention_mask': text_fea['prompt_attention_mask'],
+                    'pooled_prompt_embed': text_fea['pooled_prompt_embed'],
+                    "identifier": 'video',
+                }
+            else:
+                return {
+                    'video': video_latent,
+                    'text': text,
+                    "identifier": 'video',
+                }
+        except Exception as e:
+            print(f'Load Video Error with {e}')
+            return self.__getitem__(random.randint(0, self.__len__() - 1))
+class VideoFrameProcessor:
+    # load a video and transform
+    def __init__(self, resolution=256, num_frames=24, add_normalize=True, sample_fps=24):
+        image_size = resolution
+        transform_list = [
+            transforms.Resize(image_size, interpolation=InterpolationMode.BICUBIC, antialias=True),
+            transforms.CenterCrop(image_size),
+        ]
+        if add_normalize:
+            transform_list.append(transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))
+        print(f"Transform List is {transform_list}")
+        self.num_frames = num_frames
+        self.transform = transforms.Compose(transform_list)
+        self.sample_fps = sample_fps
+    def __call__(self, video_path):
+        try:
+            video_capture = cv2.VideoCapture(video_path)
+            fps = video_capture.get(cv2.CAP_PROP_FPS)
+            frames = []
+            while True:
+                flag, frame = video_capture.read()
+                if not flag:
+                    break
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frame = torch.from_numpy(frame)
+                frame = frame.permute(2, 0, 1)
+                frames.append(frame)
+            video_capture.release()
+            sample_fps = self.sample_fps
+            interval = max(int(fps / sample_fps), 1)
+            frames = frames[::interval]
+            if len(frames) < self.num_frames:
+                num_frame_to_pack = self.num_frames - len(frames)
+                recurrent_num = num_frame_to_pack // len(frames)
+                frames = frames + recurrent_num * frames + frames[:(num_frame_to_pack % len(frames))]
+                assert len(frames) >= self.num_frames, f'{len(frames)}'
+            start_indexs = list(range(0, max(0, len(frames) - self.num_frames + 1)))
+            start_index = random.choice(start_indexs)
+            filtered_frames = frames[start_index : start_index+self.num_frames]
+            assert len(filtered_frames) == self.num_frames, f"The sampled frames should equals to {self.num_frames}"
+            filtered_frames = torch.stack(filtered_frames).float() / 255
+            filtered_frames = self.transform(filtered_frames)
+            filtered_frames = filtered_frames.permute(1, 0, 2, 3)
+            return filtered_frames, None
+        except Exception as e:
+            print(f"Load video: {video_path} Error, Exception {e}")
+            return None, None
+class VideoDataset(Dataset):
+    def __init__(self, anno_file, resolution=256, max_frames=6, add_normalize=True):
+        super().__init__()
+        self.video_annos = []
+        self.max_frames = max_frames
+        if not isinstance(anno_file, list):
+            anno_file = [anno_file]
+        print(f"The training video clip frame number is {max_frames} ")
+        for anno_file_ in anno_file:
+            print(f"Load annotation file from {anno_file_}")
+            with jsonlines.open(anno_file_, 'r') as reader:
+                for item in tqdm(reader):
+                    self.video_annos.append(item)
+        print(f"Totally Remained {len(self.video_annos)} videos")
+        self.video_processor = VideoFrameProcessor(resolution, max_frames, add_normalize)
+    def __len__(self):
+        return len(self.video_annos)
+    def __getitem__(self, index):
+        video_anno = self.video_annos[index]
+        video_path = video_anno['video']
+        try:
+            video_tensors, video_frames = self.video_processor(video_path)
+            assert video_tensors.shape[1] == self.max_frames
+            return {
+                "video": video_tensors,
+                "identifier": 'video',
+            }
+        except Exception as e:
+            print('Loading Video Error with {e}')
+            return self.__getitem__(random.randint(0, self.__len__() - 1))
+class ImageDataset(Dataset):
+    def __init__(self, anno_file, resolution=256, max_frames=8, add_normalize=True):
+        super().__init__()
+        self.image_annos = []
+        self.max_frames = max_frames
+        image_paths = []
+        if not isinstance(anno_file, list):
+            anno_file = [anno_file]
+        for anno_file_ in anno_file:
+            print(f"Load annotation file from {anno_file_}")
+            with jsonlines.open(anno_file_, 'r') as reader:
+                for item in tqdm(reader):
+                    image_paths.append(item['image'])
+        print(f"Totally Remained {len(image_paths)} images")
+        # pack multiple frames
+        for idx in range(0, len(image_paths), self.max_frames):
+            image_path_shard = image_paths[idx : idx + self.max_frames]
+            if len(image_path_shard) < self.max_frames:
+                image_path_shard = image_path_shard + image_paths[:self.max_frames - len(image_path_shard)]
+            assert len(image_path_shard) == self.max_frames
+            self.image_annos.append(image_path_shard)
+        image_size = resolution
+        transform_list = [
+            transforms.Resize(image_size, interpolation=InterpolationMode.BICUBIC, antialias=True),
+            transforms.CenterCrop(image_size),
+            transforms.ToTensor(),
+        ]
+        if add_normalize:
+            transform_list.append(transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))
+        print(f"Transform List is {transform_list}")
+        self.transform = transforms.Compose(transform_list)
+    def __len__(self):
+        return len(self.image_annos)
+    def __getitem__(self, index):
+        image_paths = self.image_annos[index]
+        try:
+            packed_pil_frames = [Image.open(image_path).convert("RGB") for image_path in image_paths]
+            filtered_frames = [self.transform(frame) for frame in packed_pil_frames]
+            filtered_frames = torch.stack(filtered_frames)  # [t, c, h, w]
+            filtered_frames = filtered_frames.permute(1, 0, 2, 3) # [c, t, h, w]
+            return {
+                "video": filtered_frames,
+                "identifier": 'image',
+            }
+        except Exception as e:
+            print(f'Load Images Error with {e}')
+            return self.__getitem__(random.randint(0, self.__len__() - 1))

diffusion_schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .scheduling_cosine_ddpm import DDPMCosineScheduler
2	+ from .scheduling_flow_matching import PyramidFlowMatchEulerDiscreteScheduler

diffusion_schedulers/scheduling_cosine_ddpm.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+@dataclass
+class DDPMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.Tensor
+class DDPMCosineScheduler(SchedulerMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        scaler: float = 1.0,
+        s: float = 0.008,
+    ):
+        self.scaler = scaler
+        self.s = torch.tensor([s])
+        self._init_alpha_cumprod = torch.cos(self.s / (1 + self.s) * torch.pi * 0.5) ** 2
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+    def _alpha_cumprod(self, t, device):
+        if self.scaler > 1:
+            t = 1 - (1 - t) ** self.scaler
+        elif self.scaler < 1:
+            t = t**self.scaler
+        alpha_cumprod = torch.cos(
+            (t + self.s.to(device)) / (1 + self.s.to(device)) * torch.pi * 0.5
+        ) ** 2 / self._init_alpha_cumprod.to(device)
+        return alpha_cumprod.clamp(0.0001, 0.9999)
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`): input sample
+            timestep (`int`, optional): current timestep
+        Returns:
+            `torch.Tensor`: scaled input sample
+        """
+        return sample
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        timesteps: Optional[List[int]] = None,
+        device: Union[str, torch.device] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+        Args:
+            num_inference_steps (`Dict[float, int]`):
+                the number of diffusion steps used when generating samples with a pre-trained model. If passed, then
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps are moved to. {2 / 3: 20, 0.0: 10}
+        """
+        if timesteps is None:
+            timesteps = torch.linspace(1.0, 0.0, num_inference_steps + 1, device=device)
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = torch.Tensor(timesteps).to(device)
+        self.timesteps = timesteps
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMSchedulerOutput, Tuple]:
+        dtype = model_output.dtype
+        device = model_output.device
+        t = timestep
+        prev_t = self.previous_timestep(t)
+        alpha_cumprod = self._alpha_cumprod(t, device).view(t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha_cumprod_prev = self._alpha_cumprod(prev_t, device).view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha = alpha_cumprod / alpha_cumprod_prev
+        mu = (1.0 / alpha).sqrt() * (sample - (1 - alpha) * model_output / (1 - alpha_cumprod).sqrt())
+        std_noise = randn_tensor(mu.shape, generator=generator, device=model_output.device, dtype=model_output.dtype)
+        std = ((1 - alpha) * (1.0 - alpha_cumprod_prev) / (1.0 - alpha_cumprod)).sqrt() * std_noise
+        pred = mu + std * (prev_t != 0).float().view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+        if not return_dict:
+            return (pred.to(dtype),)
+        return DDPMSchedulerOutput(prev_sample=pred.to(dtype))
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        device = original_samples.device
+        dtype = original_samples.dtype
+        alpha_cumprod = self._alpha_cumprod(timesteps, device=device).view(
+            timesteps.size(0), *[1 for _ in original_samples.shape[1:]]
+        )
+        noisy_samples = alpha_cumprod.sqrt() * original_samples + (1 - alpha_cumprod).sqrt() * noise
+        return noisy_samples.to(dtype=dtype)
+    def __len__(self):
+        return self.config.num_train_timesteps
+    def previous_timestep(self, timestep):
+        index = (self.timesteps - timestep[0]).abs().argmin().item()
+        prev_t = self.timesteps[index + 1][None].expand(timestep.shape[0])
+        return prev_t

diffusion_schedulers/scheduling_flow_matching.py ADDED Viewed

	@@ -0,0 +1,297 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple, Union, List
+import math
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class PyramidFlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,     # Following Stable diffusion 3,
+        stages: int = 3,
+        stage_range: List = [0, 1/3, 2/3, 1],
+        gamma: float = 1/3,
+    ):
+        self.timestep_ratios = {}           # The timestep ratio for each stage
+        self.timesteps_per_stage = {}       # The  detailed timesteps per stage
+        self.sigmas_per_stage = {}
+        self.start_sigmas = {}
+        self.end_sigmas = {}
+        self.ori_start_sigmas = {}
+        # self.init_sigmas()
+        self.init_sigmas_for_each_stage()
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+        self.gamma = gamma
+    def init_sigmas(self):
+        """
+            initialize the global timesteps and sigmas
+        """
+        num_train_timesteps = self.config.num_train_timesteps
+        shift = self.config.shift
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+        sigmas = timesteps / num_train_timesteps
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        self.timesteps = sigmas * num_train_timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    def init_sigmas_for_each_stage(self):
+        """
+            Init the timesteps for each stage
+        """
+        self.init_sigmas()
+        stage_distance = []
+        stages = self.config.stages
+        training_steps = self.config.num_train_timesteps
+        stage_range = self.config.stage_range
+        # Init the start and end point of each stage
+        for i_s in range(stages):
+            # To decide the start and ends point
+            start_indice = int(stage_range[i_s] * training_steps)
+            start_indice = max(start_indice, 0)
+            end_indice = int(stage_range[i_s+1] * training_steps)
+            end_indice = min(end_indice, training_steps)
+            start_sigma = self.sigmas[start_indice].item()
+            end_sigma = self.sigmas[end_indice].item() if end_indice < training_steps else 0.0
+            self.ori_start_sigmas[i_s] = start_sigma
+            if i_s != 0:
+                ori_sigma = 1 - start_sigma
+                gamma = self.config.gamma
+                corrected_sigma = (1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)) * ori_sigma
+                # corrected_sigma = 1 / (2 - ori_sigma) * ori_sigma
+                start_sigma = 1 - corrected_sigma
+            stage_distance.append(start_sigma - end_sigma)
+            self.start_sigmas[i_s] = start_sigma
+            self.end_sigmas[i_s] = end_sigma
+        # Determine the ratio of each stage according to flow length
+        tot_distance = sum(stage_distance)
+        for i_s in range(stages):
+            if i_s == 0:
+                start_ratio = 0.0
+            else:
+                start_ratio = sum(stage_distance[:i_s]) / tot_distance
+            if i_s == stages - 1:
+                end_ratio = 1.0
+            else:
+                end_ratio = sum(stage_distance[:i_s+1]) / tot_distance
+            self.timestep_ratios[i_s] = (start_ratio, end_ratio)
+        # Determine the timesteps and sigmas for each stage
+        for i_s in range(stages):
+            timestep_ratio = self.timestep_ratios[i_s]
+            timestep_max = self.timesteps[int(timestep_ratio[0] * training_steps)]
+            timestep_min = self.timesteps[min(int(timestep_ratio[1] * training_steps), training_steps - 1)]
+            timesteps = np.linspace(
+                timestep_max, timestep_min, training_steps + 1,
+            )
+            self.timesteps_per_stage[i_s] = timesteps[:-1] if isinstance(timesteps, torch.Tensor) else torch.from_numpy(timesteps[:-1])
+            stage_sigmas = np.linspace(
+                1, 0, training_steps + 1,
+            )
+            self.sigmas_per_stage[i_s] = torch.from_numpy(stage_sigmas[:-1])
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def set_timesteps(self, num_inference_steps: int, stage_index: int, device: Union[str, torch.device] = None):
+        """
+            Setting the timesteps and sigmas for each stage
+        """
+        self.num_inference_steps = num_inference_steps
+        training_steps = self.config.num_train_timesteps
+        self.init_sigmas()
+        stage_timesteps = self.timesteps_per_stage[stage_index]
+        timestep_max = stage_timesteps[0].item()
+        timestep_min = stage_timesteps[-1].item()
+        timesteps = np.linspace(
+            timestep_max, timestep_min, num_inference_steps,
+        )
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        stage_sigmas = self.sigmas_per_stage[stage_index]
+        sigma_max = stage_sigmas[0].item()
+        sigma_min = stage_sigmas[-1].item()
+        ratios = np.linspace(
+            sigma_max, sigma_min, num_inference_steps
+        )
+        sigmas = torch.from_numpy(ratios).to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self._step_index = None
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._step_index = 0
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+        prev_sample = sample + (sigma_next - sigma) * model_output
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps

docs/DiT.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# Pyramid Flow's DiT Finetuning Guide
+This is the finetuning guide for the DiT in Pyramid Flow. We provide instructions for both autoregressive and non-autoregressive versions. The former is more research oriented and the latter is more stable (but less efficient without temporal pyramid). Please refer to [another document](https://github.com/jy0205/Pyramid-Flow/blob/main/docs/VAE) for VAE finetuning.
+## Hardware Requirements
++ DiT finetuning: At least 8 A100 GPUs.
+## Prepare the Dataset
+The training dataset should be arranged into a json file, with `video`, `text` fields. Since the video vae latent extraction is very slow, we strongly recommend you to pre-extract the video vae latents to save the training time. We provide a video vae latent extraction script in folder `tools`. You can run it with the following command:
+```bash
+sh scripts/extract_vae_latent.sh
+```
+(optional) Since the T5 text encoder will cost a lot of GPU memory, pre-extract the text features will save the training memory. We also provide a text feature extraction script in folder `tools`. You can run it with the following command:
+```bash
+sh scripts/extract_text_feature.sh
+```
+The final training annotation json file should look like the following format:
+```
+{"video": video_path, "text": text prompt, "latent": extracted video vae latent, "text_fea": extracted text feature}
+```
+We provide the example json annotation files for [video](https://github.com/jy0205/Pyramid-Flow/blob/main/annotation/video_text.jsonl) and [image](https://github.com/jy0205/Pyramid-Flow/blob/main/annotation/image_text.jsonl)) training in the `annotation` folder. You can refer them to prepare your training dataset.
+## Run Training
+We provide two types of training scripts: (1) autoregressive video generation training with temporal pyramid. (2) Full-sequence diffusion training with pyramid-flow for both text-to-image and text-to-video training. This corresponds to the following two script files. Running these training scripts using at least 8 GPUs:
++ `scripts/train_pyramid_flow.sh`: The autoregressive video generation training with temporal pyramid.
+```bash
+sh scripts/train_pyramid_flow.sh
+```
++ `scripts/train_pyramid_flow_without_ar.sh`: Using pyramid-flow for full-sequence diffusion training.
+```bash
+sh scripts/train_pyramid_flow_without_ar.sh
+```
+## Tips
++ For the 768p version, make sure to add the args:  `--gradient_checkpointing`
++ Param `NUM_FRAMES` should be set to a multiple of 8
++ For the param `video_sync_group`, it indicates the number of process that accepts the same input video, used for temporal pyramid AR training. We recommend to set this value to 4, 8 or 16. (16 is better if you have more GPUs)
++ Make sure to set `NUM_FRAMES % VIDEO_SYNC_GROUP == 0`, `GPUS % VIDEO_SYNC_GROUP == 0`, and `BATCH_SIZE % 4 == 0`

docs/VAE.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# Pyramid Flow's VAE Training Guide
+This is the training guide for a [MAGVIT-v2](https://arxiv.org/abs/2310.05737) like continuous 3D VAE, which should be quite flexible. Feel free to build your own video generative model on this part of VAE training code. Please refer to [another document](https://github.com/jy0205/Pyramid-Flow/blob/main/docs/DiT) for DiT finetuning.
+## Hardware Requirements
++ VAE training: At least 8 A100 GPUs.
+## Prepare the Dataset
+The training of our causal video vae uses both image and video data. Both of them should be arranged into a json file, with `video` or `image` field. The final training annotation json file should look like the following format:
+```
+# For Video
+{"video": video_path}
+# For Image
+{"image": image_path}
+```
+## Run Training
+The causal video vae undergoes a two-stage training.
++ Stage-1: image and video mixed training
++ Stage-2: pure video training, using context parallel to load video with more video frames
+The VAE training script is `scripts/train_causal_video_vae.sh`, run it as follows:
+```bash
+sh scripts/train_causal_video_vae.sh
+```
+We also provide a VAE demo `causal_video_vae_demo.ipynb` for image and video reconstruction.
+## Tips
++ For stage-1, we use a mixed image and video training. Add the param `--use_image_video_mixed_training` to support the mixed training. We set the image ratio to 0.1 by default.
++ Set the `resolution` to 256 is enough for VAE training.
++ For stage-1, the `max_frames` is set to 17. It means we use 17 sampled video frames for training.
++ For stage-2, we open the param `use_context_parallel` to distribute long video frames to multiple GPUs. Make sure to set `GPUS % CONTEXT_SIZE == 0` and `NUM_FRAMES=17 * CONTEXT_SIZE + 1`

image_generation_demo.ipynb ADDED Viewed

	@@ -0,0 +1,123 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import PIL\n",
+    "from PIL import Image\n",
+    "from IPython.display import HTML\n",
+    "from pyramid_dit import PyramidDiTForVideoGeneration\n",
+    "from IPython.display import Image as ipython_image\n",
+    "from diffusers.utils import load_image, export_to_video, export_to_gif"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant='diffusion_transformer_image'       # For low resolution\n",
+    "model_name = \"pyramid_flux\"\n",
+    "\n",
+    "model_path = \"/home/jinyang06/models/pyramid-flow-miniflux\"   # The downloaded checkpoint dir\n",
+    "model_dtype = 'bf16'\n",
+    "\n",
+    "device_id = 0\n",
+    "torch.cuda.set_device(device_id)\n",
+    "\n",
+    "model = PyramidDiTForVideoGeneration(\n",
+    "    model_path,\n",
+    "    model_dtype,\n",
+    "    model_name=model_name,\n",
+    "    model_variant=variant,\n",
+    ")\n",
+    "\n",
+    "model.vae.to(\"cuda\")\n",
+    "model.dit.to(\"cuda\")\n",
+    "model.text_encoder.to(\"cuda\")\n",
+    "\n",
+    "model.vae.enable_tiling()\n",
+    "\n",
+    "if model_dtype == \"bf16\":\n",
+    "    torch_dtype = torch.bfloat16 \n",
+    "elif model_dtype == \"fp16\":\n",
+    "    torch_dtype = torch.float16\n",
+    "else:\n",
+    "    torch_dtype = torch.float32"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Text-to-Image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"shoulder and full head portrait of a beautiful 19 year old girl, brunette, smiling, stunning, highly detailed, glamour lighting, HDR, photorealistic, hyperrealism, octane render, unreal engine\"\n",
+    "\n",
+    "# now support 3 aspect ratios\n",
+    "resolution_dict = {\n",
+    "    '1:1' : (1024, 1024),\n",
+    "    '5:3' : (1280, 768),\n",
+    "    '3:5' : (768, 1280),\n",
+    "}\n",
+    "\n",
+    "ratio = '1:1'   # 1:1, 5:3, 3:5\n",
+    "\n",
+    "width, height = resolution_dict[ratio]\n",
+    "\n",
+    "\n",
+    "with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):\n",
+    "    images = model.generate(\n",
+    "        prompt=prompt,\n",
+    "        num_inference_steps=[20, 20, 20],\n",
+    "        height=height,\n",
+    "        width=width,\n",
+    "        temp=1,\n",
+    "        guidance_scale=9.0,        \n",
+    "        output_type=\"pil\",\n",
+    "        save_memory=False, \n",
+    "    )\n",
+    "\n",
+    "display(images[0])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

inference_multigpu.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import torch
+import sys
+import argparse
+import random
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from diffusers.utils import export_to_video
+from pyramid_dit import PyramidDiTForVideoGeneration
+from trainer_misc import init_distributed_mode, init_sequence_parallel_group
+import PIL
+from PIL import Image
+def get_args():
+    parser = argparse.ArgumentParser('Pytorch Multi-process Script', add_help=False)
+    parser.add_argument('--model_name', default='pyramid_flux', type=str, help="The model name", choices=["pyramid_flux", "pyramid_mmdit"])
+    parser.add_argument('--model_dtype', default='bf16', type=str, help="The Model Dtype: bf16")
+    parser.add_argument('--model_path', default='/home/jinyang06/models/pyramid-flow', type=str, help='Set it to the downloaded checkpoint dir')
+    parser.add_argument('--variant', default='diffusion_transformer_768p', type=str,)
+    parser.add_argument('--task', default='t2v', type=str, choices=['i2v', 't2v'])
+    parser.add_argument('--temp', default=16, type=int, help='The generated latent num, num_frames = temp * 8 + 1')
+    parser.add_argument('--sp_group_size', default=2, type=int, help="The number of gpus used for inference, should be 2 or 4")
+    parser.add_argument('--sp_proc_num', default=-1, type=int, help="The number of process used for video training, default=-1 means using all process.")
+    return parser.parse_args()
+def main():
+    args = get_args()
+    # setup DDP
+    init_distributed_mode(args)
+    assert args.world_size == args.sp_group_size, "The sequence parallel size should be DDP world size"
+    # Enable sequence parallel
+    init_sequence_parallel_group(args)
+    device = torch.device('cuda')
+    rank = args.rank
+    model_dtype = args.model_dtype
+    model = PyramidDiTForVideoGeneration(
+        args.model_path,
+        model_dtype,
+        model_name=args.model_name,
+        model_variant=args.variant,
+    )
+    model.vae.to(device)
+    model.dit.to(device)
+    model.text_encoder.to(device)
+    model.vae.enable_tiling()
+    if model_dtype == "bf16":
+        torch_dtype = torch.bfloat16
+    elif model_dtype == "fp16":
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = torch.float32
+    # The video generation config
+    if args.variant == 'diffusion_transformer_768p':
+        width = 1280
+        height = 768
+    else:
+        assert args.variant == 'diffusion_transformer_384p'
+        width = 640
+        height = 384
+    if args.task == 't2v':
+        prompt = "A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors"
+        with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):
+            frames = model.generate(
+                prompt=prompt,
+                num_inference_steps=[20, 20, 20],
+                video_num_inference_steps=[10, 10, 10],
+                height=height,
+                width=width,
+                temp=args.temp,
+                guidance_scale=7.0,         # The guidance for the first frame, set it to 7 for 384p variant
+                video_guidance_scale=5.0,   # The guidance for the other video latent
+                output_type="pil",
+                save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+                cpu_offloading=False,       # If OOM, set it to True to reduce memory usage
+                inference_multigpu=True,
+            )
+        if rank == 0:
+            export_to_video(frames, "./text_to_video_sample.mp4", fps=24)
+    else:
+        assert args.task == 'i2v'
+        image_path = 'assets/the_great_wall.jpg'
+        image = Image.open(image_path).convert("RGB")
+        image = image.resize((width, height))
+        prompt = "FPV flying over the Great Wall"
+        with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):
+            frames = model.generate_i2v(
+                prompt=prompt,
+                input_image=image,
+                num_inference_steps=[10, 10, 10],
+                temp=args.temp,
+                video_guidance_scale=4.0,
+                output_type="pil",
+                save_memory=True,         # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+                cpu_offloading=False,       # If OOM, set it to True to reduce memory usage
+                inference_multigpu=True,
+            )
+        if rank == 0:
+            export_to_video(frames, "./image_to_video_sample.mp4", fps=24)
+    torch.distributed.barrier()
+if __name__ == "__main__":
+    main()

pyramid_dit/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .pyramid_dit_for_video_gen_pipeline import PyramidDiTForVideoGeneration
+from .flux_modules import FluxSingleTransformerBlock, FluxTransformerBlock, FluxTextEncoderWithMask
+from .mmdit_modules import JointTransformerBlock, SD3TextEncoderWithMask

pyramid_dit/flux_modules/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .modeling_pyramid_flux import PyramidFluxTransformer
+from .modeling_text_encoder import FluxTextEncoderWithMask
+from .modeling_flux_block import FluxSingleTransformerBlock, FluxTransformerBlock

pyramid_dit/flux_modules/modeling_embedding.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import math
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.models.activations import get_activation, FP32SiLU
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
+        elif act_fn == "silu_fp32":
+            self.act_1 = FP32SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, guidance, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+        guidance_proj = self.time_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+        time_guidance_emb = timesteps_emb + guidance_emb
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = time_guidance_emb + pooled_projections
+        return conditioning
+class CombinedTimestepTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = timesteps_emb + pooled_projections
+        return conditioning

pyramid_dit/flux_modules/modeling_flux_block.py ADDED Viewed

	@@ -0,0 +1,1044 @@

+from typing import Any, Dict, List, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import inspect
+from einops import rearrange
+from diffusers.utils import deprecate
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU, SwiGLU
+from .modeling_normalization import (
+    AdaLayerNormContinuous, AdaLayerNormZero,
+    AdaLayerNormZeroSingle, FP32LayerNorm, RMSNorm
+)
+from trainer_misc import (
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    all_to_all,
+)
+try:
+    from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+    from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+except:
+    flash_attn_func = None
+    flash_attn_qkvpacked_func = None
+    flash_attn_varlen_func = None
+def apply_rope(xq, xk, freqs_cis):
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "swiglu":
+            act_fn = SwiGLU(dim, inner_dim, bias=bias)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class SequenceParallelVarlenFlashSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        qkv_list = []
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        # To sync the encoder query, key and values
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        encoder_qkv = all_to_all(encoder_qkv, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+        output_hidden = torch.zeros_like(qkv[:,:,0])
+        output_encoder_hidden = torch.zeros_like(encoder_qkv[:,:,0])
+        encoder_length = encoder_qkv.shape[1]
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            # get the query, key, value from padding sequence
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, pad_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(concat_qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, encoder_length + length * sp_group_size)
+            stage_encoder_hidden_output = stage_output[:, :encoder_length]
+            stage_hidden_output = stage_output[:, encoder_length:]
+            stage_hidden_output = all_to_all(stage_hidden_output, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden[:, i_sum:i_sum+length] = stage_hidden_output
+            output_encoder_hidden[i_p::num_stages] = stage_encoder_hidden_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_encoder_hidden = all_to_all(output_encoder_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+        output_hidden = output_hidden.flatten(2, 3)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class VarlenFlashSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        output_hidden = torch.zeros_like(query)
+        output_encoder_hidden = torch.zeros_like(encoder_query)
+        encoder_length = encoder_query.shape[1]
+        qkv_list = []
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(concat_qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, encoder_length + length)
+            stage_encoder_hidden_output = stage_output[:, :encoder_length]
+            stage_hidden_output = stage_output[:, encoder_length:]
+            output_hidden[:, i_sum:i_sum+length] = stage_hidden_output
+            output_encoder_hidden[i_p::num_stages] = stage_encoder_hidden_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_hidden = output_hidden.flatten(2, 3)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class SequenceParallelVarlenSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        # To sync the encoder query, key and values
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        encoder_qkv = all_to_all(encoder_qkv, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+        encoder_length = encoder_qkv.shape[1]
+        i_sum = 0
+        output_encoder_hidden_list = []
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = concat_qkv_tokens.unbind(2)   # [bs, tot_seq, nhead, dim]
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2)   # [bs, tot_seq, nhead, dim]
+            output_encoder_hidden_list.append(stage_hidden_states[:, :encoder_length])
+            output_hidden = stage_hidden_states[:, encoder_length:]
+            output_hidden = all_to_all(output_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden_list.append(output_hidden)
+            i_sum += length
+        output_encoder_hidden = torch.stack(output_encoder_hidden_list, dim=1)  # [b n s nhead d]
+        output_encoder_hidden = rearrange(output_encoder_hidden, 'b n s h d -> (b n) s h d')
+        output_encoder_hidden = all_to_all(output_encoder_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        output_hidden = torch.cat(output_hidden_list, dim=1).flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class VarlenSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        encoder_length = encoder_query.shape[1]
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        output_encoder_hidden_list = []
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = concat_qkv_tokens.unbind(2)   # [bs, tot_seq, nhead, dim]
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            # with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2).flatten(2, 3)   # [bs, tot_seq, dim]
+            output_encoder_hidden_list.append(stage_hidden_states[:, :encoder_length])
+            output_hidden_list.append(stage_hidden_states[:, encoder_length:])
+            i_sum += length
+        output_encoder_hidden = torch.stack(output_encoder_hidden_list, dim=1)  # [b n s d]
+        output_encoder_hidden = rearrange(output_encoder_hidden, 'b n s d -> (b n) s d')
+        output_hidden = torch.cat(output_hidden_list, dim=1)
+        return output_hidden, output_encoder_hidden
+class SequenceParallelVarlenFlashAttnSingle:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, heads, scale,
+            hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        qkv_list = []
+        num_stages = len(hidden_length)
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        output_hidden = torch.zeros_like(qkv[:,:,0])
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            # get the query, key, value from padding sequence
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            if image_rotary_emb is not None:
+                qkv_tokens[:,:,0], qkv_tokens[:,:,1] = apply_rope(qkv_tokens[:,:,0], qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, length * sp_group_size)
+            stage_hidden_output = all_to_all(stage_output, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden[:, i_sum:i_sum+length] = stage_hidden_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_hidden = output_hidden.flatten(2, 3)
+        return output_hidden
+class VarlenFlashSelfAttnSingle:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, heads, scale,
+            hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        output_hidden = torch.zeros_like(query)
+        qkv_list = []
+        num_stages = len(hidden_length)
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            if image_rotary_emb is not None:
+                qkv_tokens[:,:,0], qkv_tokens[:,:,1] = apply_rope(qkv_tokens[:,:,0], qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, length)
+            output_hidden[:, i_sum:i_sum+length] = stage_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_hidden = output_hidden.flatten(2, 3)
+        return output_hidden
+class SequenceParallelVarlenAttnSingle:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, heads, scale,
+            hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        num_stages = len(hidden_length)
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        # To sync the encoder query, key and values
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        i_sum = 0
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            if image_rotary_emb is not None:
+                qkv_tokens[:,:,0], qkv_tokens[:,:,1] = apply_rope(qkv_tokens[:,:,0], qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = qkv_tokens.unbind(2)   # [bs, tot_seq, nhead, dim]
+            query = query.transpose(1, 2).contiguous()
+            key = key.transpose(1, 2).contiguous()
+            value = value.transpose(1, 2).contiguous()
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2)   # [bs, tot_seq, nhead, dim]
+            output_hidden = stage_hidden_states
+            output_hidden = all_to_all(output_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden_list.append(output_hidden)
+            i_sum += length
+        output_hidden = torch.cat(output_hidden_list, dim=1).flatten(2, 3)
+        return output_hidden
+class VarlenSelfAttnSingle:
+    def __init__(self):
+        pass
+    def __call__(
+            self, query, key, value, heads, scale,
+            hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        num_stages = len(hidden_length)
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            if image_rotary_emb is not None:
+                qkv_tokens[:,:,0], qkv_tokens[:,:,1] = apply_rope(qkv_tokens[:,:,0], qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = qkv_tokens.unbind(2)
+            query = query.transpose(1, 2).contiguous()
+            key = key.transpose(1, 2).contiguous()
+            value = value.transpose(1, 2).contiguous()
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2).flatten(2, 3)   # [bs, tot_seq, dim]
+            output_hidden_list.append(stage_hidden_states)
+            i_sum += length
+        output_hidden = torch.cat(output_hidden_list, dim=1)
+        return output_hidden
+class Attention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        out_bias: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.dropout = dropout
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.scale = dim_head**-0.5
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(nn.Dropout(dropout))
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "fp32_layer_norm":
+                self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+                self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            elif qk_norm == "rms_norm":
+                self.norm_added_q = RMSNorm(dim_head, eps=eps)
+                self.norm_added_k = RMSNorm(dim_head, eps=eps)
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+        # set attention processor
+        self.set_processor(processor)
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        self.processor = processor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        hidden_length: List = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            attention_mask=attention_mask,
+            hidden_length=hidden_length,
+            image_rotary_emb=image_rotary_emb,
+        )
+class FluxSingleAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self, use_flash_attn=False):
+        self.use_flash_attn = use_flash_attn
+        if self.use_flash_attn:
+            if is_sequence_parallel_initialized():
+                self.varlen_flash_attn = SequenceParallelVarlenFlashAttnSingle()
+            else:
+                self.varlen_flash_attn = VarlenFlashSelfAttnSingle()
+        else:
+            if is_sequence_parallel_initialized():
+                self.varlen_attn = SequenceParallelVarlenAttnSingle()
+            else:
+                self.varlen_attn = VarlenSelfAttnSingle()
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        hidden_length: List = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(query.shape[0], -1, attn.heads, head_dim)
+        key = key.view(key.shape[0], -1, attn.heads, head_dim)
+        value = value.view(value.shape[0], -1, attn.heads, head_dim)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if self.use_flash_attn:
+            hidden_states = self.varlen_flash_attn(
+                query, key, value,
+                attn.heads, attn.scale, hidden_length,
+                image_rotary_emb, encoder_attention_mask,
+            )
+        else:
+            hidden_states = self.varlen_attn(
+                query, key, value,
+                attn.heads, attn.scale, hidden_length,
+                image_rotary_emb, attention_mask,
+            )
+        return hidden_states
+class FluxAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self, use_flash_attn=False):
+        self.use_flash_attn = use_flash_attn
+        if self.use_flash_attn:
+            if is_sequence_parallel_initialized():
+                self.varlen_flash_attn = SequenceParallelVarlenFlashSelfAttentionWithT5Mask()
+            else:
+                self.varlen_flash_attn = VarlenFlashSelfAttentionWithT5Mask()
+        else:
+            if is_sequence_parallel_initialized():
+                self.varlen_attn = SequenceParallelVarlenSelfAttentionWithT5Mask()
+            else:
+                self.varlen_attn = VarlenSelfAttentionWithT5Mask()
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        hidden_length: List = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(query.shape[0], -1, attn.heads, head_dim)
+        key = key.view(key.shape[0], -1, attn.heads, head_dim)
+        value = value.view(value.shape[0], -1, attn.heads, head_dim)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            encoder_hidden_states_query_proj.shape[0], -1, attn.heads, head_dim
+        )
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            encoder_hidden_states_key_proj.shape[0], -1, attn.heads, head_dim
+        )
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            encoder_hidden_states_value_proj.shape[0], -1, attn.heads, head_dim
+        )
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        if self.use_flash_attn:
+            hidden_states, encoder_hidden_states = self.varlen_flash_attn(
+                query, key, value,
+                encoder_hidden_states_query_proj, encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj, attn.heads, attn.scale, hidden_length,
+                image_rotary_emb, encoder_attention_mask,
+            )
+        else:
+            hidden_states, encoder_hidden_states = self.varlen_attn(
+                query, key, value,
+                encoder_hidden_states_query_proj, encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj, attn.heads, attn.scale, hidden_length,
+                image_rotary_emb, attention_mask,
+            )
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+class FluxSingleTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0, use_flash_attn=False):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        processor = FluxSingleAttnProcessor2_0(use_flash_attn)
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        encoder_attention_mask=None,
+        attention_mask=None,
+        hidden_length=None,
+        image_rotary_emb=None,
+    ):
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb, hidden_length=hidden_length)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=None,
+            encoder_attention_mask=encoder_attention_mask,
+            attention_mask=attention_mask,
+            hidden_length=hidden_length,
+            image_rotary_emb=image_rotary_emb,
+        )
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states
+class FluxTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6, use_flash_attn=False):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0(use_flash_attn)
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        encoder_attention_mask: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        attention_mask: torch.FloatTensor = None,
+        hidden_length: List = None,
+        image_rotary_emb=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb, hidden_length=hidden_length)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            attention_mask=attention_mask,
+            hidden_length=hidden_length,
+            image_rotary_emb=image_rotary_emb,
+        )
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp * ff_output
+        hidden_states = hidden_states + ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states

pyramid_dit/flux_modules/modeling_normalization.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import numbers
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.utils import is_torch_version
+if is_torch_version(">=", "2.1.0"):
+    LayerNorm = nn.LayerNorm
+else:
+    # Has optional bias parameter compared to torch layer norm
+    # TODO: replace with torch layernorm once min required torch version >= 2.1
+    class LayerNorm(nn.Module):
+        def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
+            super().__init__()
+            self.eps = eps
+            if isinstance(dim, numbers.Integral):
+                dim = (dim,)
+            self.dim = torch.Size(dim)
+            if elementwise_affine:
+                self.weight = nn.Parameter(torch.ones(dim))
+                self.bias = nn.Parameter(torch.zeros(dim)) if bias else None
+            else:
+                self.weight = None
+                self.bias = None
+        def forward(self, input):
+            return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps)
+class FP32LayerNorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        return F.layer_norm(
+            inputs.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        ).to(origin_dtype)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+        super().__init__()
+        self.eps = eps
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is not None:
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+        else:
+            hidden_states = hidden_states.to(input_dtype)
+        return hidden_states
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward_with_pad(self, x: torch.Tensor, conditioning_embedding: torch.Tensor, hidden_length=None) -> torch.Tensor:
+        assert hidden_length is not None
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        batch_emb = torch.zeros_like(x).repeat(1, 1, 2)
+        i_sum = 0
+        num_stages = len(hidden_length)
+        for i_p, length in enumerate(hidden_length):
+            batch_emb[:, i_sum:i_sum+length] = emb[i_p::num_stages][:,None]
+            i_sum += length
+        batch_scale, batch_shift = torch.chunk(batch_emb, 2, dim=2)
+        x = self.norm(x) * (1 + batch_scale) + batch_shift
+        return x
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor, hidden_length=None) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        if hidden_length is not None:
+            return self.forward_with_pad(x, conditioning_embedding, hidden_length)
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+class AdaLayerNormZero(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+    def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None):
+        super().__init__()
+        self.emb = None
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward_with_pad(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # hidden_length: [[20, 30], [30, 40], [50, 60]]
+        # x: [bs, seq_len, dim]
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        emb = self.linear(self.silu(emb))
+        batch_emb = torch.zeros_like(x).repeat(1, 1, 6)
+        i_sum = 0
+        num_stages = len(hidden_length)
+        for i_p, length in enumerate(hidden_length):
+            batch_emb[:, i_sum:i_sum+length] = emb[i_p::num_stages][:,None]
+            i_sum += length
+        batch_shift_msa, batch_scale_msa, batch_gate_msa, batch_shift_mlp, batch_scale_mlp, batch_gate_mlp = batch_emb.chunk(6, dim=2)
+        x = self.norm(x) * (1 + batch_scale_msa) + batch_shift_msa
+        return x, batch_gate_msa, batch_shift_mlp, batch_scale_mlp, batch_gate_mlp
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if hidden_length is not None:
+            return self.forward_with_pad(x, timestep, class_labels, hidden_dtype, emb, hidden_length)
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+class AdaLayerNormZeroSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(
+                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
+            )
+    def forward_with_pad(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ):
+        emb = self.linear(self.silu(emb))
+        batch_emb = torch.zeros_like(x).repeat(1, 1, 3)
+        i_sum = 0
+        num_stages = len(hidden_length)
+        for i_p, length in enumerate(hidden_length):
+            batch_emb[:, i_sum:i_sum+length] = emb[i_p::num_stages][:,None]
+            i_sum += length
+        batch_shift_msa, batch_scale_msa, batch_gate_msa = batch_emb.chunk(3, dim=2)
+        x = self.norm(x) * (1 + batch_scale_msa) + batch_shift_msa
+        return x, batch_gate_msa
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if hidden_length is not None:
+            return self.forward_with_pad(x, emb, hidden_length)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa

pyramid_dit/flux_modules/modeling_pyramid_flux.py ADDED Viewed

	@@ -0,0 +1,543 @@

+from typing import Any, Dict, List, Optional, Union
+import torch
+import os
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from tqdm import tqdm
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import is_torch_version
+from .modeling_normalization import AdaLayerNormContinuous
+from .modeling_embedding import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings
+from .modeling_flux_block import FluxTransformerBlock, FluxSingleTransformerBlock
+from trainer_misc import (
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    get_sequence_parallel_rank,
+    all_to_all,
+)
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    assert dim % 2 == 0, "The dimension must be even."
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    batch_size, seq_length = pos.shape
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    cos_out = torch.cos(out)
+    sin_out = torch.sin(out)
+    stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+    out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
+    return out.float()
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(2)
+class PyramidFluxTransformer(ModelMixin, ConfigMixin):
+    """
+    The Transformer model introduced in Flux.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Parameters:
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
+        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        axes_dims_rope: List[int] = [16, 24, 24],
+        use_flash_attn: bool = False,
+        use_temporal_causal: bool = True,
+        interp_condition_pos: bool = True,
+        use_gradient_checkpointing: bool = False,
+        gradient_checkpointing_ratio: float = 0.6,
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.pos_embed = EmbedND(dim=self.inner_dim, theta=10000, axes_dim=axes_dims_rope)
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
+        self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                    use_flash_attn=use_flash_attn,
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                    use_flash_attn=use_flash_attn,
+                )
+                for i in range(self.config.num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = use_gradient_checkpointing
+        self.gradient_checkpointing_ratio = gradient_checkpointing_ratio
+        self.use_temporal_causal = use_temporal_causal
+        if self.use_temporal_causal:
+            print("Using temporal causal attention")
+        self.use_flash_attn = use_flash_attn
+        if self.use_flash_attn:
+            print("Using Flash attention")
+        self.patch_size = 2   # hard-code for now
+        # init weights
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv3d)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize all the conditioning to normal init
+        nn.init.normal_(self.time_text_embed.timestep_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.timestep_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.text_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.text_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.context_embedder.weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.norm1.linear.weight, 0)
+            nn.init.constant_(block.norm1.linear.bias, 0)
+            nn.init.constant_(block.norm1_context.linear.weight, 0)
+            nn.init.constant_(block.norm1_context.linear.bias, 0)
+        for block in self.single_transformer_blocks:
+            nn.init.constant_(block.norm.linear.weight, 0)
+            nn.init.constant_(block.norm.linear.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.norm_out.linear.weight, 0)
+        nn.init.constant_(self.norm_out.linear.bias, 0)
+        nn.init.constant_(self.proj_out.weight, 0)
+        nn.init.constant_(self.proj_out.bias, 0)
+    @torch.no_grad()
+    def _prepare_image_ids(self, batch_size, temp, height, width, train_height, train_width, device, start_time_stamp=0):
+        latent_image_ids = torch.zeros(temp, height, width, 3)
+        # Temporal Rope
+        latent_image_ids[..., 0] = latent_image_ids[..., 0] + torch.arange(start_time_stamp, start_time_stamp + temp)[:, None, None]
+        # height Rope
+        if height != train_height:
+            height_pos = F.interpolate(torch.arange(train_height)[None, None, :].float(), height, mode='linear').squeeze(0, 1)
+        else:
+            height_pos = torch.arange(train_height).float()
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + height_pos[None, :, None]
+        # width rope
+        if width != train_width:
+            width_pos = F.interpolate(torch.arange(train_width)[None, None, :].float(), width, mode='linear').squeeze(0, 1)
+        else:
+            width_pos = torch.arange(train_width).float()
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + width_pos[None, None, :]
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1, 1)
+        latent_image_ids = rearrange(latent_image_ids, 'b t h w c -> b (t h w) c')
+        return latent_image_ids.to(device=device)
+    @torch.no_grad()
+    def _prepare_pyramid_image_ids(self, sample, batch_size, device):
+        image_ids_list = []
+        for i_b, sample_ in enumerate(sample):
+            if not isinstance(sample_, list):
+                sample_ = [sample_]
+            cur_image_ids = []
+            start_time_stamp = 0
+            train_height = sample_[-1].shape[-2] // self.patch_size
+            train_width = sample_[-1].shape[-1] // self.patch_size
+            for clip_ in sample_:
+                _, _, temp, height, width = clip_.shape
+                height = height // self.patch_size
+                width = width // self.patch_size
+                cur_image_ids.append(self._prepare_image_ids(batch_size, temp, height, width, train_height, train_width, device, start_time_stamp=start_time_stamp))
+                start_time_stamp += temp
+            cur_image_ids = torch.cat(cur_image_ids, dim=1)
+            image_ids_list.append(cur_image_ids)
+        return image_ids_list
+    def merge_input(self, sample, encoder_hidden_length, encoder_attention_mask):
+        """
+            Merge the input video with different resolutions into one sequence
+            Sample: From low resolution to high resolution
+        """
+        if isinstance(sample[0], list):
+            device = sample[0][-1].device
+            pad_batch_size = sample[0][-1].shape[0]
+        else:
+            device = sample[0].device
+            pad_batch_size = sample[0].shape[0]
+        num_stages = len(sample)
+        height_list = [];width_list = [];temp_list = []
+        trainable_token_list = []
+        for i_b, sample_ in enumerate(sample):
+            if isinstance(sample_, list):
+                sample_ = sample_[-1]
+            _, _, temp, height, width = sample_.shape
+            height = height // self.patch_size
+            width = width // self.patch_size
+            temp_list.append(temp)
+            height_list.append(height)
+            width_list.append(width)
+            trainable_token_list.append(height * width * temp)
+        # prepare the RoPE IDs,
+        image_ids_list = self._prepare_pyramid_image_ids(sample, pad_batch_size, device)
+        text_ids = torch.zeros(pad_batch_size, encoder_attention_mask.shape[1], 3).to(device=device)
+        input_ids_list = [torch.cat([text_ids, image_ids], dim=1) for image_ids in image_ids_list]
+        image_rotary_emb = [self.pos_embed(input_ids) for input_ids in input_ids_list]  # [bs, seq_len, 1, head_dim // 2, 2, 2]
+        if is_sequence_parallel_initialized():
+            sp_group = get_sequence_parallel_group()
+            sp_group_size = get_sequence_parallel_world_size()
+            concat_output = True if self.training else False
+            image_rotary_emb = [all_to_all(x_.repeat(1, 1, sp_group_size, 1, 1, 1), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output) for x_ in image_rotary_emb]
+            input_ids_list = [all_to_all(input_ids.repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output) for input_ids in input_ids_list]
+        hidden_states, hidden_length = [], []
+        for sample_ in sample:
+            video_tokens = []
+            for each_latent in sample_:
+                each_latent = rearrange(each_latent, 'b c t h w -> b t h w c')
+                each_latent = rearrange(each_latent, 'b t (h p1) (w p2) c -> b (t h w) (p1 p2 c)', p1=self.patch_size, p2=self.patch_size)
+                video_tokens.append(each_latent)
+            video_tokens = torch.cat(video_tokens, dim=1)
+            video_tokens = self.x_embedder(video_tokens)
+            hidden_states.append(video_tokens)
+            hidden_length.append(video_tokens.shape[1])
+        # prepare the attention mask
+        if self.use_flash_attn:
+            attention_mask = None
+            indices_list = []
+            for i_p, length in enumerate(hidden_length):
+                pad_attention_mask = torch.ones((pad_batch_size, length), dtype=encoder_attention_mask.dtype).to(device)
+                pad_attention_mask = torch.cat([encoder_attention_mask[i_p::num_stages], pad_attention_mask], dim=1)
+                if is_sequence_parallel_initialized():
+                    sp_group = get_sequence_parallel_group()
+                    sp_group_size = get_sequence_parallel_world_size()
+                    pad_attention_mask = all_to_all(pad_attention_mask.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0)
+                    pad_attention_mask = pad_attention_mask.squeeze(2)
+                seqlens_in_batch = pad_attention_mask.sum(dim=-1, dtype=torch.int32)
+                indices = torch.nonzero(pad_attention_mask.flatten(), as_tuple=False).flatten()
+                indices_list.append(
+                    {
+                        'indices': indices,
+                        'seqlens_in_batch': seqlens_in_batch,
+                    }
+                )
+            encoder_attention_mask = indices_list
+        else:
+            assert encoder_attention_mask.shape[1] == encoder_hidden_length
+            real_batch_size = encoder_attention_mask.shape[0]
+            # prepare text ids
+            text_ids = torch.arange(1, real_batch_size + 1, dtype=encoder_attention_mask.dtype).unsqueeze(1).repeat(1, encoder_hidden_length)
+            text_ids = text_ids.to(device)
+            text_ids[encoder_attention_mask == 0] = 0
+            # prepare image ids
+            image_ids = torch.arange(1, real_batch_size + 1, dtype=encoder_attention_mask.dtype).unsqueeze(1).repeat(1, max(hidden_length))
+            image_ids = image_ids.to(device)
+            image_ids_list = []
+            for i_p, length in enumerate(hidden_length):
+                image_ids_list.append(image_ids[i_p::num_stages][:, :length])
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                concat_output = True if self.training else False
+                text_ids = all_to_all(text_ids.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output).squeeze(2)
+                image_ids_list = [all_to_all(image_ids_.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output).squeeze(2) for image_ids_ in image_ids_list]
+            attention_mask = []
+            for i_p in range(len(hidden_length)):
+                image_ids = image_ids_list[i_p]
+                token_ids = torch.cat([text_ids[i_p::num_stages], image_ids], dim=1)
+                stage_attention_mask = rearrange(token_ids, 'b i -> b 1 i 1') == rearrange(token_ids, 'b j -> b 1 1 j')  # [bs, 1, q_len, k_len]
+                if self.use_temporal_causal:
+                    input_order_ids = input_ids_list[i_p][:,:,0]
+                    temporal_causal_mask = rearrange(input_order_ids, 'b i -> b 1 i 1') >= rearrange(input_order_ids, 'b j -> b 1 1 j')
+                    stage_attention_mask = stage_attention_mask & temporal_causal_mask
+                attention_mask.append(stage_attention_mask)
+        return hidden_states, hidden_length, temp_list, height_list, width_list, trainable_token_list, encoder_attention_mask, attention_mask, image_rotary_emb
+    def split_output(self, batch_hidden_states, hidden_length, temps, heights, widths, trainable_token_list):
+        # To split the hidden states
+        batch_size = batch_hidden_states.shape[0]
+        output_hidden_list = []
+        batch_hidden_states = torch.split(batch_hidden_states, hidden_length, dim=1)
+        if is_sequence_parallel_initialized():
+            sp_group_size = get_sequence_parallel_world_size()
+            if self.training:
+                batch_size = batch_size // sp_group_size
+        for i_p, length in enumerate(hidden_length):
+            width, height, temp = widths[i_p], heights[i_p], temps[i_p]
+            trainable_token_num = trainable_token_list[i_p]
+            hidden_states = batch_hidden_states[i_p]
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                if not self.training:
+                    hidden_states = hidden_states.repeat(sp_group_size, 1, 1)
+                hidden_states = all_to_all(hidden_states, sp_group, sp_group_size, scatter_dim=0, gather_dim=1)
+            # only the trainable token are taking part in loss computation
+            hidden_states = hidden_states[:, -trainable_token_num:]
+            # unpatchify
+            hidden_states = hidden_states.reshape(
+                shape=(batch_size, temp, height, width, self.patch_size, self.patch_size, self.out_channels // 4)
+            )
+            hidden_states = rearrange(hidden_states, "b t h w p1 p2 c -> b t (h p1) (w p2) c")
+            hidden_states = rearrange(hidden_states, "b t h w c -> b c t h w")
+            output_hidden_list.append(hidden_states)
+        return output_hidden_list
+    def forward(
+        self,
+        sample: torch.FloatTensor, # [num_stages]
+        encoder_hidden_states: torch.Tensor = None,
+        encoder_attention_mask: torch.FloatTensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep_ratio: torch.LongTensor = None,
+    ):
+        temb = self.time_text_embed(timestep_ratio, pooled_projections)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        encoder_hidden_length = encoder_hidden_states.shape[1]
+        # Get the input sequence
+        hidden_states, hidden_length, temps, heights, widths, trainable_token_list, encoder_attention_mask, attention_mask, \
+                image_rotary_emb = self.merge_input(sample, encoder_hidden_length, encoder_attention_mask)
+        # split the long latents if necessary
+        if is_sequence_parallel_initialized():
+            sp_group = get_sequence_parallel_group()
+            sp_group_size = get_sequence_parallel_world_size()
+            concat_output = True if self.training else False
+            # sync the input hidden states
+            batch_hidden_states = []
+            for i_p, hidden_states_ in enumerate(hidden_states):
+                assert hidden_states_.shape[1] % sp_group_size == 0, "The sequence length should be divided by sequence parallel size"
+                hidden_states_ = all_to_all(hidden_states_, sp_group, sp_group_size, scatter_dim=1, gather_dim=0, concat_output=concat_output)
+                hidden_length[i_p] = hidden_length[i_p] // sp_group_size
+                batch_hidden_states.append(hidden_states_)
+            # sync the encoder hidden states
+            hidden_states = torch.cat(batch_hidden_states, dim=1)
+            encoder_hidden_states = all_to_all(encoder_hidden_states, sp_group, sp_group_size, scatter_dim=1, gather_dim=0, concat_output=concat_output)
+            temb = all_to_all(temb.unsqueeze(1).repeat(1, sp_group_size, 1), sp_group, sp_group_size, scatter_dim=1, gather_dim=0, concat_output=concat_output)
+            temb = temb.squeeze(1)
+        else:
+            hidden_states = torch.cat(hidden_states, dim=1)
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing and (index_block <= int(len(self.transformer_blocks) * self.gradient_checkpointing_ratio)):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    temb,
+                    attention_mask,
+                    hidden_length,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    temb=temb,
+                    attention_mask=attention_mask,
+                    hidden_length=hidden_length,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        # remerge for single attention block
+        num_stages = len(hidden_length)
+        batch_hidden_states = list(torch.split(hidden_states, hidden_length, dim=1))
+        concat_hidden_length = []
+        if is_sequence_parallel_initialized():
+            sp_group = get_sequence_parallel_group()
+            sp_group_size = get_sequence_parallel_world_size()
+            encoder_hidden_states = all_to_all(encoder_hidden_states, sp_group, sp_group_size, scatter_dim=0, gather_dim=1)
+        for i_p in range(len(hidden_length)):
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                batch_hidden_states[i_p] = all_to_all(batch_hidden_states[i_p], sp_group, sp_group_size, scatter_dim=0, gather_dim=1)
+            batch_hidden_states[i_p] = torch.cat([encoder_hidden_states[i_p::num_stages], batch_hidden_states[i_p]], dim=1)
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                batch_hidden_states[i_p] = all_to_all(batch_hidden_states[i_p], sp_group, sp_group_size, scatter_dim=1, gather_dim=0)
+            concat_hidden_length.append(batch_hidden_states[i_p].shape[1])
+        hidden_states = torch.cat(batch_hidden_states, dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing and (index_block <= int(len(self.single_transformer_blocks) * self.gradient_checkpointing_ratio)):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    encoder_attention_mask,
+                    attention_mask,
+                    concat_hidden_length,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    encoder_attention_mask=encoder_attention_mask,      # used for
+                    attention_mask=attention_mask,
+                    hidden_length=concat_hidden_length,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        batch_hidden_states = list(torch.split(hidden_states, concat_hidden_length, dim=1))
+        for i_p in range(len(concat_hidden_length)):
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                batch_hidden_states[i_p] = all_to_all(batch_hidden_states[i_p], sp_group, sp_group_size, scatter_dim=0, gather_dim=1)
+            batch_hidden_states[i_p] = batch_hidden_states[i_p][:, encoder_hidden_length :, ...]
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                batch_hidden_states[i_p] = all_to_all(batch_hidden_states[i_p], sp_group, sp_group_size, scatter_dim=1, gather_dim=0)
+        hidden_states = torch.cat(batch_hidden_states, dim=1)
+        hidden_states = self.norm_out(hidden_states, temb, hidden_length=hidden_length)
+        hidden_states = self.proj_out(hidden_states)
+        output = self.split_output(hidden_states, hidden_length, temps, heights, widths, trainable_token_list)
+        return output

pyramid_dit/flux_modules/modeling_text_encoder.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torch.nn as nn
+import os
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from typing import Any, Callable, Dict, List, Optional, Union
+class FluxTextEncoderWithMask(nn.Module):
+    def __init__(self, model_path, torch_dtype):
+        super().__init__()
+        # CLIP-G
+        self.tokenizer = CLIPTokenizer.from_pretrained(os.path.join(model_path, 'tokenizer'), torch_dtype=torch_dtype)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.text_encoder = CLIPTextModel.from_pretrained(os.path.join(model_path, 'text_encoder'), torch_dtype=torch_dtype)
+        # T5
+        self.tokenizer_2 = T5TokenizerFast.from_pretrained(os.path.join(model_path, 'tokenizer_2'))
+        self.text_encoder_2 = T5EncoderModel.from_pretrained(os.path.join(model_path, 'text_encoder_2'), torch_dtype=torch_dtype)
+        self._freeze()
+    def _freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 128,
+        device: Optional[torch.device] = None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_attention_mask = text_inputs.attention_mask
+        prompt_attention_mask = prompt_attention_mask.to(device)
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), attention_mask=prompt_attention_mask, output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(batch_size, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        return prompt_embeds, prompt_attention_mask
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(self,
+        prompt,
+        num_images_per_prompt=1,
+        device=None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        pooled_prompt_embeds = self._get_clip_prompt_embeds(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+        )
+        prompt_embeds, prompt_attention_mask = self._get_t5_prompt_embeds(
+            prompt=prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+        )
+        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds
+    def forward(self, input_prompts, device):
+        with torch.no_grad():
+            prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.encode_prompt(input_prompts, 1, device=device)
+        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds

pyramid_dit/mmdit_modules/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .modeling_text_encoder import SD3TextEncoderWithMask
+from .modeling_pyramid_mmdit import PyramidDiffusionMMDiT
+from .modeling_mmdit_block import JointTransformerBlock

pyramid_dit/mmdit_modules/modeling_embedding.py ADDED Viewed

	@@ -0,0 +1,390 @@

+from typing import Any, Dict, Optional, Union
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from diffusers.models.activations import get_activation
+from einops import rearrange
+def get_1d_sincos_pos_embed(
+    embed_dim, num_frames, cls_token=False, extra_tokens=0,
+):
+    t = np.arange(num_frames, dtype=np.float32)
+    pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, t)  # (T, D)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed(
+    embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
+):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        self.act = get_activation(act_fn)
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, sample_proj_bias)
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class TextProjection(nn.Module):
+    def __init__(self, in_features, hidden_size, act_fn="silu"):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        self.act_1 = get_activation(act_fn)
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class CombinedTimestepConditionEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = TextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = timesteps_emb + pooled_projections
+        return conditioning
+class CombinedTimestepEmbeddings(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+    def forward(self, timestep):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj)  # (N, D)
+        return timesteps_emb
+class PatchEmbed3D(nn.Module):
+    """Support the 3D Tensor input"""
+    def __init__(
+        self,
+        height=128,
+        width=128,
+        patch_size=2,
+        in_channels=16,
+        embed_dim=1536,
+        layer_norm=False,
+        bias=True,
+        interpolation_scale=1,
+        pos_embed_type="sincos",
+        temp_pos_embed_type='rope',
+        pos_embed_max_size=192,   # For SD3 cropping
+        max_num_frames=64,
+        add_temp_pos_embed=False,
+        interp_condition_pos=False,
+    ):
+        super().__init__()
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.layer_norm = layer_norm
+        self.pos_embed_max_size = pos_embed_max_size
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+        self.patch_size = patch_size
+        self.height, self.width = height // patch_size, width // patch_size
+        self.base_size = height // patch_size
+        self.interpolation_scale = interpolation_scale
+        self.add_temp_pos_embed = add_temp_pos_embed
+        # Calculate positional embeddings based on max size or default
+        if pos_embed_max_size:
+            grid_size = pos_embed_max_size
+        else:
+            grid_size = int(num_patches**0.5)
+        if pos_embed_type is None:
+            self.pos_embed = None
+        elif pos_embed_type == "sincos":
+            pos_embed = get_2d_sincos_pos_embed(
+                embed_dim, grid_size, base_size=self.base_size, interpolation_scale=self.interpolation_scale
+            )
+            persistent = True if pos_embed_max_size else False
+            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=persistent)
+            if add_temp_pos_embed and temp_pos_embed_type == 'sincos':
+                time_pos_embed = get_1d_sincos_pos_embed(embed_dim, max_num_frames)
+                self.register_buffer("temp_pos_embed", torch.from_numpy(time_pos_embed).float().unsqueeze(0), persistent=True)
+        elif pos_embed_type == "rope":
+            print("Using the rotary position embedding")
+        else:
+            raise ValueError(f"Unsupported pos_embed_type: {pos_embed_type}")
+        self.pos_embed_type = pos_embed_type
+        self.temp_pos_embed_type = temp_pos_embed_type
+        self.interp_condition_pos = interp_condition_pos
+    def cropped_pos_embed(self, height, width, ori_height, ori_width):
+        """Crops positional embeddings for SD3 compatibility."""
+        if self.pos_embed_max_size is None:
+            raise ValueError("`pos_embed_max_size` must be set for cropping.")
+        height = height // self.patch_size
+        width = width // self.patch_size
+        ori_height = ori_height // self.patch_size
+        ori_width = ori_width // self.patch_size
+        assert ori_height >= height, "The ori_height needs >= height"
+        assert ori_width >= width, "The ori_width needs >= width"
+        if height > self.pos_embed_max_size:
+            raise ValueError(
+                f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if width > self.pos_embed_max_size:
+            raise ValueError(
+                f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if self.interp_condition_pos:
+            top = (self.pos_embed_max_size - ori_height) // 2
+            left = (self.pos_embed_max_size - ori_width) // 2
+            spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+            spatial_pos_embed = spatial_pos_embed[:, top : top + ori_height, left : left + ori_width, :]   # [b h w c]
+            if ori_height != height or ori_width != width:
+                spatial_pos_embed = spatial_pos_embed.permute(0, 3, 1, 2)
+                spatial_pos_embed = torch.nn.functional.interpolate(spatial_pos_embed, size=(height, width), mode='bilinear')
+                spatial_pos_embed = spatial_pos_embed.permute(0, 2, 3, 1)
+        else:
+            top = (self.pos_embed_max_size - height) // 2
+            left = (self.pos_embed_max_size - width) // 2
+            spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+            spatial_pos_embed = spatial_pos_embed[:, top : top + height, left : left + width, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+    def forward_func(self, latent, time_index=0, ori_height=None, ori_width=None):
+        if self.pos_embed_max_size is not None:
+            height, width = latent.shape[-2:]
+        else:
+            height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+        bs = latent.shape[0]
+        temp = latent.shape[2]
+        latent = rearrange(latent, 'b c t h w -> (b t) c h w')
+        latent = self.proj(latent)
+        latent = latent.flatten(2).transpose(1, 2)  # (BT)CHW -> (BT)NC
+        if self.layer_norm:
+            latent = self.norm(latent)
+        if self.pos_embed_type == 'sincos':
+            # Spatial position embedding, Interpolate or crop positional embeddings as needed
+            if self.pos_embed_max_size:
+                pos_embed = self.cropped_pos_embed(height, width, ori_height, ori_width)
+            else:
+                raise NotImplementedError("Not implemented sincos pos embed without sd3 max pos crop")
+                if self.height != height or self.width != width:
+                    pos_embed = get_2d_sincos_pos_embed(
+                        embed_dim=self.pos_embed.shape[-1],
+                        grid_size=(height, width),
+                        base_size=self.base_size,
+                        interpolation_scale=self.interpolation_scale,
+                    )
+                    pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).to(latent.device)
+                else:
+                    pos_embed = self.pos_embed
+            if self.add_temp_pos_embed and self.temp_pos_embed_type == 'sincos':
+                latent_dtype = latent.dtype
+                latent = latent + pos_embed
+                latent = rearrange(latent, '(b t) n c -> (b n) t c', t=temp)
+                latent = latent + self.temp_pos_embed[:, time_index:time_index + temp, :]
+                latent = latent.to(latent_dtype)
+                latent = rearrange(latent, '(b n) t c -> b t n c', b=bs)
+            else:
+                latent = (latent + pos_embed).to(latent.dtype)
+                latent = rearrange(latent, '(b t) n c -> b t n c', b=bs, t=temp)
+        else:
+            assert self.pos_embed_type == "rope", "Only supporting the sincos and rope embedding"
+            latent = rearrange(latent, '(b t) n c -> b t n c', b=bs, t=temp)
+        return latent
+    def forward(self, latent):
+        """
+        Arguments:
+            past_condition_latents (Torch.FloatTensor): The past latent during the generation
+            flatten_input (bool): True indicate flatten the latent into 1D sequence
+        """
+        if isinstance(latent, list):
+            output_list = []
+            for latent_ in latent:
+                if not isinstance(latent_, list):
+                    latent_ = [latent_]
+                output_latent = []
+                time_index = 0
+                ori_height, ori_width = latent_[-1].shape[-2:]
+                for each_latent in latent_:
+                    hidden_state = self.forward_func(each_latent, time_index=time_index, ori_height=ori_height, ori_width=ori_width)
+                    time_index += each_latent.shape[2]
+                    hidden_state = rearrange(hidden_state, "b t n c -> b (t n) c")
+                    output_latent.append(hidden_state)
+                output_latent = torch.cat(output_latent, dim=1)
+                output_list.append(output_latent)
+            return output_list
+        else:
+            hidden_states = self.forward_func(latent)
+            hidden_states = rearrange(hidden_states, "b t n c -> b (t n) c")
+            return hidden_states

pyramid_dit/mmdit_modules/modeling_mmdit_block.py ADDED Viewed

	@@ -0,0 +1,671 @@

+from typing import Dict, Optional, Tuple, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+try:
+    from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+    from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+except:
+    flash_attn_func = None
+    flash_attn_qkvpacked_func = None
+    flash_attn_varlen_func = None
+from trainer_misc import (
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    all_to_all,
+)
+from .modeling_normalization import AdaLayerNormZero, AdaLayerNormContinuous, RMSNorm
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class VarlenFlashSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        output_hidden = torch.zeros_like(query)
+        output_encoder_hidden = torch.zeros_like(encoder_query)
+        encoder_length = encoder_query.shape[1]
+        qkv_list = []
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(concat_qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, encoder_length + length)
+            stage_encoder_hidden_output = stage_output[:, :encoder_length]
+            stage_hidden_output = stage_output[:, encoder_length:]
+            output_hidden[:, i_sum:i_sum+length] = stage_hidden_output
+            output_encoder_hidden[i_p::num_stages] = stage_encoder_hidden_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_hidden = output_hidden.flatten(2, 3)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class SequenceParallelVarlenFlashSelfAttentionWithT5Mask:
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, encoder_attention_mask=None,
+        ):
+        assert encoder_attention_mask is not None, "The encoder-hidden mask needed to be set"
+        batch_size = query.shape[0]
+        qkv_list = []
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        # To sync the encoder query, key and values
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        encoder_qkv = all_to_all(encoder_qkv, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+        output_hidden = torch.zeros_like(qkv[:,:,0])
+        output_encoder_hidden = torch.zeros_like(encoder_qkv[:,:,0])
+        encoder_length = encoder_qkv.shape[1]
+        i_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            # get the query, key, value from padding sequence
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, pad_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            indices = encoder_attention_mask[i_p]['indices']
+            qkv_list.append(index_first_axis(rearrange(concat_qkv_tokens, "b s ... -> (b s) ..."), indices))
+            i_sum += length
+        token_lengths = [x_.shape[0] for x_ in qkv_list]
+        qkv = torch.cat(qkv_list, dim=0)
+        query, key, value = qkv.unbind(1)
+        cu_seqlens = torch.cat([x_['seqlens_in_batch'] for x_ in encoder_attention_mask], dim=0)
+        max_seqlen_q = cu_seqlens.max().item()
+        max_seqlen_k = max_seqlen_q
+        cu_seqlens_q = F.pad(torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32), (1, 0))
+        cu_seqlens_k = cu_seqlens_q.clone()
+        output = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=scale,
+        )
+        # To merge the tokens
+        i_sum = 0;token_sum = 0
+        for i_p, length in enumerate(hidden_length):
+            tot_token_num = token_lengths[i_p]
+            stage_output = output[token_sum : token_sum + tot_token_num]
+            stage_output = pad_input(stage_output, encoder_attention_mask[i_p]['indices'], batch_size, encoder_length + length * sp_group_size)
+            stage_encoder_hidden_output = stage_output[:, :encoder_length]
+            stage_hidden_output = stage_output[:, encoder_length:]
+            stage_hidden_output = all_to_all(stage_hidden_output, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden[:, i_sum:i_sum+length] = stage_hidden_output
+            output_encoder_hidden[i_p::num_stages] = stage_encoder_hidden_output
+            token_sum += tot_token_num
+            i_sum += length
+        output_encoder_hidden = all_to_all(output_encoder_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+        output_hidden = output_hidden.flatten(2, 3)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class VarlenSelfAttentionWithT5Mask:
+    """
+        For chunk stage attention without using flash attention
+    """
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        encoder_length = encoder_query.shape[1]
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        i_sum = 0
+        output_encoder_hidden_list = []
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = concat_qkv_tokens.unbind(2)   # [bs, tot_seq, nhead, dim]
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            # with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True):
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2).flatten(2, 3)   # [bs, tot_seq, dim]
+            output_encoder_hidden_list.append(stage_hidden_states[:, :encoder_length])
+            output_hidden_list.append(stage_hidden_states[:, encoder_length:])
+            i_sum += length
+        output_encoder_hidden = torch.stack(output_encoder_hidden_list, dim=1)  # [b n s d]
+        output_encoder_hidden = rearrange(output_encoder_hidden, 'b n s d -> (b n) s d')
+        output_hidden = torch.cat(output_hidden_list, dim=1)
+        return output_hidden, output_encoder_hidden
+class SequenceParallelVarlenSelfAttentionWithT5Mask:
+    """
+        For chunk stage attention without using flash attention
+    """
+    def __init__(self):
+        pass
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def __call__(
+            self, query, key, value, encoder_query, encoder_key, encoder_value,
+            heads, scale, hidden_length=None, image_rotary_emb=None, attention_mask=None,
+        ):
+        assert attention_mask is not None, "The attention mask needed to be set"
+        num_stages = len(hidden_length)
+        encoder_qkv = torch.stack([encoder_query, encoder_key, encoder_value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        qkv = torch.stack([query, key, value], dim=2) # [bs, sub_seq, 3, head, head_dim]
+        # To sync the encoder query, key and values
+        sp_group = get_sequence_parallel_group()
+        sp_group_size = get_sequence_parallel_world_size()
+        encoder_qkv = all_to_all(encoder_qkv, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+        encoder_length = encoder_qkv.shape[1]
+        i_sum = 0
+        output_encoder_hidden_list = []
+        output_hidden_list = []
+        for i_p, length in enumerate(hidden_length):
+            encoder_qkv_tokens = encoder_qkv[i_p::num_stages]
+            qkv_tokens = qkv[:, i_sum:i_sum+length]
+            qkv_tokens = all_to_all(qkv_tokens, sp_group, sp_group_size, scatter_dim=3, gather_dim=1) # [bs, seq, 3, sub_head, head_dim]
+            concat_qkv_tokens = torch.cat([encoder_qkv_tokens, qkv_tokens], dim=1)  # [bs, tot_seq, 3, nhead, dim]
+            if image_rotary_emb is not None:
+                concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1] = self.apply_rope(concat_qkv_tokens[:,:,0], concat_qkv_tokens[:,:,1], image_rotary_emb[i_p])
+            query, key, value = concat_qkv_tokens.unbind(2)   # [bs, tot_seq, nhead, dim]
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            stage_hidden_states = F.scaled_dot_product_attention(
+                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask[i_p],
+            )
+            stage_hidden_states = stage_hidden_states.transpose(1, 2)   # [bs, tot_seq, nhead, dim]
+            output_encoder_hidden_list.append(stage_hidden_states[:, :encoder_length])
+            output_hidden = stage_hidden_states[:, encoder_length:]
+            output_hidden = all_to_all(output_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+            output_hidden_list.append(output_hidden)
+            i_sum += length
+        output_encoder_hidden = torch.stack(output_encoder_hidden_list, dim=1)  # [b n s nhead d]
+        output_encoder_hidden = rearrange(output_encoder_hidden, 'b n s h d -> (b n) s h d')
+        output_encoder_hidden = all_to_all(output_encoder_hidden, sp_group, sp_group_size, scatter_dim=1, gather_dim=2)
+        output_encoder_hidden = output_encoder_hidden.flatten(2, 3)
+        output_hidden = torch.cat(output_hidden_list, dim=1).flatten(2, 3)
+        return output_hidden, output_encoder_hidden
+class JointAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        context_pre_only=None,
+        use_flash_attn=True,
+    ):
+        """
+            Fixing the QKNorm, following the flux, norm the head dimension
+        """
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.use_bias = bias
+        self.dropout = dropout
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.scale = dim_head**-0.5
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "layer_norm":
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+        elif qk_norm == 'rms_norm':
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias)
+        self.to_v = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias)
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_dim)
+            self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim)
+            if qk_norm is None:
+                self.norm_add_q = None
+                self.norm_add_k = None
+            elif qk_norm == "layer_norm":
+                self.norm_add_q = nn.LayerNorm(dim_head, eps=eps)
+                self.norm_add_k = nn.LayerNorm(dim_head, eps=eps)
+            elif qk_norm == 'rms_norm':
+                self.norm_add_q = RMSNorm(dim_head, eps=eps)
+                self.norm_add_k = RMSNorm(dim_head, eps=eps)
+            else:
+                raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        if not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
+        self.use_flash_attn = use_flash_attn
+        if flash_attn_func is None:
+            self.use_flash_attn = False
+        # print(f"Using flash-attention: {self.use_flash_attn}")
+        if self.use_flash_attn:
+            if is_sequence_parallel_initialized():
+                self.var_flash_attn = SequenceParallelVarlenFlashSelfAttentionWithT5Mask()
+            else:
+                self.var_flash_attn = VarlenFlashSelfAttentionWithT5Mask()
+        else:
+            if is_sequence_parallel_initialized():
+                self.var_len_attn = SequenceParallelVarlenSelfAttentionWithT5Mask()
+            else:
+                self.var_len_attn = VarlenSelfAttentionWithT5Mask()
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        encoder_attention_mask: torch.FloatTensor = None,
+        attention_mask: torch.FloatTensor = None,   # [B, L, S]
+        hidden_length: torch.Tensor = None,
+        image_rotary_emb: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        # This function is only used during training
+        # `sample` projections.
+        query = self.to_q(hidden_states)
+        key = self.to_k(hidden_states)
+        value = self.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.heads
+        query = query.view(query.shape[0], -1, self.heads, head_dim)
+        key = key.view(key.shape[0], -1, self.heads, head_dim)
+        value = value.view(value.shape[0], -1, self.heads, head_dim)
+        if self.norm_q is not None:
+            query = self.norm_q(query)
+        if self.norm_k is not None:
+            key = self.norm_k(key)
+        # `context` projections.
+        encoder_hidden_states_query_proj = self.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            encoder_hidden_states_query_proj.shape[0], -1, self.heads, head_dim
+        )
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            encoder_hidden_states_key_proj.shape[0], -1, self.heads, head_dim
+        )
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            encoder_hidden_states_value_proj.shape[0], -1, self.heads, head_dim
+        )
+        if self.norm_add_q is not None:
+            encoder_hidden_states_query_proj = self.norm_add_q(encoder_hidden_states_query_proj)
+        if self.norm_add_k is not None:
+            encoder_hidden_states_key_proj = self.norm_add_k(encoder_hidden_states_key_proj)
+        # To cat the hidden and encoder hidden, perform attention compuataion, and then split
+        if self.use_flash_attn:
+            hidden_states, encoder_hidden_states = self.var_flash_attn(
+                query, key, value,
+                encoder_hidden_states_query_proj, encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj, self.heads, self.scale, hidden_length,
+                image_rotary_emb, encoder_attention_mask,
+            )
+        else:
+            hidden_states, encoder_hidden_states = self.var_len_attn(
+                query, key, value,
+                encoder_hidden_states_query_proj, encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj, self.heads, self.scale, hidden_length,
+                image_rotary_emb, attention_mask,
+            )
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        if not self.context_pre_only:
+            encoder_hidden_states = self.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+class JointTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(
+        self, dim, num_attention_heads, attention_head_dim, qk_norm=None,
+        context_pre_only=False, use_flash_attn=True,
+    ):
+        super().__init__()
+        self.context_pre_only = context_pre_only
+        context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero"
+        self.norm1 = AdaLayerNormZero(dim)
+        if context_norm_type == "ada_norm_continous":
+            self.norm1_context = AdaLayerNormContinuous(
+                dim, dim, elementwise_affine=False, eps=1e-6, bias=True, norm_type="layer_norm"
+            )
+        elif context_norm_type == "ada_norm_zero":
+            self.norm1_context = AdaLayerNormZero(dim)
+        else:
+            raise ValueError(
+                f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`"
+            )
+        self.attn = JointAttention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim // num_attention_heads,
+            heads=num_attention_heads,
+            out_dim=attention_head_dim,
+            qk_norm=qk_norm,
+            context_pre_only=context_pre_only,
+            bias=True,
+            use_flash_attn=use_flash_attn,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        if not context_pre_only:
+            self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        else:
+            self.norm2_context = None
+            self.ff_context = None
+    def forward(
+        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor,
+        encoder_attention_mask: torch.FloatTensor, temb: torch.FloatTensor,
+        attention_mask: torch.FloatTensor = None, hidden_length: List = None,
+        image_rotary_emb: torch.FloatTensor = None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb, hidden_length=hidden_length)
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
+        else:
+            norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+                encoder_hidden_states, emb=temb,
+            )
+        # Attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask, attention_mask=attention_mask,
+            hidden_length=hidden_length, image_rotary_emb=image_rotary_emb,
+        )
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp * ff_output
+        hidden_states = hidden_states + ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+            encoder_hidden_states = encoder_hidden_states + context_attn_output
+            norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+            norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            context_ff_output = self.ff_context(norm_encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        return encoder_hidden_states, hidden_states

pyramid_dit/mmdit_modules/modeling_normalization.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import numbers
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.utils import is_torch_version
+if is_torch_version(">=", "2.1.0"):
+    LayerNorm = nn.LayerNorm
+else:
+    # Has optional bias parameter compared to torch layer norm
+    # TODO: replace with torch layernorm once min required torch version >= 2.1
+    class LayerNorm(nn.Module):
+        def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
+            super().__init__()
+            self.eps = eps
+            if isinstance(dim, numbers.Integral):
+                dim = (dim,)
+            self.dim = torch.Size(dim)
+            if elementwise_affine:
+                self.weight = nn.Parameter(torch.ones(dim))
+                self.bias = nn.Parameter(torch.zeros(dim)) if bias else None
+            else:
+                self.weight = None
+                self.bias = None
+        def forward(self, input):
+            return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+        super().__init__()
+        self.eps = eps
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is not None:
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+        hidden_states = hidden_states.to(input_dtype)
+        return hidden_states
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward_with_pad(self, x: torch.Tensor, conditioning_embedding: torch.Tensor, hidden_length=None) -> torch.Tensor:
+        assert hidden_length is not None
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        batch_emb = torch.zeros_like(x).repeat(1, 1, 2)
+        i_sum = 0
+        num_stages = len(hidden_length)
+        for i_p, length in enumerate(hidden_length):
+            batch_emb[:, i_sum:i_sum+length] = emb[i_p::num_stages][:,None]
+            i_sum += length
+        batch_scale, batch_shift = torch.chunk(batch_emb, 2, dim=2)
+        x = self.norm(x) * (1 + batch_scale) + batch_shift
+        return x
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor, hidden_length=None) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        if hidden_length is not None:
+            return self.forward_with_pad(x, conditioning_embedding, hidden_length)
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+class AdaLayerNormZero(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+    def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None):
+        super().__init__()
+        self.emb = None
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward_with_pad(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # x: [bs, seq_len, dim]
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        emb = self.linear(self.silu(emb))
+        batch_emb = torch.zeros_like(x).repeat(1, 1, 6)
+        i_sum = 0
+        num_stages = len(hidden_length)
+        for i_p, length in enumerate(hidden_length):
+            batch_emb[:, i_sum:i_sum+length] = emb[i_p::num_stages][:,None]
+            i_sum += length
+        batch_shift_msa, batch_scale_msa, batch_gate_msa, batch_shift_mlp, batch_scale_mlp, batch_gate_mlp = batch_emb.chunk(6, dim=2)
+        x = self.norm(x) * (1 + batch_scale_msa) + batch_shift_msa
+        return x, batch_gate_msa, batch_shift_mlp, batch_scale_mlp, batch_gate_mlp
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+        hidden_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if hidden_length is not None:
+            return self.forward_with_pad(x, timestep, class_labels, hidden_dtype, emb, hidden_length)
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp

pyramid_dit/mmdit_modules/modeling_pyramid_mmdit.py ADDED Viewed

	@@ -0,0 +1,497 @@

+import torch
+import torch.nn as nn
+import os
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import is_torch_version
+from typing import Any, Callable, Dict, List, Optional, Union
+from .modeling_embedding import PatchEmbed3D, CombinedTimestepConditionEmbeddings
+from .modeling_normalization import AdaLayerNormContinuous
+from .modeling_mmdit_block import JointTransformerBlock
+from trainer_misc import (
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    get_sequence_parallel_rank,
+    all_to_all,
+)
+from IPython import embed
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    assert dim % 2 == 0, "The dimension must be even."
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    batch_size, seq_length = pos.shape
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    cos_out = torch.cos(out)
+    sin_out = torch.sin(out)
+    stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+    out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
+    return out.float()
+class EmbedNDRoPE(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(2)
+class PyramidDiffusionMMDiT(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        num_layers: int = 24,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 24,
+        caption_projection_dim: int = 1152,
+        pooled_projection_dim: int = 2048,
+        pos_embed_max_size: int = 192,
+        max_num_frames: int = 200,
+        qk_norm: str = 'rms_norm',
+        pos_embed_type: str = 'rope',
+        temp_pos_embed_type: str = 'sincos',
+        joint_attention_dim: int = 4096,
+        use_gradient_checkpointing: bool = False,
+        use_flash_attn: bool = True,
+        use_temporal_causal: bool = False,
+        use_t5_mask: bool = False,
+        add_temp_pos_embed: bool = False,
+        interp_condition_pos: bool = False,
+        gradient_checkpointing_ratio: float = 0.6,
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        assert temp_pos_embed_type in ['rope', 'sincos']
+        # The input latent embeder, using the name pos_embed to remain the same with SD#
+        self.pos_embed = PatchEmbed3D(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=self.inner_dim,
+            pos_embed_max_size=pos_embed_max_size,  # hard-code for now.
+            max_num_frames=max_num_frames,
+            pos_embed_type=pos_embed_type,
+            temp_pos_embed_type=temp_pos_embed_type,
+            add_temp_pos_embed=add_temp_pos_embed,
+            interp_condition_pos=interp_condition_pos,
+        )
+        # The RoPE EMbedding
+        if pos_embed_type == 'rope':
+            self.rope_embed = EmbedNDRoPE(self.inner_dim, 10000, axes_dim=[16, 24, 24])
+        else:
+            self.rope_embed = None
+        if temp_pos_embed_type == 'rope':
+            self.temp_rope_embed = EmbedNDRoPE(self.inner_dim, 10000, axes_dim=[attention_head_dim])
+        else:
+            self.temp_rope_embed = None
+        self.time_text_embed = CombinedTimestepConditionEmbeddings(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim,
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.config.caption_projection_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=self.inner_dim,
+                    qk_norm=qk_norm,
+                    context_pre_only=i == num_layers - 1,
+                    use_flash_attn=use_flash_attn,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = use_gradient_checkpointing
+        self.gradient_checkpointing_ratio = gradient_checkpointing_ratio
+        self.patch_size = patch_size
+        self.use_flash_attn = use_flash_attn
+        self.use_temporal_causal = use_temporal_causal
+        self.pos_embed_type = pos_embed_type
+        self.temp_pos_embed_type = temp_pos_embed_type
+        self.add_temp_pos_embed = add_temp_pos_embed
+        if self.use_temporal_causal:
+            print("Using temporal causal attention")
+            assert self.use_flash_attn is False, "The flash attention does not support temporal causal"
+        if interp_condition_pos:
+            print("We interp the position embedding of condition latents")
+        # init weights
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv3d)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.pos_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.pos_embed.proj.bias, 0)
+        # Initialize all the conditioning to normal init
+        nn.init.normal_(self.time_text_embed.timestep_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.timestep_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.text_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.time_text_embed.text_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.context_embedder.weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.norm1.linear.weight, 0)
+            nn.init.constant_(block.norm1.linear.bias, 0)
+            nn.init.constant_(block.norm1_context.linear.weight, 0)
+            nn.init.constant_(block.norm1_context.linear.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.norm_out.linear.weight, 0)
+        nn.init.constant_(self.norm_out.linear.bias, 0)
+        nn.init.constant_(self.proj_out.weight, 0)
+        nn.init.constant_(self.proj_out.bias, 0)
+    @torch.no_grad()
+    def _prepare_latent_image_ids(self, batch_size, temp, height, width, device):
+        latent_image_ids = torch.zeros(temp, height, width, 3)
+        latent_image_ids[..., 0] = latent_image_ids[..., 0] + torch.arange(temp)[:, None, None]
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[None, :, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, None, :]
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1, 1)
+        latent_image_ids = rearrange(latent_image_ids, 'b t h w c -> b (t h w) c')
+        return latent_image_ids.to(device=device)
+    @torch.no_grad()
+    def _prepare_pyramid_latent_image_ids(self, batch_size, temp_list, height_list, width_list, device):
+        base_width = width_list[-1]; base_height = height_list[-1]
+        assert base_width == max(width_list)
+        assert base_height == max(height_list)
+        image_ids_list = []
+        for temp, height, width in zip(temp_list, height_list, width_list):
+            latent_image_ids = torch.zeros(temp, height, width, 3)
+            if height != base_height:
+                height_pos = F.interpolate(torch.arange(base_height)[None, None, :].float(), height, mode='linear').squeeze(0, 1)
+            else:
+                height_pos = torch.arange(base_height).float()
+            if width != base_width:
+                width_pos = F.interpolate(torch.arange(base_width)[None, None, :].float(), width, mode='linear').squeeze(0, 1)
+            else:
+                width_pos = torch.arange(base_width).float()
+            latent_image_ids[..., 0] = latent_image_ids[..., 0] + torch.arange(temp)[:, None, None]
+            latent_image_ids[..., 1] = latent_image_ids[..., 1] + height_pos[None, :, None]
+            latent_image_ids[..., 2] = latent_image_ids[..., 2] + width_pos[None, None, :]
+            latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1, 1)
+            latent_image_ids = rearrange(latent_image_ids, 'b t h w c -> b (t h w) c').to(device)
+            image_ids_list.append(latent_image_ids)
+        return image_ids_list
+    @torch.no_grad()
+    def _prepare_temporal_rope_ids(self, batch_size, temp, height, width, device, start_time_stamp=0):
+        latent_image_ids = torch.zeros(temp, height, width, 1)
+        latent_image_ids[..., 0] = latent_image_ids[..., 0] + torch.arange(start_time_stamp, start_time_stamp + temp)[:, None, None]
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1, 1)
+        latent_image_ids = rearrange(latent_image_ids, 'b t h w c -> b (t h w) c')
+        return latent_image_ids.to(device=device)
+    @torch.no_grad()
+    def _prepare_pyramid_temporal_rope_ids(self, sample, batch_size, device):
+        image_ids_list = []
+        for i_b, sample_ in enumerate(sample):
+            if not isinstance(sample_, list):
+                sample_ = [sample_]
+            cur_image_ids = []
+            start_time_stamp = 0
+            for clip_ in sample_:
+                _, _, temp, height, width = clip_.shape
+                height = height // self.patch_size
+                width = width // self.patch_size
+                cur_image_ids.append(self._prepare_temporal_rope_ids(batch_size, temp, height, width, device, start_time_stamp=start_time_stamp))
+                start_time_stamp += temp
+            cur_image_ids = torch.cat(cur_image_ids, dim=1)
+            image_ids_list.append(cur_image_ids)
+        return image_ids_list
+    def merge_input(self, sample, encoder_hidden_length, encoder_attention_mask):
+        """
+            Merge the input video with different resolutions into one sequence
+            Sample: From low resolution to high resolution
+        """
+        if isinstance(sample[0], list):
+            device = sample[0][-1].device
+            pad_batch_size = sample[0][-1].shape[0]
+        else:
+            device = sample[0].device
+            pad_batch_size = sample[0].shape[0]
+        num_stages = len(sample)
+        height_list = [];width_list = [];temp_list = []
+        trainable_token_list = []
+        for i_b, sample_ in enumerate(sample):
+            if isinstance(sample_, list):
+                sample_ = sample_[-1]
+            _, _, temp, height, width = sample_.shape
+            height = height // self.patch_size
+            width = width // self.patch_size
+            temp_list.append(temp)
+            height_list.append(height)
+            width_list.append(width)
+            trainable_token_list.append(height * width * temp)
+        # prepare the RoPE embedding if needed
+        if self.pos_embed_type == 'rope':
+            # TODO: support the 3D Rope for video
+            raise NotImplementedError("Not compatible with video generation now")
+            text_ids = torch.zeros(pad_batch_size, encoder_hidden_length, 3).to(device=device)
+            image_ids_list = self._prepare_pyramid_latent_image_ids(pad_batch_size, temp_list, height_list, width_list, device)
+            input_ids_list = [torch.cat([text_ids, image_ids], dim=1) for image_ids in image_ids_list]
+            image_rotary_emb = [self.rope_embed(input_ids) for input_ids in input_ids_list]  # [bs, seq_len, 1, head_dim // 2, 2, 2]
+        else:
+            if self.temp_pos_embed_type == 'rope' and self.add_temp_pos_embed:
+                image_ids_list = self._prepare_pyramid_temporal_rope_ids(sample, pad_batch_size, device)
+                text_ids = torch.zeros(pad_batch_size, encoder_attention_mask.shape[1], 1).to(device=device)
+                input_ids_list = [torch.cat([text_ids, image_ids], dim=1) for image_ids in image_ids_list]
+                image_rotary_emb = [self.temp_rope_embed(input_ids) for input_ids in input_ids_list]  # [bs, seq_len, 1, head_dim // 2, 2, 2]
+                if is_sequence_parallel_initialized():
+                    sp_group = get_sequence_parallel_group()
+                    sp_group_size = get_sequence_parallel_world_size()
+                    concat_output = True if self.training else False
+                    image_rotary_emb = [all_to_all(x_.repeat(1, 1, sp_group_size, 1, 1, 1), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output) for x_ in image_rotary_emb]
+                    input_ids_list = [all_to_all(input_ids.repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output) for input_ids in input_ids_list]
+            else:
+                image_rotary_emb = None
+        hidden_states = self.pos_embed(sample)  # hidden states is a list of [b c t h w] b = real_b // num_stages
+        hidden_length = []
+        for i_b in range(num_stages):
+            hidden_length.append(hidden_states[i_b].shape[1])
+        # prepare the attention mask
+        if self.use_flash_attn:
+            attention_mask = None
+            indices_list = []
+            for i_p, length in enumerate(hidden_length):
+                pad_attention_mask = torch.ones((pad_batch_size, length), dtype=encoder_attention_mask.dtype).to(device)
+                pad_attention_mask = torch.cat([encoder_attention_mask[i_p::num_stages], pad_attention_mask], dim=1)
+                if is_sequence_parallel_initialized():
+                    sp_group = get_sequence_parallel_group()
+                    sp_group_size = get_sequence_parallel_world_size()
+                    pad_attention_mask = all_to_all(pad_attention_mask.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0)
+                    pad_attention_mask = pad_attention_mask.squeeze(2)
+                seqlens_in_batch = pad_attention_mask.sum(dim=-1, dtype=torch.int32)
+                indices = torch.nonzero(pad_attention_mask.flatten(), as_tuple=False).flatten()
+                indices_list.append(
+                    {
+                        'indices': indices,
+                        'seqlens_in_batch': seqlens_in_batch,
+                    }
+                )
+            encoder_attention_mask = indices_list
+        else:
+            assert encoder_attention_mask.shape[1] == encoder_hidden_length
+            real_batch_size = encoder_attention_mask.shape[0]
+            # prepare text ids
+            text_ids = torch.arange(1, real_batch_size + 1, dtype=encoder_attention_mask.dtype).unsqueeze(1).repeat(1, encoder_hidden_length)
+            text_ids = text_ids.to(device)
+            text_ids[encoder_attention_mask == 0] = 0
+            # prepare image ids
+            image_ids = torch.arange(1, real_batch_size + 1, dtype=encoder_attention_mask.dtype).unsqueeze(1).repeat(1, max(hidden_length))
+            image_ids = image_ids.to(device)
+            image_ids_list = []
+            for i_p, length in enumerate(hidden_length):
+                image_ids_list.append(image_ids[i_p::num_stages][:, :length])
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                concat_output = True if self.training else False
+                text_ids = all_to_all(text_ids.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output).squeeze(2)
+                image_ids_list = [all_to_all(image_ids_.unsqueeze(2).repeat(1, 1, sp_group_size), sp_group, sp_group_size, scatter_dim=2, gather_dim=0, concat_output=concat_output).squeeze(2) for image_ids_ in image_ids_list]
+            attention_mask = []
+            for i_p in range(len(hidden_length)):
+                image_ids = image_ids_list[i_p]
+                token_ids = torch.cat([text_ids[i_p::num_stages], image_ids], dim=1)
+                stage_attention_mask = rearrange(token_ids, 'b i -> b 1 i 1') == rearrange(token_ids, 'b j -> b 1 1 j')  # [bs, 1, q_len, k_len]
+                if self.use_temporal_causal:
+                    input_order_ids = input_ids_list[i_p].squeeze(2)
+                    temporal_causal_mask = rearrange(input_order_ids, 'b i -> b 1 i 1') >= rearrange(input_order_ids, 'b j -> b 1 1 j')
+                    stage_attention_mask = stage_attention_mask & temporal_causal_mask
+                attention_mask.append(stage_attention_mask)
+        return hidden_states, hidden_length, temp_list, height_list, width_list, trainable_token_list, encoder_attention_mask, attention_mask, image_rotary_emb
+    def split_output(self, batch_hidden_states, hidden_length, temps, heights, widths, trainable_token_list):
+        # To split the hidden states
+        batch_size = batch_hidden_states.shape[0]
+        output_hidden_list = []
+        batch_hidden_states = torch.split(batch_hidden_states, hidden_length, dim=1)
+        if is_sequence_parallel_initialized():
+            sp_group_size = get_sequence_parallel_world_size()
+            if self.training:
+                batch_size = batch_size // sp_group_size
+        for i_p, length in enumerate(hidden_length):
+            width, height, temp = widths[i_p], heights[i_p], temps[i_p]
+            trainable_token_num = trainable_token_list[i_p]
+            hidden_states = batch_hidden_states[i_p]
+            if is_sequence_parallel_initialized():
+                sp_group = get_sequence_parallel_group()
+                sp_group_size = get_sequence_parallel_world_size()
+                if not self.training:
+                    hidden_states = hidden_states.repeat(sp_group_size, 1, 1)
+                hidden_states = all_to_all(hidden_states, sp_group, sp_group_size, scatter_dim=0, gather_dim=1)
+            # only the trainable token are taking part in loss computation
+            hidden_states = hidden_states[:, -trainable_token_num:]
+            # unpatchify
+            hidden_states = hidden_states.reshape(
+                shape=(batch_size, temp, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = rearrange(hidden_states, "b t h w p1 p2 c -> b t (h p1) (w p2) c")
+            hidden_states = rearrange(hidden_states, "b t h w c -> b c t h w")
+            output_hidden_list.append(hidden_states)
+        return output_hidden_list
+    def forward(
+        self,
+        sample: torch.FloatTensor, # [num_stages]
+        encoder_hidden_states: torch.FloatTensor = None,
+        encoder_attention_mask: torch.FloatTensor = None,
+        pooled_projections: torch.FloatTensor = None,
+        timestep_ratio: torch.FloatTensor = None,
+    ):
+        # Get the timestep embedding
+        temb = self.time_text_embed(timestep_ratio, pooled_projections)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        encoder_hidden_length = encoder_hidden_states.shape[1]
+        # Get the input sequence
+        hidden_states, hidden_length, temps, heights, widths, trainable_token_list, encoder_attention_mask, \
+                attention_mask, image_rotary_emb = self.merge_input(sample, encoder_hidden_length, encoder_attention_mask)
+        # split the long latents if necessary
+        if is_sequence_parallel_initialized():
+            sp_group = get_sequence_parallel_group()
+            sp_group_size = get_sequence_parallel_world_size()
+            concat_output = True if self.training else False
+            # sync the input hidden states
+            batch_hidden_states = []
+            for i_p, hidden_states_ in enumerate(hidden_states):
+                assert hidden_states_.shape[1] % sp_group_size == 0, "The sequence length should be divided by sequence parallel size"
+                hidden_states_ = all_to_all(hidden_states_, sp_group, sp_group_size, scatter_dim=1, gather_dim=0, concat_output=concat_output)
+                hidden_length[i_p] = hidden_length[i_p] // sp_group_size
+                batch_hidden_states.append(hidden_states_)
+            # sync the encoder hidden states
+            hidden_states = torch.cat(batch_hidden_states, dim=1)
+            encoder_hidden_states = all_to_all(encoder_hidden_states, sp_group, sp_group_size, scatter_dim=1, gather_dim=0, concat_output=concat_output)
+            temb = all_to_all(temb.unsqueeze(1).repeat(1, sp_group_size, 1), sp_group, sp_group_size, scatter_dim=1, gather_dim=0, concat_output=concat_output)
+            temb = temb.squeeze(1)
+        else:
+            hidden_states = torch.cat(hidden_states, dim=1)
+        # print(hidden_length)
+        for i_b, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing and (i_b >= int(len(self.transformer_blocks) * self.gradient_checkpointing_ratio)):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    temb,
+                    attention_mask,
+                    hidden_length,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    temb=temb,
+                    attention_mask=attention_mask,
+                    hidden_length=hidden_length,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        hidden_states = self.norm_out(hidden_states, temb, hidden_length=hidden_length)
+        hidden_states = self.proj_out(hidden_states)
+        output = self.split_output(hidden_states, hidden_length, temps, heights, widths, trainable_token_list)
+        return output

pyramid_dit/mmdit_modules/modeling_text_encoder.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+import os
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from typing import Any, Callable, Dict, List, Optional, Union
+class SD3TextEncoderWithMask(nn.Module):
+    def __init__(self, model_path, torch_dtype):
+        super().__init__()
+        # CLIP-L
+        self.tokenizer = CLIPTokenizer.from_pretrained(os.path.join(model_path, 'tokenizer'))
+        self.tokenizer_max_length = self.tokenizer.model_max_length
+        self.text_encoder = CLIPTextModelWithProjection.from_pretrained(os.path.join(model_path, 'text_encoder'), torch_dtype=torch_dtype)
+        # CLIP-G
+        self.tokenizer_2 = CLIPTokenizer.from_pretrained(os.path.join(model_path, 'tokenizer_2'))
+        self.text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(os.path.join(model_path, 'text_encoder_2'), torch_dtype=torch_dtype)
+        # T5
+        self.tokenizer_3 = T5TokenizerFast.from_pretrained(os.path.join(model_path, 'tokenizer_3'))
+        self.text_encoder_3 = T5EncoderModel.from_pretrained(os.path.join(model_path, 'text_encoder_3'), torch_dtype=torch_dtype)
+        self._freeze()
+    def _freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 128,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_attention_mask = text_inputs.attention_mask
+        prompt_attention_mask = prompt_attention_mask.to(device)
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device), attention_mask=prompt_attention_mask)[0]
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(batch_size, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        return prompt_embeds, prompt_attention_mask
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+        batch_size = len(prompt)
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return pooled_prompt_embeds
+    def encode_prompt(self,
+        prompt,
+        num_images_per_prompt=1,
+        clip_skip: Optional[int] = None,
+        device=None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        pooled_prompt_embed = self._get_clip_prompt_embeds(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            clip_skip=clip_skip,
+            clip_model_index=0,
+        )
+        pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            clip_skip=clip_skip,
+            clip_model_index=1,
+        )
+        pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+        prompt_embeds, prompt_attention_mask = self._get_t5_prompt_embeds(
+            prompt=prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+        )
+        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds
+    def forward(self, input_prompts, device):
+        with torch.no_grad():
+            prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.encode_prompt(input_prompts, 1, clip_skip=None, device=device)
+        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds

pyramid_dit/pyramid_dit_for_video_gen_pipeline.py ADDED Viewed

	@@ -0,0 +1,1279 @@

+import torch
+import os
+import gc
+import sys
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import OrderedDict
+from einops import rearrange
+from diffusers.utils.torch_utils import randn_tensor
+import numpy as np
+import math
+import random
+import PIL
+from PIL import Image
+from tqdm import tqdm
+from torchvision import transforms
+from copy import deepcopy
+from typing import Any, Callable, Dict, List, Optional, Union
+from accelerate import Accelerator, cpu_offload
+from diffusion_schedulers import PyramidFlowMatchEulerDiscreteScheduler
+from video_vae.modeling_causal_vae import CausalVideoVAE
+from trainer_misc import (
+    all_to_all,
+    is_sequence_parallel_initialized,
+    get_sequence_parallel_group,
+    get_sequence_parallel_group_rank,
+    get_sequence_parallel_rank,
+    get_sequence_parallel_world_size,
+    get_rank,
+)
+from .mmdit_modules import (
+    PyramidDiffusionMMDiT,
+    SD3TextEncoderWithMask,
+)
+from .flux_modules import (
+    PyramidFluxTransformer,
+    FluxTextEncoderWithMask,
+)
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+def build_pyramid_dit(
+    model_name : str,
+    model_path : str,
+    torch_dtype,
+    use_flash_attn : bool,
+    use_mixed_training: bool,
+    interp_condition_pos: bool = True,
+    use_gradient_checkpointing: bool = False,
+    use_temporal_causal: bool = True,
+    gradient_checkpointing_ratio: float = 0.6,
+):
+    model_dtype = torch.float32 if use_mixed_training else torch_dtype
+    if model_name == "pyramid_flux":
+        dit = PyramidFluxTransformer.from_pretrained(
+            model_path, torch_dtype=model_dtype,
+            use_gradient_checkpointing=use_gradient_checkpointing,
+            gradient_checkpointing_ratio=gradient_checkpointing_ratio,
+            use_flash_attn=use_flash_attn, use_temporal_causal=use_temporal_causal,
+            interp_condition_pos=interp_condition_pos, axes_dims_rope=[16, 24, 24],
+        )
+    elif model_name == "pyramid_mmdit":
+        dit = PyramidDiffusionMMDiT.from_pretrained(
+            model_path, torch_dtype=model_dtype, use_gradient_checkpointing=use_gradient_checkpointing,
+            gradient_checkpointing_ratio=gradient_checkpointing_ratio,
+            use_flash_attn=use_flash_attn, use_t5_mask=True,
+            add_temp_pos_embed=True, temp_pos_embed_type='rope',
+            use_temporal_causal=use_temporal_causal, interp_condition_pos=interp_condition_pos,
+        )
+    else:
+        raise NotImplementedError(f"Unsupported DiT architecture, please set the model_name to `pyramid_flux` or `pyramid_mmdit`")
+    return dit
+def build_text_encoder(
+    model_name : str,
+    model_path : str,
+    torch_dtype,
+    load_text_encoder: bool = True,
+):
+    # The text encoder
+    if load_text_encoder:
+        if model_name == "pyramid_flux":
+            text_encoder = FluxTextEncoderWithMask(model_path, torch_dtype=torch_dtype)
+        elif model_name == "pyramid_mmdit":
+            text_encoder = SD3TextEncoderWithMask(model_path, torch_dtype=torch_dtype)
+        else:
+            raise NotImplementedError(f"Unsupported Text Encoder architecture, please set the model_name to `pyramid_flux` or `pyramid_mmdit`")
+    else:
+        text_encoder = None
+    return text_encoder
+class PyramidDiTForVideoGeneration:
+    """
+        The pyramid dit for both image and video generation, The running class wrapper
+        This class is mainly for fixed unit implementation: 1 + n + n + n
+    """
+    def __init__(self, model_path, model_dtype='bf16', model_name='pyramid_mmdit', use_gradient_checkpointing=False,
+        return_log=True, model_variant="diffusion_transformer_768p", timestep_shift=1.0, stage_range=[0, 1/3, 2/3, 1],
+        sample_ratios=[1, 1, 1], scheduler_gamma=1/3, use_mixed_training=False, use_flash_attn=False,
+        load_text_encoder=True, load_vae=True, max_temporal_length=31, frame_per_unit=1, use_temporal_causal=True,
+        corrupt_ratio=1/3, interp_condition_pos=True, stages=[1, 2, 4], video_sync_group=8, gradient_checkpointing_ratio=0.6, **kwargs,
+    ):
+        super().__init__()
+        if model_dtype == 'bf16':
+            torch_dtype = torch.bfloat16
+        elif model_dtype == 'fp16':
+            torch_dtype = torch.float16
+        else:
+            torch_dtype = torch.float32
+        self.stages = stages
+        self.sample_ratios = sample_ratios
+        self.corrupt_ratio = corrupt_ratio
+        dit_path = os.path.join(model_path, model_variant)
+        # The dit
+        self.dit = build_pyramid_dit(
+            model_name, dit_path, torch_dtype,
+            use_flash_attn=use_flash_attn, use_mixed_training=use_mixed_training,
+            interp_condition_pos=interp_condition_pos, use_gradient_checkpointing=use_gradient_checkpointing,
+            use_temporal_causal=use_temporal_causal, gradient_checkpointing_ratio=gradient_checkpointing_ratio,
+        )
+        # The text encoder
+        self.text_encoder = build_text_encoder(
+            model_name, model_path, torch_dtype, load_text_encoder=load_text_encoder,
+        )
+        self.load_text_encoder = load_text_encoder
+        # The base video vae decoder
+        if load_vae:
+            self.vae = CausalVideoVAE.from_pretrained(os.path.join(model_path, 'causal_video_vae'), torch_dtype=torch_dtype, interpolate=False)
+            # Freeze vae
+            for parameter in self.vae.parameters():
+                parameter.requires_grad = False
+        else:
+            self.vae = None
+        self.load_vae = load_vae
+        # For the image latent
+        if model_name == "pyramid_flux":
+            self.vae_shift_factor = -0.04
+            self.vae_scale_factor = 1 / 1.8726
+        elif model_name == "pyramid_mmdit":
+            self.vae_shift_factor = 0.1490
+            self.vae_scale_factor = 1 / 1.8415
+        else:
+            raise NotImplementedError(f"Unsupported model name : {model_name}")
+        # For the video latent
+        self.vae_video_shift_factor = -0.2343
+        self.vae_video_scale_factor = 1 / 3.0986
+        self.downsample = 8
+        # Configure the video training hyper-parameters
+        # The video sequence: one frame + N * unit
+        self.frame_per_unit = frame_per_unit
+        self.max_temporal_length = max_temporal_length
+        assert (max_temporal_length - 1) % frame_per_unit == 0, "The frame number should be divided by the frame number per unit"
+        self.num_units_per_video = 1 + ((max_temporal_length - 1) // frame_per_unit) + int(sum(sample_ratios))
+        self.scheduler = PyramidFlowMatchEulerDiscreteScheduler(
+            shift=timestep_shift, stages=len(self.stages),
+            stage_range=stage_range, gamma=scheduler_gamma,
+        )
+        print(f"The start sigmas and end sigmas of each stage is Start: {self.scheduler.start_sigmas}, End: {self.scheduler.end_sigmas}, Ori_start: {self.scheduler.ori_start_sigmas}")
+        self.cfg_rate = 0.1
+        self.return_log = return_log
+        self.use_flash_attn = use_flash_attn
+        self.model_name = model_name
+        self.sequential_offload_enabled = False
+        self.accumulate_steps = 0
+        self.video_sync_group = video_sync_group
+    def _enable_sequential_cpu_offload(self, model):
+        self.sequential_offload_enabled = True
+        torch_device = torch.device("cuda")
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:0")
+        offload_buffers = len(model._parameters) > 0
+        cpu_offload(model, device, offload_buffers=offload_buffers)
+    def enable_sequential_cpu_offload(self):
+        self._enable_sequential_cpu_offload(self.text_encoder)
+        self._enable_sequential_cpu_offload(self.dit)
+    def load_checkpoint(self, checkpoint_path, model_key='model', **kwargs):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        dit_checkpoint = OrderedDict()
+        for key in checkpoint:
+            if key.startswith('vae') or key.startswith('text_encoder'):
+                continue
+            if key.startswith('dit'):
+                new_key = key.split('.')
+                new_key = '.'.join(new_key[1:])
+                dit_checkpoint[new_key] = checkpoint[key]
+            else:
+                dit_checkpoint[key] = checkpoint[key]
+        load_result = self.dit.load_state_dict(dit_checkpoint, strict=True)
+        print(f"Load checkpoint from {checkpoint_path}, load result: {load_result}")
+    def load_vae_checkpoint(self, vae_checkpoint_path, model_key='model'):
+        checkpoint = torch.load(vae_checkpoint_path, map_location='cpu')
+        checkpoint = checkpoint[model_key]
+        loaded_checkpoint = OrderedDict()
+        for key in checkpoint.keys():
+            if key.startswith('vae.'):
+                new_key = key.split('.')
+                new_key = '.'.join(new_key[1:])
+                loaded_checkpoint[new_key] = checkpoint[key]
+        load_result = self.vae.load_state_dict(loaded_checkpoint)
+        print(f"Load the VAE from {vae_checkpoint_path}, load result: {load_result}")
+    @torch.no_grad()
+    def add_pyramid_noise(
+        self,
+        latents_list,
+        sample_ratios=[1, 1, 1],
+    ):
+        """
+        add the noise for each pyramidal stage
+            noting that, this method is a general strategy for pyramid-flow, it
+            can be used for both image and video training.
+            You can also use this method to train pyramid-flow with full-sequence
+            diffusion in video generation (without using temporal pyramid and autoregressive modeling)
+        Params:
+            latent_list: [low_res, mid_res, high_res] The vae latents of all stages
+            sample_ratios: The proportion of each stage in the training batch
+        """
+        noise = torch.randn_like(latents_list[-1])
+        device = noise.device
+        dtype = latents_list[-1].dtype
+        t = noise.shape[2]
+        stages = len(self.stages)
+        tot_samples = noise.shape[0]
+        assert tot_samples % (int(sum(sample_ratios))) == 0
+        assert stages == len(sample_ratios)
+        height, width = noise.shape[-2], noise.shape[-1]
+        noise_list = [noise]
+        cur_noise = noise
+        for i_s in range(stages-1):
+            height //= 2;width //= 2
+            cur_noise = rearrange(cur_noise, 'b c t h w -> (b t) c h w')
+            cur_noise = F.interpolate(cur_noise, size=(height, width), mode='bilinear') * 2
+            cur_noise = rearrange(cur_noise, '(b t) c h w -> b c t h w', t=t)
+            noise_list.append(cur_noise)
+        noise_list = list(reversed(noise_list))   # make sure from low res to high res
+        # To calculate the padding batchsize and column size
+        batch_size = tot_samples // int(sum(sample_ratios))
+        column_size = int(sum(sample_ratios))
+        column_to_stage = {}
+        i_sum = 0
+        for i_s, column_num in enumerate(sample_ratios):
+            for index in range(i_sum, i_sum + column_num):
+                column_to_stage[index] = i_s
+            i_sum += column_num
+        noisy_latents_list = []
+        ratios_list = []
+        targets_list = []
+        timesteps_list = []
+        training_steps = self.scheduler.config.num_train_timesteps
+        # from low resolution to high resolution
+        for index in range(column_size):
+            i_s = column_to_stage[index]
+            clean_latent = latents_list[i_s][index::column_size]   # [bs, c, t, h, w]
+            last_clean_latent = None if i_s == 0 else latents_list[i_s-1][index::column_size]
+            start_sigma = self.scheduler.start_sigmas[i_s]
+            end_sigma = self.scheduler.end_sigmas[i_s]
+            if i_s == 0:
+                start_point = noise_list[i_s][index::column_size]
+            else:
+                # Get the upsampled latent
+                last_clean_latent = rearrange(last_clean_latent, 'b c t h w -> (b t) c h w')
+                last_clean_latent = F.interpolate(last_clean_latent, size=(last_clean_latent.shape[-2] * 2, last_clean_latent.shape[-1] * 2), mode='nearest')
+                last_clean_latent = rearrange(last_clean_latent, '(b t) c h w -> b c t h w', t=t)
+                start_point = start_sigma * noise_list[i_s][index::column_size] + (1 - start_sigma) * last_clean_latent
+            if i_s == stages - 1:
+                end_point = clean_latent
+            else:
+                end_point = end_sigma * noise_list[i_s][index::column_size] + (1 - end_sigma) * clean_latent
+            # To sample a timestep
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme='random',
+                batch_size=batch_size,
+                logit_mean=0.0,
+                logit_std=1.0,
+                mode_scale=1.29,
+            )
+            indices = (u * training_steps).long()   # Totally 1000 training steps per stage
+            indices = indices.clamp(0, training_steps-1)
+            timesteps = self.scheduler.timesteps_per_stage[i_s][indices].to(device=device)
+            ratios = self.scheduler.sigmas_per_stage[i_s][indices].to(device=device)
+            while len(ratios.shape) < start_point.ndim:
+                ratios = ratios.unsqueeze(-1)
+            # interpolate the latent
+            noisy_latents = ratios * start_point + (1 - ratios) * end_point
+            last_cond_noisy_sigma = torch.rand(size=(batch_size,), device=device) * self.corrupt_ratio
+            # [stage1_latent, stage2_latent, ..., stagen_latent], which will be concat after patching
+            noisy_latents_list.append([noisy_latents.to(dtype)])
+            ratios_list.append(ratios.to(dtype))
+            timesteps_list.append(timesteps.to(dtype))
+            targets_list.append(start_point - end_point)     # The standard rectified flow matching objective
+        return noisy_latents_list, ratios_list, timesteps_list, targets_list
+    def sample_stage_length(self, num_stages, max_units=None):
+        max_units_in_training = 1 + ((self.max_temporal_length - 1) // self.frame_per_unit)
+        cur_rank = get_rank()
+        self.accumulate_steps = self.accumulate_steps + 1
+        total_turns =  max_units_in_training // self.video_sync_group
+        update_turn = self.accumulate_steps % total_turns
+        # # uniformly sampling each position
+        cur_highres_unit = max(int((cur_rank % self.video_sync_group + 1) + update_turn * self.video_sync_group), 1)
+        cur_mid_res_unit = max(1 + max_units_in_training - cur_highres_unit, 1)
+        cur_low_res_unit = cur_mid_res_unit
+        if max_units is not None:
+            cur_highres_unit = min(cur_highres_unit, max_units)
+            cur_mid_res_unit = min(cur_mid_res_unit, max_units)
+            cur_low_res_unit = min(cur_low_res_unit, max_units)
+        length_list = [cur_low_res_unit, cur_mid_res_unit, cur_highres_unit]
+        assert len(length_list) == num_stages
+        return length_list
+    @torch.no_grad()
+    def add_pyramid_noise_with_temporal_pyramid(
+        self,
+        latents_list,
+        sample_ratios=[1, 1, 1],
+    ):
+        """
+        add the noise for each pyramidal stage, used for AR video training with temporal pyramid
+        Params:
+            latent_list: [low_res, mid_res, high_res] The vae latents of all stages
+            sample_ratios: The proportion of each stage in the training batch
+        """
+        stages = len(self.stages)
+        tot_samples = latents_list[0].shape[0]
+        device = latents_list[0].device
+        dtype = latents_list[0].dtype
+        assert tot_samples % (int(sum(sample_ratios))) == 0
+        assert stages == len(sample_ratios)
+        noise = torch.randn_like(latents_list[-1])
+        t = noise.shape[2]
+        # To allocate the temporal length of each stage, ensuring the sum == constant
+        max_units = 1 + (t - 1) // self.frame_per_unit
+        if is_sequence_parallel_initialized():
+            max_units_per_sample = torch.LongTensor([max_units]).to(device)
+            sp_group = get_sequence_parallel_group()
+            sp_group_size = get_sequence_parallel_world_size()
+            max_units_per_sample = all_to_all(max_units_per_sample.unsqueeze(1).repeat(1, sp_group_size), sp_group, sp_group_size, scatter_dim=1, gather_dim=0).squeeze(1)
+            max_units = min(max_units_per_sample.cpu().tolist())
+        num_units_per_stage = self.sample_stage_length(stages, max_units=max_units)   # [The unit number of each stage]
+        # we needs to sync the length alloc of each sequence parallel group
+        if is_sequence_parallel_initialized():
+            num_units_per_stage = torch.LongTensor(num_units_per_stage).to(device)
+            sp_group_rank = get_sequence_parallel_group_rank()
+            global_src_rank = sp_group_rank * get_sequence_parallel_world_size()
+            torch.distributed.broadcast(num_units_per_stage, global_src_rank, group=get_sequence_parallel_group())
+            num_units_per_stage = num_units_per_stage.tolist()
+        height, width = noise.shape[-2], noise.shape[-1]
+        noise_list = [noise]
+        cur_noise = noise
+        for i_s in range(stages-1):
+            height //= 2;width //= 2
+            cur_noise = rearrange(cur_noise, 'b c t h w -> (b t) c h w')
+            cur_noise = F.interpolate(cur_noise, size=(height, width), mode='bilinear') * 2
+            cur_noise = rearrange(cur_noise, '(b t) c h w -> b c t h w', t=t)
+            noise_list.append(cur_noise)
+        noise_list = list(reversed(noise_list))   # make sure from low res to high res
+        # To calculate the batchsize and column size
+        batch_size = tot_samples // int(sum(sample_ratios))
+        column_size = int(sum(sample_ratios))
+        column_to_stage = {}
+        i_sum = 0
+        for i_s, column_num in enumerate(sample_ratios):
+            for index in range(i_sum, i_sum + column_num):
+                column_to_stage[index] = i_s
+            i_sum += column_num
+        noisy_latents_list = []
+        ratios_list = []
+        targets_list = []
+        timesteps_list = []
+        training_steps = self.scheduler.config.num_train_timesteps
+        # from low resolution to high resolution
+        for index in range(column_size):
+            # First prepare the trainable latent construction
+            i_s = column_to_stage[index]
+            clean_latent = latents_list[i_s][index::column_size]   # [bs, c, t, h, w]
+            last_clean_latent = None if i_s == 0 else latents_list[i_s-1][index::column_size]
+            start_sigma = self.scheduler.start_sigmas[i_s]
+            end_sigma = self.scheduler.end_sigmas[i_s]
+            if i_s == 0:
+                start_point = noise_list[i_s][index::column_size]
+            else:
+                # Get the upsampled latent
+                last_clean_latent = rearrange(last_clean_latent, 'b c t h w -> (b t) c h w')
+                last_clean_latent = F.interpolate(last_clean_latent, size=(last_clean_latent.shape[-2] * 2, last_clean_latent.shape[-1] * 2), mode='nearest')
+                last_clean_latent = rearrange(last_clean_latent, '(b t) c h w -> b c t h w', t=t)
+                start_point = start_sigma * noise_list[i_s][index::column_size] + (1 - start_sigma) * last_clean_latent
+            if i_s == stages - 1:
+                end_point = clean_latent
+            else:
+                end_point = end_sigma * noise_list[i_s][index::column_size] + (1 - end_sigma) * clean_latent
+            # To sample a timestep
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme='random',
+                batch_size=batch_size,
+                logit_mean=0.0,
+                logit_std=1.0,
+                mode_scale=1.29,
+            )
+            indices = (u * training_steps).long()   # Totally 1000 training steps per stage
+            indices = indices.clamp(0, training_steps-1)
+            timesteps = self.scheduler.timesteps_per_stage[i_s][indices].to(device=device)
+            ratios = self.scheduler.sigmas_per_stage[i_s][indices].to(device=device)
+            noise_ratios = ratios * start_sigma + (1 - ratios) * end_sigma
+            while len(ratios.shape) < start_point.ndim:
+                ratios = ratios.unsqueeze(-1)
+            # interpolate the latent
+            noisy_latents = ratios * start_point + (1 - ratios) * end_point
+            # The flow matching object
+            target_latents = start_point - end_point
+            # pad the noisy previous
+            num_units = num_units_per_stage[i_s]
+            num_units = min(num_units, 1 + (t - 1) // self.frame_per_unit)
+            actual_frames = 1 + (num_units - 1) * self.frame_per_unit
+            noisy_latents = noisy_latents[:, :, :actual_frames]
+            target_latents = target_latents[:, :, :actual_frames]
+            clean_latent = clean_latent[:, :, :actual_frames]
+            stage_noise = noise_list[i_s][index::column_size][:, :, :actual_frames]
+            # only the last latent takes part in training
+            noisy_latents = noisy_latents[:, :, -self.frame_per_unit:]
+            target_latents = target_latents[:, :, -self.frame_per_unit:]
+            last_cond_noisy_sigma = torch.rand(size=(batch_size,), device=device) * self.corrupt_ratio
+            if num_units == 1:
+                stage_input = [noisy_latents.to(dtype)]
+            else:
+                # add the random noise for the last cond clip
+                last_cond_latent = clean_latent[:, :, -(2*self.frame_per_unit):-self.frame_per_unit]
+                while len(last_cond_noisy_sigma.shape) < last_cond_latent.ndim:
+                    last_cond_noisy_sigma = last_cond_noisy_sigma.unsqueeze(-1)
+                # We adding some noise to corrupt the clean condition
+                last_cond_latent = last_cond_noisy_sigma * torch.randn_like(last_cond_latent) + (1 - last_cond_noisy_sigma) * last_cond_latent
+                # concat the corrupted condition and the input noisy latents
+                stage_input = [noisy_latents.to(dtype), last_cond_latent.to(dtype)]
+                cur_unit_num = 2
+                cur_stage = i_s
+                while cur_unit_num < num_units:
+                    cur_stage = max(cur_stage - 1, 0)
+                    if cur_stage == 0:
+                        break
+                    cur_unit_num += 1
+                    cond_latents = latents_list[cur_stage][index::column_size][:, :, :actual_frames]
+                    cond_latents = cond_latents[:, :, -(cur_unit_num * self.frame_per_unit) : -((cur_unit_num - 1) * self.frame_per_unit)]
+                    cond_latents = last_cond_noisy_sigma * torch.randn_like(cond_latents)  + (1 - last_cond_noisy_sigma) * cond_latents
+                    stage_input.append(cond_latents.to(dtype))
+                if cur_stage == 0 and cur_unit_num < num_units:
+                    cond_latents = latents_list[0][index::column_size][:, :, :actual_frames]
+                    cond_latents = cond_latents[:, :, :-(cur_unit_num * self.frame_per_unit)]
+                    cond_latents = last_cond_noisy_sigma * torch.randn_like(cond_latents)  + (1 - last_cond_noisy_sigma) * cond_latents
+                    stage_input.append(cond_latents.to(dtype))
+            stage_input = list(reversed(stage_input))
+            noisy_latents_list.append(stage_input)
+            ratios_list.append(ratios.to(dtype))
+            timesteps_list.append(timesteps.to(dtype))
+            targets_list.append(target_latents)     # The standard rectified flow matching objective
+        return noisy_latents_list, ratios_list, timesteps_list, targets_list
+    @torch.no_grad()
+    def get_pyramid_latent(self, x, stage_num):
+        # x is the origin vae latent
+        vae_latent_list = []
+        vae_latent_list.append(x)
+        temp, height, width = x.shape[-3], x.shape[-2], x.shape[-1]
+        for _ in range(stage_num):
+            height //= 2
+            width //= 2
+            x = rearrange(x, 'b c t h w -> (b t) c h w')
+            x = torch.nn.functional.interpolate(x, size=(height, width), mode='bilinear')
+            x = rearrange(x, '(b t) c h w -> b c t h w', t=temp)
+            vae_latent_list.append(x)
+        vae_latent_list = list(reversed(vae_latent_list))
+        return vae_latent_list
+    @torch.no_grad()
+    def get_vae_latent(self, video, use_temporal_pyramid=True):
+        if self.load_vae:
+            assert video.shape[1] == 3, "The vae is loaded, the input should be raw pixels"
+            video = self.vae.encode(video).latent_dist.sample() # [b c t h w]
+        if video.shape[2] == 1:
+            # is image
+            video = (video - self.vae_shift_factor) * self.vae_scale_factor
+        else:
+            # is video
+            video[:, :, :1] = (video[:, :, :1] - self.vae_shift_factor) * self.vae_scale_factor
+            video[:, :, 1:] =  (video[:, :, 1:] - self.vae_video_shift_factor) * self.vae_video_scale_factor
+        # Get the pyramidal stages
+        vae_latent_list = self.get_pyramid_latent(video, len(self.stages) - 1)
+        if use_temporal_pyramid:
+            noisy_latents_list, ratios_list, timesteps_list, targets_list = self.add_pyramid_noise_with_temporal_pyramid(vae_latent_list, self.sample_ratios)
+        else:
+            # Only use the spatial pyramidal (without temporal ar)
+            noisy_latents_list, ratios_list, timesteps_list, targets_list = self.add_pyramid_noise(vae_latent_list, self.sample_ratios)
+        return noisy_latents_list, ratios_list, timesteps_list, targets_list
+    @torch.no_grad()
+    def get_text_embeddings(self, text, rand_idx, device):
+        if self.load_text_encoder:
+            batch_size = len(text)   # Text is a str list
+            for idx in range(batch_size):
+                if rand_idx[idx].item():
+                    text[idx] = ''
+            return self.text_encoder(text, device)   # [b s c]
+        else:
+            batch_size = len(text['prompt_embeds'])
+            for idx in range(batch_size):
+                if rand_idx[idx].item():
+                    text['prompt_embeds'][idx] = self.null_text_embeds['prompt_embed'].to(device)
+                    text['prompt_attention_mask'][idx] = self.null_text_embeds['prompt_attention_mask'].to(device)
+                    text['pooled_prompt_embeds'][idx] = self.null_text_embeds['pooled_prompt_embed'].to(device)
+            return text['prompt_embeds'], text['prompt_attention_mask'], text['pooled_prompt_embeds']
+    def calculate_loss(self, model_preds_list, targets_list):
+        loss_list = []
+        for model_pred, target in zip(model_preds_list, targets_list):
+            # Compute the loss.
+            loss_weight = torch.ones_like(target)
+            loss = torch.mean(
+                (loss_weight.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                1,
+            )
+            loss_list.append(loss)
+        diffusion_loss = torch.cat(loss_list, dim=0).mean()
+        if self.return_log:
+            log = {}
+            split="train"
+            log[f'{split}/loss'] = diffusion_loss.detach()
+            return diffusion_loss, log
+        else:
+            return diffusion_loss, {}
+    def __call__(self, video, text, identifier=['video'], use_temporal_pyramid=True, accelerator: Accelerator=None):
+        xdim = video.ndim
+        device = video.device
+        if 'video' in identifier:
+            assert 'image' not in identifier
+            is_image = False
+        else:
+            assert 'video' not in identifier
+            video = video.unsqueeze(2)  # 'b c h w -> b c 1 h w'
+            is_image = True
+        # TODO: now have 3 stages, firstly get the vae latents
+        with torch.no_grad(), accelerator.autocast():
+            # 10% prob drop the text
+            batch_size = len(video)
+            rand_idx = torch.rand((batch_size,)) <= self.cfg_rate
+            prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.get_text_embeddings(text, rand_idx, device)
+            noisy_latents_list, ratios_list, timesteps_list, targets_list = self.get_vae_latent(video, use_temporal_pyramid=use_temporal_pyramid)
+        timesteps = torch.cat([timestep.unsqueeze(-1) for timestep in timesteps_list], dim=-1)
+        timesteps = timesteps.reshape(-1)
+        assert timesteps.shape[0] == prompt_embeds.shape[0]
+        # DiT forward
+        model_preds_list = self.dit(
+            sample=noisy_latents_list,
+            timestep_ratio=timesteps,
+            encoder_hidden_states=prompt_embeds,
+            encoder_attention_mask=prompt_attention_mask,
+            pooled_projections=pooled_prompt_embeds,
+        )
+        # calculate the loss
+        return self.calculate_loss(model_preds_list, targets_list)
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        temp,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(temp),
+            int(height) // self.downsample,
+            int(width) // self.downsample,
+        )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    def sample_block_noise(self, bs, ch, temp, height, width):
+        gamma = self.scheduler.config.gamma
+        dist = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(4), torch.eye(4) * (1 + gamma) - torch.ones(4, 4) * gamma)
+        block_number = bs * ch * temp * (height // 2) * (width // 2)
+        noise = torch.stack([dist.sample() for _ in range(block_number)]) # [block number, 4]
+        noise = rearrange(noise, '(b c t h w) (p q) -> b c t (h p) (w q)',b=bs,c=ch,t=temp,h=height//2,w=width//2,p=2,q=2)
+        return noise
+    @torch.no_grad()
+    def generate_one_unit(
+        self,
+        latents,
+        past_conditions, # List of past conditions, contains the conditions of each stage
+        prompt_embeds,
+        prompt_attention_mask,
+        pooled_prompt_embeds,
+        num_inference_steps,
+        height,
+        width,
+        temp,
+        device,
+        dtype,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        is_first_frame: bool = False,
+    ):
+        stages = self.stages
+        intermed_latents = []
+        for i_s in range(len(stages)):
+            self.scheduler.set_timesteps(num_inference_steps[i_s], i_s, device=device)
+            timesteps = self.scheduler.timesteps
+            if i_s > 0:
+                height *= 2; width *= 2
+                latents = rearrange(latents, 'b c t h w -> (b t) c h w')
+                latents = F.interpolate(latents, size=(height, width), mode='nearest')
+                latents = rearrange(latents, '(b t) c h w -> b c t h w', t=temp)
+                # Fix the stage
+                ori_sigma = 1 - self.scheduler.ori_start_sigmas[i_s]   # the original coeff of signal
+                gamma = self.scheduler.config.gamma
+                alpha = 1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)
+                beta = alpha * (1 - ori_sigma) / math.sqrt(gamma)
+                bs, ch, temp, height, width = latents.shape
+                noise = self.sample_block_noise(bs, ch, temp, height, width)
+                noise = noise.to(device=device, dtype=dtype)
+                latents = alpha * latents + beta * noise    # To fix the block artifact
+            for idx, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+                if is_sequence_parallel_initialized():
+                    # sync the input latent
+                    sp_group_rank = get_sequence_parallel_group_rank()
+                    global_src_rank = sp_group_rank * get_sequence_parallel_world_size()
+                    torch.distributed.broadcast(latent_model_input, global_src_rank, group=get_sequence_parallel_group())
+                latent_model_input = past_conditions[i_s] + [latent_model_input]
+                noise_pred = self.dit(
+                    sample=[latent_model_input],
+                    timestep_ratio=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    pooled_projections=pooled_prompt_embeds,
+                )
+                noise_pred = noise_pred[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    if is_first_frame:
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    else:
+                        noise_pred = noise_pred_uncond + self.video_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    model_output=noise_pred,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                ).prev_sample
+            intermed_latents.append(latents)
+        return intermed_latents
+    @torch.no_grad()
+    def generate_i2v(
+        self,
+        prompt: Union[str, List[str]] = '',
+        input_image: PIL.Image = None,
+        temp: int = 1,
+        num_inference_steps: Optional[Union[int, List[int]]] = 28,
+        guidance_scale: float = 7.0,
+        video_guidance_scale: float = 4.0,
+        min_guidance_scale: float = 2.0,
+        use_linear_guidance: bool = False,
+        alpha: float = 0.5,
+        negative_prompt: Optional[Union[str, List[str]]]="cartoon style, worst quality, low quality, blurry, absolute black, absolute white, low res, extra limbs, extra digits, misplaced objects, mutated anatomy, monochrome, horror",
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        save_memory: bool = True,
+        cpu_offloading: bool = False, # If true, reload device will be cuda.
+        inference_multigpu: bool = False,
+        callback: Optional[Callable[[int, int, Dict], None]] = None,
+    ):
+        if self.sequential_offload_enabled and not cpu_offloading:
+            print("Warning: overriding cpu_offloading set to false, as it's needed for sequential cpu offload")
+            cpu_offloading=True
+        device = self.device if not cpu_offloading else torch.device("cuda")
+        dtype = self.dtype
+        if cpu_offloading:
+            # skip caring about the text encoder here as its about to be used anyways.
+            if not self.sequential_offload_enabled:
+                if str(self.dit.device) != "cpu":
+                    print("(dit) Warning: Do not preload pipeline components (i.e. to cuda) with cpu offloading enabled! Otherwise, a second transfer will occur needlessly taking up time.")
+                    self.dit.to("cpu")
+                    torch.cuda.empty_cache()
+            if str(self.vae.device) != "cpu":
+                print("(vae) Warning: Do not preload pipeline components (i.e. to cuda) with cpu offloading enabled! Otherwise, a second transfer will occur needlessly taking up time.")
+                self.vae.to("cpu")
+                torch.cuda.empty_cache()
+        width = input_image.width
+        height = input_image.height
+        assert temp % self.frame_per_unit == 0, "The frames should be divided by frame_per unit"
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = prompt + ", hyper quality, Ultra HD, 8K"   # adding this prompt to improve aesthetics
+        else:
+            assert isinstance(prompt, list)
+            batch_size = len(prompt)
+            prompt = [_ + ", hyper quality, Ultra HD, 8K" for _ in prompt]
+        if isinstance(num_inference_steps, int):
+            num_inference_steps = [num_inference_steps] * len(self.stages)
+        negative_prompt = negative_prompt or ""
+        # Get the text embeddings
+        if cpu_offloading and not self.sequential_offload_enabled:
+            self.text_encoder.to("cuda")
+        prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.text_encoder(prompt, device)
+        negative_prompt_embeds, negative_prompt_attention_mask, negative_pooled_prompt_embeds = self.text_encoder(negative_prompt, device)
+        if cpu_offloading:
+            if not self.sequential_offload_enabled:
+                self.text_encoder.to("cpu")
+            self.vae.to("cuda")
+            torch.cuda.empty_cache()
+        if use_linear_guidance:
+            max_guidance_scale = guidance_scale
+            guidance_scale_list = [max(max_guidance_scale - alpha * t_, min_guidance_scale) for t_ in range(temp+1)]
+            print(guidance_scale_list)
+        self._guidance_scale = guidance_scale
+        self._video_guidance_scale = video_guidance_scale
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        if is_sequence_parallel_initialized():
+            # sync the prompt embedding across multiple GPUs
+            sp_group_rank = get_sequence_parallel_group_rank()
+            global_src_rank = sp_group_rank * get_sequence_parallel_world_size()
+            torch.distributed.broadcast(prompt_embeds, global_src_rank, group=get_sequence_parallel_group())
+            torch.distributed.broadcast(pooled_prompt_embeds, global_src_rank, group=get_sequence_parallel_group())
+            torch.distributed.broadcast(prompt_attention_mask, global_src_rank, group=get_sequence_parallel_group())
+        # Create the initial random noise
+        num_channels_latents = (self.dit.config.in_channels // 4) if self.model_name == "pyramid_flux" else  self.dit.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            temp,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+        temp, height, width = latents.shape[-3], latents.shape[-2], latents.shape[-1]
+        latents = rearrange(latents, 'b c t h w -> (b t) c h w')
+        # by defalut, we needs to start from the block noise
+        for _ in range(len(self.stages)-1):
+            height //= 2;width //= 2
+            latents = F.interpolate(latents, size=(height, width), mode='bilinear') * 2
+        latents = rearrange(latents, '(b t) c h w -> b c t h w', t=temp)
+        num_units = temp // self.frame_per_unit
+        stages = self.stages
+        # encode the image latents
+        image_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+        ])
+        input_image_tensor = image_transform(input_image).unsqueeze(0).unsqueeze(2)   # [b c 1 h w]
+        input_image_latent = (self.vae.encode(input_image_tensor.to(self.vae.device, dtype=self.vae.dtype)).latent_dist.sample() - self.vae_shift_factor) * self.vae_scale_factor  # [b c 1 h w]
+        if is_sequence_parallel_initialized():
+            # sync the image latent across multiple GPUs
+            sp_group_rank = get_sequence_parallel_group_rank()
+            global_src_rank = sp_group_rank * get_sequence_parallel_world_size()
+            torch.distributed.broadcast(input_image_latent, global_src_rank, group=get_sequence_parallel_group())
+        generated_latents_list = [input_image_latent]    # The generated results
+        last_generated_latents = input_image_latent
+        if cpu_offloading:
+            self.vae.to("cpu")
+            if not self.sequential_offload_enabled:
+                self.dit.to("cuda")
+            torch.cuda.empty_cache()
+        for unit_index in tqdm(range(1, num_units)):
+            gc.collect()
+            torch.cuda.empty_cache()
+            if callback:
+                callback(unit_index, num_units)
+            if use_linear_guidance:
+                self._guidance_scale = guidance_scale_list[unit_index]
+                self._video_guidance_scale = guidance_scale_list[unit_index]
+            # prepare the condition latents
+            past_condition_latents = []
+            clean_latents_list = self.get_pyramid_latent(torch.cat(generated_latents_list, dim=2), len(stages) - 1)
+            for i_s in range(len(stages)):
+                last_cond_latent = clean_latents_list[i_s][:,:,-self.frame_per_unit:]
+                stage_input = [torch.cat([last_cond_latent] * 2) if self.do_classifier_free_guidance else last_cond_latent]
+                # pad the past clean latents
+                cur_unit_num = unit_index
+                cur_stage = i_s
+                cur_unit_ptx = 1
+                while cur_unit_ptx < cur_unit_num:
+                    cur_stage = max(cur_stage - 1, 0)
+                    if cur_stage == 0:
+                        break
+                    cur_unit_ptx += 1
+                    cond_latents = clean_latents_list[cur_stage][:, :, -(cur_unit_ptx * self.frame_per_unit) : -((cur_unit_ptx - 1) * self.frame_per_unit)]
+                    stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                if cur_stage == 0 and cur_unit_ptx < cur_unit_num:
+                    cond_latents = clean_latents_list[0][:, :, :-(cur_unit_ptx * self.frame_per_unit)]
+                    stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                stage_input = list(reversed(stage_input))
+                past_condition_latents.append(stage_input)
+            intermed_latents = self.generate_one_unit(
+                latents[:,:,(unit_index - 1) * self.frame_per_unit:unit_index * self.frame_per_unit],
+                past_condition_latents,
+                prompt_embeds,
+                prompt_attention_mask,
+                pooled_prompt_embeds,
+                num_inference_steps,
+                height,
+                width,
+                self.frame_per_unit,
+                device,
+                dtype,
+                generator,
+                is_first_frame=False,
+            )
+            generated_latents_list.append(intermed_latents[-1])
+            last_generated_latents = intermed_latents
+        generated_latents = torch.cat(generated_latents_list, dim=2)
+        if output_type == "latent":
+            image = generated_latents
+        else:
+            if cpu_offloading:
+                if not self.sequential_offload_enabled:
+                    self.dit.to("cpu")
+                self.vae.to("cuda")
+                torch.cuda.empty_cache()
+            image = self.decode_latent(generated_latents, save_memory=save_memory, inference_multigpu=inference_multigpu)
+            if cpu_offloading:
+                self.vae.to("cpu")
+                torch.cuda.empty_cache()
+                # not technically necessary, but returns the pipeline to its original state
+        return image
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        temp: int = 1,
+        num_inference_steps: Optional[Union[int, List[int]]] = 28,
+        video_num_inference_steps: Optional[Union[int, List[int]]] = 28,
+        guidance_scale: float = 7.0,
+        video_guidance_scale: float = 7.0,
+        min_guidance_scale: float = 2.0,
+        use_linear_guidance: bool = False,
+        alpha: float = 0.5,
+        negative_prompt: Optional[Union[str, List[str]]]="cartoon style, worst quality, low quality, blurry, absolute black, absolute white, low res, extra limbs, extra digits, misplaced objects, mutated anatomy, monochrome, horror",
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        save_memory: bool = True,
+        cpu_offloading: bool = False, # If true, reload device will be cuda.
+        inference_multigpu: bool = False,
+        callback: Optional[Callable[[int, int, Dict], None]] = None,
+    ):
+        if self.sequential_offload_enabled and not cpu_offloading:
+            print("Warning: overriding cpu_offloading set to false, as it's needed for sequential cpu offload")
+            cpu_offloading=True
+        device = self.device if not cpu_offloading else torch.device("cuda")
+        dtype = self.dtype
+        if cpu_offloading:
+            # skip caring about the text encoder here as its about to be used anyways.
+            if not self.sequential_offload_enabled:
+                if str(self.dit.device) != "cpu":
+                    print("(dit) Warning: Do not preload pipeline components (i.e. to cuda) with cpu offloading enabled! Otherwise, a second transfer will occur needlessly taking up time.")
+                    self.dit.to("cpu")
+                    torch.cuda.empty_cache()
+            if str(self.vae.device) != "cpu":
+                print("(vae) Warning: Do not preload pipeline components (i.e. to cuda) with cpu offloading enabled! Otherwise, a second transfer will occur needlessly taking up time.")
+                self.vae.to("cpu")
+                torch.cuda.empty_cache()
+        assert (temp - 1) % self.frame_per_unit == 0, "The frames should be divided by frame_per unit"
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = prompt + ", hyper quality, Ultra HD, 8K"        # adding this prompt to improve aesthetics
+        else:
+            assert isinstance(prompt, list)
+            batch_size = len(prompt)
+            prompt = [_ + ", hyper quality, Ultra HD, 8K" for _ in prompt]
+        if isinstance(num_inference_steps, int):
+            num_inference_steps = [num_inference_steps] * len(self.stages)
+        if isinstance(video_num_inference_steps, int):
+            video_num_inference_steps = [video_num_inference_steps] * len(self.stages)
+        negative_prompt = negative_prompt or ""
+        # Get the text embeddings
+        if cpu_offloading and not self.sequential_offload_enabled:
+            self.text_encoder.to("cuda")
+        prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self.text_encoder(prompt, device)
+        negative_prompt_embeds, negative_prompt_attention_mask, negative_pooled_prompt_embeds = self.text_encoder(negative_prompt, device)
+        if cpu_offloading:
+            if not self.sequential_offload_enabled:
+                self.text_encoder.to("cpu")
+                self.dit.to("cuda")
+            torch.cuda.empty_cache()
+        if use_linear_guidance:
+            max_guidance_scale = guidance_scale
+            # guidance_scale_list = torch.linspace(max_guidance_scale, min_guidance_scale, temp).tolist()
+            guidance_scale_list = [max(max_guidance_scale - alpha * t_, min_guidance_scale) for t_ in range(temp)]
+            print(guidance_scale_list)
+        self._guidance_scale = guidance_scale
+        self._video_guidance_scale = video_guidance_scale
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        if is_sequence_parallel_initialized():
+            # sync the prompt embedding across multiple GPUs
+            sp_group_rank = get_sequence_parallel_group_rank()
+            global_src_rank = sp_group_rank * get_sequence_parallel_world_size()
+            torch.distributed.broadcast(prompt_embeds, global_src_rank, group=get_sequence_parallel_group())
+            torch.distributed.broadcast(pooled_prompt_embeds, global_src_rank, group=get_sequence_parallel_group())
+            torch.distributed.broadcast(prompt_attention_mask, global_src_rank, group=get_sequence_parallel_group())
+        # Create the initial random noise
+        num_channels_latents = (self.dit.config.in_channels // 4) if self.model_name == "pyramid_flux" else  self.dit.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            temp,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+        temp, height, width = latents.shape[-3], latents.shape[-2], latents.shape[-1]
+        latents = rearrange(latents, 'b c t h w -> (b t) c h w')
+        # by default, we needs to start from the block noise
+        for _ in range(len(self.stages)-1):
+            height //= 2;width //= 2
+            latents = F.interpolate(latents, size=(height, width), mode='bilinear') * 2
+        latents = rearrange(latents, '(b t) c h w -> b c t h w', t=temp)
+        num_units = 1 + (temp - 1) // self.frame_per_unit
+        stages = self.stages
+        generated_latents_list = []    # The generated results
+        last_generated_latents = None
+        for unit_index in tqdm(range(num_units)):
+            gc.collect()
+            torch.cuda.empty_cache()
+            if callback:
+                callback(unit_index, num_units)
+            if use_linear_guidance:
+                self._guidance_scale = guidance_scale_list[unit_index]
+                self._video_guidance_scale = guidance_scale_list[unit_index]
+            if unit_index == 0:
+                past_condition_latents = [[] for _ in range(len(stages))]
+                intermed_latents = self.generate_one_unit(
+                    latents[:,:,:1],
+                    past_condition_latents,
+                    prompt_embeds,
+                    prompt_attention_mask,
+                    pooled_prompt_embeds,
+                    num_inference_steps,
+                    height,
+                    width,
+                    1,
+                    device,
+                    dtype,
+                    generator,
+                    is_first_frame=True,
+                )
+            else:
+                # prepare the condition latents
+                past_condition_latents = []
+                clean_latents_list = self.get_pyramid_latent(torch.cat(generated_latents_list, dim=2), len(stages) - 1)
+                for i_s in range(len(stages)):
+                    last_cond_latent = clean_latents_list[i_s][:,:,-(self.frame_per_unit):]
+                    stage_input = [torch.cat([last_cond_latent] * 2) if self.do_classifier_free_guidance else last_cond_latent]
+                    # pad the past clean latents
+                    cur_unit_num = unit_index
+                    cur_stage = i_s
+                    cur_unit_ptx = 1
+                    while cur_unit_ptx < cur_unit_num:
+                        cur_stage = max(cur_stage - 1, 0)
+                        if cur_stage == 0:
+                            break
+                        cur_unit_ptx += 1
+                        cond_latents = clean_latents_list[cur_stage][:, :, -(cur_unit_ptx * self.frame_per_unit) : -((cur_unit_ptx - 1) * self.frame_per_unit)]
+                        stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                    if cur_stage == 0 and cur_unit_ptx < cur_unit_num:
+                        cond_latents = clean_latents_list[0][:, :, :-(cur_unit_ptx * self.frame_per_unit)]
+                        stage_input.append(torch.cat([cond_latents] * 2) if self.do_classifier_free_guidance else cond_latents)
+                    stage_input = list(reversed(stage_input))
+                    past_condition_latents.append(stage_input)
+                intermed_latents = self.generate_one_unit(
+                    latents[:,:, 1 + (unit_index - 1) * self.frame_per_unit:1 + unit_index * self.frame_per_unit],
+                    past_condition_latents,
+                    prompt_embeds,
+                    prompt_attention_mask,
+                    pooled_prompt_embeds,
+                    video_num_inference_steps,
+                    height,
+                    width,
+                    self.frame_per_unit,
+                    device,
+                    dtype,
+                    generator,
+                    is_first_frame=False,
+                )
+            generated_latents_list.append(intermed_latents[-1])
+            last_generated_latents = intermed_latents
+        generated_latents = torch.cat(generated_latents_list, dim=2)
+        if output_type == "latent":
+            image = generated_latents
+        else:
+            if cpu_offloading:
+                if not self.sequential_offload_enabled:
+                    self.dit.to("cpu")
+                self.vae.to("cuda")
+                torch.cuda.empty_cache()
+            image = self.decode_latent(generated_latents, save_memory=save_memory, inference_multigpu=inference_multigpu)
+            if cpu_offloading:
+                self.vae.to("cpu")
+                torch.cuda.empty_cache()
+                # not technically necessary, but returns the pipeline to its original state
+        return image
+    def decode_latent(self, latents, save_memory=True, inference_multigpu=False):
+        # only the main process needs vae decoding
+        if inference_multigpu and get_rank() != 0:
+            return None
+        if latents.shape[2] == 1:
+            latents = (latents / self.vae_scale_factor) + self.vae_shift_factor
+        else:
+            latents[:, :, :1] = (latents[:, :, :1] / self.vae_scale_factor) + self.vae_shift_factor
+            latents[:, :, 1:] = (latents[:, :, 1:] / self.vae_video_scale_factor) + self.vae_video_shift_factor
+        if save_memory:
+            # reducing the tile size and temporal chunk window size
+            image = self.vae.decode(latents, temporal_chunk=True, window_size=1, tile_sample_min_size=256).sample
+        else:
+            image = self.vae.decode(latents, temporal_chunk=True, window_size=2, tile_sample_min_size=512).sample
+        image = image.mul(127.5).add(127.5).clamp(0, 255).byte()
+        image = rearrange(image, "B C T H W -> (B T) H W C")
+        image = image.cpu().numpy()
+        image = self.numpy_to_pil(image)
+        return image
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    @property
+    def device(self):
+        return next(self.dit.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.dit.parameters()).dtype
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def video_guidance_scale(self):
+        return self._video_guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 0

pyramid_flow_model.lnk ADDED Viewed

Binary file (982 Bytes). View file

pyramid_flow_model/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

pyramid_flow_model/README.md ADDED Viewed

	@@ -0,0 +1,191 @@

+---
+license: apache-2.0
+pipeline_tag: text-to-video
+tags:
+- text-to-image
+- image-to-video
+- flux
+---
+# ⚡️Pyramid Flow miniFLUX⚡️
+[[Paper]](https://arxiv.org/abs/2410.05954) [[Project Page ✨]](https://pyramid-flow.github.io) [[Code 🚀]](https://github.com/jy0205/Pyramid-Flow) [[SD3 Model ⚡️]](https://huggingface.co/rain1011/pyramid-flow-sd3) [[demo 🤗](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow)]
+This is the model repository for Pyramid Flow, a training-efficient **Autoregressive Video Generation** method based on **Flow Matching**. By training only on open-source datasets, it generates high-quality 10-second videos at 768p resolution and 24 FPS, and naturally supports image-to-video generation.
+<table class="center" border="0" style="width: 100%; text-align: left;">
+<tr>
+  <th>10s, 768p, 24fps</th>
+  <th>5s, 768p, 24fps</th>
+  <th>Image-to-video</th>
+</tr>
+<tr>
+  <td><video src="https://pyramid-flow.github.io/static/videos/t2v_10s/fireworks.mp4" autoplay muted loop playsinline></video></td>
+  <td><video src="https://pyramid-flow.github.io/static/videos/t2v/trailer.mp4" autoplay muted loop playsinline></video></td>
+  <td><video src="https://pyramid-flow.github.io/static/videos/i2v/sunday.mp4" autoplay muted loop playsinline></video></td>
+</tr>
+</table>
+## News
+* `2024.11.13`  🚀🚀🚀 We release the [768p miniFLUX checkpoint](https://huggingface.co/rain1011/pyramid-flow-miniflux) (up to 10s).
+  > We have switched the model structure from SD3 to a mini FLUX to fix human structure issues, please try our 1024p image checkpoint, 384p video checkpoint (up to 5s) and 768p video checkpoint (up to 10s). The new miniflux model shows great improvement on human structure and motion stability
+* `2024.10.29` ⚡️⚡️⚡️ We release [training code](https://github.com/jy0205/Pyramid-Flow?tab=readme-ov-file#training) and [new model checkpoints](https://huggingface.co/rain1011/pyramid-flow-miniflux) with FLUX structure trained from scratch.
+* `2024.10.11`  🤗🤗🤗 [Hugging Face demo](https://huggingface.co/spaces/Pyramid-Flow/pyramid-flow) is available. Thanks [@multimodalart](https://huggingface.co/multimodalart) for the commit!
+* `2024.10.10`  🚀🚀🚀 We release the [technical report](https://arxiv.org/abs/2410.05954), [project page](https://pyramid-flow.github.io) and [model checkpoint](https://huggingface.co/rain1011/pyramid-flow-sd3) of Pyramid Flow.
+## Installation
+We recommend setting up the environment with conda. The codebase currently uses Python 3.8.10 and PyTorch 2.1.2 ([guide](https://pytorch.org/get-started/previous-versions/#v212)), and we are actively working to support a wider range of versions.
+```bash
+git clone https://github.com/jy0205/Pyramid-Flow
+cd Pyramid-Flow
+# create env using conda
+conda create -n pyramid python==3.8.10
+conda activate pyramid
+pip install -r requirements.txt
+```
+Then, download the model from [Huggingface](https://huggingface.co/rain1011) (there are two variants: [miniFLUX](https://huggingface.co/rain1011/pyramid-flow-miniflux) or [SD3](https://huggingface.co/rain1011/pyramid-flow-sd3)). The miniFLUX models support 1024p image, 384p and 768p video generation, and the SD3-based models support 768p and 384p video generation. The 384p checkpoint generates 5-second video at 24FPS, while the 768p checkpoint generates up to 10-second video at 24FPS.
+```python
+from huggingface_hub import snapshot_download
+model_path = 'PATH'   # The local directory to save downloaded checkpoint
+snapshot_download("rain1011/pyramid-flow-miniflux", local_dir=model_path, local_dir_use_symlinks=False, repo_type='model')
+```
+## Usage
+For inference, we provide Gradio demo, single-GPU, multi-GPU, and Apple Silicon inference code, as well as VRAM-efficient features such as CPU offloading. Please check our [code repository](https://github.com/jy0205/Pyramid-Flow?tab=readme-ov-file#inference) for usage.
+Below is a simplified two-step usage procedure. First, load the downloaded model:
+```python
+import torch
+from PIL import Image
+from pyramid_dit import PyramidDiTForVideoGeneration
+from diffusers.utils import load_image, export_to_video
+torch.cuda.set_device(0)
+model_dtype, torch_dtype = 'bf16', torch.bfloat16   # Use bf16 (not support fp16 yet)
+model = PyramidDiTForVideoGeneration(
+    'PATH',                                         # The downloaded checkpoint dir
+    model_name="pyramid_flux",
+    model_dtype,
+    model_variant='diffusion_transformer_768p',
+)
+model.vae.enable_tiling()
+# model.vae.to("cuda")
+# model.dit.to("cuda")
+# model.text_encoder.to("cuda")
+# if you're not using sequential offloading bellow uncomment the lines above ^
+model.enable_sequential_cpu_offload()
+```
+Then, you can try text-to-video generation on your own prompts:
+```python
+prompt = "A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors"
+# used for 384p model variant
+# width = 640
+# height = 384
+# used for 768p model variant
+width = 1280
+height = 768
+with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
+    frames = model.generate(
+        prompt=prompt,
+        num_inference_steps=[20, 20, 20],
+        video_num_inference_steps=[10, 10, 10],
+        height=height,
+        width=width,
+        temp=16,                    # temp=16: 5s, temp=31: 10s
+        guidance_scale=7.0,         # The guidance for the first frame, set it to 7 for 384p variant
+        video_guidance_scale=5.0,   # The guidance for the other video latent
+        output_type="pil",
+        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+    )
+export_to_video(frames, "./text_to_video_sample.mp4", fps=24)
+```
+As an autoregressive model, our model also supports (text conditioned) image-to-video generation:
+```python
+# used for 384p model variant
+# width = 640
+# height = 384
+# used for 768p model variant
+width = 1280
+height = 768
+image = Image.open('assets/the_great_wall.jpg').convert("RGB").resize((width, height))
+prompt = "FPV flying over the Great Wall"
+with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
+    frames = model.generate_i2v(
+        prompt=prompt,
+        input_image=image,
+        num_inference_steps=[10, 10, 10],
+        temp=16,
+        video_guidance_scale=4.0,
+        output_type="pil",
+        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
+    )
+export_to_video(frames, "./image_to_video_sample.mp4", fps=24)
+```
+## Usage tips
+* The `guidance_scale` parameter controls the visual quality. We suggest using a guidance within [7, 9] for the 768p checkpoint during text-to-video generation, and 7 for the 384p checkpoint.
+* The `video_guidance_scale` parameter controls the motion. A larger value increases the dynamic degree and mitigates the autoregressive generation degradation, while a smaller value stabilizes the video.
+* For 10-second video generation, we recommend using a guidance scale of 7 and a video guidance scale of 5.
+## Gallery
+The following video examples are generated at 5s, 768p, 24fps. For more results, please visit our [project page](https://pyramid-flow.github.io).
+<table class="center" border="0" style="width: 100%; text-align: left;">
+<tr>
+  <td><video src="https://pyramid-flow.github.io/static/videos/t2v/tokyo.mp4" autoplay muted loop playsinline></video></td>
+  <td><video src="https://pyramid-flow.github.io/static/videos/t2v/eiffel.mp4" autoplay muted loop playsinline></video></td>
+</tr>
+<tr>
+  <td><video src="https://pyramid-flow.github.io/static/videos/t2v/waves.mp4" autoplay muted loop playsinline></video></td>
+  <td><video src="https://pyramid-flow.github.io/static/videos/t2v/rail.mp4" autoplay muted loop playsinline></video></td>
+</tr>
+</table>
+## Acknowledgement
+We are grateful for the following awesome projects when implementing Pyramid Flow:
+* [SD3 Medium](https://huggingface.co/stabilityai/stable-diffusion-3-medium) and [Flux 1.0](https://huggingface.co/black-forest-labs/FLUX.1-dev): State-of-the-art image generation models based on flow matching.
+* [Diffusion Forcing](https://boyuan.space/diffusion-forcing) and [GameNGen](https://gamengen.github.io): Next-token prediction meets full-sequence diffusion.
+* [WebVid-10M](https://github.com/m-bain/webvid), [OpenVid-1M](https://github.com/NJU-PCALab/OpenVid-1M) and [Open-Sora Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan): Large-scale datasets for text-to-video generation.
+* [CogVideoX](https://github.com/THUDM/CogVideo): An open-source text-to-video generation model that shares many training details.
+* [Video-LLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2): An open-source video LLM for our video recaptioning.
+## Citation
+Consider giving this repository a star and cite Pyramid Flow in your publications if it helps your research.
+```
+@article{jin2024pyramidal,
+  title={Pyramidal Flow Matching for Efficient Video Generative Modeling},
+  author={Jin, Yang and Sun, Zhicheng and Li, Ningyuan and Xu, Kun and Xu, Kun and Jiang, Hao and Zhuang, Nan and Huang, Quzhe and Song, Yang and Mu, Yadong and Lin, Zhouchen},
+  jounal={arXiv preprint arXiv:2410.05954},
+  year={2024}
+}
+```

pyramid_flow_model/causal_video_vae/config.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "_class_name": "CausalVideoVAE",
+  "_diffusers_version": "0.29.2",
+  "add_post_quant_conv": true,
+  "decoder_act_fn": "silu",
+  "decoder_block_dropout": [
+    0.0,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "decoder_block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "decoder_in_channels": 16,
+  "decoder_layers_per_block": [
+    3,
+    3,
+    3,
+    3
+  ],
+  "decoder_norm_num_groups": 32,
+  "decoder_out_channels": 3,
+  "decoder_spatial_up_sample": [
+    true,
+    true,
+    true,
+    false
+  ],
+  "decoder_temporal_up_sample": [
+    true,
+    true,
+    true,
+    false
+  ],
+  "decoder_type": "causal_vae_conv",
+  "decoder_up_block_types": [
+    "UpDecoderBlockCausal3D",
+    "UpDecoderBlockCausal3D",
+    "UpDecoderBlockCausal3D",
+    "UpDecoderBlockCausal3D"
+  ],
+  "downsample_scale": 8,
+  "encoder_act_fn": "silu",
+  "encoder_block_dropout": [
+    0.0,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "encoder_block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "encoder_double_z": true,
+  "encoder_down_block_types": [
+    "DownEncoderBlockCausal3D",
+    "DownEncoderBlockCausal3D",
+    "DownEncoderBlockCausal3D",
+    "DownEncoderBlockCausal3D"
+  ],
+  "encoder_in_channels": 3,
+  "encoder_layers_per_block": [
+    2,
+    2,
+    2,
+    2
+  ],
+  "encoder_norm_num_groups": 32,
+  "encoder_out_channels": 16,
+  "encoder_spatial_down_sample": [
+    true,
+    true,
+    true,
+    false
+  ],
+  "encoder_temporal_down_sample": [
+    true,
+    true,
+    true,
+    false
+  ],
+  "encoder_type": "causal_vae_conv",
+  "interpolate": false,
+  "sample_size": 256,
+  "scaling_factor": 0.13025
+}

pyramid_flow_model/causal_video_vae/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8385177ef6dc62f9e0665213c1509f750a59b496ddf573b4524d7a641b21d260
+size 1341696682

pyramid_flow_model/diffusion_transformer_384p/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_class_name": "PyramidFluxTransformer",
+  "_diffusers_version": "0.30.3",
+  "attention_head_dim": 64,
+  "axes_dims_rope": [
+    16,
+    24,
+    24
+  ],
+  "in_channels": 64,
+  "interp_condition_pos": true,
+  "joint_attention_dim": 4096,
+  "num_attention_heads": 30,
+  "num_layers": 8,
+  "num_single_layers": 16,
+  "patch_size": 1,
+  "pooled_projection_dim": 768,
+  "use_flash_attn": false,
+  "use_gradient_checkpointing": false,
+  "use_temporal_causal": true
+}

pyramid_flow_model/diffusion_transformer_384p/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76ab1a5f81f5c69285ad8040e8282a6260dae5ca601d7f614bd9de38a46316b5
+size 7888294568

pyramid_flow_model/diffusion_transformer_768p/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_class_name": "PyramidFluxTransformer",
+  "_diffusers_version": "0.30.3",
+  "attention_head_dim": 64,
+  "axes_dims_rope": [
+    16,
+    24,
+    24
+  ],
+  "in_channels": 64,
+  "interp_condition_pos": true,
+  "joint_attention_dim": 4096,
+  "num_attention_heads": 30,
+  "num_layers": 8,
+  "num_single_layers": 16,
+  "patch_size": 1,
+  "pooled_projection_dim": 768,
+  "use_flash_attn": false,
+  "use_gradient_checkpointing": false,
+  "use_temporal_causal": true
+}

pyramid_flow_model/diffusion_transformer_768p/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:864de0e1afd9dd2c373d957ac2c54346f5006036dc7aa8ec7605db80eea2272c
+size 7888294568

pyramid_flow_model/diffusion_transformer_image/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_class_name": "PyramidFluxTransformer",
+  "_diffusers_version": "0.30.3",
+  "attention_head_dim": 64,
+  "axes_dims_rope": [
+    16,
+    24,
+    24
+  ],
+  "in_channels": 64,
+  "interp_condition_pos": true,
+  "joint_attention_dim": 4096,
+  "num_attention_heads": 30,
+  "num_layers": 8,
+  "num_single_layers": 16,
+  "patch_size": 1,
+  "pooled_projection_dim": 768,
+  "use_flash_attn": false,
+  "use_gradient_checkpointing": false,
+  "use_temporal_causal": true
+}

pyramid_flow_model/diffusion_transformer_image/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a16bcc2f50fe52de93d6a7aa13a31dde384dda13a98007e3a5b17e02257697e
+size 7888294568