Spaces:

hehao13
/

CameraCtrl-svd

Runtime error

App Files Files Community

hao he commited on May 9

Commit

308c973

•

1 Parent(s): 88b231a

Add gradio codes for CameraCtrl with SVD-xt model

Browse files

Files changed (47) hide show

LICENSE.txt +201 -0
README.md +1 -1
app.py +579 -0
assets/example_condition_images/A_beautiful_fluffy_domestic_hen_sitting_on_white_eggs_in_a_brown_nest,_eggs_are_under_the_hen..png +0 -0
assets/example_condition_images/A_car_running_on_Mars..png +0 -0
assets/example_condition_images/A_lion_standing_on_a_surfboard_in_the_ocean..png +0 -0
assets/example_condition_images/A_serene_mountain_lake_at_sunrise,_with_mist_hovering_over_the_water..png +0 -0
assets/example_condition_images/A_tiny_finch_on_a_branch_with_spring_flowers_on_background..png +0 -0
assets/example_condition_images/An_exploding_cheese_house..png +0 -0
assets/example_condition_images/Dolphins_leaping_out_of_the_ocean_at_sunset..png +0 -0
assets/example_condition_images/Fireworks_display_illuminating_the_night_sky..png +0 -0
assets/example_condition_images/Leaves_are_falling_from_trees..png +0 -0
assets/example_condition_images/Rocky_coastline_with_crashing_waves..png +0 -0
assets/pose_files/0bf152ef84195293.txt +26 -0
assets/pose_files/0c11dbe781b1c11c.txt +26 -0
assets/pose_files/0c9b371cc6225682.txt +26 -0
assets/pose_files/0f47577ab3441480.txt +26 -0
assets/pose_files/0f68374b76390082.txt +26 -0
assets/pose_files/2c80f9eb0d3b2bb4.txt +26 -0
assets/pose_files/2f25826f0d0ef09a.txt +26 -0
assets/pose_files/3f79dc32d575bcdc.txt +26 -0
assets/pose_files/4a2d6753676df096.txt +26 -0
assets/reference_videos/0bf152ef84195293.mp4 +0 -0
assets/reference_videos/0c11dbe781b1c11c.mp4 +0 -0
assets/reference_videos/0c9b371cc6225682.mp4 +0 -0
assets/reference_videos/0f47577ab3441480.mp4 +0 -0
assets/reference_videos/0f68374b76390082.mp4 +0 -0
assets/reference_videos/2c80f9eb0d3b2bb4.mp4 +0 -0
assets/reference_videos/2f25826f0d0ef09a.mp4 +0 -0
assets/reference_videos/3f79dc32d575bcdc.mp4 +0 -0
assets/reference_videos/4a2d6753676df096.mp4 +0 -0
cameractrl/data/dataset.py +355 -0
cameractrl/models/attention.py +65 -0
cameractrl/models/attention_processor.py +591 -0
cameractrl/models/motion_module.py +399 -0
cameractrl/models/pose_adaptor.py +240 -0
cameractrl/models/transformer_temporal.py +191 -0
cameractrl/models/unet.py +587 -0
cameractrl/models/unet_3d_blocks.py +461 -0
cameractrl/pipelines/pipeline_animation.py +523 -0
cameractrl/utils/convert_from_ckpt.py +556 -0
cameractrl/utils/convert_lora_safetensor_to_diffusers.py +154 -0
cameractrl/utils/util.py +148 -0
configs/train_cameractrl/svd_320_576_cameractrl.yaml +87 -0
configs/train_cameractrl/svdxt_320_576_cameractrl.yaml +88 -0
inference_cameractrl.py +255 -0
requirements.txt +20 -0

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: CameraCtrl Svd Xt
-emoji: 🚀
 colorFrom: gray
 colorTo: indigo
 sdk: gradio

 ---
 title: CameraCtrl Svd Xt
+emoji: 🎥
 colorFrom: gray
 colorTo: indigo
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,579 @@

+import spaces
+import argparse
+import torch
+import tempfile
+import os
+import cv2
+import numpy as np
+import gradio as gr
+import torchvision.transforms.functional as F
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+from omegaconf import OmegaConf
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+from inference_cameractrl import get_relative_pose, ray_condition, get_pipeline
+from cameractrl.utils.util import save_videos_grid
+cv2.setNumThreads(1)
+mpl.use('agg')
+#### Description ####
+title = r"""<h1 align="center">CameraCtrl: Enabling Camera Control for Text-to-Video Generation</h1>"""
+subtitle = r"""<h2 align="center">CameraCtrl Image2Video with <a href='https://arxiv.org/abs/2311.15127' target='_blank'> <b>Stable Video Diffusion (SVD)</b> </a>-xt <a href='https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt' target='_blank'> <b> model </b> </a> </h2>"""
+description = r"""
+<b>Official Gradio demo</b> for <a href='https://github.com/hehao13/CameraCtrl' target='_blank'><b>CameraCtrl: Enabling Camera Control for Text-to-Video Generation</b></a>.<br>
+CameraCtrl is capable of precisely controlling the camera trajectory during the video generation process.<br>
+Note that, with SVD-xt, CameraCtrl only support Image2Video now.<br>
+"""
+closing_words = r"""
+---
+If you are interested in this demo or CameraCtrl is helpful for you, please give us a ⭐ of the <a href='https://github.com/hehao13/CameraCtrl' target='_blank'> CameraCtrl</a> Github Repo !
+[![GitHub Stars](https://img.shields.io/github/stars/hehao13/CameraCtrl
+)](https://github.com/hehao13/CameraCtrl)
+---
+📝 **Citation**
+<br>
+If you find our paper or code is useful for your research, please consider citing:
+```bibtex
+@article{he2024cameractrl,
+      title={CameraCtrl: Enabling Camera Control for Text-to-Video Generation},
+      author={Hao He and Yinghao Xu and Yuwei Guo and Gordon Wetzstein and Bo Dai and Hongsheng Li and Ceyuan Yang},
+      journal={arXiv preprint arXiv:2404.02101},
+      year={2024}
+}
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to contact me at <b>[email protected]</b>.
+**Acknowledgement**
+<br>
+We thank <a href='https://wzhouxiff.github.io/projects/MotionCtrl/' target='_blank'><b>MotionCtrl</b></a> and <a href='https://huggingface.co/spaces/lllyasviel/IC-Light' target='_blank'><b>IC-Light</b></a> for their gradio codes.<br>
+"""
+RESIZE_MODES = ['Resize then Center Crop', 'Directly resize']
+CAMERA_TRAJECTORY_MODES = ["Provided Camera Trajectories", "Custom Camera Trajectories"]
+height = 320
+width = 576
+num_frames = 25
+device = "cuda" if torch.cuda.is_available() else "cpu"
+config = "configs/train_cameractrl/svdxt_320_576_cameractrl.yaml"
+model_id = "stabilityai/stable-video-diffusion-img2vid-xt"
+ckpt = "checkpoints/CameraCtrl_svdxt.ckpt"
+if not os.path.exists(ckpt):
+    os.makedirs("checkpoints", exist_ok=True)
+    os.system("wget -c https://huggingface.co/hehao13/CameraCtrl_svd/resolve/main/CameraCtrl_svdxt.ckpt?download=true")
+    os.system("mv CameraCtrl_svdxt.ckpt?download=true checkpoints/CameraCtrl_svdxt.ckpt")
+model_config = OmegaConf.load(config)
+pipeline = get_pipeline(model_id, "unet", model_config['down_block_types'], model_config['up_block_types'],
+                        model_config['pose_encoder_kwargs'], model_config['attention_processor_kwargs'],
+                        ckpt, True, device)
+examples = [
+    [
+        "assets/example_condition_images/A_tiny_finch_on_a_branch_with_spring_flowers_on_background..png",
+        "assets/pose_files/0bf152ef84195293.txt",
+        "Trajectory 1"
+    ],
+    [
+        "assets/example_condition_images/A_beautiful_fluffy_domestic_hen_sitting_on_white_eggs_in_a_brown_nest,_eggs_are_under_the_hen..png",
+        "assets/pose_files/0c9b371cc6225682.txt",
+        "Trajectory 2"
+    ],
+    [
+        "assets/example_condition_images/Rocky_coastline_with_crashing_waves..png",
+        "assets/pose_files/0c11dbe781b1c11c.txt",
+        "Trajectory 3"
+    ],
+    [
+        "assets/example_condition_images/A_lion_standing_on_a_surfboard_in_the_ocean..png",
+        "assets/pose_files/0f47577ab3441480.txt",
+        "Trajectory 4"
+    ],
+    [
+        "assets/example_condition_images/An_exploding_cheese_house..png",
+        "assets/pose_files/0f47577ab3441480.txt",
+        "Trajectory 4"
+    ],
+    [
+        "assets/example_condition_images/Dolphins_leaping_out_of_the_ocean_at_sunset..png",
+        "assets/pose_files/0f68374b76390082.txt",
+        "Trajectory 5"
+    ],
+    [
+        "assets/example_condition_images/Leaves_are_falling_from_trees..png",
+        "assets/pose_files/2c80f9eb0d3b2bb4.txt",
+        "Trajectory 6"
+    ],
+    [
+        "assets/example_condition_images/A_serene_mountain_lake_at_sunrise,_with_mist_hovering_over_the_water..png",
+        "assets/pose_files/2f25826f0d0ef09a.txt",
+        "Trajectory 7"
+    ],
+    [
+        "assets/example_condition_images/Fireworks_display_illuminating_the_night_sky..png",
+        "assets/pose_files/3f79dc32d575bcdc.txt",
+        "Trajectory 8"
+    ],
+    [
+        "assets/example_condition_images/A_car_running_on_Mars..png",
+        "assets/pose_files/4a2d6753676df096.txt",
+        "Trajectory 9"
+    ],
+]
+class Camera(object):
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+class CameraPoseVisualizer:
+    def __init__(self, xlim, ylim, zlim):
+        self.fig = plt.figure(figsize=(18, 7))
+        self.ax = self.fig.add_subplot(projection='3d')
+        self.plotly_data = None  # plotly data traces
+        self.ax.set_aspect("auto")
+        self.ax.set_xlim(xlim)
+        self.ax.set_ylim(ylim)
+        self.ax.set_zlim(zlim)
+        self.ax.set_xlabel('x')
+        self.ax.set_ylabel('y')
+        self.ax.set_zlabel('z')
+    def extrinsic2pyramid(self, extrinsic, color_map='red', hw_ratio=9 / 16, base_xval=1, zval=3):
+        vertex_std = np.array([[0, 0, 0, 1],
+                               [base_xval, -base_xval * hw_ratio, zval, 1],
+                               [base_xval, base_xval * hw_ratio, zval, 1],
+                               [-base_xval, base_xval * hw_ratio, zval, 1],
+                               [-base_xval, -base_xval * hw_ratio, zval, 1]])
+        vertex_transformed = vertex_std @ extrinsic.T
+        meshes = [[vertex_transformed[0, :-1], vertex_transformed[1][:-1], vertex_transformed[2, :-1]],
+                  [vertex_transformed[0, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1]],
+                  [vertex_transformed[0, :-1], vertex_transformed[3, :-1], vertex_transformed[4, :-1]],
+                  [vertex_transformed[0, :-1], vertex_transformed[4, :-1], vertex_transformed[1, :-1]],
+                  [vertex_transformed[1, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1],
+                   vertex_transformed[4, :-1]]]
+        color = color_map if isinstance(color_map, str) else plt.cm.rainbow(color_map)
+        self.ax.add_collection3d(
+            Poly3DCollection(meshes, facecolors=color, linewidths=0.3, edgecolors=color, alpha=0.35))
+    def colorbar(self, max_frame_length):
+        cmap = mpl.cm.rainbow
+        norm = mpl.colors.Normalize(vmin=0, vmax=max_frame_length)
+        self.fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=self.ax, orientation='vertical',
+                          label='Frame Indexes')
+    def show(self):
+        plt.title('Camera Trajectory')
+        plt.show()
+def get_c2w(w2cs):
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, 0],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ np.linalg.inv(w2c) for w2c in w2cs[1:]]
+    camera_positions = np.asarray([c2w[:3, 3] for c2w in ret_poses])  # [n_frame, 3]
+    position_distances = [camera_positions[i] - camera_positions[i - 1] for i in range(1, len(camera_positions))]
+    xyz_max = np.max(camera_positions, axis=0)
+    xyz_min = np.min(camera_positions, axis=0)
+    xyz_ranges = xyz_max - xyz_min  # [3, ]
+    max_range = np.max(xyz_ranges)
+    expected_xyz_ranges = 1
+    scale_ratio = expected_xyz_ranges / max_range
+    scaled_position_distances = [dis * scale_ratio for dis in position_distances]  # [n_frame - 1]
+    scaled_camera_positions = [camera_positions[0], ]
+    scaled_camera_positions.extend([camera_positions[0] + np.sum(np.asarray(scaled_position_distances[:i]), axis=0)
+                                    for i in range(1, len(camera_positions))])
+    ret_poses = [np.concatenate(
+        (np.concatenate((ori_pose[:3, :3], cam_position[:, None]), axis=1), np.asarray([0, 0, 0, 1])[None]), axis=0)
+                 for ori_pose, cam_position in zip(ret_poses, scaled_camera_positions)]
+    transform_matrix = np.asarray([[1, 0, 0, 0], [0, 0, 1, 0], [0, -1, 0, 0], [0, 0, 0, 1]]).reshape(4, 4)
+    ret_poses = [transform_matrix @ x for x in ret_poses]
+    return np.array(ret_poses, dtype=np.float32)
+def visualize_trajectory(trajectory_file):
+    with open(trajectory_file, 'r') as f:
+        poses = f.readlines()
+    w2cs = [np.asarray([float(p) for p in pose.strip().split(' ')[7:]]).reshape(3, 4) for pose in poses[1:]]
+    num_frames = len(w2cs)
+    last_row = np.zeros((1, 4))
+    last_row[0, -1] = 1.0
+    w2cs = [np.concatenate((w2c, last_row), axis=0) for w2c in w2cs]
+    c2ws = get_c2w(w2cs)
+    visualizer = CameraPoseVisualizer([-1.2, 1.2], [-1.2, 1.2], [-1.2, 1.2])
+    for frame_idx, c2w in enumerate(c2ws):
+        visualizer.extrinsic2pyramid(c2w, frame_idx / num_frames, hw_ratio=9 / 16, base_xval=0.02, zval=0.1)
+    visualizer.colorbar(num_frames)
+    return visualizer.fig
+vis_traj = visualize_trajectory('assets/pose_files/0bf152ef84195293.txt')
+@torch.inference_mode()
+def process_input_image(input_image, resize_mode):
+    global height, width
+    expected_hw_ratio = height / width
+    inp_w, inp_h = input_image.size
+    inp_hw_ratio = inp_h / inp_w
+    if inp_hw_ratio > expected_hw_ratio:
+        resized_height = inp_hw_ratio * width
+        resized_width = width
+    else:
+        resized_height = height
+        resized_width = height / inp_hw_ratio
+    resized_image = F.resize(input_image, size=[resized_height, resized_width])
+    if resize_mode == RESIZE_MODES[0]:
+        return_image = F.center_crop(resized_image, output_size=[height, width])
+    else:
+        return_image = resized_image
+    return gr.update(visible=True, value=return_image, height=height, width=width), gr.update(visible=True), gr.update(
+        visible=True), gr.update(visible=True), gr.update(visible=True)
+def update_camera_trajectories(trajectory_mode):
+    if trajectory_mode == CAMERA_TRAJECTORY_MODES[0]:
+        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), \
+               gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
+               gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
+    elif trajectory_mode == CAMERA_TRAJECTORY_MODES[1]:
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
+               gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), \
+               gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
+def update_camera_args(trajectory_mode, provided_camera_trajectory, customized_trajectory_file):
+    if trajectory_mode == CAMERA_TRAJECTORY_MODES[0]:
+        res = "Provided " + str(provided_camera_trajectory)
+    else:
+        if customized_trajectory_file is None:
+            res = " "
+        else:
+            res = f"Customized trajectory file {customized_trajectory_file.name.split('/')[-1]}"
+    return res
+def update_camera_args_reset():
+    return " "
+def update_trajectory_vis_plot(camera_trajectory_args, provided_camera_trajectory, customized_trajectory_file):
+    if 'Provided' in camera_trajectory_args:
+        if provided_camera_trajectory == "Trajectory 1":
+            trajectory_file_path = "assets/pose_files/0bf152ef84195293.txt"
+        elif provided_camera_trajectory == "Trajectory 2":
+            trajectory_file_path = "assets/pose_files/0c9b371cc6225682.txt"
+        elif provided_camera_trajectory == "Trajectory 3":
+            trajectory_file_path = "assets/pose_files/0c11dbe781b1c11c.txt"
+        elif provided_camera_trajectory == "Trajectory 4":
+            trajectory_file_path = "assets/pose_files/0f47577ab3441480.txt"
+        elif provided_camera_trajectory == "Trajectory 5":
+            trajectory_file_path = "assets/pose_files/0f68374b76390082.txt"
+        elif provided_camera_trajectory == "Trajectory 6":
+            trajectory_file_path = "assets/pose_files/2c80f9eb0d3b2bb4.txt"
+        elif provided_camera_trajectory == "Trajectory 7":
+            trajectory_file_path = "assets/pose_files/2f25826f0d0ef09a.txt"
+        elif provided_camera_trajectory == "Trajectory 8":
+            trajectory_file_path = "assets/pose_files/3f79dc32d575bcdc.txt"
+        else:
+            trajectory_file_path = "assets/pose_files/4a2d6753676df096.txt"
+    else:
+        trajectory_file_path = customized_trajectory_file.name
+    vis_traj = visualize_trajectory(trajectory_file_path)
+    return gr.update(visible=True), vis_traj, gr.update(visible=True), gr.update(visible=True), \
+           gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), \
+           gr.update(visible=True), gr.update(visible=True), trajectory_file_path
+def update_set_button():
+    return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
+def update_buttons_for_example(example_image, example_traj_path, provided_traj_name):
+    global height, width
+    return_image = example_image
+    return gr.update(visible=True, value=return_image, height=height, width=width), gr.update(visible=True), \
+           gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), \
+           gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), \
+           gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), \
+           gr.update(visible=True)
+@spaces.GPU
+@torch.inference_mode()
+def sample_video(condition_image, trajectory_file, num_inference_step, min_guidance_scale, max_guidance_scale, fps_id, seed):
+    global height, width, num_frames, device, pipeline
+    with open(trajectory_file, 'r') as f:
+        poses = f.readlines()
+    poses = [pose.strip().split(' ') for pose in poses[1:]]
+    cam_params = [[float(x) for x in pose] for pose in poses]
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+    sample_wh_ratio = width / height
+    pose_wh_ratio = cam_params[0].fy / cam_params[0].fx
+    if pose_wh_ratio > sample_wh_ratio:
+        resized_ori_w = height * pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fx = resized_ori_w * cam_param.fx / width
+    else:
+        resized_ori_h = width / pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fy = resized_ori_h * cam_param.fy / height
+    intrinsic = np.asarray([[cam_param.fx * width,
+                             cam_param.fy * height,
+                             cam_param.cx * width,
+                             cam_param.cy * height]
+                            for cam_param in cam_params], dtype=np.float32)
+    K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+    c2ws = get_relative_pose(cam_params, zero_first_frame_scale=True)
+    c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+    plucker_embedding = ray_condition(K, c2ws, height, width, device='cpu')  # b f h w 6
+    plucker_embedding = plucker_embedding.permute(0, 1, 4, 2, 3).contiguous().to(device=device)
+    generator = torch.Generator(device=device)
+    generator.manual_seed(int(seed))
+    with torch.no_grad():
+        sample = pipeline(
+            image=condition_image,
+            pose_embedding=plucker_embedding,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            num_inference_steps=num_inference_step,
+            min_guidance_scale=min_guidance_scale,
+            max_guidance_scale=max_guidance_scale,
+            fps=fps_id,
+            do_image_process=True,
+            generator=generator,
+            output_type='pt'
+        ).frames[0].transpose(0, 1).cpu()
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_videos_grid(sample[None], temporal_video_path, rescale=False)
+    return temporal_video_path
+def main(args):
+    demo = gr.Blocks().queue()
+    with demo:
+        gr.Markdown(title)
+        gr.Markdown(subtitle)
+        gr.Markdown(description)
+        with gr.Column():
+            # step1: Input condition image
+            step1_title = gr.Markdown("---\n## Step 1: Input an Image", show_label=False, visible=True)
+            step1_dec = gr.Markdown(f"\n 1. Upload an Image by `Drag` or Click `Upload Image`; \
+                                                \n 2. Click `{RESIZE_MODES[0]}` or `{RESIZE_MODES[1]}` to select the image resize mode. \
+                                                \n - `{RESIZE_MODES[0]}`: First resize the input image, then center crop it into the resolution of 320 x 576. \
+                                                \n - `{RESIZE_MODES[1]}`: Only resize the input image, and keep the original aspect ratio.",
+                                    show_label=False, visible=True)
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=2):
+                    input_image = gr.Image(type='pil', interactive=True, elem_id='condition_image',
+                                           elem_classes='image',
+                                           visible=True)
+                    with gr.Row():
+                        resize_crop_button = gr.Button(RESIZE_MODES[0], visible=True)
+                        directly_resize_button = gr.Button(RESIZE_MODES[1], visible=True)
+                with gr.Column(scale=2):
+                    processed_image = gr.Image(type='pil', interactive=False, elem_id='processed_image',
+                                               elem_classes='image', visible=False)
+            # step2: Select camera trajectory
+            step2_camera_trajectory = gr.Markdown("---\n## Step 2: Select the camera trajectory", show_label=False,
+                                                  visible=False)
+            step2_camera_trajectory_des = gr.Markdown(f"\n - `{CAMERA_TRAJECTORY_MODES[0]}`: Including 9 camera trajectories extracted from the test set of RealEstate10K dataset, each has 25 frames. \
+                                                                \n - `{CAMERA_TRAJECTORY_MODES[1]}`: You can provide the customized camera trajectories in the txt file.",
+                                                      show_label=False, visible=False)
+            with gr.Row(equal_height=True):
+                provide_trajectory_button = gr.Button(CAMERA_TRAJECTORY_MODES[0], visible=False)
+                customized_trajectory_button = gr.Button(CAMERA_TRAJECTORY_MODES[1], visible=False)
+            with gr.Row():
+                with gr.Column():
+                    provided_camera_trajectory = gr.Markdown(f"---\n### {CAMERA_TRAJECTORY_MODES[0]}", show_label=False,
+                                                             visible=False)
+                    provided_camera_trajectory_des = gr.Markdown(f"\n 1. Click one of the provide camera trajectories, such as `Trajectory 1`; \
+                                                                   \n 2. Click `Visualize Trajectory` to visualize the camera trajectory; \
+                                                                   \n 3. Click `Reset Trajectory` to reset the camera trajectory. ",
+                                                                 show_label=False, visible=False)
+                    customized_camera_trajectory = gr.Markdown(f"---\n### {CAMERA_TRAJECTORY_MODES[1]}",
+                                                               show_label=False,
+                                                               visible=False)
+                    customized_run_status = gr.Markdown(f"\n 1. Input the txt file containing camera trajectory. \
+                                                    \n 2. Click `Visualize Trajectory` to visualize the camera trajectory; \
+                                                    \n 3. Click `Reset Trajectory` to reset the camera trajectory. ",
+                                                        show_label=False, visible=False)
+                    with gr.Row():
+                        provided_trajectories = gr.Dropdown(
+                            ["Trajectory 1", "Trajectory 2", "Trajectory 3", "Trajectory 4", "Trajectory 5",
+                             "Trajectory 6", "Trajectory 7", "Trajectory 8", "Trajectory 9"],
+                            label="Provided Trajectories", interactive=True, visible=False)
+                    with gr.Row():
+                        customized_camera_trajectory_file = gr.File(
+                            label="Upload customized camera trajectory (in .txt format).", visible=False, interactive=True)
+                    with gr.Row():
+                        camera_args = gr.Textbox(value=" ", label="Camera Trajectory Name", visible=False)
+                        camera_trajectory_path = gr.Textbox(value=" ", visible=False)
+                    with gr.Row():
+                        camera_trajectory_vis = gr.Button(value="Visualize Camera Trajectory", visible=False)
+                        camera_trajectory_reset = gr.Button(value="Reset Camera Trajectory", visible=False)
+                with gr.Column():
+                    vis_camera_trajectory = gr.Plot(vis_traj, label='Camera Trajectory', visible=False)
+            # step3: Set inference parameters
+            with gr.Row():
+                with gr.Column():
+                    step3_title = gr.Markdown(f"---\n## Step3: Setting the inference hyper-parameters.", visible=False)
+                    step3_des = gr.Markdown(
+                        f"\n 1. Set the mumber of inference step; \
+                         \n 2. Set the seed; \
+                         \n 3. Set the minimum guidance scale and the maximum guidance scale; \
+                         \n 4. Set the fps; \
+                          \n - Please refer to the SVD paper for the meaning of the last three parameter",
+                        visible=False)
+                    with gr.Row():
+                        with gr.Column():
+                            num_inference_steps = gr.Number(value=25, label='Number Inference Steps', step=1, interactive=True,
+                                                            visible=False)
+                        with gr.Column():
+                            seed = gr.Number(value=42, label='Seed', minimum=1, interactive=True, visible=False, step=1)
+                        with gr.Column():
+                            min_guidance_scale = gr.Number(value=1.0, label='Minimum Guidance Scale', minimum=1.0, step=0.5,
+                                                           interactive=True, visible=False)
+                        with gr.Column():
+                            max_guidance_scale = gr.Number(value=3.0, label='Maximum Guidance Scale', minimum=1.0, step=0.5,
+                                                           interactive=True, visible=False)
+                        with gr.Column():
+                            fps = gr.Number(value=7, label='FPS', minimum=1, step=1, interactive=True, visible=False)
+                        with gr.Column():
+                            _ = gr.Button("Seed", visible=False)
+                        with gr.Column():
+                            _ = gr.Button("Seed", visible=False)
+                        with gr.Column():
+                            _ = gr.Button("Seed", visible=False)
+            with gr.Row():
+                with gr.Column():
+                    _ = gr.Button("Set", visible=False)
+                with gr.Column():
+                    set_button = gr.Button("Set", visible=False)
+                with gr.Column():
+                    _ = gr.Button("Set", visible=False)
+            # step 4: Generate video
+            with gr.Row():
+                with gr.Column():
+                    step4_title = gr.Markdown("---\n## Step4 Generating video", show_label=False, visible=False)
+                    step4_des = gr.Markdown(f"\n - Click the `Start generation !` button to generate the video.; \
+                    \n - If the content of generated video is not very aligned with the condition image, try to increase the `Minimum Guidance Scale` and `Maximum Guidance Scale`. \
+                                         \n - If the generated videos are distored, try to increase `FPS`.",
+                                            visible=False)
+                    start_button = gr.Button(value="Start generation !", visible=False)
+                with gr.Column():
+                    generate_video = gr.Video(value=None, label="Generate Video", visible=False)
+        resize_crop_button.click(fn=process_input_image, inputs=[input_image, resize_crop_button],
+                                 outputs=[processed_image, step2_camera_trajectory, step2_camera_trajectory_des,
+                                          provide_trajectory_button, customized_trajectory_button])
+        directly_resize_button.click(fn=process_input_image, inputs=[input_image, directly_resize_button],
+                                     outputs=[processed_image, step2_camera_trajectory, step2_camera_trajectory_des,
+                                              provide_trajectory_button, customized_trajectory_button])
+        provide_trajectory_button.click(fn=update_camera_trajectories, inputs=[provide_trajectory_button],
+                                        outputs=[provided_camera_trajectory, provided_camera_trajectory_des,
+                                                 provided_trajectories,
+                                                 customized_camera_trajectory, customized_run_status,
+                                                 customized_camera_trajectory_file,
+                                                 camera_args, camera_trajectory_vis, camera_trajectory_reset])
+        customized_trajectory_button.click(fn=update_camera_trajectories, inputs=[customized_trajectory_button],
+                                           outputs=[provided_camera_trajectory, provided_camera_trajectory_des,
+                                                    provided_trajectories,
+                                                    customized_camera_trajectory, customized_run_status,
+                                                    customized_camera_trajectory_file,
+                                                    camera_args, camera_trajectory_vis, camera_trajectory_reset])
+        provided_trajectories.change(fn=update_camera_args, inputs=[provide_trajectory_button, provided_trajectories, customized_camera_trajectory_file],
+                                     outputs=[camera_args])
+        customized_camera_trajectory_file.change(fn=update_camera_args, inputs=[customized_trajectory_button, provided_trajectories, customized_camera_trajectory_file],
+                                                 outputs=[camera_args])
+        camera_trajectory_reset.click(fn=update_camera_args_reset, inputs=None, outputs=[camera_args])
+        camera_trajectory_vis.click(fn=update_trajectory_vis_plot, inputs=[camera_args, provided_trajectories, customized_camera_trajectory_file],
+                                    outputs=[vis_camera_trajectory, vis_camera_trajectory, step3_title, step3_des,
+                                             num_inference_steps, min_guidance_scale, max_guidance_scale, fps,
+                                             seed, set_button, camera_trajectory_path])
+        set_button.click(fn=update_set_button, inputs=None, outputs=[step4_title, step4_des, start_button, generate_video])
+        start_button.click(fn=sample_video, inputs=[processed_image, camera_trajectory_path, num_inference_steps,
+                                                    min_guidance_scale, max_guidance_scale, fps, seed],
+                           outputs=[generate_video])
+        # set example
+        gr.Markdown("## Examples")
+        gr.Markdown("\n Choosing the one of the following examples to get a quick start, by selecting an example, "
+                    "we will set the condition image and camera trajectory automatically. "
+                    "Then, you can click the `Visualize Camera Trajectory` button to visualize the camera trajectory.")
+        gr.Examples(
+            fn=update_buttons_for_example,
+            run_on_click=True,
+            cache_examples=False,
+            examples=examples,
+            inputs=[input_image, camera_args, provided_trajectories],
+            outputs=[processed_image, step2_camera_trajectory, step2_camera_trajectory_des, provide_trajectory_button,
+                     customized_trajectory_button,
+                     provided_camera_trajectory, provided_camera_trajectory_des, provided_trajectories,
+                     customized_camera_trajectory, customized_run_status, customized_camera_trajectory_file,
+                     camera_args, camera_trajectory_vis, camera_trajectory_reset]
+        )
+        with gr.Row():
+            gr.Markdown(closing_words)
+    demo.launch(**args)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--listen', default='0.0.0.0')
+    parser.add_argument('--broswer', action='store_true')
+    parser.add_argument('--share', action='store_true')
+    args = parser.parse_args()
+    launch_kwargs = {'server_name': args.listen,
+                     'inbrowser': args.broswer,
+                     'share': args.share}
+    main(launch_kwargs)

assets/example_condition_images/A_beautiful_fluffy_domestic_hen_sitting_on_white_eggs_in_a_brown_nest,_eggs_are_under_the_hen..png ADDED Viewed

assets/example_condition_images/A_car_running_on_Mars..png ADDED Viewed

assets/example_condition_images/A_lion_standing_on_a_surfboard_in_the_ocean..png ADDED Viewed

assets/example_condition_images/A_serene_mountain_lake_at_sunrise,_with_mist_hovering_over_the_water..png ADDED Viewed

assets/example_condition_images/A_tiny_finch_on_a_branch_with_spring_flowers_on_background..png ADDED Viewed

assets/example_condition_images/An_exploding_cheese_house..png ADDED Viewed

assets/example_condition_images/Dolphins_leaping_out_of_the_ocean_at_sunset..png ADDED Viewed

assets/example_condition_images/Fireworks_display_illuminating_the_night_sky..png ADDED Viewed

assets/example_condition_images/Leaves_are_falling_from_trees..png ADDED Viewed

assets/example_condition_images/Rocky_coastline_with_crashing_waves..png ADDED Viewed

assets/pose_files/0bf152ef84195293.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=QShWPZxTDoE
+157323991 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.950234294 0.023969267 -0.310612619 -0.058392330 -0.025083920 0.999685287 0.000406042 0.179560758 0.310524613 0.007405547 0.950536489 -0.411621285
+157490824 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.932122767 0.029219138 -0.360961705 0.019157260 -0.030671693 0.999528050 0.001705339 0.195243598 0.360841185 0.009481722 0.932579100 -0.489249695
+157657658 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.912891090 0.034948215 -0.406704396 0.093606521 -0.036429971 0.999327779 0.004101569 0.203909523 0.406574339 0.011071944 0.913550615 -0.570709379
+157824491 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.892021954 0.039648205 -0.450249761 0.174752186 -0.041337918 0.999126673 0.006083843 0.206605029 0.450097769 0.013185467 0.892881930 -0.657519766
+157991325 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.870897233 0.043891508 -0.489501357 0.266759117 -0.046065997 0.998909414 0.007609563 0.208293300 0.489301533 0.015922222 0.871969342 -0.739918788
+158158158 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.850054264 0.048434701 -0.524463415 0.371990684 -0.051002879 0.998652756 0.009560689 0.215520371 0.524219871 0.018622037 0.851379335 -0.814489669
+158358358 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.823578537 0.052956820 -0.564724684 0.498689894 -0.055313200 0.998385012 0.012955925 0.224118528 0.564498782 0.020566508 0.825177670 -0.889946292
+158525192 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.801249743 0.056292553 -0.595676124 0.608660065 -0.058902908 0.998149574 0.015096202 0.223320416 0.595423639 0.022991227 0.803082883 -0.943733076
+158692025 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.780003667 0.059620168 -0.622928321 0.726968666 -0.062449891 0.997897983 0.017311305 0.217967188 0.622651041 0.025398925 0.782087326 -1.002211444
+158858859 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.758300304 0.062706254 -0.648882568 0.862137737 -0.066019125 0.997632504 0.019256916 0.210050766 0.648553908 0.028236136 0.760644853 -1.055941415
+159025692 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.733386099 0.066376433 -0.676564097 1.014642875 -0.069476441 0.997329056 0.022534581 0.204417168 0.676252782 0.030478716 0.736038864 -1.100931176
+159192526 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.703763664 0.069719747 -0.707004845 1.176046236 -0.073094003 0.996997535 0.025557835 0.198280199 0.706663966 0.033691138 0.706746757 -1.127059555
+159392726 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.662238955 0.074904546 -0.745539367 1.361111617 -0.076822795 0.996534884 0.031882886 0.176548885 0.745344102 0.036160327 0.665698588 -1.136046987
+159559560 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.629729092 0.078562595 -0.772831917 1.488353738 -0.081706621 0.996052980 0.034676969 0.152860218 0.772505820 0.041308392 0.633662641 -1.137729720
+159726393 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.602676034 0.081962064 -0.793765604 1.594849532 -0.083513811 0.995727122 0.039407197 0.137400253 0.793603837 0.042540617 0.606945813 -1.154423412
+159893227 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.580205023 0.084535435 -0.810071528 1.693660542 -0.086233690 0.995384574 0.042109925 0.134338657 0.809892535 0.045423068 0.584816933 -1.189997045
+160060060 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.559533417 0.086548090 -0.824276507 1.785956560 -0.089039005 0.995054126 0.044038296 0.143407250 0.824011147 0.048751865 0.564472198 -1.233530509
+160226894 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.539407372 0.088928543 -0.837335885 1.876278781 -0.091476299 0.994710982 0.046713885 0.159821683 0.837061405 0.051398575 0.544689238 -1.287939732
+160427094 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.515480161 0.090795092 -0.852077723 1.979818582 -0.093906343 0.994367242 0.049146701 0.181896137 0.851740420 0.054681353 0.521102846 -1.359775674
+160593927 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.497423410 0.091656610 -0.862652302 2.062552118 -0.095314465 0.994156837 0.050668620 0.194005458 0.862255812 0.057019483 0.503253102 -1.415121326
+160760761 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.484208912 0.092620693 -0.870036304 2.136687359 -0.096262053 0.993984103 0.052242137 0.204385655 0.869640946 0.058455370 0.490211815 -1.477987717
+160927594 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.475284129 0.093297184 -0.874871790 2.200792438 -0.096743606 0.993874133 0.053430639 0.209217395 0.874497354 0.059243519 0.481398523 -1.547068315
+161094428 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.468848795 0.093707815 -0.878293574 2.268227083 -0.097071946 0.993799806 0.054212786 0.208793720 0.877928138 0.059840068 0.475038230 -1.634971335
+161261261 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.463450164 0.093811318 -0.881143212 2.339123750 -0.097783640 0.993721604 0.054366294 0.224513862 0.880711257 0.060965322 0.469713658 -1.732136350
+161461461 0.474812461 0.844111024 0.500000000 0.500000000 0.000000000 0.000000000 0.458983690 0.093715429 -0.883488178 2.426412787 -0.098171033 0.993681431 0.054402962 0.253829726 0.883004189 0.061762877 0.465283692 -1.863571195

assets/pose_files/0c11dbe781b1c11c.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=a-Unpcomk5k
+90023267 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.949961841 -0.054589756 0.307558835 0.363597957 0.049778115 0.998484373 0.023474237 0.122943811 -0.308374137 -0.006989930 0.951239467 -0.411649725
+90190100 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.936324358 -0.058613990 0.346209586 0.384438270 0.053212658 0.998267829 0.025095066 0.136336848 -0.347080797 -0.005074390 0.937821507 -0.495378251
+90356933 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.920415699 -0.061646536 0.386050045 0.392760735 0.055660341 0.998093307 0.026676189 0.148407963 -0.386958480 -0.003065505 0.922092021 -0.584840288
+90523767 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.902696550 -0.065121211 0.425321281 0.393740251 0.058245987 0.997876167 0.029164905 0.157571476 -0.426317245 -0.001553800 0.904572368 -0.683591501
+90690600 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.883015513 -0.069272175 0.464203626 0.383146000 0.061874375 0.997597098 0.031171000 0.171538756 -0.465247482 0.001197834 0.885179818 -0.798848920
+90857433 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.863586664 -0.074230894 0.498706162 0.378236544 0.067191981 0.997224212 0.032080498 0.194804574 -0.499703199 0.005804764 0.866177261 -0.912604869
+91057633 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.837207139 -0.080319084 0.540955663 0.348125228 0.072502285 0.996726155 0.035782419 0.216269091 -0.542058706 0.009263224 0.840289593 -1.067256689
+91224467 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.814787984 -0.085085154 0.573481560 0.311799242 0.076606996 0.996299267 0.038975649 0.234736581 -0.574675500 0.012175811 0.818290770 -1.196836664
+91391300 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.792760789 -0.089765556 0.602886796 0.270539226 0.081507996 0.995825171 0.041093048 0.259814929 -0.604058623 0.016563140 0.796767771 -1.328140863
+91558133 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.771091938 -0.093814306 0.629774630 0.223948432 0.087357447 0.995320201 0.041307874 0.293608807 -0.630702674 0.023163332 0.775678813 -1.459775674
+91724967 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.751535058 -0.096625380 0.652578413 0.178494515 0.089747138 0.994993508 0.043969437 0.308307880 -0.653559864 0.025522472 0.756444395 -1.587897834
+91891800 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.735025227 -0.099746063 0.670662820 0.138219010 0.093737528 0.994570971 0.045186777 0.333516116 -0.671528995 0.029652854 0.740384698 -1.712296424
+92092000 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.719043434 -0.101635426 0.687493145 0.093557351 0.098243877 0.994179368 0.044221871 0.373143031 -0.687985957 0.035744540 0.724843204 -1.860791364
+92258833 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.709150374 -0.102171108 0.697615087 0.059169738 0.099845670 0.994025767 0.044086087 0.400782920 -0.697951734 0.038390182 0.715115070 -1.981529677
+92425667 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.703360021 -0.101482928 0.703552365 0.039205180 0.098760851 0.994108558 0.044659954 0.417778776 -0.703939617 0.038071405 0.709238708 -2.106152155
+92592500 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.700221658 -0.101235874 0.706711292 0.029170036 0.096752122 0.994219005 0.046557475 0.427528111 -0.707339108 0.035775267 0.705968499 -2.234683370
+92759333 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.698873043 -0.100907177 0.708091974 0.024634507 0.096064955 0.994270742 0.046875048 0.444746524 -0.708765149 0.035263114 0.704562664 -2.365965080
+92926167 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.698221087 -0.101446368 0.708657861 0.017460176 0.096007936 0.994235396 0.047733612 0.465147684 -0.709415078 0.034708157 0.703935742 -2.489595036
+93126367 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.701540232 -0.100508377 0.705506444 0.030925799 0.096309878 0.994293392 0.045881305 0.500429136 -0.706091821 0.035759658 0.707216740 -2.635113223
+93293200 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.706040561 -0.099173397 0.701192796 0.055235645 0.095376909 0.994441032 0.044612732 0.522969947 -0.701719284 0.035379205 0.711574554 -2.748741222
+93460033 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.711469471 -0.101191826 0.695392966 0.092386154 0.097211346 0.994235933 0.045219962 0.550373275 -0.695960581 0.035427466 0.717205524 -2.869640023
+93626867 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.715287089 -0.106710054 0.690635443 0.110879759 0.101620771 0.993650913 0.048280932 0.574557114 -0.691402614 0.035648178 0.721589625 -3.003606281
+93793700 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.717362285 -0.111333445 0.687747240 0.117481763 0.104680635 0.993167043 0.051586930 0.589310163 -0.688791215 0.034987301 0.724115014 -3.119467820
+93960533 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.717443645 -0.117016122 0.686718166 0.111165224 0.109845228 0.992461383 0.054354500 0.614623166 -0.687901616 0.036436427 0.724888742 -3.219243898
+94160733 0.485388169 0.862912326 0.500000000 0.500000000 0.000000000 0.000000000 0.715022981 -0.122569911 0.688272297 0.080960594 0.115116455 0.991714299 0.057017289 0.647785934 -0.689558089 0.038462799 0.723208308 -3.337481340

assets/pose_files/0c9b371cc6225682.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=_ca03xP_KUU
+212078000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.981108844 0.010863926 -0.193151161 0.019480142 -0.008781361 0.999893725 0.011634931 -0.185801323 0.193257034 -0.009719004 0.981100023 -1.207220396
+212245000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.981206656 0.010493318 -0.192674309 0.047262620 -0.008341321 0.999893486 0.011976899 -0.196644454 0.192779467 -0.010144655 0.981189668 -1.332579514
+212412000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.981131375 0.009989015 -0.193084016 0.089602762 -0.007912135 0.999902308 0.011524491 -0.209028987 0.193180263 -0.009779332 0.981114566 -1.458343512
+212579000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980986536 0.009889571 -0.193823576 0.142988232 -0.007621351 0.999893546 0.012444697 -0.219217661 0.193926007 -0.010730883 0.980957448 -1.565616727
+212746000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980907381 0.009417370 -0.194247320 0.202069071 -0.007269385 0.999904335 0.011767862 -0.219211705 0.194339558 -0.010131124 0.980881989 -1.654996418
+212913000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980841637 0.009196524 -0.194589555 0.262465567 -0.006609587 0.999880970 0.013939449 -0.224018296 0.194694594 -0.012386235 0.980785728 -1.740759996
+213112000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980812550 0.009127630 -0.194739416 0.343576873 -0.006686049 0.999890625 0.013191325 -0.227157741 0.194838524 -0.011636180 0.980766296 -1.843349559
+213279000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980493903 0.009053789 -0.196340859 0.419120133 -0.006299877 0.999872863 0.014646200 -0.230109231 0.196448520 -0.013123587 0.980426311 -1.921706921
+213446000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980334044 0.009148465 -0.197133064 0.491943193 -0.006159810 0.999856710 0.015768444 -0.229004834 0.197249070 -0.014244041 0.980249941 -2.001160080
+213613000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980392158 0.009613466 -0.196821600 0.558373534 -0.006672133 0.999855995 0.015601818 -0.224721707 0.196943253 -0.013982680 0.980315149 -2.074274069
+213779000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980493963 0.009960363 -0.196296573 0.623893674 -0.006936011 0.999846518 0.016088497 -0.223079036 0.196426690 -0.014413159 0.980412602 -2.137999468
+213946000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.980379820 0.010693249 -0.196827397 0.699812821 -0.007542110 0.999831200 0.016752303 -0.227942951 0.196973309 -0.014939127 0.980295002 -2.197760648
+214146000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.979374588 0.009917496 -0.201809332 0.814920839 -0.006856939 0.999850750 0.015859045 -0.216071799 0.201936498 -0.014148152 0.979296446 -2.259941063
+214313000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.977240086 0.010313706 -0.211885542 0.923878346 -0.006808316 0.999827743 0.017266726 -0.213645187 0.212027133 -0.015431152 0.977141917 -2.298546075
+214480000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.974372387 0.010545789 -0.224693686 1.023098265 -0.007014100 0.999839067 0.016510243 -0.202358523 0.224831656 -0.014511099 0.974289536 -2.336883235
+214647000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.970687985 0.012000944 -0.240043446 1.114468699 -0.008058093 0.999816120 0.017400362 -0.191482059 0.240208134 -0.014956030 0.970606208 -2.373449288
+214814000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.966870189 0.013693837 -0.254900873 1.198725810 -0.010257521 0.999837756 0.014805457 -0.166976597 0.255062282 -0.011700304 0.966853857 -2.418678595
+214981000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.964416146 0.015124563 -0.263955861 1.261415498 -0.015491933 0.999879777 0.000689789 -0.124738174 0.263934553 0.003423943 0.964534521 -2.488291986
+215181000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.961933076 0.016891202 -0.272762626 1.331672110 -0.022902885 0.999559581 -0.018870916 -0.076291319 0.272323757 0.024399608 0.961896241 -2.579417067
+215348000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.960170150 0.017672766 -0.278856426 1.385267829 -0.031079723 0.998559833 -0.043730423 -0.007773330 0.277681977 0.050655428 0.959336638 -2.653662977
+215515000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.959171832 0.017540060 -0.282279491 1.424533991 -0.041599691 0.995969117 -0.079466961 0.101994757 0.279747814 0.087965213 0.956035197 -2.725926173
+215681000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.958615839 0.017946823 -0.284136623 1.452641041 -0.050743751 0.992801547 -0.108490512 0.201790053 0.280144215 0.118418880 0.952625930 -2.789404412
+215848000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.958124936 0.017704720 -0.285802603 1.475525887 -0.056795157 0.990007460 -0.129071787 0.280252282 0.280661523 0.139899105 0.949556410 -2.857222541
+216015000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.958040893 0.018267807 -0.286048800 1.493161376 -0.062177986 0.987448573 -0.145186886 0.346856721 0.279806226 0.156880915 0.947151959 -2.929304305
+216215000 0.479272232 0.852039479 0.500000000 0.500000000 0.000000000 0.000000000 0.959765971 0.017995594 -0.280223966 1.499055201 -0.064145394 0.985608101 -0.156403333 0.410155748 0.273376435 0.168085665 0.947107434 -3.033597428

assets/pose_files/0f47577ab3441480.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=in69BD2eZqg
+195161633 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999976993 -0.003071866 0.006052452 0.037627942 0.003092153 0.999989629 -0.003345382 0.206876054 -0.006042113 0.003364020 0.999976099 -0.240768750
+195328467 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999913514 -0.003797470 0.012590170 0.037371090 0.003835545 0.999988139 -0.003001482 0.258472399 -0.012578622 0.003049513 0.999916255 -0.264166944
+195495300 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999804139 -0.004247059 0.019329911 0.038498871 0.004311955 0.999985218 -0.003316826 0.307481199 -0.019315537 0.003399526 0.999807656 -0.276803884
+195662133 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999610126 -0.005408245 0.027391966 0.038009080 0.005529573 0.999975204 -0.004355530 0.361086350 -0.027367733 0.004505298 0.999615252 -0.278727233
+195828967 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999336481 -0.006239665 0.035883281 0.034735125 0.006456365 0.999961615 -0.005926326 0.417233500 -0.035844926 0.006154070 0.999338388 -0.270773664
+195995800 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999090433 -0.007104441 0.042045686 0.033419301 0.007331387 0.999959350 -0.005245875 0.473378445 -0.042006709 0.005549357 0.999101937 -0.261640758
+196196000 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.998589635 -0.007975463 0.052489758 0.032633405 0.008245680 0.999953806 -0.004933467 0.535259197 -0.052447986 0.005359322 0.998609304 -0.250263159
+196362833 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.998181939 -0.008554175 0.059662651 0.028043281 0.008866202 0.999948382 -0.004967103 0.576287383 -0.059617080 0.005487053 0.998206258 -0.238836996
+196529667 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.997807026 -0.009027892 0.065571494 0.020762443 0.009380648 0.999943137 -0.005073830 0.611177122 -0.065521955 0.005677806 0.997834980 -0.221059185
+196696500 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.997390270 -0.009296595 0.071597412 0.013903395 0.009683816 0.999940276 -0.005063088 0.639742116 -0.071546070 0.005743211 0.997420788 -0.192511620
+196863333 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.996968508 -0.009326802 0.077245183 0.007660940 0.009716687 0.999941885 -0.004673062 0.661375479 -0.077197112 0.005409463 0.997001171 -0.161790087
+197030167 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.996511698 -0.009557574 0.082903855 0.000657017 0.009994033 0.999938309 -0.004851241 0.672208252 -0.082852371 0.005662862 0.996545732 -0.126490956
+197230367 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.996102691 -0.010129508 0.087617576 -0.013035317 0.010638822 0.999929130 -0.005347892 0.673139255 -0.087557197 0.006259197 0.996139824 -0.073934910
+197397200 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.995961964 -0.010034073 0.089213885 -0.025143057 0.010407046 0.999938965 -0.003716475 0.666403518 -0.089171149 0.004629921 0.996005535 -0.027130940
+197564033 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.995849669 -0.009882330 0.090475440 -0.039230446 0.010261126 0.999940395 -0.003722523 0.652124926 -0.090433262 0.004635453 0.995891750 0.029309661
+197730867 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.995756924 -0.010019524 0.091475174 -0.055068664 0.010371366 0.999940515 -0.003371752 0.630272532 -0.091435947 0.004306168 0.995801628 0.101088973
+197897700 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.995885789 -0.009802628 0.090085521 -0.068138022 0.010283959 0.999935210 -0.004880426 0.600038118 -0.090031847 0.005786783 0.995922089 0.182818315
+198064533 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.995840013 -0.010080555 0.090559855 -0.077047283 0.010255665 0.999946356 -0.001468501 0.569244350 -0.090540186 0.002391143 0.995889962 0.259090585
+198264733 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.995863378 -0.010109165 0.090299115 -0.082291512 0.010262243 0.999946594 -0.001231105 0.534897586 -0.090281844 0.002152683 0.995913923 0.348298991
+198431567 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.996120393 -0.010062339 0.087423652 -0.082856431 0.010250443 0.999945998 -0.001702961 0.509342862 -0.087401800 0.002592486 0.996169746 0.427163225
+198598400 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.996832788 -0.009680700 0.078934617 -0.077252838 0.010075771 0.999938607 -0.004608278 0.480071534 -0.078885160 0.005389010 0.996869147 0.513721870
+198765233 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.997568011 -0.009045602 0.069110014 -0.060091805 0.009451369 0.999939978 -0.005546598 0.444060897 -0.069055691 0.006186293 0.997593641 0.602911453
+198932067 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.998626053 -0.008290987 0.051742285 -0.037270541 0.008407482 0.999962568 -0.002034174 0.410440195 -0.051723484 0.002466401 0.998658419 0.690111645
+199098900 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999656141 -0.006388811 0.025431594 -0.014759559 0.006480854 0.999972761 -0.003538476 0.375793364 -0.025408294 0.003702078 0.999670327 0.777147280
+199299100 0.507650910 0.902490531 0.500000000 0.500000000 0.000000000 0.000000000 0.999947906 -0.003541293 -0.009570339 -0.002547566 0.003502194 0.999985456 -0.004099103 0.343015758 0.009584717 0.004065373 0.999945819 0.878377059

assets/pose_files/0f68374b76390082.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=-aldZQifF2U
+103837067 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.792261064 -0.075338066 0.605513453 -2.753106466 0.083067641 0.996426642 0.015288832 0.122302125 -0.604501545 0.038185827 0.795688212 -1.791608923
+104003900 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.772824645 -0.077280566 0.629896700 -2.856354365 0.084460691 0.996253133 0.018602582 0.115028772 -0.628974140 0.038824979 0.776456118 -1.799931844
+104170733 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.752573133 -0.078496389 0.653813422 -2.957175162 0.085868694 0.996090353 0.020750597 0.112823623 -0.652886093 0.040525761 0.756371260 -1.810994932
+104337567 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.730659664 -0.077806436 0.678293884 -3.062095207 0.087071396 0.995992005 0.020455774 0.121362801 -0.677166939 0.044113789 0.734505892 -1.811030009
+104504400 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.706461906 -0.074765891 0.703790903 -3.177137127 0.086851373 0.996047020 0.018632174 0.129874960 -0.702401876 0.047962286 0.710162818 -1.792277939
+104671233 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.681627631 -0.071119592 0.728234708 -3.294052837 0.086847432 0.996093273 0.015989548 0.143049226 -0.726526856 0.052346393 0.685141265 -1.768016440
+104871433 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.649465024 -0.065721743 0.757545888 -3.442979418 0.086763002 0.996156216 0.012038323 0.166510317 -0.755425274 0.057908490 0.652670860 -1.724684703
+105038267 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.621174812 -0.061671518 0.781241655 -3.558270668 0.087205477 0.996146977 0.009298084 0.180848136 -0.778804898 0.062352814 0.624159455 -1.675155675
+105205100 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.591690660 -0.058407109 0.804046512 -3.660407702 0.087778911 0.996109724 0.007763143 0.186383384 -0.801371992 0.065984949 0.594515741 -1.621257762
+105371933 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.561377883 -0.055783633 0.825677335 -3.752373081 0.089227341 0.995989263 0.006624432 0.194667304 -0.822735310 0.069954179 0.564103782 -1.568545872
+105538767 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.531322777 -0.053783599 0.845460474 -3.836961453 0.091640897 0.995775461 0.005754844 0.205166191 -0.842198372 0.074421078 0.534006953 -1.522108893
+105705600 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.501615226 -0.052979972 0.863467038 -3.914896511 0.093892507 0.995560884 0.006539768 0.201989601 -0.859980464 0.077792637 0.504362881 -1.476983336
+105905800 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.466019660 -0.052434672 0.883219302 -4.004424531 0.098161966 0.995143771 0.007285428 0.209186293 -0.879312158 0.083303392 0.468903631 -1.424243874
+106072633 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.435604274 -0.051984914 0.898635924 -4.083866394 0.101657487 0.994785070 0.008269622 0.213039517 -0.894379497 0.087750785 0.438617289 -1.372398599
+106239467 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.404144615 -0.051714677 0.913232028 -4.163043658 0.104999557 0.994423509 0.009845568 0.210349578 -0.908648551 0.091909930 0.407320917 -1.308948274
+106406300 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.372057080 -0.052390546 0.926730156 -4.232320456 0.108426183 0.994023800 0.012664661 0.202983014 -0.921855330 0.095769837 0.375514120 -1.239784641
+106573133 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.338993609 -0.053159237 0.939285576 -4.297918560 0.111065105 0.993681848 0.016153777 0.191918628 -0.934209764 0.098845825 0.342755914 -1.169019518
+106739967 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.305330545 -0.054462686 0.950687706 -4.358390475 0.113597691 0.993316948 0.020420864 0.175834622 -0.945446372 0.101760812 0.309476852 -1.098186456
+106940167 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.264192373 -0.056177936 0.962832510 -4.426586558 0.117604628 0.992729127 0.025652671 0.163465045 -0.957272947 0.106456317 0.268878251 -1.008524756
+107107000 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.228878200 -0.056077410 0.971838534 -4.485196000 0.120451130 0.992298782 0.028890507 0.159180748 -0.965974271 0.110446639 0.233870149 -0.923927626
+107273833 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.192813009 -0.054965079 0.979694843 -4.547398479 0.122527294 0.991963863 0.031538919 0.153786345 -0.973555446 0.113958240 0.197998255 -0.835885482
+107440667 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.155427963 -0.053089641 0.986419618 -4.614075971 0.124575593 0.991636276 0.033741303 0.151495104 -0.979960740 0.117639467 0.160741687 -0.738650735
+107607500 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.117806904 -0.051662166 0.991691768 -4.672324721 0.126608506 0.991277337 0.036600262 0.144364476 -0.984932423 0.121244848 0.123320177 -0.639080225
+107774333 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.080108978 -0.050879046 0.995486736 -4.716803649 0.129048899 0.990820825 0.040255725 0.133545828 -0.988397181 0.125241622 0.085939527 -0.541709066
+107974533 0.474175212 0.842978122 0.500000000 0.500000000 0.000000000 0.000000000 0.034108389 -0.050325166 0.998150289 -4.758242879 0.132215530 0.990180492 0.045405328 0.118994547 -0.990633965 0.130422264 0.040427230 -0.433560831

assets/pose_files/2c80f9eb0d3b2bb4.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=sLIFyXD2ujI
+77010267 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.991455436 0.021077231 -0.128731906 -0.119416025 -0.023393147 0.999590099 -0.016504617 0.019347615 0.128331259 0.019375037 0.991542101 -0.092957340
+77143733 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.988697171 0.023564288 -0.148062930 -0.142632843 -0.026350429 0.999510169 -0.016883694 0.023384606 0.147592559 0.020594381 0.988833785 -0.115024468
+77277200 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.985320270 0.026362764 -0.168668360 -0.165155176 -0.029713295 0.999407530 -0.017371174 0.028412548 0.168110475 0.022127863 0.985519767 -0.141363672
+77410667 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.981402338 0.029484071 -0.189684242 -0.188834577 -0.033494804 0.999277294 -0.017972585 0.034114674 0.189017251 0.023991773 0.981680632 -0.169959835
+77544133 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.976626754 0.033091862 -0.212379664 -0.212527322 -0.037670061 0.999136209 -0.017545532 0.036524990 0.211615592 0.025135791 0.977029681 -0.204014687
+77677600 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.970793188 0.035301749 -0.237306431 -0.229683177 -0.040712819 0.999009848 -0.017938551 0.042000619 0.236438200 0.027076038 0.971269190 -0.236341621
+77811067 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.964167893 0.038360216 -0.262504756 -0.246031807 -0.044686489 0.998835802 -0.018170038 0.047261141 0.261502147 0.029249383 0.964759588 -0.276015669
+77944533 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.956261098 0.040829532 -0.289650917 -0.252766079 -0.048421524 0.998644531 -0.019089982 0.054620904 0.288478881 0.032280345 0.956941962 -0.321621308
+78078000 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.946828246 0.042435788 -0.318928629 -0.251662187 -0.051583275 0.998462617 -0.020286530 0.062274582 0.317577451 0.035659242 0.947561622 -0.373008852
+78211467 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.935860872 0.044850163 -0.349503726 -0.247351407 -0.055966165 0.998195410 -0.021766055 0.072942153 0.347896785 0.039930381 0.936682105 -0.431307858
+78344933 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.923219025 0.046543088 -0.381445110 -0.234020172 -0.059361201 0.997996330 -0.021899769 0.078518674 0.379661530 0.042861324 0.924132049 -0.487708973
+78478400 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.909880757 0.048629351 -0.412009954 -0.218247042 -0.063676558 0.997708619 -0.022863906 0.088967126 0.409954011 0.047038805 0.910892427 -0.543114491
+78645233 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.891359746 0.050869841 -0.450433195 -0.185763327 -0.067926541 0.997452736 -0.021771761 0.093745158 0.448178291 0.050002839 0.892544627 -0.611223637
+78778700 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.877080619 0.053094681 -0.477399796 -0.163606786 -0.072203092 0.997152746 -0.021752052 0.102191599 0.474885583 0.053548045 0.878416896 -0.664313657
+78912167 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.863215029 0.055334236 -0.501794696 -0.143825544 -0.076518841 0.996831775 -0.021708660 0.111709364 0.499003649 0.057135988 0.864714324 -0.719103228
+79045633 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.849143744 0.056816563 -0.525096953 -0.122334566 -0.079860382 0.996578217 -0.021311868 0.118459005 0.522089303 0.060031284 0.850775540 -0.775464728
+79179100 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.835254073 0.059146367 -0.546673834 -0.101344556 -0.084243484 0.996225357 -0.020929486 0.126763936 0.543372452 0.063535146 0.837083995 -0.832841061
+79312567 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.822106183 0.061935693 -0.565955281 -0.082663275 -0.088697352 0.995860636 -0.019859029 0.133423045 0.562382638 0.066524968 0.824196696 -0.894100189
+79446033 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.808796465 0.064479858 -0.584543109 -0.062439027 -0.093630031 0.995411158 -0.019748187 0.145033126 0.580587387 0.070703052 0.811122298 -0.951788129
+79579500 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.794901192 0.066913344 -0.603037894 -0.037377988 -0.097949244 0.995015621 -0.018705536 0.153829045 0.598780453 0.073936157 0.797493160 -1.008854626
+79712967 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.781648815 0.069040783 -0.619885862 -0.013285614 -0.101820730 0.994646847 -0.017611075 0.161173621 0.615351617 0.076882906 0.784494340 -1.070102980
+79846433 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.768634439 0.072034292 -0.635619521 0.012177816 -0.107280776 0.994082153 -0.017072625 0.174403322 0.630628169 0.081312358 0.771813691 -1.132424688
+79979900 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.755315959 0.075072937 -0.651046753 0.040377463 -0.113015875 0.993455172 -0.016559631 0.189742153 0.645542622 0.086086370 0.758856952 -1.193296093
+80113367 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.742196620 0.078075886 -0.665618777 0.069020519 -0.118120082 0.992882252 -0.015246205 0.202486741 0.659690738 0.089938626 0.746136189 -1.254564875
+80280200 0.483930168 0.860320329 0.500000000 0.500000000 0.000000000 0.000000000 0.726252913 0.078639805 -0.682914674 0.104603927 -0.119984925 0.992686808 -0.013288199 0.209760187 0.676875412 0.091590062 0.730377257 -1.329527748

assets/pose_files/2f25826f0d0ef09a.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=t-mlAKnESzQ
+167300000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.991854608 -0.011446482 0.126859888 0.441665245 0.012175850 0.999913514 -0.004975420 -0.056449972 -0.126791954 0.006479521 0.991908193 -0.456202583
+167467000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.991945148 -0.011409644 0.126153216 0.506974565 0.012122569 0.999914587 -0.004884966 -0.069421149 -0.126086697 0.006374919 0.991998732 -0.517325825
+167634000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.991982698 -0.011117884 0.125883758 0.561996906 0.011760475 0.999921322 -0.004362585 -0.080740919 -0.125825346 0.005808061 0.992035389 -0.570476997
+167801000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.992486775 -0.010566713 0.121894784 0.609598405 0.011126007 0.999930441 -0.003908583 -0.087745179 -0.121845007 0.005235420 0.992535353 -0.617968773
+167968000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.993018925 -0.010175236 0.117515638 0.655818155 0.010723241 0.999934375 -0.004031916 -0.098194076 -0.117466904 0.005263917 0.993062854 -0.668642428
+168134000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.993561447 -0.009874708 0.112863302 0.703081750 0.010385432 0.999938309 -0.003938090 -0.108951006 -0.112817451 0.005084869 0.993602693 -0.730919086
+168335000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.994062483 -0.010140099 0.108337507 0.763671544 0.010665529 0.999934018 -0.004271581 -0.104826596 -0.108287044 0.005401695 0.994104981 -0.820197463
+168501000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.994565487 -0.010703885 0.103560977 0.813888661 0.011249267 0.999925733 -0.004683641 -0.095187847 -0.103503153 0.005823173 0.994612098 -0.890086513
+168668000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.994905293 -0.010604435 0.100254886 0.865790711 0.011124965 0.999927402 -0.004634405 -0.086100908 -0.100198455 0.005726126 0.994951010 -0.962092459
+168835000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.995332658 -0.010553311 0.095924698 0.905775925 0.011022467 0.999929726 -0.004362300 -0.075394333 -0.095871925 0.005399267 0.995379031 -1.025694236
+169002000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.995705128 -0.010036361 0.092035979 0.944576676 0.010483396 0.999935448 -0.004374997 -0.058609663 -0.091986135 0.005321057 0.995746076 -1.081030198
+169169000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.996029556 -0.009414902 0.088523701 0.977259045 0.009879347 0.999939620 -0.004809874 -0.042104006 -0.088473074 0.005665333 0.996062458 -1.127427189
+169369000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.996554554 -0.009220830 0.082425818 1.013619994 0.009555685 0.999947608 -0.003668923 -0.018710063 -0.082387671 0.004443917 0.996590436 -1.175459833
+169536000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.996902764 -0.008823335 0.078147218 1.041872487 0.009157063 0.999950409 -0.003913174 0.011864113 -0.078108817 0.004616653 0.996934175 -1.202554477
+169703000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.997331142 -0.008540447 0.072509713 1.068829435 0.008805763 0.999955654 -0.003340158 0.047323405 -0.072477967 0.003969747 0.997362137 -1.214284849
+169870000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.997695088 -0.008219596 0.067356706 1.095289713 0.008451649 0.999959290 -0.003160893 0.090756953 -0.067327984 0.003722883 0.997723937 -1.225599061
+170036000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.997950792 -0.008326715 0.063442364 1.112874795 0.008502332 0.999960721 -0.002498670 0.132311648 -0.063419066 0.003032958 0.997982383 -1.233305313
+170203000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998197436 -0.008063688 0.059471287 1.125840626 0.008245971 0.999962032 -0.002820280 0.178666038 -0.059446286 0.003305595 0.998226047 -1.240809047
+170403000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998360872 -0.007677370 0.056714825 1.144370603 0.007830821 0.999966264 -0.002483922 0.248055953 -0.056693841 0.002923975 0.998387337 -1.246230780
+170570000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998471320 -0.007715963 0.054730706 1.159189486 0.007868989 0.999965727 -0.002581036 0.310163907 -0.054708913 0.003007766 0.998497844 -1.245661417
+170737000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998507679 -0.007751614 0.054058932 1.165836593 0.007918007 0.999964535 -0.002864495 0.366963293 -0.054034811 0.003288259 0.998533666 -1.241523115
+170904000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998536825 -0.007817166 0.053508084 1.175042384 0.008036798 0.999960124 -0.003890704 0.423587941 -0.053475536 0.004315045 0.998559833 -1.224956309
+171071000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998537302 -0.007878507 0.053490099 1.177855699 0.008138275 0.999956131 -0.004640296 0.484100754 -0.053451192 0.005068825 0.998557627 -1.202906710
+171238000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998549581 -0.007872007 0.053261518 1.180678596 0.008100130 0.999958932 -0.004068544 0.548228374 -0.053227302 0.004494068 0.998572290 -1.184901744
+171438000 0.470983989 0.837304886 0.500000000 0.500000000 0.000000000 0.000000000 0.998469293 -0.008281939 0.054685175 1.181414517 0.008542870 0.999953210 -0.004539483 0.618089736 -0.054645021 0.004999703 0.998493314 -1.159911786

assets/pose_files/3f79dc32d575bcdc.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=1qVpRlWxam4
+87387300 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.998291552 0.018666664 0.055367537 -0.431348097 -0.018963017 0.999808490 0.004831879 0.070488701 -0.055266738 -0.005873560 0.998454332 -0.848986490
+87554133 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.997851610 0.017319093 0.063184217 -0.464904483 -0.017837154 0.999811709 0.007644337 0.068569507 -0.063039921 -0.008754940 0.997972608 -0.876888649
+87720967 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.997675776 0.016262729 0.066170901 -0.486385324 -0.016915560 0.999813497 0.009317505 0.069230577 -0.066007033 -0.010415167 0.997764826 -0.912234761
+87887800 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.997801185 0.015721651 0.064386748 -0.496646826 -0.016416471 0.999812424 0.010276551 0.072350447 -0.064213105 -0.011310958 0.997872114 -0.952896762
+88054633 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.998319149 0.016226118 0.055637561 -0.489176520 -0.016823635 0.999805570 0.010287891 0.076572802 -0.055459809 -0.011206625 0.998398006 -1.004124831
+88221467 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.998926342 0.017387087 0.042939600 -0.475558168 -0.017787032 0.999801755 0.008949692 0.085218470 -0.042775478 -0.009703852 0.999037564 -1.053459508
+88421667 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.999537408 0.020246139 0.022695299 -0.447975333 -0.020412439 0.999766290 0.007119950 0.094693503 -0.022545842 -0.007579923 0.999717057 -1.119813421
+88588500 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.999718606 0.023644496 0.001895716 -0.414069396 -0.023654999 0.999703765 0.005723978 0.102792865 -0.001759814 -0.005767211 0.999981821 -1.180436614
+88755333 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.999452412 0.027961638 -0.017690983 -0.387314056 -0.027902454 0.999604225 0.003583529 0.113408687 0.017784184 -0.003087946 0.999837101 -1.226234160
+88922167 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.998792231 0.032815013 -0.036568113 -0.365929800 -0.032777511 0.999461353 0.001624677 0.124849793 0.036601726 -0.000424103 0.999329865 -1.267691893
+89089000 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.997681975 0.038422074 -0.056164715 -0.342733324 -0.038413495 0.999261141 0.001232749 0.131945819 0.056170583 0.000927592 0.998420775 -1.304181539
+89255833 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.995796800 0.044428598 -0.080092981 -0.304097608 -0.044486329 0.999009430 0.001064335 0.139304626 0.080060929 0.002503182 0.996786833 -1.346184197
+89456033 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.992265880 0.051900670 -0.112759680 -0.242293511 -0.051975533 0.998645782 0.002277754 0.141999546 0.112725191 0.003600606 0.993619680 -1.403491443
+89622867 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.988518834 0.057282198 -0.139818728 -0.191310403 -0.057411496 0.998345733 0.003111851 0.144317113 0.139765680 0.004951079 0.990172207 -1.446433054
+89789700 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.984156251 0.062501073 -0.165921792 -0.143876127 -0.062264379 0.998037636 0.006632906 0.137240925 0.166010767 0.003803201 0.986116588 -1.485275757
+89956533 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.979292631 0.066839822 -0.191097870 -0.099323029 -0.066578977 0.997750700 0.007792730 0.139573975 0.191188902 0.005091738 0.981540024 -1.518326120
+90123367 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.973332286 0.070821166 -0.218194127 -0.042629488 -0.070645541 0.997464299 0.008616162 0.140175484 0.218251050 0.007028054 0.975867331 -1.554681376
+90290200 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.966279447 0.074934490 -0.246350974 0.028017454 -0.074612871 0.997155666 0.010653359 0.133648148 0.246448576 0.008086831 0.969122112 -1.595505702
+90490400 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.957038641 0.079540767 -0.278837353 0.115624588 -0.079602204 0.996764660 0.011121323 0.132533757 0.278819799 0.011552530 0.960273921 -1.622873069
+90657233 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.948620677 0.083499879 -0.305199176 0.195920884 -0.084326349 0.996382892 0.010498469 0.132694923 0.304971874 0.015777269 0.952230692 -1.640734525
+90824067 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.940162480 0.087165222 -0.329388469 0.271130852 -0.089630231 0.995945156 0.007725847 0.141518901 0.328726262 0.022259612 0.944162905 -1.645387258
+90990900 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.932207108 0.091082737 -0.350276887 0.339575901 -0.095441416 0.995423317 0.004838228 0.149739069 0.349114448 0.028920690 0.936633706 -1.648637528
+91157733 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.925111592 0.095387921 -0.367518306 0.398473437 -0.101803973 0.994802594 0.001937620 0.156527229 0.365792990 0.035622310 0.930014253 -1.650611039
+91324567 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.918998003 0.099750437 -0.381434739 0.448520864 -0.108052738 0.994145095 -0.000350893 0.159817007 0.379166484 0.041537538 0.924395680 -1.652156379
+91524767 0.487278048 0.866272132 0.500000000 0.500000000 0.000000000 0.000000000 0.913031578 0.105410583 -0.394032896 0.493990424 -0.115993932 0.993245184 -0.003064641 0.163621223 0.391048223 0.048503540 0.919091225 -1.650421710

assets/pose_files/4a2d6753676df096.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://www.youtube.com/watch?v=mGFQkgadzRQ
+123373000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.998857915 0.002672890 -0.047704928 -0.388737999 -0.002653247 0.999996364 0.000475094 -0.004533370 0.047706023 -0.000347978 0.998861372 0.139698036
+123581000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.997534156 0.002900333 -0.070122920 -0.417036011 -0.002881077 0.999995768 0.000375740 -0.005476288 0.070123710 -0.000172784 0.997538269 0.134851393
+123790000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.995756805 0.003055056 -0.091973245 -0.444572396 -0.003032017 0.999995351 0.000390221 -0.006227409 0.091974013 -0.000109701 0.995761395 0.129660844
+123999000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.993462563 0.003229393 -0.114112593 -0.472377562 -0.003208589 0.999994814 0.000365978 -0.005932507 0.114113182 0.000002555 0.993467748 0.123959606
+124207000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.990603268 0.003450655 -0.136723205 -0.500173495 -0.003429445 0.999994040 0.000390680 -0.006082111 0.136723727 0.000081876 0.990609229 0.117333920
+124416000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.987169921 0.003684058 -0.159630775 -0.528663584 -0.003696360 0.999993145 0.000219867 -0.006000823 0.159630492 0.000373007 0.987176776 0.110039363
+124666000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.982509255 0.003945273 -0.186171964 -0.561235187 -0.003999375 0.999992013 0.000084966 -0.007105507 0.186170816 0.000661092 0.982517183 0.100220962
+124874000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.978331029 0.004287674 -0.207002580 -0.586641713 -0.004329238 0.999990582 0.000252201 -0.009076863 0.207001716 0.000649428 0.978340387 0.091930702
+125083000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.973817825 0.004402113 -0.227287307 -0.611123286 -0.004493149 0.999989927 0.000116853 -0.009074310 0.227285519 0.000907443 0.973827720 0.083304516
+125292000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.968695283 0.004357880 -0.248214558 -0.636185581 -0.004474392 0.999989986 0.000094731 -0.008011808 0.248212487 0.001018844 0.968705058 0.074442714
+125500000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.962747812 0.004391920 -0.270365149 -0.662786287 -0.004570082 0.999989569 -0.000029452 -0.006714359 0.270362198 0.001263946 0.962757826 0.064526619
+125709000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.955500066 0.004409539 -0.294957876 -0.691555299 -0.004778699 0.999988437 -0.000530787 -0.004279872 0.294952124 0.001916682 0.955510139 0.052776269
+125959000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.946343422 0.004634521 -0.323129416 -0.724457169 -0.005231380 0.999985814 -0.000978639 -0.001732190 0.323120296 0.002616541 0.946354270 0.037519903
+126167000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.939147174 0.004834646 -0.343481004 -0.749049950 -0.005533603 0.999984145 -0.001054784 -0.002170622 0.343470454 0.002891285 0.939159036 0.026149102
+126376000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.931589127 0.004859613 -0.363480538 -0.772472331 -0.005669596 0.999983251 -0.001161554 -0.002324411 0.363468796 0.003142879 0.931601048 0.014526636
+126584000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.923323810 0.004994850 -0.383989871 -0.796474752 -0.005933880 0.999981582 -0.001260800 -0.001656055 0.383976519 0.003442676 0.923336446 0.001805353
+126793000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.914980114 0.005216786 -0.403465271 -0.819526045 -0.006272262 0.999979496 -0.001294577 -0.001858109 0.403450221 0.003715152 0.914994061 -0.010564998
+127002000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.906303227 0.005258658 -0.422595352 -0.842292418 -0.006397304 0.999978721 -0.001276282 -0.001621911 0.422579646 0.003860169 0.906317592 -0.023561723
+127252000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.894903898 0.005439967 -0.446225733 -0.870190326 -0.006754198 0.999976277 -0.001354740 -0.001280526 0.446207762 0.004226258 0.894919395 -0.040196739
+127460000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.884573221 0.005480251 -0.466369092 -0.894082633 -0.006980692 0.999974549 -0.001489853 -0.000948027 0.466349065 0.004573463 0.884588957 -0.055396928
+127669000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.873941839 0.005343055 -0.486001104 -0.917185865 -0.007038773 0.999973834 -0.001663705 -0.000687769 0.485979497 0.004874832 0.873956621 -0.070420475
+127877000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.862660766 0.005402398 -0.505754173 -0.939888304 -0.007276668 0.999972045 -0.001730187 -0.000489221 0.505730629 0.005172769 0.862675905 -0.086411685
+128086000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.851201892 0.005282878 -0.524811804 -0.961420775 -0.007401088 0.999970734 -0.001938020 -0.000533338 0.524786234 0.005533825 0.851216078 -0.102931062
+128295000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.839382112 0.005324626 -0.543515682 -0.982849443 -0.007655066 0.999968648 -0.002025823 0.001148876 0.543487847 0.005861088 0.839396596 -0.119132721
+128545000 0.591609280 1.051749871 0.500000000 0.500000000 0.000000000 0.000000000 0.825496972 0.005258156 -0.564382017 -1.006530766 -0.007933038 0.999965906 -0.002286965 0.002382303 0.564350784 0.006365147 0.825510561 -0.138386240

assets/reference_videos/0bf152ef84195293.mp4 ADDED Viewed

Binary file (231 kB). View file

assets/reference_videos/0c11dbe781b1c11c.mp4 ADDED Viewed

Binary file (219 kB). View file

assets/reference_videos/0c9b371cc6225682.mp4 ADDED Viewed

Binary file (195 kB). View file

assets/reference_videos/0f47577ab3441480.mp4 ADDED Viewed

Binary file (161 kB). View file

assets/reference_videos/0f68374b76390082.mp4 ADDED Viewed

Binary file (299 kB). View file

assets/reference_videos/2c80f9eb0d3b2bb4.mp4 ADDED Viewed

Binary file (173 kB). View file

assets/reference_videos/2f25826f0d0ef09a.mp4 ADDED Viewed

Binary file (195 kB). View file

assets/reference_videos/3f79dc32d575bcdc.mp4 ADDED Viewed

Binary file (148 kB). View file

assets/reference_videos/4a2d6753676df096.mp4 ADDED Viewed

Binary file (229 kB). View file

cameractrl/data/dataset.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import os
+import random
+import json
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as F
+import numpy as np
+from decord import VideoReader
+from torch.utils.data.dataset import Dataset
+from packaging import version as pver
+class RandomHorizontalFlipWithPose(nn.Module):
+    def __init__(self, p=0.5):
+        super(RandomHorizontalFlipWithPose, self).__init__()
+        self.p = p
+    def get_flip_flag(self, n_image):
+        return torch.rand(n_image) < self.p
+    def forward(self, image, flip_flag=None):
+        n_image = image.shape[0]
+        if flip_flag is not None:
+            assert n_image == flip_flag.shape[0]
+        else:
+            flip_flag = self.get_flip_flag(n_image)
+        ret_images = []
+        for fflag, img in zip(flip_flag, image):
+            if fflag:
+                ret_images.append(F.hflip(img))
+            else:
+                ret_images.append(img)
+        return torch.stack(ret_images, dim=0)
+class Camera(object):
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+def custom_meshgrid(*args):
+    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
+    if pver.parse(torch.__version__) < pver.parse('1.10'):
+        return torch.meshgrid(*args)
+    else:
+        return torch.meshgrid(*args, indexing='ij')
+def ray_condition(K, c2w, H, W, device, flip_flag=None):
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+    B, V = K.shape[:2]
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, V, H * W]) + 0.5          # [B, V, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, V, H * W]) + 0.5          # [B, V, HxW]
+    n_flip = torch.sum(flip_flag).item() if flip_flag is not None else 0
+    if n_flip > 0:
+        j_flip, i_flip = custom_meshgrid(
+            torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+            torch.linspace(W - 1, 0, W, device=device, dtype=c2w.dtype)
+        )
+        i_flip = i_flip.reshape([1, 1, H * W]).expand(B, 1, H * W) + 0.5
+        j_flip = j_flip.reshape([1, 1, H * W]).expand(B, 1, H * W) + 0.5
+        i[:, flip_flag, ...] = i_flip
+        j[:, flip_flag, ...] = j_flip
+    fx, fy, cx, cy = K.chunk(4, dim=-1)     # B,V, 1
+    zs = torch.ones_like(i)                 # [B, V, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+    directions = torch.stack((xs, ys, zs), dim=-1)              # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)             # B, V, HW, 3
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)        # B, V, HW, 3
+    rays_o = c2w[..., :3, 3]                                        # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)                   # B, V, HW, 3
+    # c2w @ dirctions
+    rays_dxo = torch.linalg.cross(rays_o, rays_d)                          # B, V, HW, 3
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)             # B, V, H, W, 6
+    # plucker = plucker.permute(0, 1, 4, 2, 3)
+    return plucker
+class RealEstate10K(Dataset):
+    def __init__(
+            self,
+            root_path,
+            annotation_json,
+            sample_stride=4,
+            sample_n_frames=16,
+            sample_size=[256, 384],
+            is_image=False,
+    ):
+        self.root_path = root_path
+        self.sample_stride = sample_stride
+        self.sample_n_frames = sample_n_frames
+        self.is_image = is_image
+        self.dataset = json.load(open(os.path.join(root_path, annotation_json), 'r'))
+        self.length = len(self.dataset)
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        pixel_transforms = [transforms.Resize(sample_size),
+                            transforms.RandomHorizontalFlip(),
+                            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        self.pixel_transforms = transforms.Compose(pixel_transforms)
+    def load_video_reader(self, idx):
+        video_dict = self.dataset[idx]
+        video_path = os.path.join(self.root_path, video_dict['clip_path'])
+        video_reader = VideoReader(video_path)
+        return video_reader, video_dict['caption']
+    def get_batch(self, idx):
+        video_reader, video_caption = self.load_video_reader(idx)
+        total_frames = len(video_reader)
+        if self.is_image:
+            frame_indice = [random.randint(0, total_frames - 1)]
+        else:
+            if isinstance(self.sample_stride, int):
+                current_sample_stride = self.sample_stride
+            else:
+                assert len(self.sample_stride) == 2
+                assert (self.sample_stride[0] >= 1) and (self.sample_stride[1] >= self.sample_stride[0])
+                current_sample_stride = random.randint(self.sample_stride[0], self.sample_stride[1])
+            cropped_length = self.sample_n_frames * current_sample_stride
+            start_frame_ind = random.randint(0, max(0, total_frames - cropped_length - 1))
+            end_frame_ind = min(start_frame_ind + cropped_length, total_frames)
+            assert end_frame_ind - start_frame_ind >= self.sample_n_frames
+            frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, self.sample_n_frames, dtype=int)
+        pixel_values = torch.from_numpy(video_reader.get_batch(frame_indice).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        if self.is_image:
+            pixel_values = pixel_values[0]
+        return pixel_values, video_caption
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, video_caption = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        video = self.pixel_transforms(video)
+        sample = dict(pixel_values=video, caption=video_caption)
+        return sample
+class RealEstate10KPose(Dataset):
+    def __init__(
+            self,
+            root_path,
+            annotation_json,
+            sample_stride=4,
+            minimum_sample_stride=1,
+            sample_n_frames=16,
+            relative_pose=False,
+            zero_t_first_frame=False,
+            sample_size=[256, 384],
+            rescale_fxy=False,
+            shuffle_frames=False,
+            use_flip=False,
+            return_clip_name=False,
+    ):
+        self.root_path = root_path
+        self.relative_pose = relative_pose
+        self.zero_t_first_frame = zero_t_first_frame
+        self.sample_stride = sample_stride
+        self.minimum_sample_stride = minimum_sample_stride
+        self.sample_n_frames = sample_n_frames
+        self.return_clip_name = return_clip_name
+        self.dataset = json.load(open(os.path.join(root_path, annotation_json), 'r'))
+        self.length = len(self.dataset)
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.sample_size = sample_size
+        if use_flip:
+            pixel_transforms = [transforms.Resize(sample_size),
+                                RandomHorizontalFlipWithPose(),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        else:
+            pixel_transforms = [transforms.Resize(sample_size),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        self.rescale_fxy = rescale_fxy
+        self.sample_wh_ratio = sample_size[1] / sample_size[0]
+        self.pixel_transforms = pixel_transforms
+        self.shuffle_frames = shuffle_frames
+        self.use_flip = use_flip
+    def get_relative_pose(self, cam_params):
+        abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+        abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+        source_cam_c2w = abs_c2ws[0]
+        if self.zero_t_first_frame:
+            cam_to_origin = 0
+        else:
+            cam_to_origin = np.linalg.norm(source_cam_c2w[:3, 3])
+        target_cam_c2w = np.array([
+            [1, 0, 0, 0],
+            [0, 1, 0, -cam_to_origin],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1]
+        ])
+        abs2rel = target_cam_c2w @ abs_w2cs[0]
+        ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+        ret_poses = np.array(ret_poses, dtype=np.float32)
+        return ret_poses
+    def load_video_reader(self, idx):
+        video_dict = self.dataset[idx]
+        video_path = os.path.join(self.root_path, video_dict['clip_path'])
+        video_reader = VideoReader(video_path)
+        return video_dict['clip_name'], video_reader, video_dict['caption']
+    def load_cameras(self, idx):
+        video_dict = self.dataset[idx]
+        pose_file = os.path.join(self.root_path, video_dict['pose_file'])
+        with open(pose_file, 'r') as f:
+            poses = f.readlines()
+        poses = [pose.strip().split(' ') for pose in poses[1:]]
+        cam_params = [[float(x) for x in pose] for pose in poses]
+        cam_params = [Camera(cam_param) for cam_param in cam_params]
+        return cam_params
+    def get_batch(self, idx):
+        clip_name, video_reader, video_caption = self.load_video_reader(idx)
+        cam_params = self.load_cameras(idx)
+        assert len(cam_params) >= self.sample_n_frames
+        total_frames = len(cam_params)
+        current_sample_stride = self.sample_stride
+        if total_frames < self.sample_n_frames * current_sample_stride:
+            maximum_sample_stride = int(total_frames // self.sample_n_frames)
+            current_sample_stride = random.randint(self.minimum_sample_stride, maximum_sample_stride)
+        cropped_length = self.sample_n_frames * current_sample_stride
+        start_frame_ind = random.randint(0, max(0, total_frames - cropped_length - 1))
+        end_frame_ind = min(start_frame_ind + cropped_length, total_frames)
+        assert end_frame_ind - start_frame_ind >= self.sample_n_frames
+        frame_indices = np.linspace(start_frame_ind, end_frame_ind - 1, self.sample_n_frames, dtype=int)
+        condition_image_ind = random.sample(list(set(range(total_frames)) - set(frame_indices.tolist())), 1)
+        condition_image = torch.from_numpy(video_reader.get_batch(condition_image_ind).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        condition_image = condition_image / 255.
+        if self.shuffle_frames:
+            perm = np.random.permutation(self.sample_n_frames)
+            frame_indices = frame_indices[perm]
+        pixel_values = torch.from_numpy(video_reader.get_batch(frame_indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        cam_params = [cam_params[indice] for indice in frame_indices]
+        if self.rescale_fxy:
+            ori_h, ori_w = pixel_values.shape[-2:]
+            ori_wh_ratio = ori_w / ori_h
+            if ori_wh_ratio > self.sample_wh_ratio:       # rescale fx
+                resized_ori_w = self.sample_size[0] * ori_wh_ratio
+                for cam_param in cam_params:
+                    cam_param.fx = resized_ori_w * cam_param.fx / self.sample_size[1]
+            else:                                          # rescale fy
+                resized_ori_h = self.sample_size[1] / ori_wh_ratio
+                for cam_param in cam_params:
+                    cam_param.fy = resized_ori_h * cam_param.fy / self.sample_size[0]
+        intrinsics = np.asarray([[cam_param.fx * self.sample_size[1],
+                                  cam_param.fy * self.sample_size[0],
+                                  cam_param.cx * self.sample_size[1],
+                                  cam_param.cy * self.sample_size[0]]
+                                 for cam_param in cam_params], dtype=np.float32)
+        intrinsics = torch.as_tensor(intrinsics)[None]                  # [1, n_frame, 4]
+        if self.relative_pose:
+            c2w_poses = self.get_relative_pose(cam_params)
+        else:
+            c2w_poses = np.array([cam_param.c2w_mat for cam_param in cam_params], dtype=np.float32)
+        c2w = torch.as_tensor(c2w_poses)[None]                          # [1, n_frame, 4, 4]
+        if self.use_flip:
+            flip_flag = self.pixel_transforms[1].get_flip_flag(self.sample_n_frames)
+        else:
+            flip_flag = torch.zeros(self.sample_n_frames, dtype=torch.bool, device=c2w.device)
+        plucker_embedding = ray_condition(intrinsics, c2w, self.sample_size[0], self.sample_size[1], device='cpu',
+                                          flip_flag=flip_flag)[0].permute(0, 3, 1, 2).contiguous()
+        return pixel_values, condition_image, plucker_embedding, video_caption, flip_flag, clip_name
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, condition_image, plucker_embedding, video_caption, flip_flag, clip_name = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        if self.use_flip:
+            video = self.pixel_transforms[0](video)
+            video = self.pixel_transforms[1](video, flip_flag)
+            for transform in self.pixel_transforms[2:]:
+                video = transform(video)
+        else:
+            for transform in self.pixel_transforms:
+                video = transform(video)
+        for transform in self.pixel_transforms:
+            condition_image = transform(condition_image)
+        if self.return_clip_name:
+            sample = dict(pixel_values=video, condition_image=condition_image, plucker_embedding=plucker_embedding, video_caption=video_caption, clip_name=clip_name)
+        else:
+            sample = dict(pixel_values=video, condition_image=condition_image, plucker_embedding=plucker_embedding, video_caption=video_caption)
+        return sample

cameractrl/models/attention.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from typing import Optional
+from diffusers.models.attention import TemporalBasicTransformerBlock, _chunked_feed_forward
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+@maybe_allow_in_graph
+class TemporalPoseCondTransformerBlock(TemporalBasicTransformerBlock):
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,   # [bs * num_frame, h * w, c]
+        num_frames: int,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,  # [bs * h * w, 1, c]
+        pose_feature: Optional[torch.FloatTensor] = None,       # [bs, c, n_frame, h, w]
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_frames, seq_length, channels = hidden_states.shape
+        batch_size = batch_frames // num_frames
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)  # [bs * h * w, frame, c]
+        residual = hidden_states
+        hidden_states = self.norm_in(hidden_states)
+        if self._chunk_size is not None:
+            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            hidden_states = self.ff_in(hidden_states)
+        if self.is_res:
+            hidden_states = hidden_states + residual
+        norm_hidden_states = self.norm1(hidden_states)
+        pose_feature = pose_feature.permute(0, 3, 4, 2, 1).reshape(batch_size * seq_length, num_frames, -1)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None, pose_feature=pose_feature)
+        hidden_states = attn_output + hidden_states
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states, pose_feature=pose_feature)
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.is_res:
+            hidden_states = ff_output + hidden_states
+        else:
+            hidden_states = ff_output
+        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
+        return hidden_states

cameractrl/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,591 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+import logging
+from diffusers.models.attention import Attention
+from diffusers.utils import USE_PEFT_BACKEND, is_xformers_available
+from typing import Optional, Callable
+from einops import rearrange
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+logger = logging.getLogger(__name__)
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+            self,
+            attn: Attention,
+            hidden_states: torch.FloatTensor,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            temb: Optional[torch.FloatTensor] = None,
+            scale: float = 1.0,
+            pose_feature=None
+    ) -> torch.Tensor:
+        residual = hidden_states
+        args = () if USE_PEFT_BACKEND else (scale,)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states, *args)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+        pose_feature=None
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        args = () if USE_PEFT_BACKEND else (scale,)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        args = () if USE_PEFT_BACKEND else (scale,)
+        query = attn.to_q(hidden_states, *args)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class XFormersAttnProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+        pose_feature=None
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        args = () if USE_PEFT_BACKEND else (scale,)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, key_tokens, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states, *args)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class PoseAdaptorAttnProcessor(nn.Module):
+    def __init__(self,
+                 hidden_size,  # dimension of hidden state
+                 pose_feature_dim=None,  # dimension of the pose feature
+                 cross_attention_dim=None,  # dimension of the text embedding
+                 query_condition=False,
+                 key_value_condition=False,
+                 scale=1.0):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.pose_feature_dim = pose_feature_dim
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.query_condition = query_condition
+        self.key_value_condition = key_value_condition
+        assert hidden_size == pose_feature_dim
+        if self.query_condition and self.key_value_condition:
+            self.qkv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.qkv_merge.weight)
+            init.zeros_(self.qkv_merge.bias)
+        elif self.query_condition:
+            self.q_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.q_merge.weight)
+            init.zeros_(self.q_merge.bias)
+        else:
+            self.kv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.kv_merge.weight)
+            init.zeros_(self.kv_merge.bias)
+    def forward(self,
+                attn,
+                hidden_states,
+                pose_feature,
+                encoder_hidden_states=None,
+                attention_mask=None,
+                temb=None,
+                scale=None,):
+        assert pose_feature is not None
+        pose_embedding_scale = (scale or self.scale)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        assert hidden_states.ndim == 3 and pose_feature.ndim == 3
+        if self.query_condition and self.key_value_condition:
+            assert encoder_hidden_states is None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        assert encoder_hidden_states.ndim == 3
+        batch_size, ehs_sequence_length, _ = encoder_hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, ehs_sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        if attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.query_condition and self.key_value_condition:  # only self attention
+            query_hidden_state = self.qkv_merge(hidden_states + pose_feature) * pose_embedding_scale + hidden_states
+            key_value_hidden_state = query_hidden_state
+        elif self.query_condition:
+            query_hidden_state = self.q_merge(hidden_states + pose_feature) * pose_embedding_scale + hidden_states
+            key_value_hidden_state = encoder_hidden_states
+        else:
+            key_value_hidden_state = self.kv_merge(encoder_hidden_states + pose_feature) * pose_embedding_scale + encoder_hidden_states
+            query_hidden_state = hidden_states
+        # original attention
+        query = attn.to_q(query_hidden_state)
+        key = attn.to_k(key_value_hidden_state)
+        value = attn.to_v(key_value_hidden_state)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class PoseAdaptorAttnProcessor2_0(nn.Module):
+    def __init__(self,
+                 hidden_size,  # dimension of hidden state
+                 pose_feature_dim=None,  # dimension of the pose feature
+                 cross_attention_dim=None,  # dimension of the text embedding
+                 query_condition=False,
+                 key_value_condition=False,
+                 scale=1.0):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.hidden_size = hidden_size
+        self.pose_feature_dim = pose_feature_dim
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.query_condition = query_condition
+        self.key_value_condition = key_value_condition
+        assert hidden_size == pose_feature_dim
+        if self.query_condition and self.key_value_condition:
+            self.qkv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.qkv_merge.weight)
+            init.zeros_(self.qkv_merge.bias)
+        elif self.query_condition:
+            self.q_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.q_merge.weight)
+            init.zeros_(self.q_merge.bias)
+        else:
+            self.kv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.kv_merge.weight)
+            init.zeros_(self.kv_merge.bias)
+    def forward(self,
+                attn,
+                hidden_states,
+                pose_feature,
+                encoder_hidden_states=None,
+                attention_mask=None,
+                temb=None,
+                scale=None,):
+        assert pose_feature is not None
+        pose_embedding_scale = (scale or self.scale)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        assert hidden_states.ndim == 3 and pose_feature.ndim == 3
+        if self.query_condition and self.key_value_condition:
+            assert encoder_hidden_states is None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        assert encoder_hidden_states.ndim == 3
+        batch_size, ehs_sequence_length, _ = encoder_hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, ehs_sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        if attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.query_condition and self.key_value_condition:  # only self attention
+            query_hidden_state = self.qkv_merge(hidden_states + pose_feature) * pose_embedding_scale + hidden_states
+            key_value_hidden_state = query_hidden_state
+        elif self.query_condition:
+            query_hidden_state = self.q_merge(hidden_states + pose_feature) * pose_embedding_scale + hidden_states
+            key_value_hidden_state = encoder_hidden_states
+        else:
+            key_value_hidden_state = self.kv_merge(encoder_hidden_states + pose_feature) * pose_embedding_scale + encoder_hidden_states
+            query_hidden_state = hidden_states
+        # original attention
+        query = attn.to_q(query_hidden_state)
+        key = attn.to_k(key_value_hidden_state)
+        value = attn.to_v(key_value_hidden_state)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)        # [bs, seq_len, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)  # [bs, nhead, seq_len, head_dim]
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)    # [bs, seq_len, dim]
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class PoseAdaptorXFormersAttnProcessor(nn.Module):
+    def __init__(self,
+                 hidden_size,  # dimension of hidden state
+                 pose_feature_dim=None,  # dimension of the pose feature
+                 cross_attention_dim=None,  # dimension of the text embedding
+                 query_condition=False,
+                 key_value_condition=False,
+                 scale=1.0,
+                 attention_op: Optional[Callable] = None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.pose_feature_dim = pose_feature_dim
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.query_condition = query_condition
+        self.key_value_condition = key_value_condition
+        self.attention_op = attention_op
+        assert hidden_size == pose_feature_dim
+        if self.query_condition and self.key_value_condition:
+            self.qkv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.qkv_merge.weight)
+            init.zeros_(self.qkv_merge.bias)
+        elif self.query_condition:
+            self.q_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.q_merge.weight)
+            init.zeros_(self.q_merge.bias)
+        else:
+            self.kv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.kv_merge.weight)
+            init.zeros_(self.kv_merge.bias)
+    def forward(self,
+                attn,
+                hidden_states,
+                pose_feature,
+                encoder_hidden_states=None,
+                attention_mask=None,
+                temb=None,
+                scale=None,):
+        assert pose_feature is not None
+        pose_embedding_scale = (scale or self.scale)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        assert hidden_states.ndim == 3 and pose_feature.ndim == 3
+        if self.query_condition and self.key_value_condition:
+            assert encoder_hidden_states is None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        assert encoder_hidden_states.ndim == 3
+        batch_size, ehs_sequence_length, _ = encoder_hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, ehs_sequence_length, batch_size)
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        if attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.query_condition and self.key_value_condition:  # only self attention
+            query_hidden_state = self.qkv_merge(hidden_states + pose_feature) * pose_embedding_scale + hidden_states
+            key_value_hidden_state = query_hidden_state
+        elif self.query_condition:
+            query_hidden_state = self.q_merge(hidden_states + pose_feature) * pose_embedding_scale + hidden_states
+            key_value_hidden_state = encoder_hidden_states
+        else:
+            key_value_hidden_state = self.kv_merge(encoder_hidden_states + pose_feature) * pose_embedding_scale + encoder_hidden_states
+            query_hidden_state = hidden_states
+        # original attention
+        query = attn.to_q(query_hidden_state)
+        key = attn.to_k(key_value_hidden_state)
+        value = attn.to_v(key_value_hidden_state)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

cameractrl/models/motion_module.py ADDED Viewed

	@@ -0,0 +1,399 @@

+from dataclasses import dataclass
+from typing import Callable, Optional
+import torch
+from torch import nn
+from diffusers.utils import BaseOutput
+from diffusers.models.attention_processor import Attention
+from diffusers.models.attention import FeedForward
+from typing import Dict, Any
+from cameractrl.models.attention_processor import PoseAdaptorAttnProcessor
+from einops import rearrange
+import math
+class InflatedGroupNorm(nn.GroupNorm):
+    def forward(self, x):
+        # return super().forward(x)
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+@dataclass
+class TemporalTransformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+def get_motion_module(
+        in_channels,
+        motion_module_type: str,
+        motion_module_kwargs: dict
+):
+    if motion_module_type == "Vanilla":
+        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs)
+    else:
+        raise ValueError
+class VanillaTemporalModule(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            num_attention_heads=8,
+            num_transformer_block=2,
+            attention_block_types=("Temporal_Self",),
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=32,
+            temporal_attention_dim_div=1,
+            cross_attention_dim=320,
+            zero_initialize=True,
+            encoder_hidden_states_query=(False, False),
+            attention_activation_scale=1.0,
+            attention_processor_kwargs: Dict = {},
+            causal_temporal_attention=False,
+            causal_temporal_attention_mask_type="",
+            rescale_output_factor=1.0
+    ):
+        super().__init__()
+        self.temporal_transformer = TemporalTransformer3DModel(
+            in_channels=in_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
+            num_layers=num_transformer_block,
+            attention_block_types=attention_block_types,
+            cross_attention_dim=cross_attention_dim,
+            temporal_position_encoding=temporal_position_encoding,
+            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+            encoder_hidden_states_query=encoder_hidden_states_query,
+            attention_activation_scale=attention_activation_scale,
+            attention_processor_kwargs=attention_processor_kwargs,
+            causal_temporal_attention=causal_temporal_attention,
+            causal_temporal_attention_mask_type=causal_temporal_attention_mask_type,
+            rescale_output_factor=rescale_output_factor
+        )
+        if zero_initialize:
+            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None,
+                cross_attention_kwargs: Dict[str, Any] = {}):
+        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask, cross_attention_kwargs=cross_attention_kwargs)
+        output = hidden_states
+        return output
+class TemporalTransformer3DModel(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            num_attention_heads,
+            attention_head_dim,
+            num_layers,
+            attention_block_types=("Temporal_Self", "Temporal_Self",),
+            dropout=0.0,
+            norm_num_groups=32,
+            cross_attention_dim=320,
+            activation_fn="geglu",
+            attention_bias=False,
+            upcast_attention=False,
+            temporal_position_encoding=False,
+            temporal_position_encoding_max_len=32,
+            encoder_hidden_states_query=(False, False),
+            attention_activation_scale=1.0,
+            attention_processor_kwargs: Dict = {},
+            causal_temporal_attention=None,
+            causal_temporal_attention_mask_type="",
+            rescale_output_factor=1.0
+    ):
+        super().__init__()
+        assert causal_temporal_attention is not None
+        self.causal_temporal_attention = causal_temporal_attention
+        assert (not causal_temporal_attention) or (causal_temporal_attention_mask_type != "")
+        self.causal_temporal_attention_mask_type = causal_temporal_attention_mask_type
+        self.causal_temporal_attention_mask = None
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = InflatedGroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TemporalTransformerBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    attention_block_types=attention_block_types,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                    encoder_hidden_states_query=encoder_hidden_states_query,
+                    attention_activation_scale=attention_activation_scale,
+                    attention_processor_kwargs=attention_processor_kwargs,
+                    rescale_output_factor=rescale_output_factor,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def get_causal_temporal_attention_mask(self, hidden_states):
+        batch_size, sequence_length, dim = hidden_states.shape
+        if self.causal_temporal_attention_mask is None or self.causal_temporal_attention_mask.shape != (
+        batch_size, sequence_length, sequence_length):
+            if self.causal_temporal_attention_mask_type == "causal":
+                # 1. vanilla causal mask
+                mask = torch.tril(torch.ones(sequence_length, sequence_length))
+            elif self.causal_temporal_attention_mask_type == "2-seq":
+                # 2. 2-seq
+                mask = torch.zeros(sequence_length, sequence_length)
+                mask[:sequence_length // 2, :sequence_length // 2] = 1
+                mask[-sequence_length // 2:, -sequence_length // 2:] = 1
+            elif self.causal_temporal_attention_mask_type == "0-prev":
+                # attn to the previous frame
+                indices = torch.arange(sequence_length)
+                indices_prev = indices - 1
+                indices_prev[0] = 0
+                mask = torch.zeros(sequence_length, sequence_length)
+                mask[:, 0] = 1.
+                mask[indices, indices_prev] = 1.
+            elif self.causal_temporal_attention_mask_type == "0":
+                # only attn to first frame
+                mask = torch.zeros(sequence_length, sequence_length)
+                mask[:, 0] = 1
+            elif self.causal_temporal_attention_mask_type == "wo-self":
+                indices = torch.arange(sequence_length)
+                mask = torch.ones(sequence_length, sequence_length)
+                mask[indices, indices] = 0
+            elif self.causal_temporal_attention_mask_type == "circle":
+                indices = torch.arange(sequence_length)
+                indices_prev = indices - 1
+                indices_prev[0] = 0
+                mask = torch.eye(sequence_length)
+                mask[indices, indices_prev] = 1
+                mask[0, -1] = 1
+            else:
+                raise ValueError
+            # generate attention mask fron binary values
+            mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+            mask = mask.unsqueeze(0)
+            mask = mask.repeat(batch_size, 1, 1)
+            self.causal_temporal_attention_mask = mask.to(hidden_states.device)
+        return self.causal_temporal_attention_mask
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None,
+                cross_attention_kwargs: Dict[str, Any] = {},):
+        residual = hidden_states
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        height, width = hidden_states.shape[-2:]
+        hidden_states = self.norm(hidden_states)
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b h w) f c")
+        hidden_states = self.proj_in(hidden_states)
+        attention_mask = self.get_causal_temporal_attention_mask(
+            hidden_states) if self.causal_temporal_attention else attention_mask
+        # Transformer Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                  attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = rearrange(hidden_states, "(b h w) f c -> b c f h w", h=height, w=width)
+        output = hidden_states + residual
+        return output
+class TemporalTransformerBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_attention_heads,
+            attention_head_dim,
+            attention_block_types=("Temporal_Self", "Temporal_Self",),
+            dropout=0.0,
+            norm_num_groups=32,
+            cross_attention_dim=768,
+            activation_fn="geglu",
+            attention_bias=False,
+            upcast_attention=False,
+            temporal_position_encoding=False,
+            temporal_position_encoding_max_len=32,
+            encoder_hidden_states_query=(False, False),
+            attention_activation_scale=1.0,
+            attention_processor_kwargs: Dict = {},
+            rescale_output_factor=1.0
+    ):
+        super().__init__()
+        attention_blocks = []
+        norms = []
+        self.attention_block_types = attention_block_types
+        for block_idx, block_name in enumerate(attention_block_types):
+            attention_blocks.append(
+                TemporalSelfAttention(
+                    attention_mode=block_name,
+                    cross_attention_dim=cross_attention_dim if block_name in ['Temporal_Cross', 'Temporal_Pose_Adaptor'] else None,
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                    rescale_output_factor=rescale_output_factor,
+                )
+            )
+            norms.append(nn.LayerNorm(dim))
+        self.attention_blocks = nn.ModuleList(attention_blocks)
+        self.norms = nn.ModuleList(norms)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.ff_norm = nn.LayerNorm(dim)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs: Dict[str, Any] = {}):
+        for attention_block, norm, attention_block_type in zip(self.attention_blocks, self.norms, self.attention_block_types):
+            norm_hidden_states = norm(hidden_states)
+            hidden_states = attention_block(
+                norm_hidden_states,
+                encoder_hidden_states=norm_hidden_states if attention_block_type == 'Temporal_Self' else encoder_hidden_states,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs
+            ) + hidden_states
+        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+        output = hidden_states
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(
+            self,
+            d_model,
+            dropout=0.,
+            max_len=32,
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+class TemporalSelfAttention(Attention):
+    def __init__(
+            self,
+            attention_mode=None,
+            temporal_position_encoding=False,
+            temporal_position_encoding_max_len=32,
+            rescale_output_factor=1.0,
+            *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        assert attention_mode == "Temporal_Self"
+        self.pos_encoder = PositionalEncoding(
+            kwargs["query_dim"],
+            max_len=temporal_position_encoding_max_len
+        ) if temporal_position_encoding else None
+        self.rescale_output_factor = rescale_output_factor
+    def set_use_memory_efficient_attention_xformers(
+            self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ):
+        # disable motion module efficient xformers to avoid bad results, don't know why
+        # TODO: fix this bug
+        pass
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        # add position encoding
+        if self.pos_encoder is not None:
+            hidden_states = self.pos_encoder(hidden_states)
+        if "pose_feature" in cross_attention_kwargs:
+            pose_feature = cross_attention_kwargs["pose_feature"]
+            if pose_feature.ndim == 5:
+                pose_feature = rearrange(pose_feature, "b c f h w -> (b h w) f c")
+            else:
+                assert pose_feature.ndim == 3
+            cross_attention_kwargs["pose_feature"] = pose_feature
+        if isinstance(self.processor,  PoseAdaptorAttnProcessor):
+            return self.processor(
+                self,
+                hidden_states,
+                cross_attention_kwargs.pop('pose_feature'),
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+        elif hasattr(self.processor, "__call__"):
+            return self.processor.__call__(
+                    self,
+                    hidden_states,
+                    encoder_hidden_states=None,
+                    attention_mask=attention_mask,
+                    **cross_attention_kwargs,
+                )
+        else:
+            return self.processor(
+                self,
+                hidden_states,
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )

cameractrl/models/pose_adaptor.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import math
+import torch
+import torch.nn as nn
+from einops import rearrange
+from typing import List, Tuple
+from cameractrl.models.motion_module import TemporalTransformerBlock
+def get_parameter_dtype(parameter: torch.nn.Module):
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, torch.Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class PoseAdaptor(nn.Module):
+    def __init__(self, unet, pose_encoder):
+        super().__init__()
+        self.unet = unet
+        self.pose_encoder = pose_encoder
+    def forward(self, noisy_latents, c_noise, encoder_hidden_states, added_time_ids, pose_embedding):
+        assert pose_embedding.ndim == 5
+        pose_embedding_features = self.pose_encoder(pose_embedding)      # b c f h w
+        noise_pred = self.unet(noisy_latents,
+                               c_noise,
+                               encoder_hidden_states,
+                               added_time_ids=added_time_ids,
+                               pose_features=pose_embedding_features).sample
+        return noise_pred
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResnetBlock(nn.Module):
+    def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
+        super().__init__()
+        ps = ksize // 2
+        if in_c != out_c or sk == False:
+            self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            self.in_conv = None
+        self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
+        if sk == False:
+            self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            self.skep = None
+        self.down = down
+        if self.down == True:
+            self.down_opt = Downsample(in_c, use_conv=use_conv)
+    def forward(self, x):
+        if self.down == True:
+            x = self.down_opt(x)
+        if self.in_conv is not None:  # edit
+            x = self.in_conv(x)
+        h = self.block1(x)
+        h = self.act(h)
+        h = self.block2(h)
+        if self.skep is not None:
+            return h + self.skep(x)
+        else:
+            return h + x
+class PositionalEncoding(nn.Module):
+    def __init__(
+            self,
+            d_model,
+            dropout=0.,
+            max_len=32,
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2, ...] = torch.sin(position * div_term)
+        pe[0, :, 1::2, ...] = torch.cos(position * div_term)
+        pe.unsqueeze_(-1).unsqueeze_(-1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1), ...]
+        return self.dropout(x)
+class CameraPoseEncoder(nn.Module):
+    def __init__(self,
+                 downscale_factor,
+                 channels=[320, 640, 1280, 1280],
+                 nums_rb=3,
+                 cin=64,
+                 ksize=3,
+                 sk=False,
+                 use_conv=True,
+                 compression_factor=1,
+                 temporal_attention_nhead=8,
+                 attention_block_types=("Temporal_Self", ),
+                 temporal_position_encoding=False,
+                 temporal_position_encoding_max_len=16,
+                 rescale_output_factor=1.0):
+        super(CameraPoseEncoder, self).__init__()
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.channels = channels
+        self.nums_rb = nums_rb
+        self.encoder_down_conv_blocks = nn.ModuleList()
+        self.encoder_down_attention_blocks = nn.ModuleList()
+        for i in range(len(channels)):
+            conv_layers = nn.ModuleList()
+            temporal_attention_layers = nn.ModuleList()
+            for j in range(nums_rb):
+                if j == 0 and i != 0:
+                    in_dim = channels[i - 1]
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=True, ksize=ksize, sk=sk, use_conv=use_conv)
+                elif j == 0:
+                    in_dim = channels[0]
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                elif j == nums_rb - 1:
+                    in_dim = channels[i] / compression_factor
+                    out_dim = channels[i]
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                else:
+                    in_dim = int(channels[i] / compression_factor)
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                temporal_attention_layer = TemporalTransformerBlock(dim=out_dim,
+                                                                    num_attention_heads=temporal_attention_nhead,
+                                                                    attention_head_dim=int(out_dim / temporal_attention_nhead),
+                                                                    attention_block_types=attention_block_types,
+                                                                    dropout=0.0,
+                                                                    cross_attention_dim=None,
+                                                                    temporal_position_encoding=temporal_position_encoding,
+                                                                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                                                                    rescale_output_factor=rescale_output_factor)
+                conv_layers.append(conv_layer)
+                temporal_attention_layers.append(temporal_attention_layer)
+            self.encoder_down_conv_blocks.append(conv_layers)
+            self.encoder_down_attention_blocks.append(temporal_attention_layers)
+        self.encoder_conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def forward(self, x):
+        # unshuffle
+        bs = x.shape[0]
+        x = rearrange(x, "b f c h w -> (b f) c h w")
+        x = self.unshuffle(x)
+        # extract features
+        features = []
+        x = self.encoder_conv_in(x)
+        for res_block, attention_block in zip(self.encoder_down_conv_blocks, self.encoder_down_attention_blocks):
+            for res_layer, attention_layer in zip(res_block, attention_block):
+                x = res_layer(x)
+                h, w = x.shape[-2:]
+                x = rearrange(x, '(b f) c h w -> (b h w) f c', b=bs)
+                x = attention_layer(x)
+                x = rearrange(x, '(b h w) f c -> (b f) c h w', h=h, w=w)
+            features.append(rearrange(x, '(b f) c h w -> b c f h w', b=bs))
+        return features

cameractrl/models/transformer_temporal.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import torch
+import torch.nn as nn
+from typing import Optional
+from diffusers.models.transformer_temporal import TransformerTemporalModelOutput
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.resnet import AlphaBlender
+from cameractrl.models.attention import TemporalPoseCondTransformerBlock
+class TransformerSpatioTemporalModelPoseCond(nn.Module):
+    """
+        A Transformer model for video-like data.
+        Parameters:
+            num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+            attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+            in_channels (`int`, *optional*):
+                The number of channels in the input and output (specify if the input is **continuous**).
+            out_channels (`int`, *optional*):
+                The number of channels in the output (specify if the input is **continuous**).
+            num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+            cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        """
+    def __init__(
+            self,
+            num_attention_heads: int = 16,
+            attention_head_dim: int = 88,
+            in_channels: int = 320,
+            out_channels: Optional[int] = None,
+            num_layers: int = 1,
+            cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+        # 2. Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        time_mix_inner_dim = inner_dim
+        self.temporal_transformer_blocks = nn.ModuleList(
+            [
+                TemporalPoseCondTransformerBlock(
+                    inner_dim,
+                    time_mix_inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        time_embed_dim = in_channels * 4
+        self.time_pos_embed = TimestepEmbedding(in_channels, time_embed_dim, out_dim=in_channels)
+        self.time_proj = Timesteps(in_channels, True, 0)
+        self.time_mixer = AlphaBlender(alpha=0.5, merge_strategy="learned_with_images")
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        # TODO: should use out_channels for continuous projections
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,            # [bs * frame, c, h, w]
+        encoder_hidden_states: Optional[torch.Tensor] = None,        # [bs * frame, 1, c]
+        image_only_indicator: Optional[torch.Tensor] = None,         # [bs, frame]
+        pose_feature: Optional[torch.Tensor] = None,                # [bs, c, frame, h, w]
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input hidden_states.
+            num_frames (`int`):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            image_only_indicator (`torch.LongTensor` of shape `(batch size, num_frames)`, *optional*):
+                A tensor indicating whether the input contains only images. 1 indicates that the input contains only
+                images, 0 indicates that the input contains video frames.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
+                returned, otherwise a `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        batch_frames, _, height, width = hidden_states.shape
+        num_frames = image_only_indicator.shape[-1]
+        batch_size = batch_frames // num_frames
+        time_context = encoder_hidden_states        # [bs * frame, 1, c]
+        time_context_first_timestep = time_context[None, :].reshape(
+            batch_size, num_frames, -1, time_context.shape[-1]
+        )[:, 0]     # [bs, frame, c]
+        time_context = time_context_first_timestep[:, None].broadcast_to(
+            batch_size, height * width, time_context.shape[-2], time_context.shape[-1]
+        )           # [bs, h*w, 1, c]
+        time_context = time_context.reshape(batch_size * height * width, -1, time_context.shape[-1])    # [bs * h * w, 1, c]
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)        # [bs * frame, c, h, w]
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_frames, height * width, inner_dim)  # [bs * frame, h * w, c]
+        hidden_states = self.proj_in(hidden_states)     # [bs * frame, h * w, c]
+        num_frames_emb = torch.arange(num_frames, device=hidden_states.device)
+        num_frames_emb = num_frames_emb.repeat(batch_size, 1)       # [bs, frame]
+        num_frames_emb = num_frames_emb.reshape(-1)     # [bs * frame]
+        t_emb = self.time_proj(num_frames_emb)          # [bs * frame, c]
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None, :]       # [bs * frame, 1, c]
+        # 2. Blocks
+        for block, temporal_block in zip(self.transformer_blocks, self.temporal_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    None,
+                    encoder_hidden_states,
+                    None,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,          # [bs * frame, h * w, c]
+                    encoder_hidden_states=encoder_hidden_states,    # [bs * frame, 1, c]
+                )               # [bs * frame, h * w, c]
+            hidden_states_mix = hidden_states
+            hidden_states_mix = hidden_states_mix + emb
+            hidden_states_mix = temporal_block(
+                hidden_states_mix,      # [bs * frame, h * w, c]
+                num_frames=num_frames,
+                encoder_hidden_states=time_context,     # [bs * h * w, 1, c]
+                pose_feature=pose_feature
+            )
+            hidden_states = self.time_mixer(
+                x_spatial=hidden_states,
+                x_temporal=hidden_states_mix,
+                image_only_indicator=image_only_indicator,
+            )
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch_frames, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        if not return_dict:
+            return (output,)
+        return TransformerTemporalModelOutput(sample=output)

cameractrl/models/unet.py ADDED Viewed

	@@ -0,0 +1,587 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from typing import List, Optional, Tuple, Union, Dict
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor, CROSS_ATTENTION_PROCESSORS
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.models.unet_spatio_temporal_condition import UNetSpatioTemporalConditionOutput
+from cameractrl.models.unet_3d_blocks import (
+    get_down_block,
+    get_up_block,
+    UNetMidBlockSpatioTemporalPoseCond
+)
+from cameractrl.models.attention_processor import XFormersAttnProcessor as CustomizedXFormerAttnProcessor
+from cameractrl.models.attention_processor import PoseAdaptorXFormersAttnProcessor
+if hasattr(F, "scaled_dot_product_attention"):
+    from cameractrl.models.attention_processor import PoseAdaptorAttnProcessor2_0 as PoseAdaptorAttnProcessor
+    from cameractrl.models.attention_processor import AttnProcessor2_0 as CustomizedAttnProcessor
+else:
+    from cameractrl.models.attention_processor import PoseAdaptorAttnProcessor
+    from cameractrl.models.attention_processor import AttnProcessor as CustomizedAttnProcessor
+class UNetSpatioTemporalConditionModelPoseCond(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 8,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockSpatioTemporalPoseCond",
+            "CrossAttnDownBlockSpatioTemporalPoseCond",
+            "CrossAttnDownBlockSpatioTemporalPoseCond",
+            "DownBlockSpatioTemporal",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporalPoseCond",
+            "CrossAttnUpBlockSpatioTemporalPoseCond",
+            "CrossAttnUpBlockSpatioTemporalPoseCond",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        addition_time_embed_dim: int = 256,
+        projection_class_embeddings_input_dim: int = 768,
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
+        num_frames: int = 25,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
+        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlockSpatioTemporalPoseCond(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-5,
+                resolution_idx=i,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    # Copied from diffusers.models.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+    def set_pose_cond_attn_processor(self,
+                                     add_spatial=False,
+                                     add_temporal=False,
+                                     enable_xformers=False,
+                                     attn_processor_name='attn1',
+                                     pose_feature_dimensions=[320, 640, 1280, 1280],
+                                     **attention_processor_kwargs):
+        all_attn_processors = {}
+        set_processor_names = attn_processor_name.split(',')
+        if add_spatial:
+            for processor_key in self.attn_processors.keys():
+                if 'temporal' in processor_key:
+                    continue
+                processor_name = processor_key.split('.')[-2]
+                cross_attention_dim = None if processor_name == 'attn1' else self.config.cross_attention_dim
+                if processor_key.startswith("mid_block"):
+                    hidden_size = self.config.block_out_channels[-1]
+                    block_id = -1
+                    add_pose_adaptor = processor_name in set_processor_names
+                    pose_feature_dim = pose_feature_dimensions[block_id] if add_pose_adaptor else None
+                elif processor_key.startswith("up_blocks"):
+                    block_id = int(processor_key[len("up_blocks.")])
+                    hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+                    add_pose_adaptor = processor_name in set_processor_names
+                    pose_feature_dim = list(reversed(pose_feature_dimensions))[block_id] if add_pose_adaptor else None
+                else:
+                    block_id = int(processor_key[len("down_blocks.")])
+                    hidden_size = self.config.block_out_channels[block_id]
+                    add_pose_adaptor = processor_name in set_processor_names
+                    pose_feature_dim = pose_feature_dimensions[block_id] if add_pose_adaptor else None
+                if add_pose_adaptor and enable_xformers:
+                    all_attn_processors[processor_key] = PoseAdaptorXFormersAttnProcessor(hidden_size=hidden_size,
+                                                                                  pose_feature_dim=pose_feature_dim,
+                                                                                  cross_attention_dim=cross_attention_dim,
+                                                                                  **attention_processor_kwargs)
+                elif add_pose_adaptor:
+                    all_attn_processors[processor_key] = PoseAdaptorAttnProcessor(hidden_size=hidden_size,
+                                                                                  pose_feature_dim=pose_feature_dim,
+                                                                                  cross_attention_dim=cross_attention_dim,
+                                                                                  **attention_processor_kwargs)
+                elif enable_xformers:
+                    all_attn_processors[processor_key] = CustomizedXFormerAttnProcessor()
+                else:
+                    all_attn_processors[processor_key] = CustomizedAttnProcessor()
+        else:
+            for processor_key in self.attn_processors.keys():
+                if 'temporal' not in processor_key and enable_xformers:
+                    all_attn_processors[processor_key] = CustomizedXFormerAttnProcessor()
+                elif 'temporal' not in processor_key:
+                    all_attn_processors[processor_key] = CustomizedAttnProcessor()
+        if add_temporal:
+            for processor_key in self.attn_processors.keys():
+                if 'temporal' not in processor_key:
+                    continue
+                processor_name = processor_key.split('.')[-2]
+                cross_attention_dim = None if processor_name == 'attn1' else self.config.cross_attention_dim
+                if processor_key.startswith("mid_block"):
+                    hidden_size = self.config.block_out_channels[-1]
+                    block_id = -1
+                    add_pose_adaptor = processor_name in set_processor_names
+                    pose_feature_dim = pose_feature_dimensions[block_id] if add_pose_adaptor else None
+                elif processor_key.startswith("up_blocks"):
+                    block_id = int(processor_key[len("up_blocks.")])
+                    hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+                    add_pose_adaptor = (processor_name in set_processor_names)
+                    pose_feature_dim = list(reversed(pose_feature_dimensions))[block_id] if add_pose_adaptor else None
+                else:
+                    block_id = int(processor_key[len("down_blocks.")])
+                    hidden_size = self.config.block_out_channels[block_id]
+                    add_pose_adaptor = processor_name in set_processor_names
+                    pose_feature_dim = pose_feature_dimensions[block_id] if add_pose_adaptor else None
+                if add_pose_adaptor and enable_xformers:
+                    all_attn_processors[processor_key] = PoseAdaptorAttnProcessor(hidden_size=hidden_size,
+                                                                                          pose_feature_dim=pose_feature_dim,
+                                                                                          cross_attention_dim=cross_attention_dim,
+                                                                                          **attention_processor_kwargs)
+                elif add_pose_adaptor:
+                    all_attn_processors[processor_key] = PoseAdaptorAttnProcessor(hidden_size=hidden_size,
+                                                                                  pose_feature_dim=pose_feature_dim,
+                                                                                  cross_attention_dim=cross_attention_dim,
+                                                                                  **attention_processor_kwargs)
+                elif enable_xformers:
+                    all_attn_processors[processor_key] = CustomizedXFormerAttnProcessor()
+                else:
+                    all_attn_processors[processor_key] = CustomizedAttnProcessor()
+        else:
+            for processor_key in self.attn_processors.keys():
+                if 'temporal' in processor_key and enable_xformers:
+                    all_attn_processors[processor_key] = CustomizedXFormerAttnProcessor()
+                elif 'temporal' in processor_key:
+                    all_attn_processors[processor_key] = CustomizedAttnProcessor()
+        self.set_attn_processor(all_attn_processors)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        pose_features: List[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.FloatTensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb)
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        down_block_res_samples = (sample,)
+        for block_idx, downsample_block in enumerate(self.down_blocks):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    pose_feature=pose_features[block_idx]
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+            pose_feature=pose_features[-1]
+        )
+        # 5. up
+        for block_idx, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    pose_feature=pose_features[-(block_idx + 1)]
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample,)
+        return UNetSpatioTemporalConditionOutput(sample=sample)

cameractrl/models/unet_3d_blocks.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import torch
+import torch.nn as nn
+from typing import Union, Tuple, Optional, Dict, Any
+from diffusers.utils import is_torch_version
+from diffusers.models.resnet import (
+    Downsample2D,
+    SpatioTemporalResBlock,
+    Upsample2D
+)
+from diffusers.models.unet_3d_blocks import (
+    DownBlockSpatioTemporal,
+    UpBlockSpatioTemporal,
+)
+from cameractrl.models.transformer_temporal import TransformerSpatioTemporalModelPoseCond
+def get_down_block(
+        down_block_type: str,
+        num_layers: int,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        add_downsample: bool,
+        num_attention_heads: int,
+        cross_attention_dim: Optional[int] = None,
+        transformer_layers_per_block: int = 1,
+        **kwargs,
+) -> Union[
+    "DownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporalPoseCond",
+]:
+    if down_block_type == "DownBlockSpatioTemporal":
+        # added for SDV
+        return DownBlockSpatioTemporal(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "CrossAttnDownBlockSpatioTemporalPoseCond":
+        # added for SDV
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockSpatioTemporal")
+        return CrossAttnDownBlockSpatioTemporalPoseCond(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            add_downsample=add_downsample,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+        up_block_type: str,
+        num_layers: int,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        add_upsample: bool,
+        num_attention_heads: int,
+        resolution_idx: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        transformer_layers_per_block: int = 1,
+        **kwargs,
+) -> Union[
+    "UpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporalPoseCond",
+]:
+    if up_block_type == "UpBlockSpatioTemporal":
+        # added for SDV
+        return UpBlockSpatioTemporal(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "CrossAttnUpBlockSpatioTemporalPoseCond":
+        # added for SDV
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockSpatioTemporal")
+        return CrossAttnUpBlockSpatioTemporalPoseCond(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            add_upsample=add_upsample,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            resolution_idx=resolution_idx,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class CrossAttnDownBlockSpatioTemporalPoseCond(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            num_layers: int = 1,
+            transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+            num_attention_heads: int = 1,
+            cross_attention_dim: int = 1280,
+            add_downsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-6,
+                )
+            )
+            attentions.append(
+                TransformerSpatioTemporalModelPoseCond(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=1,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            hidden_states: torch.FloatTensor,  # [bs * frame, c, h, w]
+            temb: Optional[torch.FloatTensor] = None,  # [bs * frame, c]
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,  # [bs * frame, 1, c]
+            image_only_indicator: Optional[torch.Tensor] = None,  # [bs, frame]
+            pose_feature: Optional[torch.Tensor] = None  # [bs, c, frame, h, w]
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        blocks = list(zip(self.resnets, self.attentions))
+        for resnet, attn in blocks:
+            if self.training and self.gradient_checkpointing:  # TODO
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )  # [bs * frame, c, h, w]
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    pose_feature=pose_feature,
+                    return_dict=False,
+                )[0]
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states
+class UNetMidBlockSpatioTemporalPoseCond(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            temb_channels: int,
+            num_layers: int = 1,
+            transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+            num_attention_heads: int = 1,
+            cross_attention_dim: int = 1280,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+        # there is always at least one resnet
+        resnets = [
+            SpatioTemporalResBlock(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=1e-5,
+            )
+        ]
+        attentions = []
+        for i in range(num_layers):
+            attentions.append(
+                TransformerSpatioTemporalModelPoseCond(
+                    num_attention_heads,
+                    in_channels // num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-5,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            image_only_indicator: Optional[torch.Tensor] = None,
+            pose_feature: Optional[torch.Tensor] = None  # [bs, c, frame, h, w]
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](
+            hidden_states,
+            temb,
+            image_only_indicator=image_only_indicator,
+        )
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:  # TODO
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    pose_feature=pose_feature,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+        return hidden_states
+class CrossAttnUpBlockSpatioTemporalPoseCond(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            prev_output_channel: int,
+            temb_channels: int,
+            resolution_idx: Optional[int] = None,
+            num_layers: int = 1,
+            transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+            resnet_eps: float = 1e-6,
+            num_attention_heads: int = 1,
+            cross_attention_dim: int = 1280,
+            add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                )
+            )
+            attentions.append(
+                TransformerSpatioTemporalModelPoseCond(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+    def forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            image_only_indicator: Optional[torch.Tensor] = None,
+            pose_feature: Optional[torch.Tensor] = None  # [bs, c, frame, h, w]
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:  # TODO
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    pose_feature=pose_feature,
+                    return_dict=False,
+                )[0]
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states

cameractrl/pipelines/pipeline_animation.py ADDED Viewed

	@@ -0,0 +1,523 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+import inspect
+import pandas as pd
+import torch
+import PIL.Image
+from typing import Callable, List, Optional, Union, Dict
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.models import AutoencoderKLTemporalDecoder
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import EulerDiscreteScheduler
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    _resize_with_antialiasing,
+    _append_dims,
+    tensor2vid,
+    StableVideoDiffusionPipelineOutput
+)
+from cameractrl.models.pose_adaptor import CameraPoseEncoder
+from cameractrl.models.unet import UNetSpatioTemporalConditionModelPoseCond
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class StableVideoDiffusionPipelinePoseCond(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModelPoseCond,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+        pose_encoder: CameraPoseEncoder
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            pose_encoder=pose_encoder
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free_guidance, do_resize_normalize):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+            # Normalize the image with for CLIP input
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+        elif do_resize_normalize:
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+            # Normalize the image with for CLIP input
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+        return image_embeddings
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        return image_latents
+    def _get_add_time_ids(
+        self,
+        fps,
+        motion_bucket_id,
+        noise_aug_strength,
+        dtype,
+        batch_size,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+        return add_time_ids
+    def decode_latents(self, latents, num_frames, decode_chunk_size=14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        accepts_num_frames = "num_frames" in set(inspect.signature(self.vae.forward).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+        self,
+        batch_size,
+        num_frames,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 2,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        pose_embedding: torch.FloatTensor,
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: int = 0.02,
+        do_resize_normalize: bool = True,
+        do_image_process: bool = False,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after generation.
+                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video.
+            noise_aug_strength (`int`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
+        Examples:
+        ```py
+        from diffusers import StableVideoDiffusionPipeline
+        from diffusers.utils import load_image, export_to_video
+        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        pipe.to("cuda")
+        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        image = image.resize((1024, 576))
+        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        export_to_video(frames, "generated.mp4", fps=7)
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = pose_embedding.device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = max_guidance_scale > 1.0
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, do_classifier_free_guidance, do_resize_normalize=do_resize_normalize)
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        if do_image_process:
+            image = self.image_processor.preprocess(image, height=height, width=width).to(image_embeddings.device)
+        noise = randn_tensor(image.shape, generator=generator, device=image.device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        image_latents = self._encode_vae_image(image, device, num_videos_per_prompt, do_classifier_free_guidance)
+        image_latents = image_latents.to(image_embeddings.dtype)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )           # [bs, frame, c, h, w]
+        # 7. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)     # [bs, frame, 1, 1, 1]
+        self._guidance_scale = guidance_scale
+        # 8. Prepare pose features
+        assert pose_embedding.ndim == 5                         # [b, f, c, h, w]
+        pose_features = self.pose_encoder(pose_embedding)       # list of [b, c, f, h, w]
+        pose_features = [torch.cat([x, x], dim=0) for x in pose_features] if do_classifier_free_guidance else pose_features
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # Concatenate image_latents over channels dimention
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    pose_features=pose_features,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)        # [b, c, f, h, w]
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)

cameractrl/utils/convert_from_ckpt.py ADDED Viewed

	@@ -0,0 +1,556 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+import re
+from transformers import CLIPTextModel
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    if controlnet:
+        unet_key = "control_model."
+    else:
+        unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    if controlnet:
+        # conditioning embedding
+        orig_index = 0
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        orig_index += 2
+        diffusers_index = 0
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    text_model.load_state_dict(text_model_dict)
+    return text_model
+textenc_conversion_lst = [
+    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))

cameractrl/utils/convert_lora_safetensor_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# coding=utf-8
+# Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LoRA's safetensors checkpoints. """
+import argparse
+import torch
+from safetensors.torch import load_file
+from diffusers import StableDiffusionPipeline
+import pdb
+def convert_motion_lora_ckpt_to_diffusers(pipeline, state_dict, alpha=1.0):
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # only process lora down key
+        if "up." in key: continue
+        up_key    = key.replace(".down.", ".up.")
+        model_key = key.replace("processor.", "").replace("_lora", "").replace("down.", "").replace("up.", "")
+        model_key = model_key.replace("to_out.", "to_out.0.")
+        layer_infos = model_key.split(".")[:-1]
+        curr_layer = pipeline.unet
+        while len(layer_infos) > 0:
+            temp_name = layer_infos.pop(0)
+            curr_layer = curr_layer.__getattr__(temp_name)
+        weight_down = state_dict[key]
+        weight_up   = state_dict[up_key]
+        curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+    return pipeline
+def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
+    # load base model
+    # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
+    # load LoRA weight from .safetensors
+    # state_dict = load_file(checkpoint_path)
+    visited = []
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+        if "text" in key:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = pipeline.unet
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    return pipeline
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
+    )
+    parser.add_argument(
+        "--lora_prefix_text_encoder",
+        default="lora_te",
+        type=str,
+        help="The prefix of text encoder weight in safetensors",
+    )
+    parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
+    parser.add_argument(
+        "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
+    )
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+    base_model_path = args.base_model_path
+    checkpoint_path = args.checkpoint_path
+    dump_path = args.dump_path
+    lora_prefix_unet = args.lora_prefix_unet
+    lora_prefix_text_encoder = args.lora_prefix_text_encoder
+    alpha = args.alpha
+    pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
+    pipe = pipe.to(args.device)
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

cameractrl/utils/util.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import functools
+import logging
+import sys
+import imageio
+import atexit
+import importlib
+import torch
+import torchvision
+import numpy as np
+from termcolor import colored
+from einops import rearrange
+def instantiate_from_config(config, **additional_kwargs):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    additional_kwargs.update(config.get("kwargs", dict()))
+    return get_obj_from_str(config["target"])(**additional_kwargs)
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps)
+# Logger utils are copied from detectron2
+class _ColorfulFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop("root_name") + "."
+        self._abbrev_name = kwargs.pop("abbrev_name", "")
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + "."
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+    def formatMessage(self, record):
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored("WARNING", "red", attrs=["blink"])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+        else:
+            return log
+        return prefix + " " + log
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    # use 1K buffer if writing to cloud storage
+    io = open(filename, "a", buffering=1024 if "://" in filename else -1)
+    atexit.register(io.close)
+    return io
+@functools.lru_cache()
+def setup_logger(output, distributed_rank, color=True, name='AnimateDiff', abbrev_name=None):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    if abbrev_name is None:
+        abbrev_name = 'AD'
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s:%(lineno)d %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
+    )
+    # stdout logging: master only
+    if distributed_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        if color:
+            formatter = _ColorfulFormatter(
+                colored("[%(asctime)s %(name)s:%(lineno)d]: ", "green") + "%(message)s",
+                datefmt="%m/%d %H:%M:%S",
+                root_name=name,
+                abbrev_name=str(abbrev_name),
+            )
+        else:
+            formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "log.txt")
+        if distributed_rank > 0:
+            filename = filename + ".rank{}".format(distributed_rank)
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    return logger
+def format_time(elapsed_time):
+    # Time thresholds
+    minute = 60
+    hour = 60 * minute
+    day = 24 * hour
+    days, remainder = divmod(elapsed_time, day)
+    hours, remainder = divmod(remainder, hour)
+    minutes, seconds = divmod(remainder, minute)
+    formatted_time = ""
+    if days > 0:
+        formatted_time += f"{int(days)} days "
+    if hours > 0:
+        formatted_time += f"{int(hours)} hours "
+    if minutes > 0:
+        formatted_time += f"{int(minutes)} minutes "
+    if seconds > 0:
+        formatted_time += f"{seconds:.2f} seconds"
+    return formatted_time.strip()

configs/train_cameractrl/svd_320_576_cameractrl.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+output_dir: "output/cameractrl_model"
+pretrained_model_path: "/mnt/petrelfs/liangzhengyang.d/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid/snapshots/2586584918a955489b599d4dc76b6bb3fdb3fbb2"
+unet_subfolder: "unet"
+down_block_types: ['CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'DownBlockSpatioTemporal']
+up_block_types: ['UpBlockSpatioTemporal', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond']
+train_data:
+  root_path:       "/mnt/petrelfs/share_data/hehao/datasets/RealEstate10k"
+  annotation_json:       "annotations/train.json"
+  sample_stride: 8
+  sample_n_frames: 14
+  relative_pose: true
+  zero_t_first_frame: true
+  sample_size: [320, 576]
+  rescale_fxy: true
+  shuffle_frames: false
+  use_flip: false
+validation_data:
+  root_path:       "/mnt/petrelfs/share_data/hehao/datasets/RealEstate10k"
+  annotation_json:       "annotations/validation.json"
+  sample_stride: 8
+  sample_n_frames: 14
+  relative_pose: true
+  zero_t_first_frame: true
+  sample_size: [320, 576]
+  rescale_fxy: true
+  shuffle_frames: false
+  use_flip: false
+  return_clip_name: true
+random_null_image_ratio: 0.15
+pose_encoder_kwargs:
+  downscale_factor: 8
+  channels: [320, 640, 1280, 1280]
+  nums_rb: 2
+  cin: 384
+  ksize: 1
+  sk: true
+  use_conv: false
+  compression_factor: 1
+  temporal_attention_nhead: 8
+  attention_block_types: ["Temporal_Self", ]
+  temporal_position_encoding: true
+  temporal_position_encoding_max_len: 14
+attention_processor_kwargs:
+  add_spatial: false
+  add_temporal: true
+  attn_processor_name: 'attn1'
+  pose_feature_dimensions: [320, 640, 1280, 1280]
+  query_condition: true
+  key_value_condition: true
+  scale: 1.0
+do_sanity_check: true
+sample_before_training: false
+max_train_epoch:      -1
+max_train_steps:      50000
+validation_steps:       2500
+validation_steps_tuple: [500, ]
+learning_rate:    3.e-5
+P_mean: 0.7
+P_std: 1.6
+condition_image_noise_mean: -3.0
+condition_image_noise_std: 0.5
+sample_latent: true
+first_image_cond: true
+num_inference_steps: 25
+min_guidance_scale: 1.0
+max_guidance_scale: 3.0
+num_workers: 8
+train_batch_size: 1
+checkpointing_epochs: -1
+checkpointing_steps:  10000
+mixed_precision_training: false
+enable_xformers_memory_efficient_attention: true
+global_seed: 42
+logger_interval: 10

configs/train_cameractrl/svdxt_320_576_cameractrl.yaml ADDED Viewed

	@@ -0,0 +1,88 @@

+output_dir: "output/cameractrl_model"
+pretrained_model_path: "/mnt/petrelfs/liangzhengyang.d/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid-xt/snapshots/4420c0886aad9930787308c62d9dd8befd4900f6"
+unet_subfolder: "unet"
+down_block_types: ['CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'CrossAttnDownBlockSpatioTemporalPoseCond', 'DownBlockSpatioTemporal']
+up_block_types: ['UpBlockSpatioTemporal', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond', 'CrossAttnUpBlockSpatioTemporalPoseCond']
+train_data:
+  root_path:       "/mnt/petrelfs/share_data/hehao/datasets/RealEstate10k"
+  annotation_json:       "annotations/train.json"
+  sample_stride: 5
+  sample_n_frames: 25
+  relative_pose: true
+  zero_t_first_frame: true
+  sample_size: [320, 576]
+  rescale_fxy: true
+  shuffle_frames: false
+  use_flip: false
+validation_data:
+  root_path:       "/mnt/petrelfs/share_data/hehao/datasets/RealEstate10k"
+  annotation_json:       "annotations/validation.json"
+  sample_stride: 5
+  sample_n_frames: 25
+  relative_pose: true
+  zero_t_first_frame: true
+  sample_size: [320, 576]
+  rescale_fxy: true
+  shuffle_frames: false
+  use_flip: false
+  return_clip_name: true
+random_null_image_ratio: 0.15
+pose_encoder_kwargs:
+  downscale_factor: 8
+  channels: [320, 640, 1280, 1280]
+  nums_rb: 2
+  cin: 384
+  ksize: 1
+  sk: true
+  use_conv: false
+  compression_factor: 1
+  temporal_attention_nhead: 8
+  attention_block_types: ["Temporal_Self", ]
+  temporal_position_encoding: true
+  temporal_position_encoding_max_len: 25
+attention_processor_kwargs:
+  add_spatial: false
+  add_temporal: true
+  attn_processor_name: 'attn1'
+  pose_feature_dimensions: [320, 640, 1280, 1280]
+  query_condition: true
+  key_value_condition: true
+  scale: 1.0
+do_sanity_check: false
+sample_before_training: false
+video_length: 25
+max_train_epoch:      -1
+max_train_steps:      50000
+validation_steps:       2500
+validation_steps_tuple: [1000, ]
+learning_rate:    3.e-5
+P_mean: 0.7
+P_std: 1.6
+condition_image_noise_mean: -3.0
+condition_image_noise_std: 0.5
+sample_latent: true
+first_image_cond: true
+num_inference_steps: 25
+min_guidance_scale: 1.0
+max_guidance_scale: 3.0
+num_workers: 8
+train_batch_size: 1
+checkpointing_epochs: -1
+checkpointing_steps:  10000
+mixed_precision_training: false
+enable_xformers_memory_efficient_attention: true
+global_seed: 42
+logger_interval: 10

inference_cameractrl.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import argparse
+import json
+import os
+import torch
+import numpy as np
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers import AutoencoderKLTemporalDecoder, EulerDiscreteScheduler
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention_processor import AttnProcessor2_0
+from packaging import version as pver
+from cameractrl.pipelines.pipeline_animation import StableVideoDiffusionPipelinePoseCond
+from cameractrl.models.unet import UNetSpatioTemporalConditionModelPoseCond
+from cameractrl.models.pose_adaptor import CameraPoseEncoder
+from cameractrl.utils.util import save_videos_grid
+class Camera(object):
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def custom_meshgrid(*args):
+    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
+    if pver.parse(torch.__version__) < pver.parse('1.10'):
+        return torch.meshgrid(*args)
+    else:
+        return torch.meshgrid(*args, indexing='ij')
+def get_relative_pose(cam_params, zero_first_frame_scale):
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    source_cam_c2w = abs_c2ws[0]
+    if zero_first_frame_scale:
+        cam_to_origin = 0
+    else:
+        cam_to_origin = np.linalg.norm(source_cam_c2w[:3, 3])
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+def ray_condition(K, c2w, H, W, device):
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+    B = K.shape[0]
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
+    zs = torch.ones_like(i)  # [B, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
+    rays_o = c2w[..., :3, 3]  # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
+    # c2w @ dirctions
+    rays_dxo = torch.linalg.cross(rays_o, rays_d)
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
+    return plucker
+def get_pipeline(ori_model_path, unet_subfolder, down_block_types, up_block_types, pose_encoder_kwargs,
+                 attention_processor_kwargs, pose_adaptor_ckpt, enable_xformers, device):
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(ori_model_path, subfolder="scheduler")
+    feature_extractor = CLIPImageProcessor.from_pretrained(ori_model_path, subfolder="feature_extractor")
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(ori_model_path, subfolder="image_encoder")
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(ori_model_path, subfolder="vae")
+    unet = UNetSpatioTemporalConditionModelPoseCond.from_pretrained(ori_model_path,
+                                                                    subfolder=unet_subfolder,
+                                                                    down_block_types=down_block_types,
+                                                                    up_block_types=up_block_types)
+    pose_encoder = CameraPoseEncoder(**pose_encoder_kwargs)
+    print("Setting the attention processors")
+    unet.set_pose_cond_attn_processor(enable_xformers=(enable_xformers and is_xformers_available()), **attention_processor_kwargs)
+    print(f"Loading weights of camera encoder and attention processor from {pose_adaptor_ckpt}")
+    ckpt_dict = torch.load(pose_adaptor_ckpt, map_location=unet.device)
+    pose_encoder_state_dict = ckpt_dict['pose_encoder_state_dict']
+    pose_encoder_m, pose_encoder_u = pose_encoder.load_state_dict(pose_encoder_state_dict)
+    assert len(pose_encoder_m) == 0 and len(pose_encoder_u) == 0
+    attention_processor_state_dict = ckpt_dict['attention_processor_state_dict']
+    _, attention_processor_u = unet.load_state_dict(attention_processor_state_dict, strict=False)
+    assert len(attention_processor_u) == 0
+    print("Loading done")
+    vae.set_attn_processor(AttnProcessor2_0())
+    vae.to(device)
+    image_encoder.to(device)
+    unet.to(device)
+    pipeline = StableVideoDiffusionPipelinePoseCond(
+        vae=vae,
+        image_encoder=image_encoder,
+        unet=unet,
+        scheduler=noise_scheduler,
+        feature_extractor=feature_extractor,
+        pose_encoder=pose_encoder
+    )
+    pipeline = pipeline.to(device)
+    return pipeline
+def main(args):
+    os.makedirs(os.path.join(args.out_root, 'generated_videos'), exist_ok=True)
+    os.makedirs(os.path.join(args.out_root, 'reference_images'), exist_ok=True)
+    rank = args.local_rank
+    setup_for_distributed(rank == 0)
+    gpu_id = rank % torch.cuda.device_count()
+    model_configs = OmegaConf.load(args.model_config)
+    device = f"cuda:{gpu_id}"
+    print(f'Constructing pipeline')
+    pipeline = get_pipeline(args.ori_model_path, model_configs['unet_subfolder'], model_configs['down_block_types'],
+                            model_configs['up_block_types'], model_configs['pose_encoder_kwargs'],
+                            model_configs['attention_processor_kwargs'], args.pose_adaptor_ckpt, args.enable_xformers, device)
+    print('Done')
+    print('Loading K, R, t matrix')
+    with open(args.trajectory_file, 'r') as f:
+        poses = f.readlines()
+    poses = [pose.strip().split(' ') for pose in poses[1:]]
+    cam_params = [[float(x) for x in pose] for pose in poses]
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+    sample_wh_ratio = args.image_width / args.image_height
+    pose_wh_ratio = args.original_pose_width / args.original_pose_height
+    if pose_wh_ratio > sample_wh_ratio:
+        resized_ori_w = args.image_height * pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fx = resized_ori_w * cam_param.fx / args.image_width
+    else:
+        resized_ori_h = args.image_width / pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fy = resized_ori_h * cam_param.fy / args.image_height
+    intrinsic = np.asarray([[cam_param.fx * args.image_width,
+                             cam_param.fy * args.image_height,
+                             cam_param.cx * args.image_width,
+                             cam_param.cy * args.image_height]
+                            for cam_param in cam_params], dtype=np.float32)
+    K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+    c2ws = get_relative_pose(cam_params, zero_first_frame_scale=True)
+    c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+    plucker_embedding = ray_condition(K, c2ws, args.image_height, args.image_width, device='cpu')       # b f h w 6
+    plucker_embedding = plucker_embedding.permute(0, 1, 4, 2, 3).contiguous().to(device=device)
+    prompt_dict = json.load(open(args.prompt_file, 'r'))
+    prompt_images = prompt_dict['image_paths']
+    prompt_captions = prompt_dict['captions']
+    N = int(len(prompt_images) // args.n_procs)
+    remainder = int(len(prompt_images) % args.n_procs)
+    prompts_per_gpu = [N + 1 if gpu_id < remainder else N for gpu_id in range(args.n_procs)]
+    low_idx = sum(prompts_per_gpu[:gpu_id])
+    high_idx = low_idx + prompts_per_gpu[gpu_id]
+    prompt_images = prompt_images[low_idx: high_idx]
+    prompt_captions = prompt_captions[low_idx: high_idx]
+    print(f"rank {rank} / {torch.cuda.device_count()}, number of prompts: {len(prompt_images)}")
+    generator = torch.Generator(device=device)
+    generator.manual_seed(42)
+    for prompt_image, prompt_caption in tqdm(zip(prompt_images, prompt_captions)):
+        save_name = "_".join(prompt_caption.split(" "))
+        condition_image = Image.open(prompt_image)
+        with torch.no_grad():
+            sample = pipeline(
+                image=condition_image,
+                pose_embedding=plucker_embedding,
+                height=args.image_height,
+                width=args.image_width,
+                num_frames=args.num_frames,
+                num_inference_steps=args.num_inference_steps,
+                min_guidance_scale=args.min_guidance_scale,
+                max_guidance_scale=args.max_guidance_scale,
+                do_image_process=True,
+                generator=generator,
+                output_type='pt'
+            ).frames[0].transpose(0, 1).cpu()      # [3, f, h, w] 0-1
+        resized_condition_image = condition_image.resize((args.image_width, args.image_height))
+        save_videos_grid(sample[None], f"{os.path.join(args.out_root, 'generated_videos')}/{save_name}.mp4", rescale=False)
+        resized_condition_image.save(os.path.join(args.out_root, 'reference_images', f'{save_name}.png'))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_root", type=str)
+    parser.add_argument("--image_height", type=int, default=320)
+    parser.add_argument("--image_width", type=int, default=576)
+    parser.add_argument("--num_frames", type=int, default=14)
+    parser.add_argument("--ori_model_path", type=str)
+    parser.add_argument("--unet_subfolder", type=str, default='unet')
+    parser.add_argument("--enable_xformers", action='store_true')
+    parser.add_argument("--pose_adaptor_ckpt", default=None)
+    parser.add_argument("--num_inference_steps", type=int, default=25)
+    parser.add_argument("--min_guidance_scale", type=float, default=1.0)
+    parser.add_argument("--max_guidance_scale", type=float, default=3.0)
+    parser.add_argument("--prompt_file", required=True, help='prompts path, json or txt')
+    parser.add_argument("--trajectory_file", required=True)
+    parser.add_argument("--original_pose_width", type=int, default=1280)
+    parser.add_argument("--original_pose_height", type=int, default=720)
+    parser.add_argument("--model_config", required=True)
+    parser.add_argument("--n_procs", type=int, default=8)
+    # DDP args
+    parser.add_argument("--world_size", default=1, type=int,
+                        help="number of the distributed processes.")
+    parser.add_argument('--local-rank', type=int, default=-1,
+                        help='Replica rank on the current node. This field is required '
+                             'by `torch.distributed.launch`.')
+    args = parser.parse_args()
+    main(args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+--extra-index-url https://download.pytorch.org/whl/cu121
+torch
+torchvision
+diffusers==0.24.0
+imageio==2.27.0
+transformers==4.39.3
+gradio==4.26.0
+imageio==2.27.0
+imageio-ffmpeg==0.4.9
+accelerate==0.30.0
+opencv-python
+gdown
+einops
+decord
+omegaconf
+safetensors
+gradio
+wandb
+triton
+termcolor