stable-fast-3d

Runtime error

App Files Files Community

mboss commited on Jul 31

Commit

d945eeb

•

1 Parent(s): 816b4c8

Initial commit

Browse files

Files changed (42) hide show

.gitattributes +2 -0
LICENSE.md +51 -0
README.md +11 -5
app.py +357 -0
demo_files/comp.gif +3 -0
demo_files/examples/animal_character.png +3 -0
demo_files/examples/animal_character_2.png +3 -0
demo_files/examples/axe.png +3 -0
demo_files/examples/chair1.png +3 -0
demo_files/examples/character1.png +3 -0
demo_files/examples/otter_samurai.png +3 -0
demo_files/examples/raccoon_wizard.png +3 -0
demo_files/examples/stylized-rocks.png +3 -0
demo_files/examples/tree.png +3 -0
demo_files/hdri/abandoned_tiled_room_1k.hdr +0 -0
demo_files/hdri/metro_noord_1k.hdr +0 -0
demo_files/hdri/neon_photostudio_1k.hdr +0 -0
demo_files/hdri/peppermint_powerplant_1k.hdr +0 -0
demo_files/hdri/rainforest_trail_1k.hdr +0 -0
demo_files/hdri/studio_small_08_1k.hdr +0 -0
demo_files/hdri/urban_alley_01_1k.hdr +0 -0
demo_files/scatterplot.jpg +0 -0
demo_files/teaser.gif +3 -0
load/tets/160_tets.npz +3 -0
requirements.txt +13 -0
sf3d/box_uv_unwrap.py +610 -0
sf3d/models/camera.py +32 -0
sf3d/models/global_estimator/multi_head_estimator.py +118 -0
sf3d/models/image_estimator/clip_based_estimator.py +168 -0
sf3d/models/isosurface.py +229 -0
sf3d/models/mesh.py +172 -0
sf3d/models/network.py +195 -0
sf3d/models/tokenizers/dinov2.py +1196 -0
sf3d/models/tokenizers/image.py +99 -0
sf3d/models/tokenizers/triplane.py +49 -0
sf3d/models/transformers/attention.py +31 -0
sf3d/models/transformers/backbone.py +515 -0
sf3d/models/utils.py +292 -0
sf3d/system.py +483 -0
sf3d/texture_baker.py +87 -0
sf3d/texture_baker.slang +93 -0
sf3d/utils.py +91 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

LICENSE.md ADDED Viewed

	@@ -0,0 +1,51 @@

+STABILITY AI COMMUNITY LICENSE AGREEMENT
+Last Updated: July 5, 2024
+I. INTRODUCTION
+This Agreement applies to any individual person or entity ("You", "Your" or "Licensee") that uses or distributes any portion or element of the Stability AI Materials  or Derivative Works thereof for any Research & Non-Commercial or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below.
+This Agreement is intended to allow research, non-commercial, and limited commercial uses of the Models free of charge. In order to ensure that certain limited commercial uses of the Models continue to be allowed, this Agreement  preserves free access to the Models for people or organizations  generating annual revenue of less than US $1,000,000 (or local currency equivalent).
+By clicking "I Accept"  or by using or distributing or using any portion or element of the Stability Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then "You" includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity's behalf.
+II. RESEARCH & NON-COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement, Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI's intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Research or Non-Commercial Purpose. "Research Purpose" means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others. "Non-Commercial Purpose" means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing.
+III. COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement (including the remainder of this Section III), Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI's intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Commercial Purpose. "Commercial Purpose" means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for commercial advantage or monetary compensation to You or others, including but not limited to, (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, and (ii) for Your business's or organization's internal operations.
+If You are using or distributing the Stability AI Materials for a Commercial Purpose, You must register with Stability AI at (https://stability.ai/community-license). If at any time You or Your Affiliate(s), either individually or in aggregate, generate more than USD $1,000,000 in annual revenue (or the equivalent thereof in Your local currency), regardless of whether that revenue is generated directly or indirectly from the Stability AI Materials or Derivative Works, any licenses granted to You under this Agreement shall terminate as of such date. You must request a license from Stability AI at (https://stability.ai/enterprise) , which Stability AI may grant to You in its sole discretion. If you receive Stability AI Materials, or any Derivative Works thereof, from a Licensee as part of an integrated end user product, then Section III of this Agreement will not apply to you.
+IV. GENERAL TERMS
+Your Research, Non-Commercial, and Commercial License(s) under this Agreement are subject to the following terms.
+a.  Distribution & Attribution. If You distribute or make available the Stability AI Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Community License, Copyright ©  Stability AI Ltd. All Rights Reserved", and (iii) prominently display "Powered by Stability AI" on a related website, user interface, blogpost, about page, or product documentation.  If You create a Derivative Work, You may add your own attribution notice(s) to the "Notice" text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Stability AI Materials and state in the "Notice" text file that You changed the Stability AI Materials and how it was modified.
+b.  Use Restrictions. Your use of the Stability AI Materials and Derivative Works, including any output or results of the Stability AI Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to the Documentation and Stability AI's AUP, which is hereby incorporated by reference. Furthermore, You will not use the Stability AI Materials or Derivative Works, or any output or results of the Stability AI Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works).
+c.  Intellectual Property.
+(i) Trademark License.  No trademark licenses are granted under this Agreement, and in connection with the Stability AI Materials or Derivative Works, You may not use any name or mark owned by or associated with Stability AI or any of its Affiliates, except as required under Section IV(a) herein.
+(ii)  Ownership of Derivative Works.  As between You and Stability AI, You are the owner of Derivative Works You create, subject to Stability AI's ownership of the Stability AI Materials and any Derivative Works made by or for Stability AI.
+(iii)  Ownership of Outputs. As between You and Stability AI, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law.
+(iv)  Disputes.  If You or Your Affiliate(s) institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Stability AI Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to Your use or distribution of the Stability AI Materials or Derivative Works in violation of this Agreement.
+(v)  Feedback.  From time to time, You may provide Stability AI with verbal and/or written suggestions, comments or other feedback related to Stability AI's existing or prospective technology, products or services (collectively, "Feedback"). You are not obligated to provide Stability AI with Feedback, but to the extent that You do, You hereby grant Stability AI a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided "AS IS" and You make no warranties whatsoever about any Feedback.
+d.  Disclaimer Of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE STABILITY AI MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE STABILITY AI MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE STABILITY AI MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS.
+e.  Limitation Of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+f.  Term And Termination. The term of this Agreement will commence upon Your acceptance of this Agreement or access to the Stability AI Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Stability AI Materials or Derivative Works. Section IV(d), (e), and (g) shall survive the termination of this Agreement.
+g.  Governing Law.  This Agreement will be governed by and constructed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement.
+V. DEFINITIONS
+"Affiliate(s)" means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, "control" means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity.
+"Agreement" means this Stability AI Community License Agreement.
+"AUP" means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may be updated from time to time.
+"Derivative Work(s)" means (a) any derivative work of the Stability AI Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model's output, including"fine tune" and "low-rank adaptation" models derived from a Model or a Model's output, but do not include the output of any Model.
+"Documentation" means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software or Models.
+"Model(s)" means, collectively, Stability AI's proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing listed on Stability's Core Models Webpage available at, https://stability.ai/core-models, as may be updated from time to time.
+"Stability AI" or "we" means Stability AI Ltd. and its Affiliates.
+"Software" means Stability AI's proprietary software made available under this Agreement now or in the future.
+"Stability AI Materials" means, collectively, Stability's proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement.
+"Trade Control Laws" means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.

README.md CHANGED Viewed

@@ -1,12 +1,18 @@
 ---
-title: Stable Fast 3d
-emoji: 🌖
-colorFrom: blue
-colorTo: purple
 sdk: gradio
-sdk_version: 4.38.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Stable Fast 3D
+emoji: 🎮
+colorFrom: purple
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.31.4
+python_version: 3.10.13
 app_file: app.py
 pinned: false
+models:
+  - stabilityai/stable-fast-3d
+license: other
+license_name: stabilityai-ai-community
+license_link: LICENSE.md
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import os
+import tempfile
+import time
+from functools import lru_cache
+from typing import Any
+import gradio as gr
+import numpy as np
+import rembg
+import torch
+from gradio_litmodel3d import LitModel3D
+import spaces
+from PIL import Image
+import sf3d.utils as sf3d_utils
+from sf3d.system import SF3D
+rembg_session = rembg.new_session()
+COND_WIDTH = 512
+COND_HEIGHT = 512
+COND_DISTANCE = 1.6
+COND_FOVY_DEG = 40
+BACKGROUND_COLOR = [0.5, 0.5, 0.5]
+# Cached. Doesn't change
+c2w_cond = sf3d_utils.default_cond_c2w(COND_DISTANCE)
+intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg(
+    COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH
+)
+model = SF3D.from_pretrained(
+    "stabilityai/stable-fast-3d",
+    config_name="config.yaml",
+    weight_name="model.safetensors",
+)
+model.eval().cuda()
+example_files = [
+    os.path.join("demo_files/examples", f) for f in os.listdir("demo_files/examples")
+]
+@spaces.GPU
+def run_model(input_image):
+    start = time.time()
+    with torch.no_grad():
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            model_batch = create_batch(input_image)
+            model_batch = {k: v.cuda() for k, v in model_batch.items()}
+            trimesh_mesh, _glob_dict = model.generate_mesh(model_batch, 1024)
+            trimesh_mesh = trimesh_mesh[0]
+    # Create new tmp file
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
+    trimesh_mesh.export(tmp_file.name, file_type="glb")
+    print("Generation took:", time.time() - start, "s")
+    return tmp_file.name
+def create_batch(input_image: Image) -> dict[str, Any]:
+    img_cond = (
+        torch.from_numpy(
+            np.asarray(input_image.resize((COND_WIDTH, COND_HEIGHT))).astype(np.float32)
+            / 255.0
+        )
+        .float()
+        .clip(0, 1)
+    )
+    mask_cond = img_cond[:, :, -1:]
+    rgb_cond = torch.lerp(
+        torch.tensor(BACKGROUND_COLOR)[None, None, :], img_cond[:, :, :3], mask_cond
+    )
+    batch_elem = {
+        "rgb_cond": rgb_cond,
+        "mask_cond": mask_cond,
+        "c2w_cond": c2w_cond.unsqueeze(0),
+        "intrinsic_cond": intrinsic.unsqueeze(0),
+        "intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0),
+    }
+    # Add batch dim
+    batched = {k: v.unsqueeze(0) for k, v in batch_elem.items()}
+    return batched
+@lru_cache
+def checkerboard(squares: int, size: int, min_value: float = 0.5):
+    base = np.zeros((squares, squares)) + min_value
+    base[1::2, ::2] = 1
+    base[::2, 1::2] = 1
+    repeat_mult = size // squares
+    return (
+        base.repeat(repeat_mult, axis=0)
+        .repeat(repeat_mult, axis=1)[:, :, None]
+        .repeat(3, axis=-1)
+    )
+def remove_background(input_image: Image) -> Image:
+    return rembg.remove(input_image, session=rembg_session)
+def resize_foreground(
+    image: Image,
+    ratio: float,
+) -> Image:
+    image = np.array(image)
+    assert image.shape[-1] == 4
+    alpha = np.where(image[..., 3] > 0)
+    y1, y2, x1, x2 = (
+        alpha[0].min(),
+        alpha[0].max(),
+        alpha[1].min(),
+        alpha[1].max(),
+    )
+    # crop the foreground
+    fg = image[y1:y2, x1:x2]
+    # pad to square
+    size = max(fg.shape[0], fg.shape[1])
+    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
+    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
+    new_image = np.pad(
+        fg,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    # compute padding according to the ratio
+    new_size = int(new_image.shape[0] / ratio)
+    # pad to size, double side
+    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
+    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
+    new_image = np.pad(
+        new_image,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    new_image = Image.fromarray(new_image, mode="RGBA").resize(
+        (COND_WIDTH, COND_HEIGHT)
+    )
+    return new_image
+def square_crop(input_image: Image) -> Image:
+    # Perform a center square crop
+    min_size = min(input_image.size)
+    left = (input_image.size[0] - min_size) // 2
+    top = (input_image.size[1] - min_size) // 2
+    right = (input_image.size[0] + min_size) // 2
+    bottom = (input_image.size[1] + min_size) // 2
+    return input_image.crop((left, top, right, bottom)).resize(
+        (COND_WIDTH, COND_HEIGHT)
+    )
+def show_mask_img(input_image: Image) -> Image:
+    img_numpy = np.array(input_image)
+    alpha = img_numpy[:, :, 3] / 255.0
+    chkb = checkerboard(32, 512) * 255
+    new_img = img_numpy[..., :3] * alpha[:, :, None] + chkb * (1 - alpha[:, :, None])
+    return Image.fromarray(new_img.astype(np.uint8), mode="RGB")
+def run_button(run_btn, input_image, background_state, foreground_ratio):
+    if run_btn == "Run":
+        glb_file: str = run_model(background_state)
+        return (
+            gr.update(),
+            gr.update(),
+            gr.update(),
+            gr.update(),
+            gr.update(value=glb_file, visible=True),
+            gr.update(visible=True),
+        )
+    elif run_btn == "Remove Background":
+        rem_removed = remove_background(input_image)
+        sqr_crop = square_crop(rem_removed)
+        fr_res = resize_foreground(sqr_crop, foreground_ratio)
+        return (
+            gr.update(value="Run", visible=True),
+            sqr_crop,
+            fr_res,
+            gr.update(value=show_mask_img(fr_res), visible=True),
+            gr.update(value=None, visible=False),
+            gr.update(visible=False),
+        )
+def requires_bg_remove(image, fr):
+    if image is None:
+        return (
+            gr.update(visible=False, value="Run"),
+            None,
+            None,
+            gr.update(value=None, visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        )
+    alpha_channel = np.array(image.getchannel("A"))
+    min_alpha = alpha_channel.min()
+    if min_alpha == 0:
+        print("Already has alpha")
+        sqr_crop = square_crop(image)
+        fr_res = resize_foreground(sqr_crop, fr)
+        return (
+            gr.update(value="Run", visible=True),
+            sqr_crop,
+            fr_res,
+            gr.update(value=show_mask_img(fr_res), visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        )
+    return (
+        gr.update(value="Remove Background", visible=True),
+        None,
+        None,
+        gr.update(value=None, visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+    )
+def update_foreground_ratio(img_proc, fr):
+    foreground_res = resize_foreground(img_proc, fr)
+    return (
+        foreground_res,
+        gr.update(value=show_mask_img(foreground_res)),
+    )
+with gr.Blocks() as demo:
+    img_proc_state = gr.State()
+    background_remove_state = gr.State()
+    gr.Markdown("""
+    # SF3D: Stable Fast 3D Mesh Reconstruction with UV-unwrapping and Illumination Disentanglement
+    **SF3D** is a state-of-the-art method for 3D mesh reconstruction from a single image.
+    This demo allows you to upload an image and generate a 3D mesh model from it.
+    **Tips**
+    1. If the image already has an alpha channel, you can skip the background removal step.
+    2. You can adjust the foreground ratio to control the size of the foreground object. This can influence the shape
+    3. You can upload your own HDR environment map to light the 3D model.
+    """)
+    with gr.Row(variant="panel"):
+        with gr.Column():
+            with gr.Row():
+                input_img = gr.Image(
+                    type="pil", label="Input Image", sources="upload", image_mode="RGBA"
+                )
+                preview_removal = gr.Image(
+                    label="Preview Background Removal",
+                    type="pil",
+                    image_mode="RGB",
+                    interactive=False,
+                    visible=False,
+                )
+            foreground_ratio = gr.Slider(
+                label="Foreground Ratio",
+                minimum=0.5,
+                maximum=1.0,
+                value=0.85,
+                step=0.05,
+            )
+            foreground_ratio.change(
+                update_foreground_ratio,
+                inputs=[img_proc_state, foreground_ratio],
+                outputs=[background_remove_state, preview_removal],
+            )
+            run_btn = gr.Button("Run", variant="primary", visible=False)
+        with gr.Column():
+            output_3d = LitModel3D(
+                label="3D Model",
+                visible=False,
+                clear_color=[0.0, 0.0, 0.0, 0.0],
+                tonemapping="aces",
+                contrast=1.0,
+                scale=1.0,
+            )
+            with gr.Column(visible=False, scale=1.0) as hdr_row:
+                gr.Markdown("""## HDR Environment Map
+                Select an HDR environment map to light the 3D model. You can also upload your own HDR environment maps.
+                """)
+                with gr.Row():
+                    hdr_illumination_file = gr.File(
+                        label="HDR Env Map", file_types=[".hdr"], file_count="single"
+                    )
+                    example_hdris = [
+                        os.path.join("demo_files/hdri", f)
+                        for f in os.listdir("demo_files/hdri")
+                    ]
+                    hdr_illumination_example = gr.Examples(
+                        examples=example_hdris,
+                        inputs=hdr_illumination_file,
+                    )
+                    hdr_illumination_file.change(
+                        lambda x: gr.update(env_map=x.name if x is not None else None),
+                        inputs=hdr_illumination_file,
+                        outputs=[output_3d],
+                    )
+    examples = gr.Examples(
+        examples=example_files,
+        inputs=input_img,
+    )
+    input_img.change(
+        requires_bg_remove,
+        inputs=[input_img, foreground_ratio],
+        outputs=[
+            run_btn,
+            img_proc_state,
+            background_remove_state,
+            preview_removal,
+            output_3d,
+            hdr_row,
+        ],
+    )
+    run_btn.click(
+        run_button,
+        inputs=[
+            run_btn,
+            input_img,
+            background_remove_state,
+            foreground_ratio,
+        ],
+        outputs=[
+            run_btn,
+            img_proc_state,
+            background_remove_state,
+            preview_removal,
+            output_3d,
+            hdr_row,
+        ],
+    )
+demo.launch()

demo_files/comp.gif ADDED Viewed

Git LFS Details

SHA256: 1d5e060d90f29889c55c1c5681dbeb4b4c2408709d18f7451bb0a6f02c6e9bc5
Pointer size: 132 Bytes
Size of remote file: 1.93 MB

demo_files/examples/animal_character.png ADDED Viewed

Git LFS Details

SHA256: 5949f60c651e71a41b7291197f91bb8be2c8861472765fc884e604e18b7806a0
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

demo_files/examples/animal_character_2.png ADDED Viewed

Git LFS Details

SHA256: ffc3f10c629afd64798d38dad2cc419eb343c7106149426f78634a91367bf031
Pointer size: 132 Bytes
Size of remote file: 1.6 MB

demo_files/examples/axe.png ADDED Viewed

Git LFS Details

SHA256: 94be53862906806ac28367017cd9d794edf416df4d33c1bc223ef6f6eed3b39e
Pointer size: 131 Bytes
Size of remote file: 277 kB

demo_files/examples/chair1.png ADDED Viewed

Git LFS Details

SHA256: 2503c12a74419d91a4c6c9f1affc48fee6e2b8b9091956ca6211e91ada57b5bf
Pointer size: 131 Bytes
Size of remote file: 115 kB

demo_files/examples/character1.png ADDED Viewed

Git LFS Details

SHA256: 39cccb99b31a614144a6d147f0e0a8d52b986d6e73587c6e697da07d0a7112f2
Pointer size: 131 Bytes
Size of remote file: 120 kB

demo_files/examples/otter_samurai.png ADDED Viewed

Git LFS Details

SHA256: 3f3c68fa49d43908f18087cde98aba486da814e938bd59909fcb70d996e9f77b
Pointer size: 131 Bytes
Size of remote file: 980 kB

demo_files/examples/raccoon_wizard.png ADDED Viewed

Git LFS Details

SHA256: 32cc3850d9f48548882c7b148e508e8ab149bc4f363611e9739adcbd38e8b16d
Pointer size: 131 Bytes
Size of remote file: 774 kB

demo_files/examples/stylized-rocks.png ADDED Viewed

Git LFS Details

SHA256: 386c3be3a6f24ee52e13f130c1ebc02a1bc46eb2c0ebe90d79ce6f38751f0fc6
Pointer size: 131 Bytes
Size of remote file: 439 kB

demo_files/examples/tree.png ADDED Viewed

Git LFS Details

SHA256: b258278b4d85a75f9ea3f795d3692fc58304a1a3a7daf8935a9549107bfee170
Pointer size: 131 Bytes
Size of remote file: 693 kB

demo_files/hdri/abandoned_tiled_room_1k.hdr ADDED Viewed

Binary file (478 kB). View file

demo_files/hdri/metro_noord_1k.hdr ADDED Viewed

Binary file (467 kB). View file

demo_files/hdri/neon_photostudio_1k.hdr ADDED Viewed

Binary file (438 kB). View file

demo_files/hdri/peppermint_powerplant_1k.hdr ADDED Viewed

Binary file (473 kB). View file

demo_files/hdri/rainforest_trail_1k.hdr ADDED Viewed

Binary file (512 kB). View file

demo_files/hdri/studio_small_08_1k.hdr ADDED Viewed

Binary file (412 kB). View file

demo_files/hdri/urban_alley_01_1k.hdr ADDED Viewed

Binary file (458 kB). View file

demo_files/scatterplot.jpg ADDED Viewed

demo_files/teaser.gif ADDED Viewed

Git LFS Details

SHA256: 1d5dcb4fbe710e94c0fa70cc2c783d66e327222cb5e74839cfd003e619bc2e1d
Pointer size: 132 Bytes
Size of remote file: 2.81 MB

load/tets/160_tets.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f4be37efc604d28d55a1a78c2aabefeeab7e63149f541aa45f9dd858ee35bb9
+size 15408790

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch==2.1.2
+torchvision==0.16.2
+einops==0.7.0
+jaxtyping==0.2.31
+omegaconf==2.3.0
+transformers==4.42.3
+slangtorch==1.2.2
+open_clip_torch==2.24.0
+trimesh==4.4.1
+numpy==1.26.4
+huggingface-hub==0.23.4
+rembg[gpu]==2.0.57
+gradio-litmodel3d==0.0.1

sf3d/box_uv_unwrap.py ADDED Viewed

	@@ -0,0 +1,610 @@

+import math
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from jaxtyping import Float, Integer
+from torch import Tensor
+from sf3d.models.utils import dot, triangle_intersection_2d
+def _box_assign_vertex_to_cube_face(
+    vertex_positions: Float[Tensor, "Nv 3"],
+    vertex_normals: Float[Tensor, "Nv 3"],
+    triangle_idxs: Integer[Tensor, "Nf 3"],
+    bbox: Float[Tensor, "2 3"],
+) -> Tuple[Float[Tensor, "Nf 3 2"], Integer[Tensor, "Nf 3"]]:
+    # Test to not have a scaled model to fit the space better
+    # bbox_min = bbox[:1].mean(-1, keepdim=True)
+    # bbox_max = bbox[1:].mean(-1, keepdim=True)
+    # v_pos_normalized = (vertex_positions - bbox_min) / (bbox_max - bbox_min)
+    # Create a [0, 1] normalized vertex position
+    v_pos_normalized = (vertex_positions - bbox[:1]) / (bbox[1:] - bbox[:1])
+    # And to [-1, 1]
+    v_pos_normalized = 2.0 * v_pos_normalized - 1.0
+    # Get all vertex positions for each triangle
+    # Now how do we define to which face the triangle belongs? Mean face pos? Max vertex pos?
+    v0 = v_pos_normalized[triangle_idxs[:, 0]]
+    v1 = v_pos_normalized[triangle_idxs[:, 1]]
+    v2 = v_pos_normalized[triangle_idxs[:, 2]]
+    tri_stack = torch.stack([v0, v1, v2], dim=1)
+    vn0 = vertex_normals[triangle_idxs[:, 0]]
+    vn1 = vertex_normals[triangle_idxs[:, 1]]
+    vn2 = vertex_normals[triangle_idxs[:, 2]]
+    tri_stack_nrm = torch.stack([vn0, vn1, vn2], dim=1)
+    # Just average the normals per face
+    face_normal = F.normalize(torch.sum(tri_stack_nrm, 1), eps=1e-6, dim=-1)
+    # Now decide based on the face normal in which box map we project
+    # abs_x, abs_y, abs_z = tri_stack_nrm.abs().unbind(-1)
+    abs_x, abs_y, abs_z = tri_stack.abs().unbind(-1)
+    axis = torch.tensor(
+        [
+            [1, 0, 0],  # 0
+            [-1, 0, 0],  # 1
+            [0, 1, 0],  # 2
+            [0, -1, 0],  # 3
+            [0, 0, 1],  # 4
+            [0, 0, -1],  # 5
+        ],
+        device=face_normal.device,
+        dtype=face_normal.dtype,
+    )
+    face_normal_axis = (face_normal[:, None] * axis[None]).sum(-1)
+    index = face_normal_axis.argmax(-1)
+    max_axis, uc, vc = (
+        torch.ones_like(abs_x),
+        torch.zeros_like(tri_stack[..., :1]),
+        torch.zeros_like(tri_stack[..., :1]),
+    )
+    mask_pos_x = index == 0
+    max_axis[mask_pos_x] = abs_x[mask_pos_x]
+    uc[mask_pos_x] = tri_stack[mask_pos_x][..., 1:2]
+    vc[mask_pos_x] = -tri_stack[mask_pos_x][..., -1:]
+    mask_neg_x = index == 1
+    max_axis[mask_neg_x] = abs_x[mask_neg_x]
+    uc[mask_neg_x] = tri_stack[mask_neg_x][..., 1:2]
+    vc[mask_neg_x] = -tri_stack[mask_neg_x][..., -1:]
+    mask_pos_y = index == 2
+    max_axis[mask_pos_y] = abs_y[mask_pos_y]
+    uc[mask_pos_y] = tri_stack[mask_pos_y][..., 0:1]
+    vc[mask_pos_y] = -tri_stack[mask_pos_y][..., -1:]
+    mask_neg_y = index == 3
+    max_axis[mask_neg_y] = abs_y[mask_neg_y]
+    uc[mask_neg_y] = tri_stack[mask_neg_y][..., 0:1]
+    vc[mask_neg_y] = -tri_stack[mask_neg_y][..., -1:]
+    mask_pos_z = index == 4
+    max_axis[mask_pos_z] = abs_z[mask_pos_z]
+    uc[mask_pos_z] = tri_stack[mask_pos_z][..., 0:1]
+    vc[mask_pos_z] = tri_stack[mask_pos_z][..., 1:2]
+    mask_neg_z = index == 5
+    max_axis[mask_neg_z] = abs_z[mask_neg_z]
+    uc[mask_neg_z] = tri_stack[mask_neg_z][..., 0:1]
+    vc[mask_neg_z] = -tri_stack[mask_neg_z][..., 1:2]
+    # UC from [-1, 1] to [0, 1]
+    max_dim_div = max_axis.max(dim=0, keepdims=True).values
+    uc = ((uc[..., 0] / max_dim_div + 1.0) * 0.5).clip(0, 1)
+    vc = ((vc[..., 0] / max_dim_div + 1.0) * 0.5).clip(0, 1)
+    uv = torch.stack([uc, vc], dim=-1)
+    return uv, index
+def _assign_faces_uv_to_atlas_index(
+    vertex_positions: Float[Tensor, "Nv 3"],
+    triangle_idxs: Integer[Tensor, "Nf 3"],
+    face_uv: Float[Tensor, "Nf 3 2"],
+    face_index: Integer[Tensor, "Nf 3"],
+) -> Integer[Tensor, "Nf"]:  # noqa: F821
+    triangle_pos = vertex_positions[triangle_idxs]
+    # We need to do perform 3 overlap checks.
+    # The first set is placed in the upper two thirds of the UV atlas.
+    # Conceptually, this is the direct visible surfaces from the each cube side
+    # The second set is placed in the lower thirds and the left half of the UV atlas.
+    # This is the first set of occluded surfaces. They will also be saved in the projected fashion
+    # The third pass finds all non assigned faces. They will be placed in the bottom right half of
+    # the UV atlas in scattered fashion.
+    assign_idx = face_index.clone()
+    for overlap_step in range(3):
+        overlapping_indicator = torch.zeros_like(assign_idx, dtype=torch.bool)
+        for i in range(overlap_step * 6, (overlap_step + 1) * 6):
+            mask = assign_idx == i
+            if not mask.any():
+                continue
+            # Get all elements belonging to the projection face
+            uv_triangle = face_uv[mask]
+            cur_triangle_pos = triangle_pos[mask]
+            # Find the center of the uv coordinates
+            center_uv = uv_triangle.mean(dim=1, keepdim=True)
+            # And also the radius of the triangle
+            uv_triangle_radius = (uv_triangle - center_uv).norm(dim=-1).max(-1).values
+            potentially_overlapping_mask = (
+                # Find all close triangles
+                (center_uv[None, ...] - center_uv[:, None]).norm(dim=-1)
+                # Do not select the same element by offseting with an large valued identity matrix
+                + torch.eye(
+                    uv_triangle.shape[0],
+                    device=uv_triangle.device,
+                    dtype=uv_triangle.dtype,
+                ).unsqueeze(-1)
+                * 1000
+            )
+            # Mark all potentially overlapping triangles to reduce the number of triangle intersection tests
+            potentially_overlapping_mask = (
+                potentially_overlapping_mask
+                <= (uv_triangle_radius.view(-1, 1, 1) * 3.0)
+            ).squeeze(-1)
+            overlap_coords = torch.stack(torch.where(potentially_overlapping_mask), -1)
+            # Only unique triangles (A|B and B|A should be the same)
+            f = torch.min(overlap_coords, dim=-1).values
+            s = torch.max(overlap_coords, dim=-1).values
+            overlap_coords = torch.unique(torch.stack([f, s], dim=1), dim=0)
+            first, second = overlap_coords.unbind(-1)
+            # Get the triangles
+            tri_1 = uv_triangle[first]
+            tri_2 = uv_triangle[second]
+            # Perform the actual set with the reduced number of potentially overlapping triangles
+            its = triangle_intersection_2d(tri_1, tri_2, eps=1e-6)
+            # So we now need to detect which triangles are the occluded ones.
+            # We always assume the first to be the visible one (the others should move)
+            # In the previous step we use a lexigraphical sort to get the unique pairs
+            # In this we use a sort based on the orthographic projection
+            ax = 0 if i < 2 else 1 if i < 4 else 2
+            use_max = i % 2 == 1
+            tri1_c = cur_triangle_pos[first].mean(dim=1)
+            tri2_c = cur_triangle_pos[second].mean(dim=1)
+            mark_first = (
+                (tri1_c[..., ax] > tri2_c[..., ax])
+                if use_max
+                else (tri1_c[..., ax] < tri2_c[..., ax])
+            )
+            first[mark_first] = second[mark_first]
+            # Lastly the same index can be tested multiple times.
+            # If one marks it as overlapping we keep it marked as such.
+            # We do this by testing if it has been marked at least once.
+            unique_idx, rev_idx = torch.unique(first, return_inverse=True)
+            add = torch.zeros_like(unique_idx, dtype=torch.float32)
+            add.index_add_(0, rev_idx, its.float())
+            its_mask = add > 0
+            # And fill it in the overlapping indicator
+            idx = torch.where(mask)[0][unique_idx]
+            overlapping_indicator[idx] = its_mask
+        # Move the index to the overlap regions (shift by 6)
+        assign_idx[overlapping_indicator] += 6
+    # We do not care about the correct face placement after the first 2 slices
+    max_idx = 6 * 2
+    return assign_idx.clamp(0, max_idx)
+def _find_slice_offset_and_scale(
+    index: Integer[Tensor, "Nf"],  # noqa: F821
+) -> Tuple[
+    Float[Tensor, "Nf"], Float[Tensor, "Nf"], Float[Tensor, "Nf"], Float[Tensor, "Nf"]  # noqa: F821
+]:  # noqa: F821
+    # 6 due to the 6 cube faces
+    off = 1 / 3
+    dupl_off = 1 / 6
+    # Here, we need to decide how to pack the textures in the case of overlap
+    def x_offset_calc(x, i):
+        offset_calc = i // 6
+        # Initial coordinates - just 3x2 grid
+        if offset_calc == 0:
+            return off * x
+        else:
+            # Smaller 3x2 grid plus eventual shift to right for
+            # second overlap
+            return dupl_off * x + min(offset_calc - 1, 1) * 0.5
+    def y_offset_calc(x, i):
+        offset_calc = i // 6
+        # Initial coordinates - just a 3x2 grid
+        if offset_calc == 0:
+            return off * x
+        else:
+            # Smaller coordinates in the lowest row
+            return dupl_off * x + off * 2
+    offset_x = torch.zeros_like(index, dtype=torch.float32)
+    offset_y = torch.zeros_like(index, dtype=torch.float32)
+    offset_x_vals = [0, 1, 2, 0, 1, 2]
+    offset_y_vals = [0, 0, 0, 1, 1, 1]
+    for i in range(index.max().item() + 1):
+        mask = index == i
+        if not mask.any():
+            continue
+        offset_x[mask] = x_offset_calc(offset_x_vals[i % 6], i)
+        offset_y[mask] = y_offset_calc(offset_y_vals[i % 6], i)
+    div_x = torch.full_like(index, 6 // 2, dtype=torch.float32)
+    # All overlap elements are saved in half scale
+    div_x[index >= 6] = 6
+    div_y = div_x.clone()  # Same for y
+    # Except for the random overlaps
+    div_x[index >= 12] = 2
+    # But the random overlaps are saved in a large block in the lower thirds
+    div_y[index >= 12] = 3
+    return offset_x, offset_y, div_x, div_y
+def rotation_flip_matrix_2d(
+    rad: float, flip_x: bool = False, flip_y: bool = False
+) -> Float[Tensor, "2 2"]:
+    cos = math.cos(rad)
+    sin = math.sin(rad)
+    rot_mat = torch.tensor([[cos, -sin], [sin, cos]], dtype=torch.float32)
+    flip_mat = torch.tensor(
+        [
+            [-1 if flip_x else 1, 0],
+            [0, -1 if flip_y else 1],
+        ],
+        dtype=torch.float32,
+    )
+    return flip_mat @ rot_mat
+def calculate_tangents(
+    vertex_positions: Float[Tensor, "Nv 3"],
+    vertex_normals: Float[Tensor, "Nv 3"],
+    triangle_idxs: Integer[Tensor, "Nf 3"],
+    face_uv: Float[Tensor, "Nf 3 2"],
+) -> Float[Tensor, "Nf 3 4"]:  # noqa: F821
+    vn_idx = [None] * 3
+    pos = [None] * 3
+    tex = face_uv.unbind(1)
+    for i in range(0, 3):
+        pos[i] = vertex_positions[triangle_idxs[:, i]]
+        # t_nrm_idx is always the same as t_pos_idx
+        vn_idx[i] = triangle_idxs[:, i]
+    tangents = torch.zeros_like(vertex_normals)
+    tansum = torch.zeros_like(vertex_normals)
+    # Compute tangent space for each triangle
+    duv1 = tex[1] - tex[0]
+    duv2 = tex[2] - tex[0]
+    dpos1 = pos[1] - pos[0]
+    dpos2 = pos[2] - pos[0]
+    tng_nom = dpos1 * duv2[..., 1:2] - dpos2 * duv1[..., 1:2]
+    denom = duv1[..., 0:1] * duv2[..., 1:2] - duv1[..., 1:2] * duv2[..., 0:1]
+    # Avoid division by zero for degenerated texture coordinates
+    denom_safe = denom.clip(1e-6)
+    tang = tng_nom / denom_safe
+    # Update all 3 vertices
+    for i in range(0, 3):
+        idx = vn_idx[i][:, None].repeat(1, 3)
+        tangents.scatter_add_(0, idx, tang)  # tangents[n_i] = tangents[n_i] + tang
+        tansum.scatter_add_(
+            0, idx, torch.ones_like(tang)
+        )  # tansum[n_i] = tansum[n_i] + 1
+    # Also normalize it. Here we do not normalize the individual triangles first so larger area
+    # triangles influence the tangent space more
+    tangents = tangents / tansum
+    # Normalize and make sure tangent is perpendicular to normal
+    tangents = F.normalize(tangents, dim=1)
+    tangents = F.normalize(tangents - dot(tangents, vertex_normals) * vertex_normals)
+    return tangents
+def _rotate_uv_slices_consistent_space(
+    vertex_positions: Float[Tensor, "Nv 3"],
+    vertex_normals: Float[Tensor, "Nv 3"],
+    triangle_idxs: Integer[Tensor, "Nf 3"],
+    uv: Float[Tensor, "Nf 3 2"],
+    index: Integer[Tensor, "Nf"],  # noqa: F821
+):
+    tangents = calculate_tangents(vertex_positions, vertex_normals, triangle_idxs, uv)
+    pos_stack = torch.stack(
+        [
+            -vertex_positions[..., 1],
+            vertex_positions[..., 0],
+            torch.zeros_like(vertex_positions[..., 0]),
+        ],
+        dim=-1,
+    )
+    expected_tangents = F.normalize(
+        torch.linalg.cross(
+            vertex_normals, torch.linalg.cross(pos_stack, vertex_normals)
+        ),
+        -1,
+    )
+    actual_tangents = tangents[triangle_idxs]
+    expected_tangents = expected_tangents[triangle_idxs]
+    def rotation_matrix_2d(theta):
+        c, s = torch.cos(theta), torch.sin(theta)
+        return torch.tensor([[c, -s], [s, c]])
+    # Now find the rotation
+    index_mod = index % 6  # Shouldn't happen. Just for safety
+    for i in range(6):
+        mask = index_mod == i
+        if not mask.any():
+            continue
+        actual_mean_tangent = actual_tangents[mask].mean(dim=(0, 1))
+        expected_mean_tangent = expected_tangents[mask].mean(dim=(0, 1))
+        dot_product = torch.dot(actual_mean_tangent, expected_mean_tangent)
+        cross_product = (
+            actual_mean_tangent[0] * expected_mean_tangent[1]
+            - actual_mean_tangent[1] * expected_mean_tangent[0]
+        )
+        angle = torch.atan2(cross_product, dot_product)
+        rot_matrix = rotation_matrix_2d(angle).to(mask.device)
+        # Center the uv coordinate to be in the range of -1 to 1 and 0 centered
+        uv_cur = uv[mask] * 2 - 1  # Center it first
+        # Rotate it
+        uv[mask] = torch.einsum("ij,nfj->nfi", rot_matrix, uv_cur)
+        # Rescale uv[mask] to be within the 0-1 range
+        uv[mask] = (uv[mask] - uv[mask].min()) / (uv[mask].max() - uv[mask].min())
+    return uv
+def _handle_slice_uvs(
+    uv: Float[Tensor, "Nf 3 2"],
+    index: Integer[Tensor, "Nf"],  # noqa: F821
+    island_padding: float,
+    max_index: int = 6 * 2,
+) -> Float[Tensor, "Nf 3 2"]:  # noqa: F821
+    uc, vc = uv.unbind(-1)
+    # Get the second slice (The first overlap)
+    index_filter = [index == i for i in range(6, max_index)]
+    # Normalize them to always fully fill the atlas patch
+    for i, fi in enumerate(index_filter):
+        if fi.sum() > 0:
+            # Scale the slice but only up to a factor of 2
+            # This keeps the texture resolution with the first slice in line (Half space in UV)
+            uc[fi] = (uc[fi] - uc[fi].min()) / (uc[fi].max() - uc[fi].min()).clip(0.5)
+            vc[fi] = (vc[fi] - vc[fi].min()) / (vc[fi].max() - vc[fi].min()).clip(0.5)
+    uc_padded = (uc * (1 - 2 * island_padding) + island_padding).clip(0, 1)
+    vc_padded = (vc * (1 - 2 * island_padding) + island_padding).clip(0, 1)
+    return torch.stack([uc_padded, vc_padded], dim=-1)
+def _handle_remaining_uvs(
+    uv: Float[Tensor, "Nf 3 2"],
+    index: Integer[Tensor, "Nf"],  # noqa: F821
+    island_padding: float,
+) -> Float[Tensor, "Nf 3 2"]:
+    uc, vc = uv.unbind(-1)
+    # Get all remaining elements
+    remaining_filter = index >= 6 * 2
+    squares_left = remaining_filter.sum()
+    if squares_left == 0:
+        return uv
+    uc = uc[remaining_filter]
+    vc = vc[remaining_filter]
+    # Or remaining triangles are distributed in a rectangle
+    # The rectangle takes 0.5 of the entire uv space in width and 1/3 in height
+    ratio = 0.5 * (1 / 3)  # 1.5
+    # sqrt(744/(0.5*(1/3)))
+    mult = math.sqrt(squares_left / ratio)
+    num_square_width = int(math.ceil(0.5 * mult))
+    num_square_height = int(math.ceil(squares_left / num_square_width))
+    width = 1 / num_square_width
+    height = 1 / num_square_height
+    # The idea is again to keep the texture resolution consistent with the first slice
+    # This only occupys half the region in the texture chart but the scaling on the squares
+    # assumes full coverage.
+    clip_val = min(width, height) * 1.5
+    # Now normalize the UVs with taking into account the maximum scaling
+    uc = (uc - uc.min(dim=1, keepdim=True).values) / (
+        uc.amax(dim=1, keepdim=True) - uc.amin(dim=1, keepdim=True)
+    ).clip(clip_val)
+    vc = (vc - vc.min(dim=1, keepdim=True).values) / (
+        vc.amax(dim=1, keepdim=True) - vc.amin(dim=1, keepdim=True)
+    ).clip(clip_val)
+    # Add a small padding
+    uc = (
+        uc * (1 - island_padding * num_square_width * 0.5)
+        + island_padding * num_square_width * 0.25
+    ).clip(0, 1)
+    vc = (
+        vc * (1 - island_padding * num_square_height * 0.5)
+        + island_padding * num_square_height * 0.25
+    ).clip(0, 1)
+    uc = uc * width
+    vc = vc * height
+    # And calculate offsets for each element
+    idx = torch.arange(uc.shape[0], device=uc.device, dtype=torch.int32)
+    x_idx = idx % num_square_width
+    y_idx = idx // num_square_width
+    # And move each triangle to its own spot
+    uc = uc + x_idx[:, None] * width
+    vc = vc + y_idx[:, None] * height
+    uc = (uc * (1 - 2 * island_padding * 0.5) + island_padding * 0.5).clip(0, 1)
+    vc = (vc * (1 - 2 * island_padding * 0.5) + island_padding * 0.5).clip(0, 1)
+    uv[remaining_filter] = torch.stack([uc, vc], dim=-1)
+    return uv
+def _distribute_individual_uvs_in_atlas(
+    face_uv: Float[Tensor, "Nf 3 2"],
+    assigned_faces: Integer[Tensor, "Nf"],  # noqa: F821
+    offset_x: Float[Tensor, "Nf"],  # noqa: F821
+    offset_y: Float[Tensor, "Nf"],  # noqa: F821
+    div_x: Float[Tensor, "Nf"],  # noqa: F821
+    div_y: Float[Tensor, "Nf"],  # noqa: F821
+    island_padding: float,
+):
+    # Place the slice first
+    placed_uv = _handle_slice_uvs(face_uv, assigned_faces, island_padding)
+    # Then handle the remaining overlap elements
+    placed_uv = _handle_remaining_uvs(placed_uv, assigned_faces, island_padding)
+    uc, vc = placed_uv.unbind(-1)
+    uc = uc / div_x[:, None] + offset_x[:, None]
+    vc = vc / div_y[:, None] + offset_y[:, None]
+    uv = torch.stack([uc, vc], dim=-1).view(-1, 2)
+    return uv
+def _get_unique_face_uv(
+    uv: Float[Tensor, "Nf 3 2"],
+) -> Tuple[Float[Tensor, "Utex 3"], Integer[Tensor, "Nf"]]:  # noqa: F821
+    unique_uv, unique_idx = torch.unique(uv, return_inverse=True, dim=0)
+    # And add the face to uv index mapping
+    vtex_idx = unique_idx.view(-1, 3)
+    return unique_uv, vtex_idx
+def _align_mesh_with_main_axis(
+    vertex_positions: Float[Tensor, "Nv 3"], vertex_normals: Float[Tensor, "Nv 3"]
+) -> Tuple[Float[Tensor, "Nv 3"], Float[Tensor, "Nv 3"]]:
+    # Use pca to find the 2 main axis (third is derived by cross product)
+    # Set the random seed so it's repeatable
+    torch.manual_seed(0)
+    _, _, v = torch.pca_lowrank(vertex_positions, q=2)
+    main_axis, seconday_axis = v[:, 0], v[:, 1]
+    main_axis: Float[Tensor, "3"] = F.normalize(main_axis, eps=1e-6, dim=-1)
+    # Orthogonalize the second axis
+    seconday_axis: Float[Tensor, "3"] = F.normalize(
+        seconday_axis - dot(seconday_axis, main_axis) * main_axis, eps=1e-6, dim=-1
+    )
+    # Create perpendicular third axis
+    third_axis: Float[Tensor, "3"] = F.normalize(
+        torch.cross(main_axis, seconday_axis), dim=-1, eps=1e-6
+    )
+    # Check to which canonical axis each aligns
+    main_axis_max_idx = main_axis.abs().argmax().item()
+    seconday_axis_max_idx = seconday_axis.abs().argmax().item()
+    third_axis_max_idx = third_axis.abs().argmax().item()
+    # Now sort the axes based on the argmax so they align with thecanonoical axes
+    # If two axes have the same argmax move one of them
+    all_possible_axis = {0, 1, 2}
+    cur_index = 1
+    while len(set([main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx])) != 3:
+        # Find missing axis
+        missing_axis = all_possible_axis - set(
+            [main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx]
+        )
+        missing_axis = missing_axis.pop()
+        # Just assign it to third axis as it had the smallest contribution to the
+        # overall shape
+        if cur_index == 1:
+            third_axis_max_idx = missing_axis
+        elif cur_index == 2:
+            seconday_axis_max_idx = missing_axis
+        else:
+            raise ValueError("Could not find 3 unique axis")
+        cur_index += 1
+    if len({main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx}) != 3:
+        raise ValueError("Could not find 3 unique axis")
+    axes = [None] * 3
+    axes[main_axis_max_idx] = main_axis
+    axes[seconday_axis_max_idx] = seconday_axis
+    axes[third_axis_max_idx] = third_axis
+    # Create rotation matrix from the individual axes
+    rot_mat = torch.stack(axes, dim=1).T
+    # Now rotate the vertex positions and vertex normals so the mesh aligns with the main axis
+    vertex_positions = torch.einsum("ij,nj->ni", rot_mat, vertex_positions)
+    vertex_normals = torch.einsum("ij,nj->ni", rot_mat, vertex_normals)
+    return vertex_positions, vertex_normals
+def box_projection_uv_unwrap(
+    vertex_positions: Float[Tensor, "Nv 3"],
+    vertex_normals: Float[Tensor, "Nv 3"],
+    triangle_idxs: Integer[Tensor, "Nf 3"],
+    island_padding: float,
+) -> Tuple[Float[Tensor, "Utex 3"], Integer[Tensor, "Nf"]]:  # noqa: F821
+    # Align the mesh with main axis directions first
+    vertex_positions, vertex_normals = _align_mesh_with_main_axis(
+        vertex_positions, vertex_normals
+    )
+    bbox: Float[Tensor, "2 3"] = torch.stack(
+        [vertex_positions.min(dim=0).values, vertex_positions.max(dim=0).values], dim=0
+    )
+    # First decide in which cube face the triangle is placed
+    face_uv, face_index = _box_assign_vertex_to_cube_face(
+        vertex_positions, vertex_normals, triangle_idxs, bbox
+    )
+    # Rotate the UV islands in a way that they align with the radial z tangent space
+    face_uv = _rotate_uv_slices_consistent_space(
+        vertex_positions, vertex_normals, triangle_idxs, face_uv, face_index
+    )
+    # Then find where where the face is placed in the atlas.
+    # This has to detect potential overlaps
+    assigned_atlas_index = _assign_faces_uv_to_atlas_index(
+        vertex_positions, triangle_idxs, face_uv, face_index
+    )
+    # Then figure out the final place in the atlas based on the assignment
+    offset_x, offset_y, div_x, div_y = _find_slice_offset_and_scale(
+        assigned_atlas_index
+    )
+    # Next distribute the faces in the uv atlas
+    placed_uv = _distribute_individual_uvs_in_atlas(
+        face_uv, assigned_atlas_index, offset_x, offset_y, div_x, div_y, island_padding
+    )
+    # And get the unique per-triangle UV coordinates
+    return _get_unique_face_uv(placed_uv)

sf3d/models/camera.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from dataclasses import dataclass, field
+from typing import List
+import torch
+import torch.nn as nn
+from sf3d.models.utils import BaseModule
+class LinearCameraEmbedder(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int = 25
+        out_channels: int = 768
+        conditions: List[str] = field(default_factory=list)
+    cfg: Config
+    def configure(self) -> None:
+        self.linear = nn.Linear(self.cfg.in_channels, self.cfg.out_channels)
+    def forward(self, **kwargs):
+        cond_tensors = []
+        for cond_name in self.cfg.conditions:
+            assert cond_name in kwargs
+            cond = kwargs[cond_name]
+            # cond in shape (B, Nv, ...)
+            cond_tensors.append(cond.view(*cond.shape[:2], -1))
+        cond_tensor = torch.cat(cond_tensors, dim=-1)
+        assert cond_tensor.shape[-1] == self.cfg.in_channels
+        embedding = self.linear(cond_tensor)
+        return embedding

sf3d/models/global_estimator/multi_head_estimator.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from dataclasses import dataclass, field
+from typing import Any, List, Optional
+import torch.nn as nn
+from jaxtyping import Float
+from torch import Tensor
+from sf3d.models.network import get_activation
+from sf3d.models.utils import BaseModule
+@dataclass
+class HeadSpec:
+    name: str
+    out_channels: int
+    n_hidden_layers: int
+    output_activation: Optional[str] = None
+    output_bias: float = 0.0
+    add_to_decoder_features: bool = False
+    shape: Optional[list[int]] = None
+class MultiHeadEstimator(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        triplane_features: int = 1024
+        n_layers: int = 2
+        hidden_features: int = 512
+        activation: str = "relu"
+        pool: str = "max"
+        # Literal["mean", "max"] = "mean"  # noqa: F821
+        heads: List[HeadSpec] = field(default_factory=lambda: [])
+    cfg: Config
+    def configure(self):
+        layers = []
+        cur_features = self.cfg.triplane_features * 3
+        for _ in range(self.cfg.n_layers):
+            layers.append(
+                nn.Conv2d(
+                    cur_features,
+                    self.cfg.hidden_features,
+                    kernel_size=3,
+                    padding=0,
+                    stride=2,
+                )
+            )
+            layers.append(self.make_activation(self.cfg.activation))
+            cur_features = self.cfg.hidden_features
+        self.layers = nn.Sequential(*layers)
+        assert len(self.cfg.heads) > 0
+        heads = {}
+        for head in self.cfg.heads:
+            head_layers = []
+            for i in range(head.n_hidden_layers):
+                head_layers += [
+                    nn.Linear(
+                        self.cfg.hidden_features,
+                        self.cfg.hidden_features,
+                    ),
+                    self.make_activation(self.cfg.activation),
+                ]
+            head_layers += [
+                nn.Linear(
+                    self.cfg.hidden_features,
+                    head.out_channels,
+                ),
+            ]
+            heads[head.name] = nn.Sequential(*head_layers)
+        self.heads = nn.ModuleDict(heads)
+    def make_activation(self, activation):
+        if activation == "relu":
+            return nn.ReLU(inplace=True)
+        elif activation == "silu":
+            return nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+    def forward(
+        self,
+        triplane: Float[Tensor, "B 3 F Ht Wt"],
+    ) -> dict[str, Any]:
+        x = self.layers(
+            triplane.reshape(
+                triplane.shape[0], -1, triplane.shape[-2], triplane.shape[-1]
+            )
+        )
+        if self.cfg.pool == "max":
+            x = x.amax(dim=[-2, -1])
+        elif self.cfg.pool == "mean":
+            x = x.mean(dim=[-2, -1])
+        else:
+            raise NotImplementedError
+        out = {
+            ("decoder_" if head.add_to_decoder_features else "")
+            + head.name: get_activation(head.output_activation)(
+                self.heads[head.name](x) + head.output_bias
+            )
+            for head in self.cfg.heads
+        }
+        for head in self.cfg.heads:
+            if head.shape:
+                head_name = (
+                    "decoder_" if head.add_to_decoder_features else ""
+                ) + head.name
+                out[head_name] = out[head_name].reshape(*head.shape)
+        return out

sf3d/models/image_estimator/clip_based_estimator.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from dataclasses import dataclass, field
+from typing import Any, List, Optional
+import open_clip
+import torch
+import torch.nn as nn
+from jaxtyping import Float
+from torch import Tensor
+from torchvision.transforms import Normalize
+from sf3d.models.network import get_activation
+from sf3d.models.utils import BaseModule
+@dataclass
+class HeadSpec:
+    name: str
+    out_channels: int
+    n_hidden_layers: int
+    output_activation: Optional[str] = None
+    output_bias: float = 0.0
+    add_to_decoder_features: bool = False
+    shape: Optional[list[int]] = None
+class ClipBasedHeadEstimator(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        model: str = "ViT-B-32"
+        pretrain: str = "laion2b_s34b_b79k"
+        distribution: str = "beta"
+        # ["mean", "mode", "sample", "sample_mean"]
+        distribution_eval: str = "mode"
+        activation: str = "relu"
+        hidden_features: int = 512
+        heads: List[HeadSpec] = field(default_factory=lambda: [])
+    cfg: Config
+    def configure(self):
+        self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+            self.cfg.model, pretrained=self.cfg.pretrain
+        )
+        self.model.eval()
+        # Do not add the weights in self.model to the optimizer
+        for param in self.model.parameters():
+            param.requires_grad = False
+        assert len(self.cfg.heads) > 0
+        heads = {}
+        for head in self.cfg.heads:
+            head_layers = []
+            for i in range(head.n_hidden_layers):
+                head_layers += [
+                    nn.Linear(
+                        self.cfg.hidden_features,
+                        self.cfg.hidden_features,
+                    ),
+                    self.make_activation(self.cfg.activation),
+                ]
+            head_layers = [nn.Sequential(*head_layers)]
+            head_layers += [
+                nn.Sequential(
+                    nn.Linear(
+                        self.cfg.hidden_features,
+                        self.cfg.hidden_features,
+                    ),
+                    self.make_activation(self.cfg.activation),
+                    nn.Linear(self.cfg.hidden_features, 1),
+                )
+                for _ in range(2)
+            ]
+            heads[head.name] = nn.ModuleList(head_layers)
+        self.heads = nn.ModuleDict(heads)
+    def make_activation(self, activation):
+        if activation == "relu":
+            return nn.ReLU(inplace=True)
+        elif activation == "silu":
+            return nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+    def forward(
+        self,
+        cond_image: Float[Tensor, "B 1 H W 3"],
+        sample: bool = True,
+    ) -> dict[str, Any]:
+        # Run the model
+        # Resize cond_image to 224
+        cond_image = nn.functional.interpolate(
+            cond_image.flatten(0, 1).permute(0, 3, 1, 2),
+            size=(224, 224),
+            mode="bilinear",
+            align_corners=False,
+        )
+        cond_image = Normalize(
+            mean=open_clip.constants.OPENAI_DATASET_MEAN,
+            std=open_clip.constants.OPENAI_DATASET_STD,
+        )(cond_image)
+        image_features = self.model.encode_image(cond_image)
+        # Run the heads
+        outputs = {}
+        for head_dict in self.cfg.heads:
+            head_name = head_dict.name
+            shared_head, d1_h, d2_h = self.heads[head_name]
+            shared_features = shared_head(image_features)
+            d1, d2 = [head(shared_features).squeeze(-1) for head in [d1_h, d2_h]]
+            if self.cfg.distribution == "normal":
+                mean = d1
+                var = d2
+                if mean.shape[-1] == 1:
+                    outputs[head_name] = torch.distributions.Normal(
+                        mean + head_dict.output_bias,
+                        torch.nn.functional.softplus(var),
+                    )
+                else:
+                    outputs[head_name] = torch.distributions.MultivariateNormal(
+                        mean + head_dict.output_bias,
+                        torch.nn.functional.softplus(var).diag_embed(),
+                    )
+            elif self.cfg.distribution == "beta":
+                outputs[head_name] = torch.distributions.Beta(
+                    torch.nn.functional.softplus(d1 + head_dict.output_bias),
+                    torch.nn.functional.softplus(d2 + head_dict.output_bias),
+                )
+            else:
+                raise NotImplementedError
+        if sample:
+            for head_dict in self.cfg.heads:
+                head_name = head_dict.name
+                dist = outputs[head_name]
+                if self.cfg.distribution_eval == "mean":
+                    out = dist.mean
+                elif self.cfg.distribution_eval == "mode":
+                    out = dist.mode
+                elif self.cfg.distribution_eval == "sample_mean":
+                    out = dist.sample([10]).mean(-1)
+                else:
+                    # use rsample if gradient is needed
+                    out = dist.rsample() if self.training else dist.sample()
+                outputs[head_name] = get_activation(head_dict.output_activation)(out)
+                outputs[f"{head_name}_dist"] = dist
+        for head in self.cfg.heads:
+            if head.shape:
+                if not sample:
+                    raise ValueError(
+                        "Cannot reshape non-sampled probabilisitic outputs"
+                    )
+                outputs[head.name] = outputs[head.name].reshape(*head.shape)
+            if head.add_to_decoder_features:
+                outputs[f"decoder_{head.name}"] = outputs[head.name]
+                del outputs[head.name]
+        return outputs

sf3d/models/isosurface.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from jaxtyping import Float, Integer
+from torch import Tensor
+from .mesh import Mesh
+class IsosurfaceHelper(nn.Module):
+    points_range: Tuple[float, float] = (0, 1)
+    @property
+    def grid_vertices(self) -> Float[Tensor, "N 3"]:
+        raise NotImplementedError
+    @property
+    def requires_instance_per_batch(self) -> bool:
+        return False
+class MarchingTetrahedraHelper(IsosurfaceHelper):
+    def __init__(self, resolution: int, tets_path: str):
+        super().__init__()
+        self.resolution = resolution
+        self.tets_path = tets_path
+        self.triangle_table: Float[Tensor, "..."]
+        self.register_buffer(
+            "triangle_table",
+            torch.as_tensor(
+                [
+                    [-1, -1, -1, -1, -1, -1],
+                    [1, 0, 2, -1, -1, -1],
+                    [4, 0, 3, -1, -1, -1],
+                    [1, 4, 2, 1, 3, 4],
+                    [3, 1, 5, -1, -1, -1],
+                    [2, 3, 0, 2, 5, 3],
+                    [1, 4, 0, 1, 5, 4],
+                    [4, 2, 5, -1, -1, -1],
+                    [4, 5, 2, -1, -1, -1],
+                    [4, 1, 0, 4, 5, 1],
+                    [3, 2, 0, 3, 5, 2],
+                    [1, 3, 5, -1, -1, -1],
+                    [4, 1, 2, 4, 3, 1],
+                    [3, 0, 4, -1, -1, -1],
+                    [2, 0, 1, -1, -1, -1],
+                    [-1, -1, -1, -1, -1, -1],
+                ],
+                dtype=torch.long,
+            ),
+            persistent=False,
+        )
+        self.num_triangles_table: Integer[Tensor, "..."]
+        self.register_buffer(
+            "num_triangles_table",
+            torch.as_tensor(
+                [0, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 0], dtype=torch.long
+            ),
+            persistent=False,
+        )
+        self.base_tet_edges: Integer[Tensor, "..."]
+        self.register_buffer(
+            "base_tet_edges",
+            torch.as_tensor([0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3], dtype=torch.long),
+            persistent=False,
+        )
+        tets = np.load(self.tets_path)
+        self._grid_vertices: Float[Tensor, "..."]
+        self.register_buffer(
+            "_grid_vertices",
+            torch.from_numpy(tets["vertices"]).float(),
+            persistent=False,
+        )
+        self.indices: Integer[Tensor, "..."]
+        self.register_buffer(
+            "indices", torch.from_numpy(tets["indices"]).long(), persistent=False
+        )
+        self._all_edges: Optional[Integer[Tensor, "Ne 2"]] = None
+        center_indices, boundary_indices = self.get_center_boundary_index(
+            self._grid_vertices
+        )
+        self.center_indices: Integer[Tensor, "..."]
+        self.register_buffer("center_indices", center_indices, persistent=False)
+        self.boundary_indices: Integer[Tensor, "..."]
+        self.register_buffer("boundary_indices", boundary_indices, persistent=False)
+    def get_center_boundary_index(self, verts):
+        magn = torch.sum(verts**2, dim=-1)
+        center_idx = torch.argmin(magn)
+        boundary_neg = verts == verts.max()
+        boundary_pos = verts == verts.min()
+        boundary = torch.bitwise_or(boundary_pos, boundary_neg)
+        boundary = torch.sum(boundary.float(), dim=-1)
+        boundary_idx = torch.nonzero(boundary)
+        return center_idx, boundary_idx.squeeze(dim=-1)
+    def normalize_grid_deformation(
+        self, grid_vertex_offsets: Float[Tensor, "Nv 3"]
+    ) -> Float[Tensor, "Nv 3"]:
+        return (
+            (self.points_range[1] - self.points_range[0])
+            / self.resolution  # half tet size is approximately 1 / self.resolution
+            * torch.tanh(grid_vertex_offsets)
+        )  # FIXME: hard-coded activation
+    @property
+    def grid_vertices(self) -> Float[Tensor, "Nv 3"]:
+        return self._grid_vertices
+    @property
+    def all_edges(self) -> Integer[Tensor, "Ne 2"]:
+        if self._all_edges is None:
+            # compute edges on GPU, or it would be VERY SLOW (basically due to the unique operation)
+            edges = torch.tensor(
+                [0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3],
+                dtype=torch.long,
+                device=self.indices.device,
+            )
+            _all_edges = self.indices[:, edges].reshape(-1, 2)
+            _all_edges_sorted = torch.sort(_all_edges, dim=1)[0]
+            _all_edges = torch.unique(_all_edges_sorted, dim=0)
+            self._all_edges = _all_edges
+        return self._all_edges
+    def sort_edges(self, edges_ex2):
+        with torch.no_grad():
+            order = (edges_ex2[:, 0] > edges_ex2[:, 1]).long()
+            order = order.unsqueeze(dim=1)
+            a = torch.gather(input=edges_ex2, index=order, dim=1)
+            b = torch.gather(input=edges_ex2, index=1 - order, dim=1)
+        return torch.stack([a, b], -1)
+    def _forward(self, pos_nx3, sdf_n, tet_fx4):
+        with torch.no_grad():
+            occ_n = sdf_n > 0
+            occ_fx4 = occ_n[tet_fx4.reshape(-1)].reshape(-1, 4)
+            occ_sum = torch.sum(occ_fx4, -1)
+            valid_tets = (occ_sum > 0) & (occ_sum < 4)
+            occ_sum = occ_sum[valid_tets]
+            # find all vertices
+            all_edges = tet_fx4[valid_tets][:, self.base_tet_edges].reshape(-1, 2)
+            all_edges = self.sort_edges(all_edges)
+            unique_edges, idx_map = torch.unique(all_edges, dim=0, return_inverse=True)
+            unique_edges = unique_edges.long()
+            mask_edges = occ_n[unique_edges.reshape(-1)].reshape(-1, 2).sum(-1) == 1
+            mapping = (
+                torch.ones(
+                    (unique_edges.shape[0]), dtype=torch.long, device=pos_nx3.device
+                )
+                * -1
+            )
+            mapping[mask_edges] = torch.arange(
+                mask_edges.sum(), dtype=torch.long, device=pos_nx3.device
+            )
+            idx_map = mapping[idx_map]  # map edges to verts
+            interp_v = unique_edges[mask_edges]
+        edges_to_interp = pos_nx3[interp_v.reshape(-1)].reshape(-1, 2, 3)
+        edges_to_interp_sdf = sdf_n[interp_v.reshape(-1)].reshape(-1, 2, 1)
+        edges_to_interp_sdf[:, -1] *= -1
+        denominator = edges_to_interp_sdf.sum(1, keepdim=True)
+        edges_to_interp_sdf = torch.flip(edges_to_interp_sdf, [1]) / denominator
+        verts = (edges_to_interp * edges_to_interp_sdf).sum(1)
+        idx_map = idx_map.reshape(-1, 6)
+        v_id = torch.pow(2, torch.arange(4, dtype=torch.long, device=pos_nx3.device))
+        tetindex = (occ_fx4[valid_tets] * v_id.unsqueeze(0)).sum(-1)
+        num_triangles = self.num_triangles_table[tetindex]
+        # Generate triangle indices
+        faces = torch.cat(
+            (
+                torch.gather(
+                    input=idx_map[num_triangles == 1],
+                    dim=1,
+                    index=self.triangle_table[tetindex[num_triangles == 1]][:, :3],
+                ).reshape(-1, 3),
+                torch.gather(
+                    input=idx_map[num_triangles == 2],
+                    dim=1,
+                    index=self.triangle_table[tetindex[num_triangles == 2]][:, :6],
+                ).reshape(-1, 3),
+            ),
+            dim=0,
+        )
+        return verts, faces
+    def forward(
+        self,
+        level: Float[Tensor, "N3 1"],
+        deformation: Optional[Float[Tensor, "N3 3"]] = None,
+    ) -> Mesh:
+        if deformation is not None:
+            grid_vertices = self.grid_vertices + self.normalize_grid_deformation(
+                deformation
+            )
+        else:
+            grid_vertices = self.grid_vertices
+        v_pos, t_pos_idx = self._forward(grid_vertices, level, self.indices)
+        mesh = Mesh(
+            v_pos=v_pos,
+            t_pos_idx=t_pos_idx,
+            # extras
+            grid_vertices=grid_vertices,
+            tet_edges=self.all_edges,
+            grid_level=level,
+            grid_deformation=deformation,
+        )
+        return mesh

sf3d/models/mesh.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from __future__ import annotations
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from jaxtyping import Float, Integer
+from torch import Tensor
+from sf3d.box_uv_unwrap import box_projection_uv_unwrap
+from sf3d.models.utils import dot
+class Mesh:
+    def __init__(
+        self, v_pos: Float[Tensor, "Nv 3"], t_pos_idx: Integer[Tensor, "Nf 3"], **kwargs
+    ) -> None:
+        self.v_pos: Float[Tensor, "Nv 3"] = v_pos
+        self.t_pos_idx: Integer[Tensor, "Nf 3"] = t_pos_idx
+        self._v_nrm: Optional[Float[Tensor, "Nv 3"]] = None
+        self._v_tng: Optional[Float[Tensor, "Nv 3"]] = None
+        self._v_tex: Optional[Float[Tensor, "Nt 3"]] = None
+        self._edges: Optional[Integer[Tensor, "Ne 2"]] = None
+        self.extras: Dict[str, Any] = {}
+        for k, v in kwargs.items():
+            self.add_extra(k, v)
+    def add_extra(self, k, v) -> None:
+        self.extras[k] = v
+    @property
+    def requires_grad(self):
+        return self.v_pos.requires_grad
+    @property
+    def v_nrm(self):
+        if self._v_nrm is None:
+            self._v_nrm = self._compute_vertex_normal()
+        return self._v_nrm
+    @property
+    def v_tng(self):
+        if self._v_tng is None:
+            self._v_tng = self._compute_vertex_tangent()
+        return self._v_tng
+    @property
+    def v_tex(self):
+        if self._v_tex is None:
+            self.unwrap_uv()
+        return self._v_tex
+    @property
+    def edges(self):
+        if self._edges is None:
+            self._edges = self._compute_edges()
+        return self._edges
+    def _compute_vertex_normal(self):
+        i0 = self.t_pos_idx[:, 0]
+        i1 = self.t_pos_idx[:, 1]
+        i2 = self.t_pos_idx[:, 2]
+        v0 = self.v_pos[i0, :]
+        v1 = self.v_pos[i1, :]
+        v2 = self.v_pos[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0, dim=-1)
+        # Splat face normals to vertices
+        v_nrm = torch.zeros_like(self.v_pos)
+        v_nrm.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
+        v_nrm.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
+        v_nrm.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
+        # Normalize, replace zero (degenerated) normals with some default value
+        v_nrm = torch.where(
+            dot(v_nrm, v_nrm) > 1e-20, v_nrm, torch.as_tensor([0.0, 0.0, 1.0]).to(v_nrm)
+        )
+        v_nrm = F.normalize(v_nrm, dim=1)
+        if torch.is_anomaly_enabled():
+            assert torch.all(torch.isfinite(v_nrm))
+        return v_nrm
+    def _compute_vertex_tangent(self):
+        vn_idx = [None] * 3
+        pos = [None] * 3
+        tex = [None] * 3
+        for i in range(0, 3):
+            pos[i] = self.v_pos[self.t_pos_idx[:, i]]
+            tex[i] = self.v_tex[self.t_pos_idx[:, i]]
+            # t_nrm_idx is always the same as t_pos_idx
+            vn_idx[i] = self.t_pos_idx[:, i]
+        tangents = torch.zeros_like(self.v_nrm)
+        tansum = torch.zeros_like(self.v_nrm)
+        # Compute tangent space for each triangle
+        duv1 = tex[1] - tex[0]
+        duv2 = tex[2] - tex[0]
+        dpos1 = pos[1] - pos[0]
+        dpos2 = pos[2] - pos[0]
+        tng_nom = dpos1 * duv2[..., 1:2] - dpos2 * duv1[..., 1:2]
+        denom = duv1[..., 0:1] * duv2[..., 1:2] - duv1[..., 1:2] * duv2[..., 0:1]
+        # Avoid division by zero for degenerated texture coordinates
+        denom_safe = denom.clip(1e-6)
+        tang = tng_nom / denom_safe
+        # Update all 3 vertices
+        for i in range(0, 3):
+            idx = vn_idx[i][:, None].repeat(1, 3)
+            tangents.scatter_add_(0, idx, tang)  # tangents[n_i] = tangents[n_i] + tang
+            tansum.scatter_add_(
+                0, idx, torch.ones_like(tang)
+            )  # tansum[n_i] = tansum[n_i] + 1
+        # Also normalize it. Here we do not normalize the individual triangles first so larger area
+        # triangles influence the tangent space more
+        tangents = tangents / tansum
+        # Normalize and make sure tangent is perpendicular to normal
+        tangents = F.normalize(tangents, dim=1)
+        tangents = F.normalize(tangents - dot(tangents, self.v_nrm) * self.v_nrm)
+        if torch.is_anomaly_enabled():
+            assert torch.all(torch.isfinite(tangents))
+        return tangents
+    @torch.no_grad()
+    def unwrap_uv(
+        self,
+        island_padding: float = 0.02,
+    ) -> Mesh:
+        uv, indices = box_projection_uv_unwrap(
+            self.v_pos, self.v_nrm, self.t_pos_idx, island_padding
+        )
+        # Do store per vertex UVs.
+        # This means we need to duplicate some vertices at the seams
+        individual_vertices = self.v_pos[self.t_pos_idx].reshape(-1, 3)
+        individual_faces = torch.arange(
+            individual_vertices.shape[0],
+            device=individual_vertices.device,
+            dtype=self.t_pos_idx.dtype,
+        ).reshape(-1, 3)
+        uv_flat = uv[indices].reshape((-1, 2))
+        # uv_flat[:, 1] = 1 - uv_flat[:, 1]
+        self.v_pos = individual_vertices
+        self.t_pos_idx = individual_faces
+        self._v_tex = uv_flat
+        self._v_nrm = self._compute_vertex_normal()
+        self._v_tng = self._compute_vertex_tangent()
+    def _compute_edges(self):
+        # Compute edges
+        edges = torch.cat(
+            [
+                self.t_pos_idx[:, [0, 1]],
+                self.t_pos_idx[:, [1, 2]],
+                self.t_pos_idx[:, [2, 0]],
+            ],
+            dim=0,
+        )
+        edges = edges.sort()[0]
+        edges = torch.unique(edges, dim=0)
+        return edges

sf3d/models/network.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from dataclasses import dataclass, field
+from typing import Callable, List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from jaxtyping import Float
+from torch import Tensor
+from torch.autograd import Function
+from torch.cuda.amp import custom_bwd, custom_fwd
+from sf3d.models.utils import BaseModule, normalize
+class PixelShuffleUpsampleNetwork(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int = 1024
+        out_channels: int = 40
+        scale_factor: int = 4
+        conv_layers: int = 4
+        conv_kernel_size: int = 3
+    cfg: Config
+    def configure(self) -> None:
+        layers = []
+        output_channels = self.cfg.out_channels * self.cfg.scale_factor**2
+        in_channels = self.cfg.in_channels
+        for i in range(self.cfg.conv_layers):
+            cur_out_channels = (
+                in_channels if i != self.cfg.conv_layers - 1 else output_channels
+            )
+            layers.append(
+                nn.Conv2d(
+                    in_channels,
+                    cur_out_channels,
+                    self.cfg.conv_kernel_size,
+                    padding=(self.cfg.conv_kernel_size - 1) // 2,
+                )
+            )
+            if i != self.cfg.conv_layers - 1:
+                layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.PixelShuffle(self.cfg.scale_factor))
+        self.upsample = nn.Sequential(*layers)
+    def forward(
+        self, triplanes: Float[Tensor, "B 3 Ci Hp Wp"]
+    ) -> Float[Tensor, "B 3 Co Hp2 Wp2"]:
+        return rearrange(
+            self.upsample(
+                rearrange(triplanes, "B Np Ci Hp Wp -> (B Np) Ci Hp Wp", Np=3)
+            ),
+            "(B Np) Co Hp Wp -> B Np Co Hp Wp",
+            Np=3,
+        )
+class _TruncExp(Function):  # pylint: disable=abstract-method
+    # Implementation from torch-ngp:
+    # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, x):  # pylint: disable=arguments-differ
+        ctx.save_for_backward(x)
+        return torch.exp(x)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, g):  # pylint: disable=arguments-differ
+        x = ctx.saved_tensors[0]
+        return g * torch.exp(torch.clamp(x, max=15))
+trunc_exp = _TruncExp.apply
+def get_activation(name) -> Callable:
+    if name is None:
+        return lambda x: x
+    name = name.lower()
+    if name == "none" or name == "linear" or name == "identity":
+        return lambda x: x
+    elif name == "lin2srgb":
+        return lambda x: torch.where(
+            x > 0.0031308,
+            torch.pow(torch.clamp(x, min=0.0031308), 1.0 / 2.4) * 1.055 - 0.055,
+            12.92 * x,
+        ).clamp(0.0, 1.0)
+    elif name == "exp":
+        return lambda x: torch.exp(x)
+    elif name == "shifted_exp":
+        return lambda x: torch.exp(x - 1.0)
+    elif name == "trunc_exp":
+        return trunc_exp
+    elif name == "shifted_trunc_exp":
+        return lambda x: trunc_exp(x - 1.0)
+    elif name == "sigmoid":
+        return lambda x: torch.sigmoid(x)
+    elif name == "tanh":
+        return lambda x: torch.tanh(x)
+    elif name == "shifted_softplus":
+        return lambda x: F.softplus(x - 1.0)
+    elif name == "scale_-11_01":
+        return lambda x: x * 0.5 + 0.5
+    elif name == "negative":
+        return lambda x: -x
+    elif name == "normalize_channel_last":
+        return lambda x: normalize(x)
+    elif name == "normalize_channel_first":
+        return lambda x: normalize(x, dim=1)
+    else:
+        try:
+            return getattr(F, name)
+        except AttributeError:
+            raise ValueError(f"Unknown activation function: {name}")
+@dataclass
+class HeadSpec:
+    name: str
+    out_channels: int
+    n_hidden_layers: int
+    output_activation: Optional[str] = None
+    out_bias: float = 0.0
+class MaterialMLP(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int = 120
+        n_neurons: int = 64
+        activation: str = "silu"
+        heads: List[HeadSpec] = field(default_factory=lambda: [])
+    cfg: Config
+    def configure(self) -> None:
+        assert len(self.cfg.heads) > 0
+        heads = {}
+        for head in self.cfg.heads:
+            head_layers = []
+            for i in range(head.n_hidden_layers):
+                head_layers += [
+                    nn.Linear(
+                        self.cfg.in_channels if i == 0 else self.cfg.n_neurons,
+                        self.cfg.n_neurons,
+                    ),
+                    self.make_activation(self.cfg.activation),
+                ]
+            head_layers += [
+                nn.Linear(
+                    self.cfg.n_neurons,
+                    head.out_channels,
+                ),
+            ]
+            heads[head.name] = nn.Sequential(*head_layers)
+        self.heads = nn.ModuleDict(heads)
+    def make_activation(self, activation):
+        if activation == "relu":
+            return nn.ReLU(inplace=True)
+        elif activation == "silu":
+            return nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+    def keys(self):
+        return self.heads.keys()
+    def forward(
+        self, x, include: Optional[List] = None, exclude: Optional[List] = None
+    ):
+        if include is not None and exclude is not None:
+            raise ValueError("Cannot specify both include and exclude.")
+        if include is not None:
+            heads = [h for h in self.cfg.heads if h.name in include]
+        elif exclude is not None:
+            heads = [h for h in self.cfg.heads if h.name not in exclude]
+        else:
+            heads = self.cfg.heads
+        out = {
+            head.name: get_activation(head.output_activation)(
+                self.heads[head.name](x) + head.out_bias
+            )
+            for head in heads
+        }
+        return out

sf3d/models/tokenizers/dinov2.py ADDED Viewed

	@@ -0,0 +1,1196 @@

+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DINOv2 model."""
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.dinov2.configuration_dinov2 import Dinov2Config
+from transformers.pytorch_utils import (
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.backbone_utils import BackboneMixin
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "Dinov2Config"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-base"
+DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/dinov2-base",
+    # See all DINOv2 models at https://huggingface.co/models?filter=dinov2
+]
+class Dinov2Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        # register as mask token as it's not used in optimization
+        # to avoid the use of find_unused_parameters_true
+        # self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_buffer("mask_token", torch.zeros(1, config.hidden_size))
+        self.patch_embeddings = Dinov2PatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(
+                height / math.sqrt(num_positions),
+                width / math.sqrt(num_positions),
+            ),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if (
+            int(height) != patch_pos_embed.shape[-2]
+            or int(width) != patch_pos_embed.shape[-1]
+        ):
+            raise ValueError(
+                "Width or height does not match with the interpolated position embeddings"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        patch_embeddings = self.patch_embeddings(pixel_values)
+        embeddings = patch_embeddings
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1),
+                self.mask_token.to(embeddings.dtype).unsqueeze(0),
+                embeddings,
+            )
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings, height, width
+        )
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class Dinov2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = (
+            image_size
+            if isinstance(image_size, collections.abc.Iterable)
+            else (image_size, image_size)
+        )
+        patch_size = (
+            patch_size
+            if isinstance(patch_size, collections.abc.Iterable)
+            else (patch_size, patch_size)
+        )
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        """
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
+class Dinov2SelfAttention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+        self.query = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.key = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.value = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        if hasattr(F, "scaled_dot_product_attention"):
+            assert head_mask is None and not output_attentions
+            new_size = hidden_states.size()[:-1] + (
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+            key_layer = self.key(hidden_states).reshape(new_size).transpose(1, 2)
+            value_layer = self.value(hidden_states).reshape(new_size).transpose(1, 2)
+            query_layer = mixed_query_layer.reshape(new_size).transpose(1, 2)
+            context_layer = F.scaled_dot_product_attention(
+                query_layer,
+                key_layer,
+                value_layer,
+                dropout_p=self.attention_probs_dropout_prob,
+                is_causal=False,
+            )
+            context_layer = context_layer.transpose(1, 2).reshape(
+                *hidden_states.size()[:-1], -1
+            )
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            query_layer = self.transpose_for_scores(mixed_query_layer)
+            # Take the dot product between "query" and "key" to get the raw attention scores.
+            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+            attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+            # Normalize the attention scores to probabilities.
+            attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.dropout(attention_probs)
+            # Mask heads if we want to
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+            context_layer = torch.matmul(attention_probs, value_layer)
+            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+            context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
+class Dinov2SelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
+class Dinov2Attention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.attention = Dinov2SelfAttention(config)
+        self.output = Dinov2SelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.attention.num_attention_heads,
+            self.attention.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
+            heads
+        )
+        self.attention.all_head_size = (
+            self.attention.attention_head_size * self.attention.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+class Dinov2LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(
+            config.layerscale_value * torch.ones(config.hidden_size)
+        )
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(
+    input: torch.Tensor, drop_prob: float = 0.0, training: bool = False
+) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (
+        input.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=input.dtype, device=input.device
+    )
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class Dinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class Dinov2MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+class Dinov2SwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+class Dinov2Layer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm1_modulation = None
+        self.attention = Dinov2Attention(config)
+        self.layer_scale1 = Dinov2LayerScale(config)
+        self.drop_path1 = (
+            Dinov2DropPath(config.drop_path_rate)
+            if config.drop_path_rate > 0.0
+            else nn.Identity()
+        )
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm2_modulation = None
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2SwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2MLP(config)
+        self.layer_scale2 = Dinov2LayerScale(config)
+        self.drop_path2 = (
+            Dinov2DropPath(config.drop_path_rate)
+            if config.drop_path_rate > 0.0
+            else nn.Identity()
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        modulation_cond: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        hidden_states_norm = self.norm1(hidden_states)
+        if self.norm1_modulation is not None:
+            assert modulation_cond is not None
+            hidden_states_norm = self.norm1_modulation(
+                hidden_states_norm, modulation_cond
+            )
+        self_attention_outputs = self.attention(
+            hidden_states_norm,  # in Dinov2, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[
+            1:
+        ]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        if self.norm2_modulation is not None:
+            assert modulation_cond is not None
+            layer_output = self.norm2_modulation(layer_output, modulation_cond)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+        # second residual connection
+        layer_output = layer_output + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+    def register_ada_norm_modulation(self, norm1_mod: nn.Module, norm2_mod: nn.Module):
+        self.norm1_modulation = norm1_mod
+        self.norm2_modulation = norm2_mod
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
+class Dinov2Encoder(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [Dinov2Layer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        modulation_cond: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                    modulation_cond,
+                    use_reentrant=False,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, layer_head_mask, modulation_cond, output_attentions
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class Dinov2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+    def _set_gradient_checkpointing(
+        self, module: Dinov2Encoder, value: bool = False
+    ) -> None:
+        if isinstance(module, Dinov2Encoder):
+            module.gradient_checkpointing = value
+DINOV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DINOV2_BASE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+DINOV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@dataclass
+class CustomBaseModelOutputWithPooling(BaseModelOutputWithPooling):
+    patch_embeddings: Optional[torch.FloatTensor] = None
+@add_start_docstrings(
+    "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Model(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    def expand_input_channels(self, extra_input_channels: int) -> None:
+        if extra_input_channels == 0:
+            return
+        conv_old = self.embeddings.patch_embeddings.projection
+        conv_new = nn.Conv2d(
+            self.config.num_channels + extra_input_channels,
+            self.config.hidden_size,
+            kernel_size=self.config.patch_size,
+            stride=self.config.patch_size,
+        ).to(self.device)
+        with torch.no_grad():
+            conv_new.weight[:, :3] = conv_old.weight
+            conv_new.bias = conv_old.bias
+        self.embeddings.patch_embeddings.projection = conv_new
+        del conv_old
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        modulation_cond: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            modulation_cond=modulation_cond,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+        return CustomBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            patch_embeddings=embedding_output,
+        )
+    def set_gradient_checkpointing(self, value: bool = False) -> None:
+        self._set_gradient_checkpointing(self.encoder, value)
+@add_start_docstrings(
+    """
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2ForImageClassification(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.dinov2 = Dinov2Model(config)
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.dinov2(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = [
+            config.hidden_size for _ in range(config.num_hidden_layers + 1)
+        ]
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        embedding_output = self.embeddings(pixel_values)
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state[:, 1:, :].reshape(
+                        batch_size, width // patch_size, height // patch_size, -1
+                    )
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions if output_attentions else None,
+        )
+class CustomPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(
+        self, image_size: int, patch_size: int, num_channels: int, hidden_size: int
+    ):
+        super().__init__()
+        image_size = (
+            image_size
+            if isinstance(image_size, collections.abc.Iterable)
+            else (image_size, image_size)
+        )
+        patch_size = (
+            patch_size
+            if isinstance(patch_size, collections.abc.Iterable)
+            else (patch_size, patch_size)
+        )
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+class CustomEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+    def __init__(
+        self, image_size: int, patch_size: int, num_channels: int, hidden_size: int
+    ) -> None:
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.cls_token = nn.Parameter(torch.randn(1, 1, self.hidden_size))
+        self.patch_embeddings = CustomPatchEmbeddings(
+            image_size, patch_size, num_channels, hidden_size
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1, self.hidden_size)
+        )
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        height = height // self.patch_size
+        width = width // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(
+                height / math.sqrt(num_positions),
+                width / math.sqrt(num_positions),
+            ),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if (
+            int(height) != patch_pos_embed.shape[-2]
+            or int(width) != patch_pos_embed.shape[-1]
+        ):
+            raise ValueError(
+                "Width or height does not match with the interpolated position embeddings"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        patch_embeddings = self.patch_embeddings(pixel_values)
+        embeddings = patch_embeddings
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings, height, width
+        )
+        return embeddings

sf3d/models/tokenizers/image.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from jaxtyping import Float
+from torch import Tensor
+from sf3d.models.tokenizers.dinov2 import Dinov2Model
+from sf3d.models.transformers.attention import Modulation
+from sf3d.models.utils import BaseModule
+class DINOV2SingleImageTokenizer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: str = "facebook/dinov2-large"
+        width: int = 512
+        height: int = 512
+        modulation_cond_dim: int = 768
+    cfg: Config
+    def configure(self) -> None:
+        self.model = Dinov2Model.from_pretrained(self.cfg.pretrained_model_name_or_path)
+        for p in self.model.parameters():
+            p.requires_grad_(False)
+        self.model.eval()
+        self.model.set_gradient_checkpointing(False)
+        # add modulation
+        modulations = []
+        for layer in self.model.encoder.layer:
+            norm1_modulation = Modulation(
+                self.model.config.hidden_size,
+                self.cfg.modulation_cond_dim,
+                zero_init=True,
+                single_layer=True,
+            )
+            norm2_modulation = Modulation(
+                self.model.config.hidden_size,
+                self.cfg.modulation_cond_dim,
+                zero_init=True,
+                single_layer=True,
+            )
+            layer.register_ada_norm_modulation(norm1_modulation, norm2_modulation)
+            modulations += [norm1_modulation, norm2_modulation]
+        self.modulations = nn.ModuleList(modulations)
+        self.register_buffer(
+            "image_mean",
+            torch.as_tensor([0.485, 0.456, 0.406]).reshape(1, 1, 3, 1, 1),
+            persistent=False,
+        )
+        self.register_buffer(
+            "image_std",
+            torch.as_tensor([0.229, 0.224, 0.225]).reshape(1, 1, 3, 1, 1),
+            persistent=False,
+        )
+    def forward(
+        self,
+        images: Float[Tensor, "B *N C H W"],
+        modulation_cond: Optional[Float[Tensor, "B *N Cc"]],
+        **kwargs,
+    ) -> Float[Tensor, "B *N Ct Nt"]:
+        model = self.model
+        packed = False
+        if images.ndim == 4:
+            packed = True
+            images = images.unsqueeze(1)
+            if modulation_cond is not None:
+                assert modulation_cond.ndim == 2
+                modulation_cond = modulation_cond.unsqueeze(1)
+        batch_size, n_input_views = images.shape[:2]
+        images = (images - self.image_mean) / self.image_std
+        out = model(
+            rearrange(images, "B N C H W -> (B N) C H W"),
+            modulation_cond=rearrange(modulation_cond, "B N Cc -> (B N) Cc")
+            if modulation_cond is not None
+            else None,
+        )
+        local_features = out.last_hidden_state
+        local_features = local_features.permute(0, 2, 1)
+        local_features = rearrange(
+            local_features, "(B N) Ct Nt -> B N Ct Nt", B=batch_size
+        )
+        if packed:
+            local_features = local_features.squeeze(1)
+        return local_features
+    def detokenize(self, *args, **kwargs):
+        raise NotImplementedError

sf3d/models/tokenizers/triplane.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from jaxtyping import Float
+from torch import Tensor
+from sf3d.models.utils import BaseModule
+class TriplaneLearnablePositionalEmbedding(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        plane_size: int = 96
+        num_channels: int = 1024
+    cfg: Config
+    def configure(self) -> None:
+        self.embeddings = nn.Parameter(
+            torch.randn(
+                (3, self.cfg.num_channels, self.cfg.plane_size, self.cfg.plane_size),
+                dtype=torch.float32,
+            )
+            * 1
+            / math.sqrt(self.cfg.num_channels)
+        )
+    def forward(self, batch_size: int) -> Float[Tensor, "B Ct Nt"]:
+        return rearrange(
+            repeat(self.embeddings, "Np Ct Hp Wp -> B Np Ct Hp Wp", B=batch_size),
+            "B Np Ct Hp Wp -> B Ct (Np Hp Wp)",
+        )
+    def detokenize(
+        self, tokens: Float[Tensor, "B Ct Nt"]
+    ) -> Float[Tensor, "B 3 Ct Hp Wp"]:
+        batch_size, Ct, Nt = tokens.shape
+        assert Nt == self.cfg.plane_size**2 * 3
+        assert Ct == self.cfg.num_channels
+        return rearrange(
+            tokens,
+            "B Ct (Np Hp Wp) -> B Np Ct Hp Wp",
+            Np=3,
+            Hp=self.cfg.plane_size,
+            Wp=self.cfg.plane_size,
+        )

sf3d/models/transformers/attention.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import torch.nn as nn
+class Modulation(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        condition_dim: int,
+        zero_init: bool = False,
+        single_layer: bool = False,
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        if single_layer:
+            self.linear1 = nn.Identity()
+        else:
+            self.linear1 = nn.Linear(condition_dim, condition_dim)
+        self.linear2 = nn.Linear(condition_dim, embedding_dim * 2)
+        # Only zero init the last linear layer
+        if zero_init:
+            nn.init.zeros_(self.linear2.weight)
+            nn.init.zeros_(self.linear2.bias)
+    def forward(self, x: torch.Tensor, condition: torch.Tensor) -> torch.Tensor:
+        emb = self.linear2(self.silu(self.linear1(condition)))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        return x

sf3d/models/transformers/backbone.py ADDED Viewed

	@@ -0,0 +1,515 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from sf3d.models.utils import BaseModule
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states, scale: float = 1.0):
+        args = ()
+        hidden_states, gate = self.proj(hidden_states, *args).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        kv_dim=None,
+        num_heads=16,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        kv_dim = dim if not kv_dim else kv_dim
+        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wk = nn.Linear(kv_dim, dim, bias=qkv_bias)
+        self.wv = nn.Linear(kv_dim, dim, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x_q, x_kv):
+        B, N_q, C = x_q.shape
+        B, N_kv, _ = x_kv.shape
+        # [B, N_q, C] -> [B, N_q, H, C/H]
+        q = self.wq(x_q).reshape(B, N_q, self.num_heads, C // self.num_heads)
+        # [B, N_kv, C] -> [B, N_kv, H, C/H]
+        k = self.wk(x_kv).reshape(B, N_kv, self.num_heads, C // self.num_heads)
+        v = self.wv(x_kv).reshape(B, N_kv, self.num_heads, C // self.num_heads)
+        #  attention
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q.permute(0, 2, 1, 3),
+            k.permute(0, 2, 1, 3),
+            v.permute(0, 2, 1, 3),
+            attn_mask=None,
+            dropout_p=self.attn_drop,
+            scale=self.scale,
+        ).permute(0, 2, 1, 3)
+        # [B, N_q, H, C/H] -> [B, N_q, C]
+        x = x.reshape(B, N_q, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        act_fn = GEGLU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        self.net.append(act_fn)
+        self.net.append(nn.Dropout(dropout))
+        self.net.append(nn.Linear(inner_dim, dim_out))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            x = module(x)
+        return x
+class BasicBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        kv_dim: Optional[int] = None,
+        num_heads: int = 16,
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        ff_drop: float = 0.0,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn1 = CrossAttention(
+            dim,
+            kv_dim=dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+        )
+        self.norm2 = nn.LayerNorm(dim)
+        self.attn2 = CrossAttention(
+            dim,
+            kv_dim=kv_dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+        )
+        self.norm3 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, dropout=ff_drop)
+    def forward(self, z, x):
+        z_norm = self.norm1(z)
+        z = z + self.attn1(z_norm, z_norm)
+        # TODO: do we need to have the second attention when x is None?
+        z_norm = self.norm2(z)
+        z = z + self.attn2(z_norm, x if x is not None else z_norm)
+        z_norm = self.norm3(z)
+        z = z + self.ff(z_norm)
+        return z
+class SingleStreamTransformer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        num_attention_heads: int = 16
+        attention_head_dim: int = 88
+        in_channels: Optional[int] = None
+        out_channels: Optional[int] = None
+        num_layers: int = 16
+        dropout: float = 0.0
+        norm_num_groups: int = 32
+        cross_attention_dim: Optional[int] = None
+        attention_bias: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        self.num_attention_heads = self.cfg.num_attention_heads
+        self.attention_head_dim = self.cfg.attention_head_dim
+        inner_dim = self.num_attention_heads * self.attention_head_dim
+        # Define input layers
+        self.norm = torch.nn.GroupNorm(
+            num_groups=self.cfg.norm_num_groups,
+            num_channels=self.cfg.in_channels,
+            eps=1e-6,
+            affine=True,
+        )
+        self.proj_in = nn.Linear(self.cfg.in_channels, inner_dim)
+        # Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicBlock(
+                    inner_dim,
+                    kv_dim=self.cfg.cross_attention_dim,
+                    num_heads=self.num_attention_heads,
+                    qkv_bias=self.cfg.attention_bias,
+                    proj_drop=self.cfg.dropout,
+                    ff_drop=self.cfg.dropout,
+                )
+                for d in range(self.cfg.num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.proj_out = nn.Linear(inner_dim, self.cfg.in_channels)
+    def forward(self, hidden_states, encoder_hidden_states=None, **kwargs):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.proj_in(hidden_states)
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, encoder_hidden_states)
+        hidden_states = self.proj_out(hidden_states).permute(0, 2, 1).contiguous()
+        # TODO: do we really need to add the residual?
+        hidden_states = hidden_states + residual
+        return hidden_states
+class FuseBlock(nn.Module):
+    """
+    Fuse X in to Z with cross attention
+    """
+    def __init__(
+        self,
+        dim_z: int,
+        dim_x: int,
+        num_heads: int = 16,
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        ff_drop: float = 0.0,
+        norm_x_input: bool = True,
+    ):
+        super().__init__()
+        self.norm_x_input = norm_x_input
+        if self.norm_x_input:
+            self.norm_x = nn.LayerNorm(dim_x)
+        self.attn = CrossAttention(
+            dim_z,
+            kv_dim=dim_x,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+        )
+        self.norm_z1 = nn.LayerNorm(dim_z)
+        self.norm_z2 = nn.LayerNorm(dim_z)
+        self.ff = FeedForward(dim_z, dropout=ff_drop)
+    def forward(self, z, x):
+        # TODO: do we need to normalize x?
+        z = z + self.attn(self.norm_z1(z), self.norm_x(x) if self.norm_x_input else x)
+        z = z + self.ff(self.norm_z2(z))
+        return z
+@torch.no_grad()
+def get_triplane_attention_mask(res):
+    N = 3 * res * res
+    attn_mask = torch.zeros(3, res, res, 3, res, res)
+    i, j = torch.meshgrid(torch.arange(res), torch.arange(res))
+    attn_mask[0, i, j, 1, i, :] = 1.0
+    attn_mask[0, i, j, 2, j, :] = 1.0
+    attn_mask[1, i, j, 0, i, :] = 1.0
+    attn_mask[1, i, j, 2, :, j] = 1.0
+    attn_mask[2, i, j, 0, :, i] = 1.0
+    attn_mask[2, i, j, 1, :, j] = 1.0
+    attn_mask = attn_mask.bool()
+    attn_bias = torch.empty_like(attn_mask, dtype=torch.float)
+    attn_bias.masked_fill_(attn_mask, 0.0)
+    attn_bias.masked_fill_(~attn_mask, float("-inf"))
+    return attn_bias.reshape(N, N)
+class TriplaneAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        resolution: int,
+        num_heads: int = 16,
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        full_attention: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.resolution = resolution
+        self.full_attention = full_attention
+        self.attn_mask = (
+            get_triplane_attention_mask(resolution) if not full_attention else None
+        )
+    def forward(self, x):
+        B, N, C = x.shape
+        # [B, N, C] -> [B, N, H, C/H]
+        q = self.wq(x).reshape(B, N, self.num_heads, C // self.num_heads)
+        k = self.wk(x).reshape(B, N, self.num_heads, C // self.num_heads)
+        v = self.wv(x).reshape(B, N, self.num_heads, C // self.num_heads)
+        # detokenize the planes
+        assert N == self.resolution**2 * 3
+        attn_bias = (
+            self.attn_mask.to(q)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .expand(B, self.num_heads, -1, -1)
+            if not self.full_attention
+            else None
+        )
+        # full attention
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q.permute(0, 2, 1, 3),
+            k.permute(0, 2, 1, 3),
+            v.permute(0, 2, 1, 3),
+            attn_mask=attn_bias,
+            dropout_p=self.attn_drop,
+            scale=self.scale,
+        ).permute(0, 2, 1, 3)
+        # [B, N_q, H, C/H] -> [B, N_q, C]
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class TwoStreamBlock(nn.Module):
+    def __init__(
+        self,
+        dim_latent: int,
+        dim_input: int,
+        num_basic_blocks: int = 4,
+        num_heads: int = 16,
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        ff_drop: float = 0.0,
+        norm_x_input: bool = True,
+        dim_cross: Optional[int] = None,
+    ):
+        super().__init__()
+        # Define the fuse block that fuse the input into the latent
+        self.fuse_block_in = FuseBlock(
+            dim_latent,
+            dim_input,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            ff_drop=ff_drop,
+            norm_x_input=norm_x_input,
+        )
+        # Define the transformer block that process the latent
+        self.transformer_block = nn.ModuleList(
+            [
+                BasicBlock(
+                    dim_latent,
+                    kv_dim=dim_cross,
+                    num_heads=num_heads,
+                    qkv_bias=qkv_bias,
+                    proj_drop=proj_drop,
+                    ff_drop=ff_drop,
+                )
+                for _ in range(num_basic_blocks)
+            ]
+        )
+        # Define the fuse block that fuse the latent into the input
+        self.fuse_block_out = FuseBlock(
+            dim_input,
+            dim_latent,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            ff_drop=ff_drop,
+            norm_x_input=norm_x_input,
+        )
+    def forward(self, latent, input, cross_input):
+        latent = self.fuse_block_in(latent, input)
+        for block in self.transformer_block:
+            latent = block(latent, cross_input)
+        input = self.fuse_block_out(input, latent)
+        return latent, input
+class TwoStreamInterleaveTransformer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        num_attention_heads: int = 16
+        attention_head_dim: int = 64
+        raw_triplane_channels: int = 1024
+        triplane_channels: int = 1024
+        raw_image_channels: int = 1024
+        num_latents: int = 1792
+        num_blocks: int = 4
+        num_basic_blocks: int = 3
+        dropout: float = 0.0
+        latent_init_std: float = 0.02
+        norm_num_groups: int = 32
+        attention_bias: bool = False
+        norm_x_input: bool = False
+        cross_attention_dim: int = 1024
+        mix_latent: bool = True
+    cfg: Config
+    def configure(self) -> None:
+        self.mix_latent = self.cfg.mix_latent
+        # Define the dimensions
+        self.num_attention_heads = self.cfg.num_attention_heads
+        self.attention_head_dim = self.cfg.attention_head_dim
+        self.num_latents = self.cfg.num_latents
+        self.latent_dim = self.num_attention_heads * self.attention_head_dim
+        # Define input layers
+        if self.cfg.norm_num_groups > 0:
+            self.norm_triplane = torch.nn.GroupNorm(
+                num_groups=self.cfg.norm_num_groups,
+                num_channels=self.cfg.raw_triplane_channels,
+                eps=1e-6,
+                affine=True,
+            )
+        else:
+            self.norm_triplane = nn.LayerNorm(self.cfg.raw_triplane_channels)
+        self.proj_triplane = nn.Linear(
+            self.cfg.raw_triplane_channels, self.cfg.triplane_channels
+        )
+        if self.mix_latent:
+            self.norm_image = nn.LayerNorm(self.cfg.raw_image_channels)
+            self.proj_image = nn.Linear(self.cfg.raw_image_channels, self.latent_dim)
+        self.norm_latent = nn.LayerNorm(self.latent_dim)
+        self.proj_latent = nn.Linear(self.latent_dim, self.latent_dim)
+        # Define the latents
+        self.latent_init = nn.Parameter(
+            torch.zeros(1, self.num_latents, self.latent_dim)
+        )
+        nn.init.normal_(self.latent_init, std=self.cfg.latent_init_std)
+        # Define the transformer blocks
+        self.main_blocks = nn.ModuleList(
+            [
+                TwoStreamBlock(
+                    self.latent_dim,
+                    self.cfg.triplane_channels,
+                    num_basic_blocks=self.cfg.num_basic_blocks,
+                    num_heads=self.num_attention_heads,
+                    qkv_bias=self.cfg.attention_bias,
+                    proj_drop=self.cfg.dropout,
+                    ff_drop=self.cfg.dropout,
+                    norm_x_input=self.cfg.norm_x_input,
+                    dim_cross=self.cfg.cross_attention_dim,
+                )
+                for _ in range(self.cfg.num_blocks)
+            ]
+        )
+        # 4. Define output layers
+        self.proj_out = nn.Linear(
+            self.cfg.triplane_channels, self.cfg.raw_triplane_channels
+        )
+    def forward(self, hidden_states, encoder_hidden_states, **kwargs):
+        # hidden_states: [B, triplane_dim, N_triplane] is triplane tokens
+        # encoder_hidden_states: [B, N_image, image_dim] is the image tokens
+        if isinstance(self.norm_triplane, nn.GroupNorm):
+            triplane_tokens = self.norm_triplane(hidden_states)
+            triplane_tokens = triplane_tokens.permute(
+                0, 2, 1
+            )  # [B, N_triplane, triplane_dim]
+        elif isinstance(self.norm_triplane, nn.LayerNorm):
+            triplane_tokens = self.norm_triplane(hidden_states.permute(0, 2, 1))
+        else:
+            raise ValueError("Unknown normalization layer")
+        triplane_tokens = self.proj_triplane(triplane_tokens)
+        if self.mix_latent:
+            image_tokens = self.norm_image(
+                encoder_hidden_states
+            )  # [B, N_image, image_dim]
+            image_tokens = self.proj_image(image_tokens)
+        init_latents = self.latent_init.expand(
+            hidden_states.shape[0], -1, -1
+        )  # [B, N_latent_init, latent_dim]
+        init_latents = self.norm_latent(init_latents)
+        init_latents = self.proj_latent(init_latents)
+        if self.mix_latent:
+            latent_tokens = torch.cat(
+                [image_tokens, init_latents], dim=1
+            )  # [B, N_latent, latent_dim]
+        else:
+            latent_tokens = init_latents
+        # forward the main blocks
+        for block in self.main_blocks:
+            latent_tokens, triplane_tokens = block(
+                latent_tokens, triplane_tokens, encoder_hidden_states
+            )
+        # project the triplane tokens back to the original dimension
+        triplane_tokens = self.proj_out(triplane_tokens).permute(0, 2, 1).contiguous()
+        triplane_tokens = triplane_tokens + hidden_states
+        return triplane_tokens

sf3d/models/utils.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import dataclasses
+import importlib
+import math
+from dataclasses import dataclass
+from typing import Any, List, Optional, Tuple, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from jaxtyping import Bool, Float, Int, Num
+from omegaconf import DictConfig, OmegaConf
+from torch import Tensor
+class BaseModule(nn.Module):
+    @dataclass
+    class Config:
+        pass
+    cfg: Config  # add this to every subclass of BaseModule to enable static type checking
+    def __init__(
+        self, cfg: Optional[Union[dict, DictConfig]] = None, *args, **kwargs
+    ) -> None:
+        super().__init__()
+        self.cfg = parse_structured(self.Config, cfg)
+        self.configure(*args, **kwargs)
+    def configure(self, *args, **kwargs) -> None:
+        raise NotImplementedError
+def find_class(cls_string):
+    module_string = ".".join(cls_string.split(".")[:-1])
+    cls_name = cls_string.split(".")[-1]
+    module = importlib.import_module(module_string, package=None)
+    cls = getattr(module, cls_name)
+    return cls
+def parse_structured(fields: Any, cfg: Optional[Union[dict, DictConfig]] = None) -> Any:
+    # Check if cfg.keys are in fields
+    cfg_ = cfg.copy()
+    keys = list(cfg_.keys())
+    field_names = {f.name for f in dataclasses.fields(fields)}
+    for key in keys:
+        # This is helpful when swapping out modules from CLI
+        if key not in field_names:
+            print(f"Ignoring {key} as it's not supported by {fields}")
+            cfg_.pop(key)
+    scfg = OmegaConf.merge(OmegaConf.structured(fields), cfg_)
+    return scfg
+EPS_DTYPE = {
+    torch.float16: 1e-4,
+    torch.bfloat16: 1e-4,
+    torch.float32: 1e-7,
+    torch.float64: 1e-8,
+}
+def dot(x, y, dim=-1):
+    return torch.sum(x * y, dim, keepdim=True)
+def reflect(x, n):
+    return x - 2 * dot(x, n) * n
+def normalize(x, dim=-1, eps=None):
+    if eps is None:
+        eps = EPS_DTYPE[x.dtype]
+    return F.normalize(x, dim=dim, p=2, eps=eps)
+def tri_winding(tri: Float[Tensor, "*B 3 2"]) -> Float[Tensor, "*B 3 3"]:
+    # One pad for determinant
+    tri_sq = F.pad(tri, (0, 1), "constant", 1.0)
+    det_tri = torch.det(tri_sq)
+    tri_rev = torch.cat(
+        (tri_sq[..., 0:1, :], tri_sq[..., 2:3, :], tri_sq[..., 1:2, :]), -2
+    )
+    tri_sq[det_tri < 0] = tri_rev[det_tri < 0]
+    return tri_sq
+def triangle_intersection_2d(
+    t1: Float[Tensor, "*B 3 2"],
+    t2: Float[Tensor, "*B 3 2"],
+    eps=1e-12,
+) -> Float[Tensor, "*B"]:  # noqa: F821
+    """Returns True if triangles collide, False otherwise"""
+    def chk_edge(x: Float[Tensor, "*B 3 3"]) -> Bool[Tensor, "*B"]:  # noqa: F821
+        logdetx = torch.logdet(x.double())
+        if eps is None:
+            return ~torch.isfinite(logdetx)
+        return ~(torch.isfinite(logdetx) & (logdetx > math.log(eps)))
+    t1s = tri_winding(t1)
+    t2s = tri_winding(t2)
+    # Assume the triangles do not collide in the begging
+    ret = torch.zeros(t1.shape[0], dtype=torch.bool, device=t1.device)
+    for i in range(3):
+        edge = torch.roll(t1s, i, dims=1)[:, :2, :]
+        # Check if all points of triangle 2 lay on the external side of edge E.
+        # If this is the case the triangle do not collide
+        upd = (
+            chk_edge(torch.cat((edge, t2s[:, 0:1]), 1))
+            & chk_edge(torch.cat((edge, t2s[:, 1:2]), 1))
+            & chk_edge(torch.cat((edge, t2s[:, 2:3]), 1))
+        )
+        # Here no collision is still True due to inversion
+        ret = ret | upd
+    for i in range(3):
+        edge = torch.roll(t2s, i, dims=1)[:, :2, :]
+        upd = (
+            chk_edge(torch.cat((edge, t1s[:, 0:1]), 1))
+            & chk_edge(torch.cat((edge, t1s[:, 1:2]), 1))
+            & chk_edge(torch.cat((edge, t1s[:, 2:3]), 1))
+        )
+        # Here no collision is still True due to inversion
+        ret = ret | upd
+    return ~ret  # Do the inversion
+ValidScale = Union[Tuple[float, float], Num[Tensor, "2 D"]]
+def scale_tensor(
+    dat: Num[Tensor, "... D"], inp_scale: ValidScale, tgt_scale: ValidScale
+):
+    if inp_scale is None:
+        inp_scale = (0, 1)
+    if tgt_scale is None:
+        tgt_scale = (0, 1)
+    if isinstance(tgt_scale, Tensor):
+        assert dat.shape[-1] == tgt_scale.shape[-1]
+    dat = (dat - inp_scale[0]) / (inp_scale[1] - inp_scale[0])
+    dat = dat * (tgt_scale[1] - tgt_scale[0]) + tgt_scale[0]
+    return dat
+def dilate_fill(img, mask, iterations=10):
+    oldMask = mask.float()
+    oldImg = img
+    mask_kernel = torch.ones(
+        (1, 1, 3, 3),
+        dtype=oldMask.dtype,
+        device=oldMask.device,
+    )
+    for i in range(iterations):
+        newMask = torch.nn.functional.max_pool2d(oldMask, 3, 1, 1)
+        # Fill the extension with mean color of old valid regions
+        img_unfold = F.unfold(oldImg, (3, 3)).view(1, 3, 3 * 3, -1)
+        mask_unfold = F.unfold(oldMask, (3, 3)).view(1, 1, 3 * 3, -1)
+        new_mask_unfold = F.unfold(newMask, (3, 3)).view(1, 1, 3 * 3, -1)
+        # Average color of the valid region
+        mean_color = (img_unfold.sum(dim=2) / mask_unfold.sum(dim=2).clip(1)).unsqueeze(
+            2
+        )
+        # Extend it to the new region
+        fill_color = (mean_color * new_mask_unfold).view(1, 3 * 3 * 3, -1)
+        mask_conv = F.conv2d(
+            newMask, mask_kernel, padding=1
+        )  # Get the sum for each kernel patch
+        newImg = F.fold(
+            fill_color, (img.shape[-2], img.shape[-1]), (3, 3)
+        ) / mask_conv.clamp(1)
+        diffMask = newMask - oldMask
+        oldMask = newMask
+        oldImg = torch.lerp(oldImg, newImg, diffMask)
+    return oldImg
+def float32_to_uint8_np(
+    x: Float[np.ndarray, "*B H W C"],
+    dither: bool = True,
+    dither_mask: Optional[Float[np.ndarray, "*B H W C"]] = None,
+    dither_strength: float = 1.0,
+) -> Int[np.ndarray, "*B H W C"]:
+    if dither:
+        dither = (
+            dither_strength * np.random.rand(*x[..., :1].shape).astype(np.float32) - 0.5
+        )
+        if dither_mask is not None:
+            dither = dither * dither_mask
+        return np.clip(np.floor((256.0 * x + dither)), 0, 255).astype(np.uint8)
+    return np.clip(np.floor((256.0 * x)), 0, 255).astype(torch.uint8)
+def convert_data(data):
+    if data is None:
+        return None
+    elif isinstance(data, np.ndarray):
+        return data
+    elif isinstance(data, torch.Tensor):
+        if data.dtype in [torch.float16, torch.bfloat16]:
+            data = data.float()
+        return data.detach().cpu().numpy()
+    elif isinstance(data, list):
+        return [convert_data(d) for d in data]
+    elif isinstance(data, dict):
+        return {k: convert_data(v) for k, v in data.items()}
+    else:
+        raise TypeError(
+            "Data must be in type numpy.ndarray, torch.Tensor, list or dict, getting",
+            type(data),
+        )
+class ImageProcessor:
+    def convert_and_resize(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        size: int,
+    ):
+        if isinstance(image, PIL.Image.Image):
+            image = torch.from_numpy(np.array(image).astype(np.float32) / 255.0)
+        elif isinstance(image, np.ndarray):
+            if image.dtype == np.uint8:
+                image = torch.from_numpy(image.astype(np.float32) / 255.0)
+            else:
+                image = torch.from_numpy(image)
+        elif isinstance(image, torch.Tensor):
+            pass
+        batched = image.ndim == 4
+        if not batched:
+            image = image[None, ...]
+        image = F.interpolate(
+            image.permute(0, 3, 1, 2),
+            (size, size),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        ).permute(0, 2, 3, 1)
+        if not batched:
+            image = image[0]
+        return image
+    def __call__(
+        self,
+        image: Union[
+            PIL.Image.Image,
+            np.ndarray,
+            torch.FloatTensor,
+            List[PIL.Image.Image],
+            List[np.ndarray],
+            List[torch.FloatTensor],
+        ],
+        size: int,
+    ) -> Any:
+        if isinstance(image, (np.ndarray, torch.FloatTensor)) and image.ndim == 4:
+            image = self.convert_and_resize(image, size)
+        else:
+            if not isinstance(image, list):
+                image = [image]
+            image = [self.convert_and_resize(im, size) for im in image]
+            image = torch.stack(image, dim=0)
+        return image
+def get_intrinsic_from_fov(fov, H, W, bs=-1):
+    focal_length = 0.5 * H / np.tan(0.5 * fov)
+    intrinsic = np.identity(3, dtype=np.float32)
+    intrinsic[0, 0] = focal_length
+    intrinsic[1, 1] = focal_length
+    intrinsic[0, 2] = W / 2.0
+    intrinsic[1, 2] = H / 2.0
+    if bs > 0:
+        intrinsic = intrinsic[None].repeat(bs, axis=0)
+    return torch.from_numpy(intrinsic)

sf3d/system.py ADDED Viewed

	@@ -0,0 +1,483 @@

+import os
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+import trimesh
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from jaxtyping import Float
+from omegaconf import OmegaConf
+from PIL import Image
+from safetensors.torch import load_model
+from torch import Tensor
+from sf3d.models.isosurface import MarchingTetrahedraHelper
+from sf3d.models.mesh import Mesh
+from sf3d.models.utils import (
+    BaseModule,
+    ImageProcessor,
+    convert_data,
+    dilate_fill,
+    dot,
+    find_class,
+    float32_to_uint8_np,
+    normalize,
+    scale_tensor,
+)
+from sf3d.utils import create_intrinsic_from_fov_deg, default_cond_c2w
+from .texture_baker import TextureBaker
+class SF3D(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        cond_image_size: int
+        isosurface_resolution: int
+        isosurface_threshold: float = 10.0
+        radius: float = 1.0
+        background_color: list[float] = field(default_factory=lambda: [0.5, 0.5, 0.5])
+        default_fovy_deg: float = 40.0
+        default_distance: float = 1.6
+        camera_embedder_cls: str = ""
+        camera_embedder: dict = field(default_factory=dict)
+        image_tokenizer_cls: str = ""
+        image_tokenizer: dict = field(default_factory=dict)
+        tokenizer_cls: str = ""
+        tokenizer: dict = field(default_factory=dict)
+        backbone_cls: str = ""
+        backbone: dict = field(default_factory=dict)
+        post_processor_cls: str = ""
+        post_processor: dict = field(default_factory=dict)
+        decoder_cls: str = ""
+        decoder: dict = field(default_factory=dict)
+        image_estimator_cls: str = ""
+        image_estimator: dict = field(default_factory=dict)
+        global_estimator_cls: str = ""
+        global_estimator: dict = field(default_factory=dict)
+    cfg: Config
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, config_name: str, weight_name: str
+    ):
+        if os.path.isdir(pretrained_model_name_or_path):
+            config_path = os.path.join(pretrained_model_name_or_path, config_name)
+            weight_path = os.path.join(pretrained_model_name_or_path, weight_name)
+        else:
+            config_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=config_name
+            )
+            weight_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=weight_name
+            )
+        cfg = OmegaConf.load(config_path)
+        OmegaConf.resolve(cfg)
+        model = cls(cfg)
+        load_model(model, weight_path)
+        return model
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def configure(self):
+        self.image_tokenizer = find_class(self.cfg.image_tokenizer_cls)(
+            self.cfg.image_tokenizer
+        )
+        self.tokenizer = find_class(self.cfg.tokenizer_cls)(self.cfg.tokenizer)
+        self.camera_embedder = find_class(self.cfg.camera_embedder_cls)(
+            self.cfg.camera_embedder
+        )
+        self.backbone = find_class(self.cfg.backbone_cls)(self.cfg.backbone)
+        self.post_processor = find_class(self.cfg.post_processor_cls)(
+            self.cfg.post_processor
+        )
+        self.decoder = find_class(self.cfg.decoder_cls)(self.cfg.decoder)
+        self.image_estimator = find_class(self.cfg.image_estimator_cls)(
+            self.cfg.image_estimator
+        )
+        self.global_estimator = find_class(self.cfg.global_estimator_cls)(
+            self.cfg.global_estimator
+        )
+        self.bbox: Float[Tensor, "2 3"]
+        self.register_buffer(
+            "bbox",
+            torch.as_tensor(
+                [
+                    [-self.cfg.radius, -self.cfg.radius, -self.cfg.radius],
+                    [self.cfg.radius, self.cfg.radius, self.cfg.radius],
+                ],
+                dtype=torch.float32,
+            ),
+        )
+        self.isosurface_helper = MarchingTetrahedraHelper(
+            self.cfg.isosurface_resolution,
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "load",
+                "tets",
+                f"{self.cfg.isosurface_resolution}_tets.npz",
+            ),
+        )
+        self.baker = TextureBaker()
+        self.image_processor = ImageProcessor()
+    def triplane_to_meshes(
+        self, triplanes: Float[Tensor, "B 3 Cp Hp Wp"]
+    ) -> list[Mesh]:
+        meshes = []
+        for i in range(triplanes.shape[0]):
+            triplane = triplanes[i]
+            grid_vertices = scale_tensor(
+                self.isosurface_helper.grid_vertices.to(triplanes.device),
+                self.isosurface_helper.points_range,
+                self.bbox,
+            )
+            values = self.query_triplane(grid_vertices, triplane)
+            decoded = self.decoder(values, include=["vertex_offset", "density"])
+            sdf = decoded["density"] - self.cfg.isosurface_threshold
+            deform = decoded["vertex_offset"].squeeze(0)
+            mesh: Mesh = self.isosurface_helper(
+                sdf.view(-1, 1), deform.view(-1, 3) if deform is not None else None
+            )
+            mesh.v_pos = scale_tensor(
+                mesh.v_pos, self.isosurface_helper.points_range, self.bbox
+            )
+            meshes.append(mesh)
+        return meshes
+    def query_triplane(
+        self,
+        positions: Float[Tensor, "*B N 3"],
+        triplanes: Float[Tensor, "*B 3 Cp Hp Wp"],
+    ) -> Float[Tensor, "*B N F"]:
+        batched = positions.ndim == 3
+        if not batched:
+            # no batch dimension
+            triplanes = triplanes[None, ...]
+            positions = positions[None, ...]
+        assert triplanes.ndim == 5 and positions.ndim == 3
+        positions = scale_tensor(
+            positions, (-self.cfg.radius, self.cfg.radius), (-1, 1)
+        )
+        indices2D: Float[Tensor, "B 3 N 2"] = torch.stack(
+            (positions[..., [0, 1]], positions[..., [0, 2]], positions[..., [1, 2]]),
+            dim=-3,
+        ).to(triplanes.dtype)
+        out: Float[Tensor, "B3 Cp 1 N"] = F.grid_sample(
+            rearrange(triplanes, "B Np Cp Hp Wp -> (B Np) Cp Hp Wp", Np=3).float(),
+            rearrange(indices2D, "B Np N Nd -> (B Np) () N Nd", Np=3).float(),
+            align_corners=True,
+            mode="bilinear",
+        )
+        out = rearrange(out, "(B Np) Cp () N -> B N (Np Cp)", Np=3)
+        return out
+    def get_scene_codes(self, batch) -> Float[Tensor, "B 3 C H W"]:
+        # if batch[rgb_cond] is only one view, add a view dimension
+        if len(batch["rgb_cond"].shape) == 4:
+            batch["rgb_cond"] = batch["rgb_cond"].unsqueeze(1)
+            batch["mask_cond"] = batch["mask_cond"].unsqueeze(1)
+            batch["c2w_cond"] = batch["c2w_cond"].unsqueeze(1)
+            batch["intrinsic_cond"] = batch["intrinsic_cond"].unsqueeze(1)
+            batch["intrinsic_normed_cond"] = batch["intrinsic_normed_cond"].unsqueeze(1)
+        batch_size, n_input_views = batch["rgb_cond"].shape[:2]
+        camera_embeds: Optional[Float[Tensor, "B Nv Cc"]]
+        camera_embeds = self.camera_embedder(**batch)
+        input_image_tokens: Float[Tensor, "B Nv Cit Nit"] = self.image_tokenizer(
+            rearrange(batch["rgb_cond"], "B Nv H W C -> B Nv C H W"),
+            modulation_cond=camera_embeds,
+        )
+        input_image_tokens = rearrange(
+            input_image_tokens, "B Nv C Nt -> B (Nv Nt) C", Nv=n_input_views
+        )
+        tokens: Float[Tensor, "B Ct Nt"] = self.tokenizer(batch_size)
+        tokens = self.backbone(
+            tokens,
+            encoder_hidden_states=input_image_tokens,
+            modulation_cond=None,
+        )
+        direct_codes = self.tokenizer.detokenize(tokens)
+        scene_codes = self.post_processor(direct_codes)
+        return scene_codes, direct_codes
+    def run_image(
+        self,
+        image: Image,
+        bake_resolution: int,
+        estimate_illumination: bool = False,
+    ) -> Tuple[trimesh.Trimesh, dict[str, Any]]:
+        if image.mode != "RGBA":
+            raise ValueError("Image must be in RGBA mode")
+        img_cond = (
+            torch.from_numpy(
+                np.asarray(
+                    image.resize((self.cfg.cond_image_size, self.cfg.cond_image_size))
+                ).astype(np.float32)
+                / 255.0
+            )
+            .float()
+            .clip(0, 1)
+            .to(self.device)
+        )
+        mask_cond = img_cond[:, :, -1:]
+        rgb_cond = torch.lerp(
+            torch.tensor(self.cfg.background_color, device=self.device)[None, None, :],
+            img_cond[:, :, :3],
+            mask_cond,
+        )
+        c2w_cond = default_cond_c2w(self.cfg.default_distance).to(self.device)
+        intrinsic, intrinsic_normed_cond = create_intrinsic_from_fov_deg(
+            self.cfg.default_fovy_deg,
+            self.cfg.cond_image_size,
+            self.cfg.cond_image_size,
+        )
+        batch = {
+            "rgb_cond": rgb_cond,
+            "mask_cond": mask_cond,
+            "c2w_cond": c2w_cond.unsqueeze(0),
+            "intrinsic_cond": intrinsic.to(self.device).unsqueeze(0),
+            "intrinsic_normed_cond": intrinsic_normed_cond.to(self.device).unsqueeze(0),
+        }
+        meshes, global_dict = self.generate_mesh(
+            batch, bake_resolution, estimate_illumination
+        )
+        return meshes[0], global_dict
+    def generate_mesh(
+        self,
+        batch,
+        bake_resolution: int,
+        estimate_illumination: bool = False,
+    ) -> Tuple[List[trimesh.Trimesh], dict[str, Any]]:
+        batch["rgb_cond"] = self.image_processor(
+            batch["rgb_cond"], self.cfg.cond_image_size
+        )
+        batch["mask_cond"] = self.image_processor(
+            batch["mask_cond"], self.cfg.cond_image_size
+        )
+        scene_codes, non_postprocessed_codes = self.get_scene_codes(batch)
+        global_dict = {}
+        if self.image_estimator is not None:
+            global_dict.update(
+                self.image_estimator(batch["rgb_cond"] * batch["mask_cond"])
+            )
+        if self.global_estimator is not None and estimate_illumination:
+            global_dict.update(self.global_estimator(non_postprocessed_codes))
+        with torch.no_grad():
+            with torch.autocast(device_type="cuda", enabled=False):
+                meshes = self.triplane_to_meshes(scene_codes)
+                rets = []
+                for i, mesh in enumerate(meshes):
+                    # Check for empty mesh
+                    if mesh.v_pos.shape[0] == 0:
+                        rets.append(trimesh.Trimesh())
+                        continue
+                    mesh.unwrap_uv()
+                    # Build textures
+                    rast = self.baker.rasterize(
+                        mesh.v_tex, mesh.t_pos_idx, bake_resolution
+                    )
+                    bake_mask = self.baker.get_mask(rast)
+                    pos_bake = self.baker.interpolate(
+                        mesh.v_pos,
+                        rast,
+                        mesh.t_pos_idx,
+                        mesh.v_tex,
+                    )
+                    gb_pos = pos_bake[bake_mask]
+                    tri_query = self.query_triplane(gb_pos, scene_codes[i])[0]
+                    decoded = self.decoder(
+                        tri_query, exclude=["density", "vertex_offset"]
+                    )
+                    nrm = self.baker.interpolate(
+                        mesh.v_nrm,
+                        rast,
+                        mesh.t_pos_idx,
+                        mesh.v_tex,
+                    )
+                    gb_nrm = F.normalize(nrm[bake_mask], dim=-1)
+                    decoded["normal"] = gb_nrm
+                    # Check if any keys in global_dict start with decoded_
+                    for k, v in global_dict.items():
+                        if k.startswith("decoder_"):
+                            decoded[k.replace("decoder_", "")] = v[i]
+                    mat_out = {
+                        "albedo": decoded["features"],
+                        "roughness": decoded["roughness"],
+                        "metallic": decoded["metallic"],
+                        "normal": normalize(decoded["perturb_normal"]),
+                        "bump": None,
+                    }
+                    for k, v in mat_out.items():
+                        if v is None:
+                            continue
+                        if v.shape[0] == 1:
+                            # Skip and directly add a single value
+                            mat_out[k] = v[0]
+                        else:
+                            f = torch.zeros(
+                                bake_resolution,
+                                bake_resolution,
+                                v.shape[-1],
+                                dtype=v.dtype,
+                                device=v.device,
+                            )
+                            if v.shape == f.shape:
+                                continue
+                            if k == "normal":
+                                # Use un-normalized tangents here so that larger smaller tris
+                                # Don't effect the tangents that much
+                                tng = self.baker.interpolate(
+                                    mesh.v_tng,
+                                    rast,
+                                    mesh.t_pos_idx,
+                                    mesh.v_tex,
+                                )
+                                gb_tng = tng[bake_mask]
+                                gb_tng = F.normalize(gb_tng, dim=-1)
+                                gb_btng = F.normalize(
+                                    torch.cross(gb_tng, gb_nrm, dim=-1), dim=-1
+                                )
+                                normal = F.normalize(mat_out["normal"], dim=-1)
+                                bump = torch.cat(
+                                    # Check if we have to flip some things
+                                    (
+                                        dot(normal, gb_tng),
+                                        dot(normal, gb_btng),
+                                        dot(normal, gb_nrm).clip(
+                                            0.3, 1
+                                        ),  # Never go below 0.3. This would indicate a flipped (or close to one) normal
+                                    ),
+                                    -1,
+                                )
+                                bump[..., :2] *= 0.5
+                                bump = (bump * 0.5 + 0.5).clamp(0, 1)
+                                f[bake_mask] = bump.view(-1, 3)
+                                mat_out["bump"] = f
+                            else:
+                                f[bake_mask] = v.view(-1, v.shape[-1])
+                                mat_out[k] = f
+                    def uv_padding(arr):
+                        if arr.ndim == 1:
+                            return arr
+                        return (
+                            dilate_fill(
+                                arr.permute(2, 0, 1)[None, ...],
+                                bake_mask.unsqueeze(0).unsqueeze(0),
+                                iterations=bake_resolution // 150,
+                            )
+                            .squeeze(0)
+                            .permute(1, 2, 0)
+                        )
+                    verts_np = convert_data(mesh.v_pos)
+                    faces = convert_data(mesh.t_pos_idx)
+                    uvs = convert_data(mesh.v_tex)
+                    basecolor_tex = Image.fromarray(
+                        float32_to_uint8_np(convert_data(uv_padding(mat_out["albedo"])))
+                    ).convert("RGB")
+                    basecolor_tex.format = "JPEG"
+                    metallic = mat_out["metallic"].squeeze().cpu().item()
+                    roughness = mat_out["roughness"].squeeze().cpu().item()
+                    if "bump" in mat_out and mat_out["bump"] is not None:
+                        bump_np = convert_data(uv_padding(mat_out["bump"]))
+                        bump_up = np.ones_like(bump_np)
+                        bump_up[..., :2] = 0.5
+                        bump_up[..., 2:] = 1
+                        bump_tex = Image.fromarray(
+                            float32_to_uint8_np(
+                                bump_np,
+                                dither=True,
+                                # Do not dither if something is perfectly flat
+                                dither_mask=np.all(
+                                    bump_np == bump_up, axis=-1, keepdims=True
+                                ).astype(np.float32),
+                            )
+                        ).convert("RGB")
+                        bump_tex.format = (
+                            "JPEG"  # PNG would be better but the assets are larger
+                        )
+                    else:
+                        bump_tex = None
+                    material = trimesh.visual.material.PBRMaterial(
+                        baseColorTexture=basecolor_tex,
+                        roughnessFactor=roughness,
+                        metallicFactor=metallic,
+                        normalTexture=bump_tex,
+                    )
+                    tmesh = trimesh.Trimesh(
+                        vertices=verts_np,
+                        faces=faces,
+                        visual=trimesh.visual.texture.TextureVisuals(
+                            uv=uvs, material=material
+                        ),
+                    )
+                    rot = trimesh.transformations.rotation_matrix(
+                        np.radians(-90), [1, 0, 0]
+                    )
+                    tmesh.apply_transform(rot)
+                    tmesh.apply_transform(
+                        trimesh.transformations.rotation_matrix(
+                            np.radians(90), [0, 1, 0]
+                        )
+                    )
+                    tmesh.invert()
+                    rets.append(tmesh)
+        return rets, global_dict

sf3d/texture_baker.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import slangtorch
+import torch
+import torch.nn as nn
+from jaxtyping import Bool, Float
+from torch import Tensor
+class TextureBaker(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.baker = slangtorch.loadModule(
+            os.path.join(os.path.dirname(__file__), "texture_baker.slang")
+        )
+    def rasterize(
+        self,
+        uv: Float[Tensor, "Nv 2"],
+        face_indices: Float[Tensor, "Nf 3"],
+        bake_resolution: int,
+    ) -> Float[Tensor, "bake_resolution bake_resolution 4"]:
+        if not face_indices.is_cuda or not uv.is_cuda:
+            raise ValueError("All input tensors must be on cuda")
+        face_indices = face_indices.to(torch.int32)
+        uv = uv.to(torch.float32)
+        rast_result = torch.empty(
+            bake_resolution, bake_resolution, 4, device=uv.device, dtype=torch.float32
+        )
+        block_size = 16
+        grid_size = bake_resolution // block_size
+        self.baker.bake_uv(uv=uv, indices=face_indices, output=rast_result).launchRaw(
+            blockSize=(block_size, block_size, 1), gridSize=(grid_size, grid_size, 1)
+        )
+        return rast_result
+    def get_mask(
+        self, rast: Float[Tensor, "bake_resolution bake_resolution 4"]
+    ) -> Bool[Tensor, "bake_resolution bake_resolution"]:
+        return rast[..., -1] >= 0
+    def interpolate(
+        self,
+        attr: Float[Tensor, "Nv 3"],
+        rast: Float[Tensor, "bake_resolution bake_resolution 4"],
+        face_indices: Float[Tensor, "Nf 3"],
+        uv: Float[Tensor, "Nv 2"],
+    ) -> Float[Tensor, "bake_resolution bake_resolution 3"]:
+        # Make sure all input tensors are on torch
+        if not attr.is_cuda or not face_indices.is_cuda or not rast.is_cuda:
+            raise ValueError("All input tensors must be on cuda")
+        attr = attr.to(torch.float32)
+        face_indices = face_indices.to(torch.int32)
+        uv = uv.to(torch.float32)
+        pos_bake = torch.zeros(
+            rast.shape[0],
+            rast.shape[1],
+            3,
+            device=attr.device,
+            dtype=attr.dtype,
+        )
+        block_size = 16
+        grid_size = rast.shape[0] // block_size
+        self.baker.interpolate(
+            attr=attr, indices=face_indices, rast=rast, output=pos_bake
+        ).launchRaw(
+            blockSize=(block_size, block_size, 1), gridSize=(grid_size, grid_size, 1)
+        )
+        return pos_bake
+    def forward(
+        self,
+        attr: Float[Tensor, "Nv 3"],
+        uv: Float[Tensor, "Nv 2"],
+        face_indices: Float[Tensor, "Nf 3"],
+        bake_resolution: int,
+    ) -> Float[Tensor, "bake_resolution bake_resolution 3"]:
+        rast = self.rasterize(uv, face_indices, bake_resolution)
+        return self.interpolate(attr, rast, face_indices, uv)

sf3d/texture_baker.slang ADDED Viewed

	@@ -0,0 +1,93 @@

+// xy: 2D test position
+// v1: vertex position 1
+// v2: vertex position 2
+// v3: vertex position 3
+//
+bool barycentric_coordinates(float2 xy, float2 v1, float2 v2, float2 v3, out float u, out float v, out float w)
+{
+    // Return true if the point (x,y) is inside the triangle defined by the vertices v1, v2, v3.
+    // If the point is inside the triangle, the barycentric coordinates are stored in u, v, and w.
+    float2 v1v2 = v2 - v1;
+    float2 v1v3 = v3 - v1;
+    float2 xyv1 = xy - v1;
+    float d00 = dot(v1v2, v1v2);
+    float d01 = dot(v1v2, v1v3);
+    float d11 = dot(v1v3, v1v3);
+    float d20 = dot(xyv1, v1v2);
+    float d21 = dot(xyv1, v1v3);
+    float denom = d00 * d11 - d01 * d01;
+    v = (d11 * d20 - d01 * d21) / denom;
+    w = (d00 * d21 - d01 * d20) / denom;
+    u = 1.0 - v - w;
+    return (v >= 0.0) && (w >= 0.0) && (v + w <= 1.0);
+}
+[AutoPyBindCUDA]
+[CUDAKernel]
+void interpolate(
+    TensorView<float3> attr,
+    TensorView<int3> indices,
+    TensorView<float4> rast,
+    TensorView<float3> output)
+{
+    // Interpolate the attr into output based on the rast result (barycentric coordinates, + triangle idx)
+    uint3 dispatch_id = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx();
+    if (dispatch_id.x > output.size(0) || dispatch_id.y > output.size(1))
+        return;
+    float4 barycentric = rast[dispatch_id.x, dispatch_id.y];
+    int triangle_idx = int(barycentric.w);
+    if (triangle_idx < 0) {
+        output[dispatch_id.x, dispatch_id.y] = float3(0.0, 0.0, 0.0);
+        return;
+    }
+    float3 v1 = attr[indices[triangle_idx].x];
+    float3 v2 = attr[indices[triangle_idx].y];
+    float3 v3 = attr[indices[triangle_idx].z];
+    output[dispatch_id.x, dispatch_id.y] = v1 * barycentric.x + v2 * barycentric.y + v3 * barycentric.z;
+}
+[AutoPyBindCUDA]
+[CUDAKernel]
+void bake_uv(
+    TensorView<float2> uv,
+    TensorView<int3> indices,
+    TensorView<float4> output)
+{
+    uint3 dispatch_id = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx();
+    if (dispatch_id.y > output.size(0) || dispatch_id.x > output.size(1))
+        return;
+    // We index x,y but the orginal coords are HW. So swap them
+    float2 pixel_coord = float2(dispatch_id.y, dispatch_id.x);
+    // Normalize to [0, 1]
+    pixel_coord /= float2(output.size(1), output.size(0));
+    pixel_coord = clamp(pixel_coord, 0.0, 1.0);
+    // Flip x-axis
+    pixel_coord.y = 1 - pixel_coord.y;
+    for (int i = 0; i < indices.size(0); i++) {
+        float2 v1 = float2(uv[indices[i].x].x, uv[indices[i].x].y);
+        float2 v2 = float2(uv[indices[i].y].x, uv[indices[i].y].y);
+        float2 v3 = float2(uv[indices[i].z].x, uv[indices[i].z].y);
+        float u, v, w;
+        bool hit = barycentric_coordinates(pixel_coord, v1, v2, v3, u, v, w);
+        if (hit){
+            output[dispatch_id.x, dispatch_id.y] = float4(u, v, w, i);
+            return;
+        }
+    }
+    output[dispatch_id.x, dispatch_id.y] = float4(0.0, 0.0, 0.0, -1);
+}

sf3d/utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from typing import Any
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+import sf3d.models.utils as sf3d_utils
+def create_intrinsic_from_fov_deg(fov_deg: float, cond_height: int, cond_width: int):
+    intrinsic = sf3d_utils.get_intrinsic_from_fov(
+        np.deg2rad(fov_deg),
+        H=cond_height,
+        W=cond_width,
+    )
+    intrinsic_normed_cond = intrinsic.clone()
+    intrinsic_normed_cond[..., 0, 2] /= cond_width
+    intrinsic_normed_cond[..., 1, 2] /= cond_height
+    intrinsic_normed_cond[..., 0, 0] /= cond_width
+    intrinsic_normed_cond[..., 1, 1] /= cond_height
+    return intrinsic, intrinsic_normed_cond
+def default_cond_c2w(distance: float):
+    c2w_cond = torch.as_tensor(
+        [
+            [0, 0, 1, distance],
+            [1, 0, 0, 0],
+            [0, 1, 0, 0],
+            [0, 0, 0, 1],
+        ]
+    ).float()
+    return c2w_cond
+def remove_background(
+    image: Image,
+    rembg_session: Any = None,
+    force: bool = False,
+    **rembg_kwargs,
+) -> Image:
+    do_remove = True
+    if image.mode == "RGBA" and image.getextrema()[3][0] < 255:
+        do_remove = False
+    do_remove = do_remove or force
+    if do_remove:
+        image = rembg.remove(image, session=rembg_session, **rembg_kwargs)
+    return image
+def resize_foreground(
+    image: Image,
+    ratio: float,
+) -> Image:
+    image = np.array(image)
+    assert image.shape[-1] == 4
+    alpha = np.where(image[..., 3] > 0)
+    y1, y2, x1, x2 = (
+        alpha[0].min(),
+        alpha[0].max(),
+        alpha[1].min(),
+        alpha[1].max(),
+    )
+    # crop the foreground
+    fg = image[y1:y2, x1:x2]
+    # pad to square
+    size = max(fg.shape[0], fg.shape[1])
+    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
+    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
+    new_image = np.pad(
+        fg,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    # compute padding according to the ratio
+    new_size = int(new_image.shape[0] / ratio)
+    # pad to size, double side
+    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
+    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
+    new_image = np.pad(
+        new_image,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    new_image = Image.fromarray(new_image, mode="RGBA")
+    return new_image