Spaces:

kxic
/

EscherNet

Running on Zero

App Files Files Community

EscherNet / eval_eschernet.py

kxic

Upload folder using huggingface_hub

e371ddd verified 6 months ago

raw

history blame

30 kB

	#!/usr/bin/env python
	# coding=utf-8
	# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and

	import argparse
	import os
	import einops
	import numpy as np
	import torch
	import torch.utils.checkpoint
	from accelerate.utils import ProjectConfiguration, set_seed
	from PIL import Image
	from torchvision import transforms
	from tqdm.auto import tqdm
	import torchvision
	import json
	import cv2
	from skimage.io import imsave
	import matplotlib.pyplot as plt

	# read .exr files for RTMV dataset
	os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"


	def parse_args(input_args=None):
	parser = argparse.ArgumentParser(description="Simple example of a Zero123 training script.")
	parser.add_argument(
	"--pretrained_model_name_or_path",
	type=str,
	default="lambdalabs/sd-image-variations-diffusers",
	required=True,
	help="Path to pretrained model or model identifier from huggingface.co/models.",
	)
	parser.add_argument(
	"--revision",
	type=str,
	default=None,
	required=False,
	help=(
	"Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
	" float32 precision."
	),
	)
	parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
	parser.add_argument(
	"--resolution",
	type=int,
	default=256,
	help=(
	"The resolution for input images, all the images in the train/validation dataset will be resized to this"
	" resolution"
	),
	)
	parser.add_argument("--num_train_epochs", type=int, default=1)
	parser.add_argument(
	"--T_in", type=int, default=1, help="Number of input views"
	)
	parser.add_argument(
	"--T_out", type=int, default=1, help="Number of output views"
	)
	parser.add_argument(
	"--guidance_scale",
	type=float,
	default=3.0,
	help="unconditional guidance scale, if guidance_scale>1.0, do_classifier_free_guidance"
	)
	parser.add_argument(
	"--data_dir",
	type=str,
	default=".",
	help=(
	"The input data dir. Should contain the .png files (or other data files) for the task."
	),
	)
	parser.add_argument(
	"--data_type",
	type=str,
	default="GSO25",
	help=(
	"The input data type. Chosen from GSO25, GSO3D, GSO100, RTMV, NeRF, Franka, MVDream, Text2Img"
	),
	)
	parser.add_argument(
	"--cape_type",
	type=str,
	default="6DoF",
	help=(
	"The camera pose encoding CaPE type. Chosen from 4DoF, 6DoF"
	),
	)
	parser.add_argument(
	"--output_dir",
	type=str,
	default="logs_eval",
	help=(
	"The output directory where the model predictions and checkpoints will be written."
	),
	)
	parser.add_argument(
	"--mixed_precision",
	type=str,
	default=None,
	choices=["no", "fp16", "bf16"],
	help=(
	"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
	" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
	" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
	),
	)
	parser.add_argument(
	"--enable_xformers_memory_efficient_attention", default=True, help="Whether or not to use xformers."
	)



	if input_args is not None:
	args = parser.parse_args(input_args)
	else:
	args = parser.parse_args()
	if args.resolution % 8 != 0:
	raise ValueError(
	"`--resolution` must be divisible by 8 for consistently sized encoded images."
	)

	return args


	# create angles in archimedean spiral with T_out number
	import math
	def get_archimedean_spiral(sphere_radius, num_steps=250):
	# x-z plane, around upper y
	'''
	https://en.wikipedia.org/wiki/Spiral, section "Spherical spiral". c = a / pi
	'''
	a = 40
	r = sphere_radius

	translations = []
	angles = []

	# i = a / 2
	i = 0.01
	while i < a:
	theta = i / a * math.pi
	x = r * math.sin(theta) * math.cos(-i)
	z = r * math.sin(-theta + math.pi) * math.sin(-i)
	y = r * - math.cos(theta)

	# translations.append((x, y, z)) # origin
	translations.append((x, z, -y))
	angles.append([np.rad2deg(-i), np.rad2deg(theta)])

	# i += a / (2 * num_steps)
	i += a / (1 * num_steps)

	return np.array(translations), np.stack(angles)

	# 36 views around the circle, with elevation degree
	def get_circle_traj(sphere_radius, elevation=0, num_steps=36):
	translations = []
	angles = []
	elevation = np.deg2rad(elevation)
	for i in range(num_steps):
	theta = i / num_steps * 2 * math.pi
	x = sphere_radius * math.sin(theta) * math.cos(elevation)
	z = sphere_radius * math.sin(-theta+math.pi) * math.sin(-elevation)
	y = sphere_radius * -math.cos(theta)

	translations.append((x, z, -y))
	angles.append([np.rad2deg(-elevation), np.rad2deg(theta)])

	return np.array(translations), np.stack(angles)



	def look_at(origin, target, up):
	forward = (target - origin)
	forward = forward / np.linalg.norm(forward)
	right = np.cross(up, forward)
	right = right / np.linalg.norm(right)
	new_up = np.cross(forward, right)
	rotation_matrix = np.column_stack((right, new_up, -forward, target))
	matrix = np.row_stack((rotation_matrix, [0, 0, 0, 1]))
	return matrix

	# from carvekit.api.high import HiInterface
	# def create_carvekit_interface():
	# # Check doc strings for more information
	# interface = HiInterface(object_type="object", # Can be "object" or "hairs-like".
	# batch_size_seg=5,
	# batch_size_matting=1,
	# device='cuda' if torch.cuda.is_available() else 'cpu',
	# seg_mask_size=640, # Use 640 for Tracer B7 and 320 for U2Net
	# matting_mask_size=2048,
	# trimap_prob_threshold=231,
	# trimap_dilation=30,
	# trimap_erosion_iters=5,
	# fp16=False)
	#
	# return interface

	import rembg
	def create_rembg_interface():
	rembg_session = rembg.new_session()

	return rembg_session

	def main(args):
	if args.seed is not None:
	set_seed(args.seed)

	CaPE_TYPE = args.cape_type
	if CaPE_TYPE == "6DoF":
	import sys
	sys.path.insert(0, "./6DoF/")
	# use the customized diffusers modules
	from diffusers import DDIMScheduler
	from dataset import get_pose
	from CN_encoder import CN_encoder
	from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline

	elif CaPE_TYPE == "4DoF":
	import sys
	sys.path.insert(0, "./4DoF/")
	# use the customized diffusers modules
	from diffusers import DDIMScheduler
	from dataset import get_pose
	from CN_encoder import CN_encoder
	from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline
	else:
	raise ValueError("CaPE_TYPE must be chosen from 4DoF, 6DoF")

	# from dataset import get_pose
	# from CN_encoder import CN_encoder
	# from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline

	DATA_DIR = args.data_dir
	DATA_TYPE = args.data_type

	if DATA_TYPE == "GSO25":
	T_in_DATA_TYPE = "render_mvs_25" # same condition for GSO
	T_out_DATA_TYPE = "render_mvs_25" # for 2D metrics
	T_out = 25
	elif DATA_TYPE == "GSO25_6dof":
	T_in_DATA_TYPE = "render_6dof_25" # same condition for GSO
	T_out_DATA_TYPE = "render_6dof_25" # for 2D metrics
	T_out = 25
	elif DATA_TYPE == "GSO3D":
	T_in_DATA_TYPE = "render_mvs_25" # same condition for GSO
	T_out_DATA_TYPE = "render_sync_36_single" # for 3D metrics
	T_out = 36
	elif DATA_TYPE == "GSO100":
	T_in_DATA_TYPE = "render_mvs_25" # same condition for GSO
	T_out_DATA_TYPE = "render_spiral_100" # for 360 gif
	T_out = 100
	elif DATA_TYPE == "NeRF":
	T_out = 200
	elif DATA_TYPE == "RTMV":
	T_out = 20
	elif DATA_TYPE == "Franka":
	T_out = 100 # do a 360 gif
	elif DATA_TYPE == "MVDream":
	T_out = 100 # do a 360 gif
	elif DATA_TYPE == "Text2Img":
	T_out = 100 # do a 360 gif
	elif DATA_TYPE == "dust3r":
	# carvekit = create_carvekit_interface()
	rembg_session = create_rembg_interface()
	T_out = 50 # do a 360 gif
	# get the number of .png files in the folder
	obj_names = [f for f in os.listdir(DATA_DIR+"/user_object") if f.endswith('.png')]
	args.T_in = len(obj_names)
	else:
	raise NotImplementedError

	T_in = args.T_in
	OUTPUT_DIR= f"logs_{CaPE_TYPE}/{DATA_TYPE}/N{T_in}M{T_out}"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# get all folders in DATA_DIR
	if DATA_TYPE == "Text2Img":
	# get all rgba_png in DATA_DIR
	obj_names = [f for f in os.listdir(DATA_DIR) if f.endswith('rgba.png')]
	else:
	obj_names = [f for f in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, f))]

	weight_dtype = torch.float16
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	h, w = args.resolution, args.resolution
	bg_color = [1., 1., 1., 1.]
	radius = 2.2 #1.5 #1.8 # Objaverse training radius [1.5, 2.2]
	# radius_4dof = np.pi * (np.log(radius) - np.log(1.5)) / (np.log(2.2)-np.log(1.5))

	# Init Dataset
	image_transforms = torchvision.transforms.Compose(
	[
	torchvision.transforms.Resize((args.resolution, args.resolution)), # 256, 256
	transforms.ToTensor(),
	transforms.Normalize([0.5], [0.5])
	]
	)

	# Init pipeline
	scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler",
	revision=args.revision)
	image_encoder = CN_encoder.from_pretrained(args.pretrained_model_name_or_path, subfolder="image_encoder", revision=args.revision)
	pipeline = Zero1to3StableDiffusionPipeline.from_pretrained(
	args.pretrained_model_name_or_path,
	revision=args.revision,
	scheduler=scheduler,
	image_encoder=None,
	safety_checker=None,
	feature_extractor=None,
	torch_dtype=weight_dtype,
	)
	pipeline.image_encoder = image_encoder
	pipeline = pipeline.to(device)
	pipeline.set_progress_bar_config(disable=False)

	if args.enable_xformers_memory_efficient_attention:
	pipeline.enable_xformers_memory_efficient_attention()
	# enable vae slicing
	pipeline.enable_vae_slicing()

	if args.seed is None:
	generator = None
	else:
	generator = torch.Generator(device=device).manual_seed(args.seed)


	for obj_name in tqdm(obj_names):
	print(f"Processing {obj_name}")
	if DATA_TYPE == "NeRF":
	if os.path.exists(os.path.join(args.output_dir, obj_name, "output.gif")):
	continue
	# load train info
	with open(os.path.join(DATA_DIR, obj_name, "transforms_train.json"), "r") as f:
	train_info = json.load(f)["frames"]
	# load test info
	with open(os.path.join(DATA_DIR, obj_name, "transforms_test.json"), "r") as f:
	test_info = json.load(f)["frames"]

	# find the radius [min_t, max_t] of the object, we later scale it to training radius [1.5, 2.2]
	max_t = 0
	min_t = 100
	for i in range(len(train_info)):
	pose = np.array(train_info[i]["transform_matrix"]).reshape(4, 4)
	translation = pose[:3, -1]
	radii = np.linalg.norm(translation)
	if max_t < radii:
	max_t = radii
	if min_t > radii:
	min_t = radii
	info_dir = os.path.join("metrics/NeRF_idx", obj_name)
	assert os.path.exists(info_dir) # use fixed train index
	train_index = np.load(os.path.join(info_dir, f"train_N{T_in}M20_random.npy"))
	test_index = np.arange(len(test_info)) # use all test views
	elif DATA_TYPE == "Franka":
	angles_in = np.load(os.path.join(DATA_DIR, obj_name, "angles.npy")) # azimuth, elevation in radians
	assert T_in <= len(angles_in)
	total_index = np.arange(0, len(angles_in)) # num of input views
	# random shuffle total_index
	np.random.shuffle(total_index)
	train_index = total_index[:T_in]
	xyzs, angles_out = get_archimedean_spiral(radius, T_out)
	origin = np.array([0, 0, 0])
	up = np.array([0, 0, 1])
	test_index = np.arange(len(angles_out)) # use all 100 test views
	elif DATA_TYPE == "MVDream": # 4 input views front right back left
	angles_in = []
	for polar in [90]: # 1
	for azimu in np.arange(0, 360, 90): # 4
	angles_in.append(np.array([azimu, polar]))
	assert T_in == len(angles_in)
	xyzs, angles_out = get_archimedean_spiral(radius, T_out)
	origin = np.array([0, 0, 0])
	up = np.array([0, 0, 1])
	train_index = np.arange(T_in)
	test_index = np.arange(T_out)
	elif DATA_TYPE == "Text2Img": # 1 input view
	angles_in = []
	angles_in.append(np.array([0, 90]))
	assert T_in == len(angles_in)
	xyzs, angles_out = get_archimedean_spiral(radius, T_out)
	origin = np.array([0, 0, 0])
	up = np.array([0, 0, 1])
	train_index = np.arange(T_in)
	test_index = np.arange(T_out)
	elif DATA_TYPE == "dust3r":
	# TODO full archimedean spiral traj
	# xyzs, angles_out = get_archimedean_spiral(radius, T_out)
	# TODO only top circle traj
	xyzs, angles_out = get_archimedean_spiral(1.5, 100)
	xyzs = xyzs[:T_out]
	angles_out = angles_out[:T_out]
	# # TODO circle traj
	# xyzs, angles_out = get_circle_traj(radius, elevation=30, num_steps=T_out)
	origin = np.array([0, 0, 0])
	up = np.array([0, 0, 1])
	train_index = np.arange(T_in)
	test_index = np.arange(T_out)
	# get the max_t
	radii = np.load(os.path.join(DATA_DIR, obj_name, "radii.npy"))
	max_t = np.max(radii)
	min_t = np.min(radii)
	else:
	train_index = np.arange(T_in)
	test_index = np.arange(T_out)


	# prepare input img + pose, output pose
	input_image = []
	pose_in = []
	pose_out = []
	gt_image = []
	for T_in_index in train_index:
	if DATA_TYPE == "RTMV":
	img_path = os.path.join(DATA_DIR, obj_name, '%05d.exr' % T_in_index)
	input_im = cv2.imread(img_path, cv2.IMREAD_ANYCOLOR \| cv2.IMREAD_ANYDEPTH)
	img = cv2.cvtColor(input_im, cv2.COLOR_BGR2RGB, input_im)
	img = Image.fromarray(np.uint8(img[:, :, :3] * 255.)).convert("RGB")
	input_image.append(image_transforms(img))
	# load input pose
	pose_path = os.path.join(DATA_DIR, obj_name, '%05d.json' % T_in_index)
	with open(pose_path, "r") as f:
	pose_dict = json.load(f)
	input_RT = np.array(pose_dict["camera_data"]["cam2world"]).T
	input_RT = np.linalg.inv(input_RT)[:3]
	pose_in.append(get_pose(np.concatenate([input_RT[:3, :], np.array([[0, 0, 0, 1]])], axis=0)))
	else:
	if DATA_TYPE == "NeRF":
	img_path = os.path.join(DATA_DIR, obj_name, train_info[T_in_index]["file_path"] + ".png")
	pose = np.array(train_info[T_in_index]["transform_matrix"])
	if CaPE_TYPE == "6DoF":
	# blender to opencv
	pose[1:3, :] *= -1
	pose = np.linalg.inv(pose)
	# scale radius to [1.5, 2.2]
	pose[:3, 3] = 1. / max_t radius
	elif CaPE_TYPE == "4DoF":
	pose = np.linalg.inv(pose)
	pose_in.append(torch.from_numpy(get_pose(pose)))
	elif DATA_TYPE == "Franka":
	img_path = os.path.join(DATA_DIR, obj_name, "images_rgba", f"frame{T_in_index:06d}.png")
	azimuth, elevation = np.rad2deg(angles_in[T_in_index])
	print("input angles index", T_in_index, "azimuth", azimuth, "elevation", elevation)
	if CaPE_TYPE == "4DoF":
	pose_in.append(torch.from_numpy([np.deg2rad(90. - elevation), np.deg2rad(azimuth - 180), 0., 0.]))
	elif CaPE_TYPE == "6DoF":
	neg_i = np.deg2rad(azimuth - 180)
	neg_theta = np.deg2rad(90. - elevation)
	xyz = np.array([np.sin(neg_theta) * np.cos(neg_i),
	np.sin(-neg_theta + np.pi) * np.sin(neg_i),
	np.cos(neg_theta)]) * radius
	pose = look_at(origin, xyz, up)
	pose = np.linalg.inv(pose)
	pose[2, :] *= -1
	pose_in.append(torch.from_numpy(get_pose(pose)))
	elif DATA_TYPE == "MVDream" or DATA_TYPE == "Text2Img":
	if DATA_TYPE == "MVDream":
	img_path = os.path.join(DATA_DIR, obj_name, f"{T_in_index}_rgba.png")
	elif DATA_TYPE == "Text2Img":
	img_path = os.path.join(DATA_DIR, obj_name)
	azimuth, polar = angles_in[T_in_index]
	if CaPE_TYPE == "4DoF":
	pose_in.append(torch.tensor([np.deg2rad(polar), np.deg2rad(azimuth), 0., 0.]))
	elif CaPE_TYPE == "6DoF":
	neg_theta = np.deg2rad(polar)
	neg_i = np.deg2rad(azimuth)
	xyz = np.array([np.sin(neg_theta) * np.cos(neg_i),
	np.sin(-neg_theta + np.pi) * np.sin(neg_i),
	np.cos(neg_theta)]) * radius
	pose = look_at(origin, xyz, up)
	pose = np.linalg.inv(pose)
	pose[2, :] *= -1
	pose_in.append(torch.from_numpy(get_pose(pose)))
	elif DATA_TYPE == "dust3r": # TODO get the object coordinate, now one of the camera is the center
	img_path = os.path.join(DATA_DIR, obj_name, "%03d.png" % T_in_index)
	pose = get_pose(np.linalg.inv(np.load(os.path.join(DATA_DIR, obj_name, "%03d.npy" % T_in_index))))
	pose[1:3, :] *= -1
	# scale radius to [1.5, 2.2]
	pose[:3, 3] = 1. / max_t radius
	pose_in.append(torch.from_numpy(pose))
	else: # GSO
	img_path = os.path.join(DATA_DIR, obj_name, T_in_DATA_TYPE, "model/%03d.png" % T_in_index)
	pose_path = os.path.join(DATA_DIR, obj_name, T_in_DATA_TYPE, "model/%03d.npy" % T_in_index)
	if T_in_DATA_TYPE == "render_mvs_25" or T_in_DATA_TYPE == "render_6dof_25": # blender coordinate
	pose_in.append(get_pose(np.concatenate([np.load(pose_path)[:3, :], np.array([[0, 0, 0, 1]])], axis=0)))
	else: # opencv coordinate
	pose = get_pose(np.concatenate([np.load(pose_path)[:3, :], np.array([[0, 0, 0, 1]])], axis=0))
	pose[1:3, :] *= -1 # pose out 36 is in opencv coordinate, pose in 25 is in blender coordinate
	pose_in.append(torch.from_numpy(pose))
	# pose_in.append(get_pose(np.concatenate([np.load(pose_path)[:3, :], np.array([[0, 0, 0, 1]])], axis=0)))

	# load image
	img = plt.imread(img_path)
	if (img.shape[-1] == 3 or (img[:,:,-1] == 1).all()) and DATA_TYPE == "dust3r":
	img_pil = Image.fromarray(np.uint8(img * 255.)).convert("RGB") # to PIL image

	## use carvekit
	# image_without_background = carvekit([img_pil])[0]
	# image_without_background = np.array(image_without_background)
	# est_seg = image_without_background > 127
	# foreground = est_seg[:, :, -1].astype(np.bool_)
	# img = np.concatenate([img[:,:,:3], foreground[:, :, np.newaxis]], axis=-1)

	# use rembg
	image = rembg.remove(img_pil, session=rembg_session)
	foreground = np.array(image)[:,:,-1] > 127
	img = np.concatenate([img[:,:,:3], foreground[:, :, np.newaxis]], axis=-1)


	img[img[:, :, -1] == 0.] = bg_color
	img = Image.fromarray(np.uint8(img[:, :, :3] * 255.)).convert("RGB")
	input_image.append(image_transforms(img))


	for T_out_index in test_index:
	if DATA_TYPE == "RTMV":
	img_path = os.path.join(DATA_DIR, obj_name, '%05d.exr' % T_out_index)
	gt_im = cv2.imread(img_path, cv2.IMREAD_ANYCOLOR \| cv2.IMREAD_ANYDEPTH)
	img = cv2.cvtColor(gt_im, cv2.COLOR_BGR2RGB, gt_im)
	img = Image.fromarray(np.uint8(img[:, :, :3] * 255.)).convert("RGB")
	gt_image.append(image_transforms(img))
	# load pose
	pose_path = os.path.join(DATA_DIR, obj_name, '%05d.json' % T_out_index)
	with open(pose_path, "r") as f:
	pose_dict = json.load(f)
	output_RT = np.array(pose_dict["camera_data"]["cam2world"]).T
	output_RT = np.linalg.inv(output_RT)[:3]
	pose_out.append(get_pose(np.concatenate([output_RT[:3, :], np.array([[0, 0, 0, 1]])], axis=0)))
	else:
	if DATA_TYPE == "NeRF":
	img_path = os.path.join(DATA_DIR, obj_name, test_info[T_out_index]["file_path"] + ".png")
	pose = np.array(test_info[T_out_index]["transform_matrix"])
	if CaPE_TYPE == "6DoF":
	# blender to opencv
	pose[1:3, :] *= -1
	pose = np.linalg.inv(pose)
	# scale radius to [1.5, 2.2]
	pose[:3, 3] = 1. / max_t radius
	elif CaPE_TYPE == "4DoF":
	pose = np.linalg.inv(pose)
	pose_out.append(torch.from_numpy(get_pose(pose)))
	elif DATA_TYPE == "Franka":
	img_path = None
	azimuth, polar = angles_out[T_out_index]
	if CaPE_TYPE == "4DoF":
	pose_out.append(torch.from_numpy([np.deg2rad(polar), np.deg2rad(azimuth), 0., 0.]))
	elif CaPE_TYPE == "6DoF":
	pose = look_at(origin, xyzs[T_out_index], up)
	neg_theta = np.deg2rad(polar)
	neg_i = np.deg2rad(azimuth)
	xyz = np.array([np.sin(neg_theta) * np.cos(neg_i),
	np.sin(-neg_theta + np.pi) * np.sin(neg_i),
	np.cos(neg_theta)]) * radius
	assert np.allclose(xyzs[T_out_index], xyz)
	pose = np.linalg.inv(pose)
	pose[2, :] *= -1
	pose_out.append(torch.from_numpy(get_pose(pose)))
	elif DATA_TYPE == "MVDream" or DATA_TYPE == "Text2Img" or DATA_TYPE == "dust3r":
	img_path = None
	azimuth, polar = angles_out[T_out_index]
	if CaPE_TYPE == "4DoF":
	pose_out.append(torch.tensor([np.deg2rad(polar), np.deg2rad(azimuth), 0., 0.]))
	elif CaPE_TYPE == "6DoF":
	pose = look_at(origin, xyzs[T_out_index], up)
	pose = np.linalg.inv(pose)
	pose[2, :] *= -1
	pose_out.append(torch.from_numpy(get_pose(pose)))
	else: # GSO
	img_path = os.path.join(DATA_DIR, obj_name, T_out_DATA_TYPE, "model/%03d.png" % T_out_index)
	pose_path = os.path.join(DATA_DIR, obj_name, T_out_DATA_TYPE, "model/%03d.npy" % T_out_index)
	if T_out_DATA_TYPE == "render_mvs_25" or T_out_DATA_TYPE == "render_6dof_25": # blender coordinate
	pose_out.append(get_pose(np.concatenate([np.load(pose_path)[:3, :], np.array([[0, 0, 0, 1]])], axis=0)))
	else: # opencv coordinate
	pose = get_pose(np.concatenate([np.load(pose_path)[:3, :], np.array([[0, 0, 0, 1]])], axis=0))
	pose[1:3, :] *= -1 # pose out 36 is in opencv coordinate, pose in 25 is in blender coordinate
	pose_out.append(torch.from_numpy(pose))

	# load image
	if img_path is not None: # sometimes don't have GT target view image
	img = plt.imread(img_path)
	img[img[:, :, -1] == 0.] = bg_color
	img = Image.fromarray(np.uint8(img[:, :, :3] * 255.)).convert("RGB")
	gt_image.append(image_transforms(img))

	# [B, T, C, H, W]
	input_image = torch.stack(input_image, dim=0).to(device).to(weight_dtype).unsqueeze(0)
	if len(gt_image)>0:
	gt_image = torch.stack(gt_image, dim=0).to(device).to(weight_dtype).unsqueeze(0)
	# [B, T, 4]
	pose_in = np.stack(pose_in)
	pose_out = np.stack(pose_out)

	if CaPE_TYPE == "6DoF":
	pose_in_inv = np.linalg.inv(pose_in).transpose([0, 2, 1])
	pose_out_inv = np.linalg.inv(pose_out).transpose([0, 2, 1])
	pose_in_inv = torch.from_numpy(pose_in_inv).to(device).to(weight_dtype).unsqueeze(0)
	pose_out_inv = torch.from_numpy(pose_out_inv).to(device).to(weight_dtype).unsqueeze(0)


	pose_in = torch.from_numpy(pose_in).to(device).to(weight_dtype).unsqueeze(0)
	pose_out = torch.from_numpy(pose_out).to(device).to(weight_dtype).unsqueeze(0)

	input_image = einops.rearrange(input_image, "b t c h w -> (b t) c h w")
	if len(gt_image)>0:
	gt_image = einops.rearrange(gt_image, "b t c h w -> (b t) c h w")
	assert T_in == input_image.shape[0]
	assert T_in == pose_in.shape[1]
	assert T_out == pose_out.shape[1]

	# run inference
	if CaPE_TYPE == "6DoF":
	with torch.autocast("cuda"):
	image = pipeline(input_imgs=input_image, prompt_imgs=input_image, poses=[[pose_out, pose_out_inv], [pose_in, pose_in_inv]],
	height=h, width=w, T_in=T_in, T_out=T_out,
	guidance_scale=args.guidance_scale, num_inference_steps=50, generator=generator,
	output_type="numpy").images
	elif CaPE_TYPE == "4DoF":
	with torch.autocast("cuda"):
	image = pipeline(input_imgs=input_image, prompt_imgs=input_image, poses=[pose_out, pose_in],
	height=h, width=w, T_in=T_in, T_out=T_out,
	guidance_scale=args.guidance_scale, num_inference_steps=50, generator=generator,
	output_type="numpy").images

	# save results
	output_dir = os.path.join(OUTPUT_DIR, obj_name)
	os.makedirs(output_dir, exist_ok=True)
	# save input image for visualization
	imsave(os.path.join(output_dir, 'input.png'),
	((np.concatenate(input_image.permute(0, 2, 3, 1).cpu().numpy(), 1) + 1) / 2 * 255).astype(np.uint8))
	# save output image
	if T_out >= 30:
	# save to N imgs
	for i in range(T_out):
	imsave(os.path.join(output_dir, f'{i}.png'), (image[i] * 255).astype(np.uint8))
	# make a gif
	frames = [Image.fromarray((image[i] * 255).astype(np.uint8)) for i in range(T_out)]
	frame_one = frames[0]
	frame_one.save(os.path.join(output_dir, "output.gif"), format="GIF", append_images=frames,
	save_all=True, duration=50, loop=1)
	else:
	imsave(os.path.join(output_dir, '0.png'), (np.concatenate(image, 1) * 255).astype(np.uint8))
	# save gt for visualization
	if len(gt_image)>0:
	imsave(os.path.join(output_dir, 'gt.png'),
	((np.concatenate(gt_image.permute(0, 2, 3, 1).cpu().numpy(), 1) + 1) / 2 * 255).astype(np.uint8))




	if __name__ == "__main__":
	args = parse_args()
	main(args)