Spaces:
Running
on
Zero
Running
on
Zero
NIRVANALAN
commited on
Commit
•
52d2875
1
Parent(s):
c00df70
update
Browse files- app.py +2 -1
- nsr/lsgm/flow_matching_trainer.py +21 -8
- nsr/train_util_diffusion.py +31 -17
- requirements.txt +2 -1
app.py
CHANGED
@@ -341,7 +341,8 @@ def main(args):
|
|
341 |
with gr.Row():
|
342 |
with gr.Tab("Reconstruction"):
|
343 |
with gr.Column():
|
344 |
-
output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
|
|
|
345 |
output_model = gr.Model3D(
|
346 |
height=384,
|
347 |
clear_color=(1,1,1,1),
|
|
|
341 |
with gr.Row():
|
342 |
with gr.Tab("Reconstruction"):
|
343 |
with gr.Column():
|
344 |
+
# output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
|
345 |
+
output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True, loop=True)
|
346 |
output_model = gr.Model3D(
|
347 |
height=384,
|
348 |
clear_color=(1,1,1,1),
|
nsr/lsgm/flow_matching_trainer.py
CHANGED
@@ -678,25 +678,36 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
678 |
|
679 |
self.ddpm_model.train()
|
680 |
|
|
|
681 |
@th.inference_mode()
|
682 |
def eval_i23d_and_export(
|
683 |
self,
|
684 |
inp_img,
|
|
|
|
|
|
|
|
|
|
|
685 |
# camera,
|
686 |
prompt="",
|
687 |
save_img=False,
|
688 |
use_train_trajectory=False,
|
689 |
num_samples=1,
|
690 |
num_instances=1,
|
691 |
-
unconditional_guidance_scale=4.0, # default value in neural ode
|
692 |
export_mesh=True,
|
693 |
**kwargs,
|
694 |
):
|
695 |
|
696 |
-
output_model= './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.
|
697 |
-
output_video = './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
|
698 |
|
699 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
700 |
|
701 |
camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
|
702 |
inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
|
@@ -722,7 +733,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
722 |
|
723 |
ucg_keys = [self.cond_key] # i23d
|
724 |
|
725 |
-
sampling_kwargs = {'cfg_scale': unconditional_guidance_scale}
|
726 |
|
727 |
N = num_samples # hard coded, to update
|
728 |
z_shape = (
|
@@ -769,7 +780,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
769 |
th.cuda.empty_cache()
|
770 |
|
771 |
# ! render sampled latent
|
772 |
-
name_prefix = f'
|
773 |
|
774 |
if self.cond_key == 'caption':
|
775 |
name_prefix = f'{name_prefix}_{prompt}'
|
@@ -784,7 +795,9 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
784 |
save_img=save_img,
|
785 |
render_reference=batch,
|
786 |
export_mesh=export_mesh,
|
787 |
-
render_all=True
|
|
|
|
|
788 |
|
789 |
all_vid_dump_path.append(vid_dump_path)
|
790 |
all_mesh_dump_path.append(mesh_dump_path)
|
@@ -810,4 +823,4 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
810 |
|
811 |
# else:
|
812 |
batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
|
813 |
-
return sample_and_save(batch_c)
|
|
|
678 |
|
679 |
self.ddpm_model.train()
|
680 |
|
681 |
+
|
682 |
@th.inference_mode()
|
683 |
def eval_i23d_and_export(
|
684 |
self,
|
685 |
inp_img,
|
686 |
+
num_steps=250,
|
687 |
+
seed=42,
|
688 |
+
mesh_size=192,
|
689 |
+
mesh_thres=10,
|
690 |
+
unconditional_guidance_scale=4.0, # default value in neural ode
|
691 |
# camera,
|
692 |
prompt="",
|
693 |
save_img=False,
|
694 |
use_train_trajectory=False,
|
695 |
num_samples=1,
|
696 |
num_instances=1,
|
|
|
697 |
export_mesh=True,
|
698 |
**kwargs,
|
699 |
):
|
700 |
|
701 |
+
# output_model, output_video = './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.obj', './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
|
|
|
702 |
|
703 |
+
# return output_model, output_video
|
704 |
+
logger.log(
|
705 |
+
num_steps,
|
706 |
+
unconditional_guidance_scale,
|
707 |
+
seed,
|
708 |
+
mesh_size,
|
709 |
+
mesh_thres,
|
710 |
+
)
|
711 |
|
712 |
camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
|
713 |
inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
|
|
|
733 |
|
734 |
ucg_keys = [self.cond_key] # i23d
|
735 |
|
736 |
+
sampling_kwargs = {'cfg_scale': unconditional_guidance_scale, 'num_steps': num_steps, 'seed': seed}
|
737 |
|
738 |
N = num_samples # hard coded, to update
|
739 |
z_shape = (
|
|
|
780 |
th.cuda.empty_cache()
|
781 |
|
782 |
# ! render sampled latent
|
783 |
+
name_prefix = f'cfg_{unconditional_guidance_scale}_sample-{i}'
|
784 |
|
785 |
if self.cond_key == 'caption':
|
786 |
name_prefix = f'{name_prefix}_{prompt}'
|
|
|
795 |
save_img=save_img,
|
796 |
render_reference=batch,
|
797 |
export_mesh=export_mesh,
|
798 |
+
render_all=True,
|
799 |
+
mesh_size=mesh_size,
|
800 |
+
mesh_thres=mesh_thres)
|
801 |
|
802 |
all_vid_dump_path.append(vid_dump_path)
|
803 |
all_mesh_dump_path.append(mesh_dump_path)
|
|
|
823 |
|
824 |
# else:
|
825 |
batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
|
826 |
+
return sample_and_save(batch_c)
|
nsr/train_util_diffusion.py
CHANGED
@@ -18,6 +18,8 @@ from torch.utils.tensorboard.writer import SummaryWriter
|
|
18 |
from tqdm import tqdm
|
19 |
import matplotlib.pyplot as plt
|
20 |
|
|
|
|
|
21 |
from guided_diffusion.gaussian_diffusion import _extract_into_tensor
|
22 |
from guided_diffusion import dist_util, logger
|
23 |
from guided_diffusion.fp16_util import MixedPrecisionTrainer
|
@@ -31,14 +33,10 @@ from guided_diffusion.train_util import (TrainLoop, calc_average_loss,
|
|
31 |
log_rec3d_loss_dict,
|
32 |
parse_resume_step_from_filename)
|
33 |
|
34 |
-
|
35 |
-
import mcubes
|
36 |
-
import trimesh
|
37 |
import dnnlib
|
38 |
-
from safetensors.torch import load_file
|
39 |
-
from huggingface_hub import hf_hub_download
|
40 |
|
41 |
from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
|
|
|
42 |
|
43 |
# AMP
|
44 |
# from accelerate import Accelerator
|
@@ -48,6 +46,16 @@ from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
|
|
48 |
# use_amp = False
|
49 |
# use_amp = True
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
class TrainLoopDiffusionWithRec(TrainLoop):
|
53 |
"""an interface with rec_model required apis
|
@@ -173,7 +181,9 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
173 |
save_img=False,
|
174 |
render_reference=None,
|
175 |
export_mesh=False,
|
176 |
-
render_all=False
|
|
|
|
|
177 |
|
178 |
planes *= self.triplane_scaling_divider # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
|
179 |
|
@@ -196,9 +206,8 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
196 |
behaviour='decode_after_vae_no_render'))
|
197 |
|
198 |
if export_mesh:
|
199 |
-
|
200 |
-
|
201 |
-
mesh_thres = 10 # TODO, requires tuning
|
202 |
dump_path = f'{logger.get_dir()}/mesh/'
|
203 |
|
204 |
os.makedirs(dump_path, exist_ok=True)
|
@@ -220,6 +229,10 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
220 |
vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy() # (0, 1)
|
221 |
vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
|
222 |
|
|
|
|
|
|
|
|
|
223 |
mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
|
224 |
# st()
|
225 |
# mesh = trimesh.Trimesh(
|
@@ -227,16 +240,17 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
227 |
# faces=faces,
|
228 |
# )
|
229 |
|
230 |
-
mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.
|
231 |
-
mesh.export(mesh_dump_path, '
|
232 |
|
233 |
-
|
234 |
del grid_out, mesh
|
235 |
th.cuda.empty_cache()
|
236 |
# return
|
237 |
|
|
|
238 |
video_out = imageio.get_writer(
|
239 |
-
|
240 |
mode='I',
|
241 |
fps=15,
|
242 |
codec='libx264')
|
@@ -331,8 +345,7 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
331 |
],
|
332 |
dim=-1) # B, 3, H, W
|
333 |
|
334 |
-
if
|
335 |
-
# if save_img:
|
336 |
for batch_idx in range(gen_img.shape[0]):
|
337 |
sampled_img = Image.fromarray(
|
338 |
(gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
|
@@ -357,11 +370,12 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
357 |
# if not save_img:
|
358 |
video_out.close()
|
359 |
del video_out
|
360 |
-
print('logged video to: ',
|
361 |
-
f'{logger.get_dir()}/triplane_{name_prefix}.mp4')
|
362 |
|
363 |
del vis, pred_vis, micro, pred,
|
364 |
|
|
|
|
|
365 |
def _init_optim_groups(self, rec_model, freeze_decoder=False):
|
366 |
"""for initializing the reconstruction model; fixing decoder part.
|
367 |
"""
|
|
|
18 |
from tqdm import tqdm
|
19 |
import matplotlib.pyplot as plt
|
20 |
|
21 |
+
from safetensors.torch import load_file
|
22 |
+
|
23 |
from guided_diffusion.gaussian_diffusion import _extract_into_tensor
|
24 |
from guided_diffusion import dist_util, logger
|
25 |
from guided_diffusion.fp16_util import MixedPrecisionTrainer
|
|
|
33 |
log_rec3d_loss_dict,
|
34 |
parse_resume_step_from_filename)
|
35 |
|
|
|
|
|
|
|
36 |
import dnnlib
|
|
|
|
|
37 |
|
38 |
from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
|
39 |
+
from huggingface_hub import hf_hub_download
|
40 |
|
41 |
# AMP
|
42 |
# from accelerate import Accelerator
|
|
|
46 |
# use_amp = False
|
47 |
# use_amp = True
|
48 |
|
49 |
+
# Function to generate a rotation matrix for an arbitrary theta along the x-axis
|
50 |
+
def rotation_matrix_x(theta_degrees):
|
51 |
+
theta = np.radians(theta_degrees) # Convert degrees to radians
|
52 |
+
cos_theta = np.cos(theta)
|
53 |
+
sin_theta = np.sin(theta)
|
54 |
+
|
55 |
+
rotation_matrix = np.array([[1, 0, 0],
|
56 |
+
[0, cos_theta, -sin_theta],
|
57 |
+
[0, sin_theta, cos_theta]])
|
58 |
+
return rotation_matrix
|
59 |
|
60 |
class TrainLoopDiffusionWithRec(TrainLoop):
|
61 |
"""an interface with rec_model required apis
|
|
|
181 |
save_img=False,
|
182 |
render_reference=None,
|
183 |
export_mesh=False,
|
184 |
+
render_all=False,
|
185 |
+
mesh_size=192,
|
186 |
+
mesh_thres=10):
|
187 |
|
188 |
planes *= self.triplane_scaling_divider # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
|
189 |
|
|
|
206 |
behaviour='decode_after_vae_no_render'))
|
207 |
|
208 |
if export_mesh:
|
209 |
+
import mcubes
|
210 |
+
import trimesh
|
|
|
211 |
dump_path = f'{logger.get_dir()}/mesh/'
|
212 |
|
213 |
os.makedirs(dump_path, exist_ok=True)
|
|
|
229 |
vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy() # (0, 1)
|
230 |
vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
|
231 |
|
232 |
+
# rotate mesh along x dim
|
233 |
+
vtx = np.transpose(rotation_matrix_x(-90) @ np.transpose(vtx))
|
234 |
+
|
235 |
+
|
236 |
mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
|
237 |
# st()
|
238 |
# mesh = trimesh.Trimesh(
|
|
|
240 |
# faces=faces,
|
241 |
# )
|
242 |
|
243 |
+
mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.obj')
|
244 |
+
mesh.export(mesh_dump_path, 'obj')
|
245 |
|
246 |
+
logger.log(f"Mesh dumped to {mesh_dump_path}")
|
247 |
del grid_out, mesh
|
248 |
th.cuda.empty_cache()
|
249 |
# return
|
250 |
|
251 |
+
vid_dump_path = f'{logger.get_dir()}/triplane_{name_prefix}.mp4'
|
252 |
video_out = imageio.get_writer(
|
253 |
+
vid_dump_path,
|
254 |
mode='I',
|
255 |
fps=15,
|
256 |
codec='libx264')
|
|
|
345 |
],
|
346 |
dim=-1) # B, 3, H, W
|
347 |
|
348 |
+
if save_img:
|
|
|
349 |
for batch_idx in range(gen_img.shape[0]):
|
350 |
sampled_img = Image.fromarray(
|
351 |
(gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
|
|
|
370 |
# if not save_img:
|
371 |
video_out.close()
|
372 |
del video_out
|
373 |
+
print('logged video to: ', f'{vid_dump_path}')
|
|
|
374 |
|
375 |
del vis, pred_vis, micro, pred,
|
376 |
|
377 |
+
return vid_dump_path, mesh_dump_path
|
378 |
+
|
379 |
def _init_optim_groups(self, rec_model, freeze_decoder=False):
|
380 |
"""for initializing the reconstruction model; fixing decoder part.
|
381 |
"""
|
requirements.txt
CHANGED
@@ -31,4 +31,5 @@ safetensors
|
|
31 |
matplotlib
|
32 |
git+https://github.com/nupurkmr9/vision-aided-gan
|
33 |
PyMCubes
|
34 |
-
trimesh
|
|
|
|
31 |
matplotlib
|
32 |
git+https://github.com/nupurkmr9/vision-aided-gan
|
33 |
PyMCubes
|
34 |
+
trimesh
|
35 |
+
gradio==4.29
|