NIRVANALAN commited on
Commit
52d2875
1 Parent(s): c00df70
app.py CHANGED
@@ -341,7 +341,8 @@ def main(args):
341
  with gr.Row():
342
  with gr.Tab("Reconstruction"):
343
  with gr.Column():
344
- output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
 
345
  output_model = gr.Model3D(
346
  height=384,
347
  clear_color=(1,1,1,1),
 
341
  with gr.Row():
342
  with gr.Tab("Reconstruction"):
343
  with gr.Column():
344
+ # output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
345
+ output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True, loop=True)
346
  output_model = gr.Model3D(
347
  height=384,
348
  clear_color=(1,1,1,1),
nsr/lsgm/flow_matching_trainer.py CHANGED
@@ -678,25 +678,36 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
678
 
679
  self.ddpm_model.train()
680
 
 
681
  @th.inference_mode()
682
  def eval_i23d_and_export(
683
  self,
684
  inp_img,
 
 
 
 
 
685
  # camera,
686
  prompt="",
687
  save_img=False,
688
  use_train_trajectory=False,
689
  num_samples=1,
690
  num_instances=1,
691
- unconditional_guidance_scale=4.0, # default value in neural ode
692
  export_mesh=True,
693
  **kwargs,
694
  ):
695
 
696
- output_model= './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.ply'
697
- output_video = './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
698
 
699
- return output_video, output_model
 
 
 
 
 
 
 
700
 
701
  camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
702
  inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
@@ -722,7 +733,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
722
 
723
  ucg_keys = [self.cond_key] # i23d
724
 
725
- sampling_kwargs = {'cfg_scale': unconditional_guidance_scale}
726
 
727
  N = num_samples # hard coded, to update
728
  z_shape = (
@@ -769,7 +780,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
769
  th.cuda.empty_cache()
770
 
771
  # ! render sampled latent
772
- name_prefix = f'cfg={unconditional_guidance_scale}_sample-{i}'
773
 
774
  if self.cond_key == 'caption':
775
  name_prefix = f'{name_prefix}_{prompt}'
@@ -784,7 +795,9 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
784
  save_img=save_img,
785
  render_reference=batch,
786
  export_mesh=export_mesh,
787
- render_all=True)
 
 
788
 
789
  all_vid_dump_path.append(vid_dump_path)
790
  all_mesh_dump_path.append(mesh_dump_path)
@@ -810,4 +823,4 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
810
 
811
  # else:
812
  batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
813
- return sample_and_save(batch_c)
 
678
 
679
  self.ddpm_model.train()
680
 
681
+
682
  @th.inference_mode()
683
  def eval_i23d_and_export(
684
  self,
685
  inp_img,
686
+ num_steps=250,
687
+ seed=42,
688
+ mesh_size=192,
689
+ mesh_thres=10,
690
+ unconditional_guidance_scale=4.0, # default value in neural ode
691
  # camera,
692
  prompt="",
693
  save_img=False,
694
  use_train_trajectory=False,
695
  num_samples=1,
696
  num_instances=1,
 
697
  export_mesh=True,
698
  **kwargs,
699
  ):
700
 
701
+ # output_model, output_video = './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.obj', './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
 
702
 
703
+ # return output_model, output_video
704
+ logger.log(
705
+ num_steps,
706
+ unconditional_guidance_scale,
707
+ seed,
708
+ mesh_size,
709
+ mesh_thres,
710
+ )
711
 
712
  camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
713
  inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
 
733
 
734
  ucg_keys = [self.cond_key] # i23d
735
 
736
+ sampling_kwargs = {'cfg_scale': unconditional_guidance_scale, 'num_steps': num_steps, 'seed': seed}
737
 
738
  N = num_samples # hard coded, to update
739
  z_shape = (
 
780
  th.cuda.empty_cache()
781
 
782
  # ! render sampled latent
783
+ name_prefix = f'cfg_{unconditional_guidance_scale}_sample-{i}'
784
 
785
  if self.cond_key == 'caption':
786
  name_prefix = f'{name_prefix}_{prompt}'
 
795
  save_img=save_img,
796
  render_reference=batch,
797
  export_mesh=export_mesh,
798
+ render_all=True,
799
+ mesh_size=mesh_size,
800
+ mesh_thres=mesh_thres)
801
 
802
  all_vid_dump_path.append(vid_dump_path)
803
  all_mesh_dump_path.append(mesh_dump_path)
 
823
 
824
  # else:
825
  batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
826
+ return sample_and_save(batch_c)
nsr/train_util_diffusion.py CHANGED
@@ -18,6 +18,8 @@ from torch.utils.tensorboard.writer import SummaryWriter
18
  from tqdm import tqdm
19
  import matplotlib.pyplot as plt
20
 
 
 
21
  from guided_diffusion.gaussian_diffusion import _extract_into_tensor
22
  from guided_diffusion import dist_util, logger
23
  from guided_diffusion.fp16_util import MixedPrecisionTrainer
@@ -31,14 +33,10 @@ from guided_diffusion.train_util import (TrainLoop, calc_average_loss,
31
  log_rec3d_loss_dict,
32
  parse_resume_step_from_filename)
33
 
34
-
35
- import mcubes
36
- import trimesh
37
  import dnnlib
38
- from safetensors.torch import load_file
39
- from huggingface_hub import hf_hub_download
40
 
41
  from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
 
42
 
43
  # AMP
44
  # from accelerate import Accelerator
@@ -48,6 +46,16 @@ from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
48
  # use_amp = False
49
  # use_amp = True
50
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  class TrainLoopDiffusionWithRec(TrainLoop):
53
  """an interface with rec_model required apis
@@ -173,7 +181,9 @@ class TrainLoopDiffusionWithRec(TrainLoop):
173
  save_img=False,
174
  render_reference=None,
175
  export_mesh=False,
176
- render_all=False):
 
 
177
 
178
  planes *= self.triplane_scaling_divider # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
179
 
@@ -196,9 +206,8 @@ class TrainLoopDiffusionWithRec(TrainLoop):
196
  behaviour='decode_after_vae_no_render'))
197
 
198
  if export_mesh:
199
- # if True:
200
- mesh_size = 192 # avoid OOM on V100
201
- mesh_thres = 10 # TODO, requires tuning
202
  dump_path = f'{logger.get_dir()}/mesh/'
203
 
204
  os.makedirs(dump_path, exist_ok=True)
@@ -220,6 +229,10 @@ class TrainLoopDiffusionWithRec(TrainLoop):
220
  vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy() # (0, 1)
221
  vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
222
 
 
 
 
 
223
  mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
224
  # st()
225
  # mesh = trimesh.Trimesh(
@@ -227,16 +240,17 @@ class TrainLoopDiffusionWithRec(TrainLoop):
227
  # faces=faces,
228
  # )
229
 
230
- mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.ply')
231
- mesh.export(mesh_dump_path, 'ply')
232
 
233
- print(f"Mesh dumped to {dump_path}")
234
  del grid_out, mesh
235
  th.cuda.empty_cache()
236
  # return
237
 
 
238
  video_out = imageio.get_writer(
239
- f'{logger.get_dir()}/triplane_{name_prefix}.mp4',
240
  mode='I',
241
  fps=15,
242
  codec='libx264')
@@ -331,8 +345,7 @@ class TrainLoopDiffusionWithRec(TrainLoop):
331
  ],
332
  dim=-1) # B, 3, H, W
333
 
334
- if False:
335
- # if save_img:
336
  for batch_idx in range(gen_img.shape[0]):
337
  sampled_img = Image.fromarray(
338
  (gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
@@ -357,11 +370,12 @@ class TrainLoopDiffusionWithRec(TrainLoop):
357
  # if not save_img:
358
  video_out.close()
359
  del video_out
360
- print('logged video to: ',
361
- f'{logger.get_dir()}/triplane_{name_prefix}.mp4')
362
 
363
  del vis, pred_vis, micro, pred,
364
 
 
 
365
  def _init_optim_groups(self, rec_model, freeze_decoder=False):
366
  """for initializing the reconstruction model; fixing decoder part.
367
  """
 
18
  from tqdm import tqdm
19
  import matplotlib.pyplot as plt
20
 
21
+ from safetensors.torch import load_file
22
+
23
  from guided_diffusion.gaussian_diffusion import _extract_into_tensor
24
  from guided_diffusion import dist_util, logger
25
  from guided_diffusion.fp16_util import MixedPrecisionTrainer
 
33
  log_rec3d_loss_dict,
34
  parse_resume_step_from_filename)
35
 
 
 
 
36
  import dnnlib
 
 
37
 
38
  from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
39
+ from huggingface_hub import hf_hub_download
40
 
41
  # AMP
42
  # from accelerate import Accelerator
 
46
  # use_amp = False
47
  # use_amp = True
48
 
49
+ # Function to generate a rotation matrix for an arbitrary theta along the x-axis
50
+ def rotation_matrix_x(theta_degrees):
51
+ theta = np.radians(theta_degrees) # Convert degrees to radians
52
+ cos_theta = np.cos(theta)
53
+ sin_theta = np.sin(theta)
54
+
55
+ rotation_matrix = np.array([[1, 0, 0],
56
+ [0, cos_theta, -sin_theta],
57
+ [0, sin_theta, cos_theta]])
58
+ return rotation_matrix
59
 
60
  class TrainLoopDiffusionWithRec(TrainLoop):
61
  """an interface with rec_model required apis
 
181
  save_img=False,
182
  render_reference=None,
183
  export_mesh=False,
184
+ render_all=False,
185
+ mesh_size=192,
186
+ mesh_thres=10):
187
 
188
  planes *= self.triplane_scaling_divider # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
189
 
 
206
  behaviour='decode_after_vae_no_render'))
207
 
208
  if export_mesh:
209
+ import mcubes
210
+ import trimesh
 
211
  dump_path = f'{logger.get_dir()}/mesh/'
212
 
213
  os.makedirs(dump_path, exist_ok=True)
 
229
  vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy() # (0, 1)
230
  vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
231
 
232
+ # rotate mesh along x dim
233
+ vtx = np.transpose(rotation_matrix_x(-90) @ np.transpose(vtx))
234
+
235
+
236
  mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
237
  # st()
238
  # mesh = trimesh.Trimesh(
 
240
  # faces=faces,
241
  # )
242
 
243
+ mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.obj')
244
+ mesh.export(mesh_dump_path, 'obj')
245
 
246
+ logger.log(f"Mesh dumped to {mesh_dump_path}")
247
  del grid_out, mesh
248
  th.cuda.empty_cache()
249
  # return
250
 
251
+ vid_dump_path = f'{logger.get_dir()}/triplane_{name_prefix}.mp4'
252
  video_out = imageio.get_writer(
253
+ vid_dump_path,
254
  mode='I',
255
  fps=15,
256
  codec='libx264')
 
345
  ],
346
  dim=-1) # B, 3, H, W
347
 
348
+ if save_img:
 
349
  for batch_idx in range(gen_img.shape[0]):
350
  sampled_img = Image.fromarray(
351
  (gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
 
370
  # if not save_img:
371
  video_out.close()
372
  del video_out
373
+ print('logged video to: ', f'{vid_dump_path}')
 
374
 
375
  del vis, pred_vis, micro, pred,
376
 
377
+ return vid_dump_path, mesh_dump_path
378
+
379
  def _init_optim_groups(self, rec_model, freeze_decoder=False):
380
  """for initializing the reconstruction model; fixing decoder part.
381
  """
requirements.txt CHANGED
@@ -31,4 +31,5 @@ safetensors
31
  matplotlib
32
  git+https://github.com/nupurkmr9/vision-aided-gan
33
  PyMCubes
34
- trimesh
 
 
31
  matplotlib
32
  git+https://github.com/nupurkmr9/vision-aided-gan
33
  PyMCubes
34
+ trimesh
35
+ gradio==4.29