Spaces:

geopavlakos
/

HaMeR

Build error

App Files Files Community

HaMeR / app.py

geopavlakos

Update app.py

aabc78e verified 2 days ago

raw

history blame contribute delete

9.67 kB

	import argparse
	import os
	from pathlib import Path
	import tempfile
	import tarfile
	import sys
	import cv2
	import gradio as gr
	import numpy as np
	import torch
	from PIL import Image

	# print file path
	print(os.path.abspath(__file__))
	os.environ["PYOPENGL_PLATFORM"] = "egl"
	os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1"
	os.system('pip install /home/user/app/pyrender')
	sys.path.append('/home/user/app/pyrender')

	from hamer.configs import get_config
	from hamer.datasets.vitdet_dataset import (DEFAULT_MEAN, DEFAULT_STD,
	ViTDetDataset)
	from hamer.models import HAMER
	from hamer.utils import recursive_to
	from hamer.utils.renderer import Renderer, cam_crop_to_full

	def extract_tar() -> None:
	if Path('mmdet_configs/configs').exists():
	return
	with tarfile.open('mmdet_configs/configs.tar') as f:
	f.extractall('mmdet_configs')

	extract_tar()

	#from vitpose_model import DetModel

	#try:
	# import detectron2
	#except:
	# import os
	# os.system('pip install --upgrade pip')
	# os.system('pip install git+https://github.com/facebookresearch/detectron2.git')

	#try:
	# from vitpose_model import ViTPoseModel
	#except:
	# os.system('pip install -v -e /home/user/app/vendor/ViTPose')
	# from vitpose_model import ViTPoseModel
	from vitpose_model import ViTPoseModel

	OUT_FOLDER = 'demo_out'
	os.makedirs(OUT_FOLDER, exist_ok=True)

	# Setup HaMeR model
	LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353)
	DEFAULT_CHECKPOINT='_DATA/hamer_ckpts/checkpoints/hamer.ckpt'
	device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
	model_cfg = str(Path(DEFAULT_CHECKPOINT).parent.parent / 'model_config.yaml')
	model_cfg = get_config(model_cfg)
	# Override some config values, to crop bbox correctly
	if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL):
	model_cfg.defrost()
	assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
	model_cfg.MODEL.BBOX_SHAPE = [192,256]
	model_cfg.freeze()
	model = HAMER.load_from_checkpoint(DEFAULT_CHECKPOINT, strict=False, cfg=model_cfg).to(device)
	model.eval()


	# Load detector
	#from detectron2.config import LazyConfig

	#from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy

	#detectron2_cfg = LazyConfig.load(f"vendor/detectron2/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep.py")
	#detectron2_cfg.train.init_checkpoint = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl"
	#for i in range(3):
	# detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
	#detector = DefaultPredictor_Lazy(detectron2_cfg)

	# Setup the renderer
	renderer = Renderer(model_cfg, faces=model.mano.faces)

	# mmdet detector
	#det_model = DetModel()
	det_model = torch.hub.load('ultralytics/yolov5', 'yolov5x6')

	# keypoint detector
	cpm = ViTPoseModel(device)

	import numpy as np

	def infer(in_pil_img, in_threshold=0.4, out_pil_img=None):

	print(in_threshold)

	open_cv_image = np.array(in_pil_img)
	det_out = det_model(open_cv_image)
	det_out = det_out.xyxy[0]
	# Convert RGB to BGR
	open_cv_image = open_cv_image[:, :, ::-1].copy()
	print("EEEEE", open_cv_image.shape)
	print(det_out)
	#det_out = detector(open_cv_image)
	scores = det_out[:,4]
	det_instances = det_out[:,5]
	print(scores)
	print(det_instances)
	valid_idx = (det_instances==0) & (scores > in_threshold)
	print(valid_idx)
	pred_bboxes=det_out[valid_idx,:4].cpu().numpy()
	pred_scores=scores[valid_idx].cpu().numpy()


	# Detect human keypoints for each person
	vitposes_out = cpm.predict_pose(
	open_cv_image,
	[np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)],
	)

	bboxes = []
	is_right = []

	# Use hands based on hand keypoint detections
	for vitposes in vitposes_out:
	left_hand_keyp = vitposes['keypoints'][-42:-21]
	right_hand_keyp = vitposes['keypoints'][-21:]

	# Rejecting not confident detections (this could be improved)
	keyp = left_hand_keyp
	valid = keyp[:,2] > 0.5
	if sum(valid) > 3:
	bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
	bboxes.append(bbox)
	is_right.append(0)
	keyp = right_hand_keyp
	valid = keyp[:,2] > 0.5
	if sum(valid) > 3:
	bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
	bboxes.append(bbox)
	is_right.append(1)

	if len(bboxes) == 0:
	return None, []

	boxes = np.stack(bboxes)
	right = np.stack(is_right)
	print(boxes)
	print(right)
	print(open_cv_image)


	# Run HaMeR on all detected humans
	dataset = ViTDetDataset(model_cfg, open_cv_image, boxes, right)
	dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0)

	all_verts = []
	all_cam_t = []
	all_right = []
	all_mesh_paths = []

	temp_name = next(tempfile._get_candidate_names())

	for batch in dataloader:
	batch = recursive_to(batch, device)
	print(batch['img'])
	with torch.no_grad():
	out = model(batch)

	multiplier = (2*batch['right']-1)
	pred_cam = out['pred_cam']
	print(out['pred_vertices'])
	print(pred_cam)
	pred_cam[:,1] = multiplier*pred_cam[:,1]
	box_center = batch["box_center"].float()
	box_size = batch["box_size"].float()
	img_size = batch["img_size"].float()
	multiplier = (2*batch['right']-1)
	render_size = img_size
	scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
	pred_cam_t = cam_crop_to_full(pred_cam, box_center, box_size, render_size, scaled_focal_length).detach().cpu().numpy()

	# Render the result
	batch_size = batch['img'].shape[0]
	for n in range(batch_size):
	# Get filename from path img_path
	# img_fn, _ = os.path.splitext(os.path.basename(img_path))
	person_id = int(batch['personid'][n])
	white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255)
	input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255)
	input_patch = input_patch.permute(1,2,0).numpy()


	verts = out['pred_vertices'][n].detach().cpu().numpy()
	is_right = batch['right'][n].cpu().numpy()
	verts[:,0] = (2is_right-1)verts[:,0]
	cam_t = pred_cam_t[n]

	all_verts.append(verts)
	all_cam_t.append(cam_t)
	all_right.append(is_right)

	# Save all meshes to disk
	# if args.save_mesh:
	if True:
	camera_translation = cam_t.copy()
	tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right)

	temp_path = os.path.join(f'{OUT_FOLDER}/{temp_name}_{person_id}.obj')
	tmesh.export(temp_path)
	all_mesh_paths.append(temp_path)

	# Render front view
	if len(all_verts) > 0:
	misc_args = dict(
	mesh_base_color=LIGHT_BLUE,
	scene_bg_color=(1, 1, 1),
	focal_length=scaled_focal_length,
	)
	cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=render_size[n], is_right=all_right, **misc_args)

	# Overlay image
	input_img = open_cv_image.astype(np.float32)[:,:,::-1]/255.0
	input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel
	input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:]

	# convert to PIL image
	out_pil_img = Image.fromarray((input_img_overlay*255).astype(np.uint8))

	return out_pil_img, all_mesh_paths
	else:
	return None, []


	with gr.Blocks(title="HaMeR", css=".gradio-container") as demo:

	#title="HaMeR"
	#description="Gradio Demo for HaMeR."

	#gr.HTML("""<h1>HaMeR</h1>""")
	#gr.HTML("""<h3>Gradio Demo for HaMeR. You can select an </h3>""")

	gr.HTML("""<div style="font-weight:bold; text-align:center; font-size: 30px;">HaMeR</div>""")
	gr.HTML("""<div style="text-align:left; font-size: 20px;">Demo for HaMeR. You can drop an image at the top-left panel
	(or select one of the examples) and you will get the 3D reconstructions of the detected hands on the right.
	You can also download the .obj files for each hand reconstruction.</div>""")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(label="Input image", type="pil")
	with gr.Column():
	output_image = gr.Image(label="Reconstructions", type="pil")
	output_meshes = gr.File(label="3D meshes")

	gr.HTML("""<br/>""")

	with gr.Row():
	threshold = gr.Slider(0, 1.0, value=0.6, label='Detection Threshold')
	send_btn = gr.Button("Infer")
	send_btn.click(fn=infer, inputs=[input_image, threshold], outputs=[output_image, output_meshes])

	# with gr.Row():
	example_images = gr.Examples([
	['/home/user/app/assets/test1.jpg'],
	['/home/user/app/assets/test2.jpg'],
	['/home/user/app/assets/test3.jpg'],
	['/home/user/app/assets/test5.jpg'],
	],
	inputs=input_image)


	#demo.queue()
	demo.launch(debug=True)



	### EOF ###