Spaces:

Pie31415
/

rome

Build error

App Files Files Community

Pie31415 commited on Jan 6, 2023

Commit

9887bdf

•

1 Parent(s): 57885e4

implemented video inference

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +72 -6
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Rome
-emoji: 💩
 colorFrom: purple
 colorTo: green
 sdk: gradio

 ---
 title: Rome
+emoji: 😂
 colorFrom: purple
 colorTo: green
 sdk: gradio

app.py CHANGED Viewed

@@ -1,7 +1,16 @@
 import sys
 import torch
-import gradio as gr
 import pickle
 from easydict import EasyDict as edict
 from huggingface_hub import hf_hub_download
@@ -11,6 +20,7 @@ sys.path.append('./DECA')
 from rome.infer import Infer
 from rome.src.utils.processing import process_black_shape, tensor2image
 # loading models ---- create model repo
 default_modnet_path = hf_hub_download('Pie31415/rome', 'modnet_photographic_portrait_matting.ckpt')
@@ -128,8 +138,64 @@ def image_inference(
                                   out['render_masked'].cpu(), out['pred_target_shape_img'][0].cpu()], dim=2))
     return res[..., ::-1]
-def video_inference():
-    pass
 with gr.Blocks() as demo:
     gr.Markdown("# **<p align='center'>ROME: Realistic one-shot mesh-based head avatars</p>**")
@@ -151,8 +217,8 @@ with gr.Blocks() as demo:
         image_button = gr.Button("Predict")
     with gr.Tab("Video Inference"):
         with gr.Row():
-            source_video = gr.Video(label="source video", )
-            driver_image_for_vid = gr.Image(type="pil", label="driver image", show_label=True)
         video_output = gr.Image()
         video_button = gr.Button("Predict")
@@ -168,6 +234,6 @@ with gr.Blocks() as demo:
     )
     image_button.click(image_inference, inputs=[source_img, driver_img], outputs=image_output)
-    video_button.click(None, inputs=[source_video, driver_image_for_vid], outputs=video_output)
 demo.launch()

 import sys
 import torch
 import pickle
+import cv2
+import gradio as gr
+import numpy as np
+from PIL import Image
+from collections import defaultdict
+from glob import glob
+from matplotlib import pyplot as plt
+from matplotlib import animation
 from easydict import EasyDict as edict
 from huggingface_hub import hf_hub_download
 from rome.infer import Infer
 from rome.src.utils.processing import process_black_shape, tensor2image
+from rome.src.utils.visuals import mask_errosion
 # loading models ---- create model repo
 default_modnet_path = hf_hub_download('Pie31415/rome', 'modnet_photographic_portrait_matting.ckpt')
                                   out['render_masked'].cpu(), out['pred_target_shape_img'][0].cpu()], dim=2))
     return res[..., ::-1]
+def extract_frames(driver_vid):
+  image_frames = []
+  vid = cv2.VideoCapture(driver_vid) # path to mp4
+  while True:
+    success, img = vid.read()
+    if not success: break
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    pil_img = Image.fromarray(img)
+    image_frames.append(pil_img)
+  return image_frames
+def video_inference(source_img, driver_vid):
+    image_frames = extract_frames(driver_vid)
+    resulted_imgs = defaultdict(list)
+    video_folder = 'jenya_driver/'
+    image_frames = sorted(glob(f"{video_folder}/*", recursive=True), key=lambda x: int(x.split('/')[-1][:-4]))
+    mask_hard_threshold = 0.5
+    N = len(image_frames)//20
+    for i in range(0, N, 4):
+        new_out = infer.evaluate(source_img, Image.open(image_frames[i]),
+                            source_information_for_reuse=out.get('source_information'))
+        mask_pred = (new_out['pred_target_unet_mask'].cpu() > mask_hard_threshold).float()
+        mask_pred = mask_errosion(mask_pred[0].float().numpy() * 255)
+        render = new_out['pred_target_img'].cpu() * (mask_pred) + (1 - mask_pred)
+        normals = process_black_shape(((new_out['pred_target_normal'][0].cpu() + 1) / 2 * mask_pred + (1 - mask_pred) ) )
+        normals[normals==0.5]=1.
+        resulted_imgs['res_normal'].append(tensor2image(normals))
+        resulted_imgs['res_mesh_images'].append(tensor2image(new_out['pred_target_shape_img'][0]))
+        resulted_imgs['res_renders'].append(tensor2image(render[0]))
+    video = np.array(resulted_imgs['res_renders'])
+    fig = plt.figure()
+    im = plt.imshow(video[0,:,:,::-1])
+    plt.axis('off')
+    plt.close() # this is required to not display the generated image
+    def init():
+        im.set_data(video[0,:,:,::-1])
+    def animate(i):
+        im.set_data(video[i,:,:,::-1])
+        return im
+    anim = animation.FuncAnimation(fig, animate, init_func=init,
+                                frames=video.shape[0], interval=30)
+    return anim
 with gr.Blocks() as demo:
     gr.Markdown("# **<p align='center'>ROME: Realistic one-shot mesh-based head avatars</p>**")
         image_button = gr.Button("Predict")
     with gr.Tab("Video Inference"):
         with gr.Row():
+            source_img2 = gr.Image(type="pil", label="source image", show_label=True)
+            driver_vid = gr.Video(label="driver video")
         video_output = gr.Image()
         video_button = gr.Button("Predict")
     )
     image_button.click(image_inference, inputs=[source_img, driver_img], outputs=image_output)
+    video_button.click(video_inference, inputs=[source_img2, driver_vid], outputs=video_output)
 demo.launch()

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ matplotlib
 pillow
 https://download.pytorch.org/whl/cu101/torch-1.6.0%2Bcu101-cp38-cp38-linux_x86_64.whl
 https://download.pytorch.org/whl/cu101/torchvision-0.7.0%2Bcu101-cp38-cp38-linux_x86_64.whl
-easydict

 pillow
 https://download.pytorch.org/whl/cu101/torch-1.6.0%2Bcu101-cp38-cp38-linux_x86_64.whl
 https://download.pytorch.org/whl/cu101/torchvision-0.7.0%2Bcu101-cp38-cp38-linux_x86_64.whl
+easydict
+opencv