Spaces:
Build error
Build error
zejunyang
commited on
Commit
•
3e99418
1
Parent(s):
fa7d98a
update
Browse files- app.py +39 -35
- src/utils/crop_face_single.py +31 -21
- src/utils/frame_interpolation.py +17 -38
app.py
CHANGED
@@ -98,10 +98,11 @@ vis = FaceMeshVisualizer()
|
|
98 |
|
99 |
frame_inter_model = init_frame_interpolation_model()
|
100 |
|
101 |
-
@spaces.GPU(duration=
|
102 |
-
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=
|
103 |
fps = 30
|
104 |
cfg = 3.5
|
|
|
105 |
|
106 |
generator = torch.manual_seed(seed)
|
107 |
|
@@ -161,8 +162,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
161 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
162 |
# )
|
163 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
164 |
-
args_L = min(args_L,
|
165 |
-
for pose_image_np in pose_images[: args_L :
|
166 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
167 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
168 |
pose_image_np = cv2.resize(pose_image_np, (width, height))
|
@@ -183,19 +184,21 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
183 |
cfg,
|
184 |
generator=generator,
|
185 |
).videos
|
|
|
|
|
186 |
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
|
195 |
-
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
196 |
-
save_pil_imgs(video, save_path)
|
197 |
|
198 |
-
save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
|
199 |
|
200 |
stream = ffmpeg.input(save_path)
|
201 |
audio = ffmpeg.input(input_audio)
|
@@ -204,9 +207,10 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
204 |
|
205 |
return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
|
206 |
|
207 |
-
@spaces.GPU(duration=
|
208 |
-
def video2video(ref_img, source_video, size=512, steps=25, length=
|
209 |
cfg = 3.5
|
|
|
210 |
|
211 |
generator = torch.manual_seed(seed)
|
212 |
|
@@ -248,11 +252,9 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
|
|
248 |
pose_trans_list = []
|
249 |
verts_list = []
|
250 |
bs_list = []
|
251 |
-
src_tensor_list = []
|
252 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
253 |
-
args_L = min(args_L,
|
254 |
-
for src_image_pil in source_images[: args_L : step*
|
255 |
-
src_tensor_list.append(pose_transform(src_image_pil))
|
256 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
257 |
frame_height, frame_width, _ = src_img_np.shape
|
258 |
src_img_result = lmk_extractor(src_img_np)
|
@@ -308,19 +310,21 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
|
|
308 |
cfg,
|
309 |
generator=generator,
|
310 |
).videos
|
|
|
|
|
311 |
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
|
320 |
-
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
321 |
-
save_pil_imgs(video, save_path)
|
322 |
|
323 |
-
save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
|
324 |
|
325 |
audio_output = f'{save_dir}/audio_from_video.aac'
|
326 |
# extract audio
|
@@ -353,7 +357,7 @@ description = r"""
|
|
353 |
"""
|
354 |
|
355 |
tips = r"""
|
356 |
-
|
357 |
"""
|
358 |
|
359 |
with gr.Blocks() as demo:
|
@@ -372,10 +376,10 @@ with gr.Blocks() as demo:
|
|
372 |
|
373 |
with gr.Row():
|
374 |
a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
375 |
-
a2v_step_slider = gr.Slider(minimum=5, maximum=
|
376 |
|
377 |
with gr.Row():
|
378 |
-
a2v_length = gr.Slider(minimum=0, maximum=
|
379 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
380 |
|
381 |
a2v_botton = gr.Button("Generate", variant="primary")
|
@@ -400,10 +404,10 @@ with gr.Blocks() as demo:
|
|
400 |
|
401 |
with gr.Row():
|
402 |
v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
403 |
-
v2v_step_slider = gr.Slider(minimum=5, maximum=
|
404 |
|
405 |
with gr.Row():
|
406 |
-
v2v_length = gr.Slider(minimum=0, maximum=
|
407 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
408 |
|
409 |
v2v_botton = gr.Button("Generate", variant="primary")
|
|
|
98 |
|
99 |
frame_inter_model = init_frame_interpolation_model()
|
100 |
|
101 |
+
@spaces.GPU(duration=300)
|
102 |
+
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
|
103 |
fps = 30
|
104 |
cfg = 3.5
|
105 |
+
fi_step = 3
|
106 |
|
107 |
generator = torch.manual_seed(seed)
|
108 |
|
|
|
162 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
163 |
# )
|
164 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
165 |
+
args_L = min(args_L, 150)
|
166 |
+
for pose_image_np in pose_images[: args_L : fi_step]:
|
167 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
168 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
169 |
pose_image_np = cv2.resize(pose_image_np, (width, height))
|
|
|
184 |
cfg,
|
185 |
generator=generator,
|
186 |
).videos
|
187 |
+
|
188 |
+
video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
|
189 |
|
190 |
+
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
|
191 |
+
save_videos_grid(
|
192 |
+
video,
|
193 |
+
save_path,
|
194 |
+
n_rows=1,
|
195 |
+
fps=fps,
|
196 |
+
)
|
197 |
|
198 |
+
# save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
199 |
+
# save_pil_imgs(video, save_path)
|
200 |
|
201 |
+
# save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
|
202 |
|
203 |
stream = ffmpeg.input(save_path)
|
204 |
audio = ffmpeg.input(input_audio)
|
|
|
207 |
|
208 |
return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
|
209 |
|
210 |
+
@spaces.GPU(duration=300)
|
211 |
+
def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
212 |
cfg = 3.5
|
213 |
+
fi_step = 3
|
214 |
|
215 |
generator = torch.manual_seed(seed)
|
216 |
|
|
|
252 |
pose_trans_list = []
|
253 |
verts_list = []
|
254 |
bs_list = []
|
|
|
255 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
256 |
+
args_L = min(args_L, 150*step)
|
257 |
+
for src_image_pil in source_images[: args_L : step*fi_step]:
|
|
|
258 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
259 |
frame_height, frame_width, _ = src_img_np.shape
|
260 |
src_img_result = lmk_extractor(src_img_np)
|
|
|
310 |
cfg,
|
311 |
generator=generator,
|
312 |
).videos
|
313 |
+
|
314 |
+
video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
|
315 |
|
316 |
+
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
|
317 |
+
save_videos_grid(
|
318 |
+
video,
|
319 |
+
save_path,
|
320 |
+
n_rows=1,
|
321 |
+
fps=src_fps,
|
322 |
+
)
|
323 |
|
324 |
+
# save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
325 |
+
# save_pil_imgs(video, save_path)
|
326 |
|
327 |
+
# save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
|
328 |
|
329 |
audio_output = f'{save_dir}/audio_from_video.aac'
|
330 |
# extract audio
|
|
|
357 |
"""
|
358 |
|
359 |
tips = r"""
|
360 |
+
Here is an accelerated version of AniPortrait. Due to limitations in computing power, the wait time will be quite long. Please utilize the source code to experience the full performance.
|
361 |
"""
|
362 |
|
363 |
with gr.Blocks() as demo:
|
|
|
376 |
|
377 |
with gr.Row():
|
378 |
a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
379 |
+
a2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
|
380 |
|
381 |
with gr.Row():
|
382 |
+
a2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
|
383 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
384 |
|
385 |
a2v_botton = gr.Button("Generate", variant="primary")
|
|
|
404 |
|
405 |
with gr.Row():
|
406 |
v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
407 |
+
v2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
|
408 |
|
409 |
with gr.Row():
|
410 |
+
v2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
|
411 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
412 |
|
413 |
v2v_botton = gr.Button("Generate", variant="primary")
|
src/utils/crop_face_single.py
CHANGED
@@ -20,26 +20,36 @@ def crop_face(img, lmk_extractor, expand=1.5):
|
|
20 |
|
21 |
width = x_max - x_min
|
22 |
height = y_max - y_min
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
return cropped_img
|
|
|
20 |
|
21 |
width = x_max - x_min
|
22 |
height = y_max - y_min
|
23 |
+
|
24 |
+
if width*height >= W*H*0.15:
|
25 |
+
if W == H:
|
26 |
+
return img
|
27 |
+
size = min(H, W)
|
28 |
+
offset = int((max(H, W) - size)/2)
|
29 |
+
if size == H:
|
30 |
+
return img[:, offset:-offset]
|
31 |
+
else:
|
32 |
+
return img[offset:-offset, :]
|
33 |
+
else:
|
34 |
+
center_x = x_min + width / 2
|
35 |
+
center_y = y_min + height / 2
|
36 |
+
|
37 |
+
width *= expand
|
38 |
+
height *= expand
|
39 |
+
|
40 |
+
size = max(width, height)
|
41 |
+
|
42 |
+
x_min = int(center_x - size / 2)
|
43 |
+
x_max = int(center_x + size / 2)
|
44 |
+
y_min = int(center_y - size / 2)
|
45 |
+
y_max = int(center_y + size / 2)
|
46 |
+
|
47 |
+
top = max(0, -y_min)
|
48 |
+
bottom = max(0, y_max - img.shape[0])
|
49 |
+
left = max(0, -x_min)
|
50 |
+
right = max(0, x_max - img.shape[1])
|
51 |
+
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
|
52 |
+
|
53 |
+
cropped_img = img[y_min + top:y_max + top, x_min + left:x_max + left]
|
54 |
|
55 |
return cropped_img
|
src/utils/frame_interpolation.py
CHANGED
@@ -1,37 +1,32 @@
|
|
|
|
1 |
import os
|
2 |
import cv2
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
import bisect
|
6 |
import shutil
|
|
|
|
|
7 |
|
8 |
def init_frame_interpolation_model():
|
9 |
print("Initializing frame interpolation model")
|
10 |
checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
|
11 |
|
12 |
-
model = torch.load(checkpoint_name, map_location='cpu')
|
13 |
model.eval()
|
14 |
model = model.half()
|
15 |
model = model.to(device="cuda")
|
16 |
return model
|
17 |
|
18 |
|
19 |
-
def batch_images_interpolation_tool(
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
input_img_list = os.listdir(input_file)
|
25 |
-
input_img_list.sort()
|
26 |
-
|
27 |
-
for idx in range(len(input_img_list)-1):
|
28 |
-
img1 = cv2.imread(os.path.join(input_file, input_img_list[idx]))
|
29 |
-
img2 = cv2.imread(os.path.join(input_file, input_img_list[idx+1]))
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
image2 = torch.from_numpy(image2).unsqueeze(0).permute(0, 3, 1, 2)
|
35 |
|
36 |
results = [image1, image2]
|
37 |
|
@@ -66,25 +61,9 @@ def batch_images_interpolation_tool(input_file, model, fps, inter_frames=1):
|
|
66 |
results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
|
67 |
del remains[step]
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
final_frames = []
|
76 |
-
final_img_list = os.listdir(image_save_dir)
|
77 |
-
final_img_list.sort()
|
78 |
-
for item in final_img_list:
|
79 |
-
final_frames.append(cv2.imread(os.path.join(image_save_dir, item)))
|
80 |
-
w, h = final_frames[0].shape[1::-1]
|
81 |
-
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
|
82 |
-
video_save_dir = input_file + '.mp4'
|
83 |
-
writer = cv2.VideoWriter(video_save_dir, fourcc, fps, (w, h))
|
84 |
-
for frame in final_frames:
|
85 |
-
writer.write(frame)
|
86 |
-
writer.release()
|
87 |
-
|
88 |
-
shutil.rmtree(image_save_dir)
|
89 |
-
|
90 |
-
return video_save_dir
|
|
|
1 |
+
# Adapted from https://github.com/dajes/frame-interpolation-pytorch
|
2 |
import os
|
3 |
import cv2
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import bisect
|
7 |
import shutil
|
8 |
+
import pdb
|
9 |
+
from tqdm import tqdm
|
10 |
|
11 |
def init_frame_interpolation_model():
|
12 |
print("Initializing frame interpolation model")
|
13 |
checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
|
14 |
|
15 |
+
model = torch.jit.load(checkpoint_name, map_location='cpu')
|
16 |
model.eval()
|
17 |
model = model.half()
|
18 |
model = model.to(device="cuda")
|
19 |
return model
|
20 |
|
21 |
|
22 |
+
def batch_images_interpolation_tool(input_tensor, model, inter_frames=1):
|
23 |
+
|
24 |
+
video_tensor = []
|
25 |
+
frame_num = input_tensor.shape[2] # bs, channel, frame, height, width
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
for idx in tqdm(range(frame_num-1)):
|
28 |
+
image1 = input_tensor[:,:,idx]
|
29 |
+
image2 = input_tensor[:,:,idx+1]
|
|
|
30 |
|
31 |
results = [image1, image2]
|
32 |
|
|
|
61 |
results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
|
62 |
del remains[step]
|
63 |
|
64 |
+
for sub_idx in range(len(results)-1):
|
65 |
+
video_tensor.append(results[sub_idx].unsqueeze(2))
|
66 |
+
|
67 |
+
video_tensor.append(input_tensor[:,:,-1].unsqueeze(2))
|
68 |
+
video_tensor = torch.cat(video_tensor, dim=2)
|
69 |
+
return video_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|