File size: 39,484 Bytes
545659d
 
82e8e66
545659d
123829d
545659d
 
 
98ce923
545659d
722a74e
 
545659d
a0e06d8
6925966
 
cabfe86
545659d
 
 
f6913f3
 
f80dbe2
c400da7
ec6e24b
2868f8e
f80dbe2
ac2a5c7
6925966
ac2a5c7
 
 
52145da
6925966
 
f6913f3
 
 
 
 
 
545659d
10b0245
 
123829d
10b0245
123829d
2f95367
3f4959e
3bf2d11
2abae72
0c01bdf
5e79f53
 
545659d
 
 
 
98ce923
545659d
 
123829d
0501944
545659d
cbdb616
 
545659d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfe56e1
545659d
 
 
 
 
 
 
 
 
 
 
 
645ddd1
 
4696931
 
545659d
 
 
 
 
f4cc18c
 
fdbb8b2
 
 
545659d
fdbb8b2
545659d
fdbb8b2
 
545659d
fdbb8b2
 
f6913f3
 
ac2a5c7
545659d
 
 
 
ac2a5c7
016503f
 
545659d
096998b
545659d
 
82e8e66
98ce923
545659d
 
 
 
 
f28cd01
e3d2855
4be74db
58be8ed
 
 
764046f
58be8ed
 
 
 
 
545659d
fdbb8b2
 
545659d
 
 
 
8f7fb40
7c00bd8
047eeea
ceec5fa
047eeea
f6913f3
 
 
4be74db
545659d
fdbb8b2
f6913f3
55cb886
 
2f0b977
432443b
f1855ec
722a74e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec0082a
722a74e
 
 
 
 
 
 
 
 
9d7a705
722a74e
7139a06
 
722a74e
010a3dc
 
722a74e
f0fccf1
 
8ee9d42
 
aa63800
 
8ee9d42
 
394988b
 
ec0082a
3b5d5e3
ec0082a
394988b
 
 
 
 
 
7cca730
8ee9d42
 
f6ef0c9
9352299
722a74e
893bff0
2e36d14
893bff0
e673b34
3d0dbc5
2868f8e
 
cc4061c
 
b60e99e
 
83c5a54
 
b60e99e
83c5a54
b60e99e
ea2584d
cc4061c
ec0082a
cc4061c
 
 
2240766
cc4061c
 
 
2240766
cc4061c
 
 
 
 
 
2240766
cc4061c
 
 
2240766
cc4061c
 
 
 
 
2240766
 
a0e06d8
722a74e
 
77b4467
722a74e
564b181
a0e06d8
722a74e
7c00bd8
 
3dbec8a
a510b3e
7c00bd8
ea2584d
3dbec8a
 
 
7c00bd8
3dbec8a
ea2584d
3dbec8a
 
ea2584d
7c00bd8
 
3dbec8a
7c00bd8
ca79504
545659d
ed652cd
 
 
fdc333a
0267daf
2f0b977
c400da7
 
09f2f01
683d0ff
e45f457
8722746
f1855ec
0267daf
bfb2083
 
 
 
ed652cd
545659d
 
 
 
 
 
 
 
 
 
 
5e79f53
545659d
dddfbb8
 
a70a18d
545659d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e79f53
 
 
545659d
7c8f6b4
545659d
 
 
 
 
c3b945b
ed652cd
f7824e3
5a7c81b
bfb2083
 
432443b
c3b945b
 
be68ce1
6925966
 
2ea14e3
7b864fb
e0cd9be
1d0244a
9730921
 
7c8f6b4
9730921
65a7485
d78ecb0
 
e7dfbcf
 
d78ecb0
 
9730921
d78ecb0
6c52b98
 
1d0244a
7b864fb
6c76b19
d78ecb0
9270d86
 
d78ecb0
6c76b19
feaa805
 
6c76b19
 
9730921
 
1d0244a
 
04a4806
1d0244a
 
bc44871
 
a510b3e
d78ecb0
9730921
d72f154
 
 
7c8f6b4
6c52b98
 
7c8f6b4
 
bb56c78
e0cd9be
4248460
e0cd9be
f70e270
 
 
e0cd9be
 
7b864fb
e0cd9be
84d1dc0
1880eb2
42d57dc
1880eb2
8e3e24b
1880eb2
 
42d57dc
8e3e24b
 
1880eb2
 
0660cb3
1880eb2
 
 
0660cb3
1880eb2
 
 
 
 
 
 
 
 
11e9611
 
2ea14e3
b5942fe
00c313f
7afa34d
 
 
 
 
b2c79c0
a756ea6
 
 
b2c79c0
 
 
ada96e4
b2c79c0
 
 
ada96e4
b2c79c0
 
 
ada96e4
b2c79c0
 
 
ada96e4
b2c79c0
 
 
ada96e4
b2c79c0
 
 
ada96e4
b2c79c0
 
 
 
 
7afa34d
b2c79c0
 
ebb3c24
7afa34d
b2c79c0
 
ebb3c24
7afa34d
b2c79c0
 
ebb3c24
7afa34d
b2c79c0
 
ebb3c24
7afa34d
b2c79c0
 
ebb3c24
7afa34d
b2c79c0
 
ebb3c24
7afa34d
b2c79c0
 
 
 
 
7afa34d
b2c79c0
 
ebb3c24
bfb2083
b2c79c0
 
ebb3c24
ada96e4
b2c79c0
 
ebb3c24
ada96e4
b2c79c0
 
ebb3c24
ada96e4
b2c79c0
 
ebb3c24
ada96e4
b2c79c0
 
ebb3c24
ada96e4
b2c79c0
 
a756ea6
 
 
7afa34d
b2c79c0
 
7afa34d
2ea14e3
00c313f
26f32f4
822a192
26f32f4
 
 
96a5963
822a192
d1f1ca8
f1855ec
d1f1ca8
 
 
 
 
 
 
 
11e9611
f9b1af2
 
 
 
 
 
3d0dbc5
f9b1af2
 
d766a5d
3d0dbc5
f9b1af2
 
66ab5da
 
d010386
66ab5da
 
 
f9b1af2
3d0dbc5
586cf48
d766a5d
f9b1af2
 
ad73ab4
3d0dbc5
2b9e0bc
16a64c6
2b9e0bc
16a64c6
 
96a5963
 
16a64c6
0ef3ae5
16a64c6
96a5963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16a64c6
 
2b9e0bc
f9b1af2
96ee80f
c900035
f9b1af2
 
 
c900035
659dc3a
 
96ee80f
 
659dc3a
f9b1af2
 
 
659dc3a
 
 
96ee80f
201037e
bb61f71
f9b1af2
 
 
bb61f71
3bb651f
 
 
 
 
 
94a6927
ba45271
369cda2
d766a5d
 
 
7ac500f
d766a5d
 
 
 
7ac500f
 
 
a88f03a
 
3bb651f
 
 
 
a547f8e
 
 
 
c596c24
29a6d28
9af81d7
c3b945b
d5d810a
ec6e24b
df7ff25
13d584d
df7ff25
59f0d02
cabfe86
cd2fa70
 
13d584d
 
59f0d02
cabfe86
cd2fa70
 
13d584d
 
 
 
 
 
 
 
ec6e24b
59f0d02
545659d
3dbec8a
545659d
fed911d
545659d
f1855ec
e673b34
545659d
d694431
545659d
d5d810a
3bf2cbc
f1855ec
545659d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
import gradio as gr
import cv2
from PIL import Image
import numpy as np
from transformers import pipeline
import os
import torch
import torch.nn.functional as F
from torchvision import transforms
from torchvision.transforms import Compose
import trimesh
from geometry import create_triangles
import tempfile
from functools import partial
import spaces
from zipfile import ZipFile
import json

from depth_anything.dpt import DepthAnything
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
from moviepy.editor import *

frame_selected = 0
masks = []
locations = []
mesh = []

def zip_files(files_in, files_out):
    with ZipFile("depth_result.zip", "w") as zipObj:
        for idx, file in enumerate(files_in):
            zipObj.write(file, file.split("/")[-1])
        for idx, file in enumerate(files_out):
            zipObj.write(file, file.split("/")[-1])
    return "depth_result.zip"

def create_video(frames, fps, type):
    print("building video result")
    clip = ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(type + "_result.mp4", fps=fps)
    
    return type + "_result.mp4"

@torch.no_grad()
def predict_depth(model, image):
    return model(image)["depth"]

@spaces.GPU
def make_video(video_path, outdir='./vis_video_depth', encoder='vits'):
    if encoder not in ["vitl","vitb","vits"]:
        encoder = "vits"

    mapper = {"vits":"small","vitb":"base","vitl":"large"}
    # DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    # model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval()
    # Define path for temporary processed frames
    temp_frame_dir = tempfile.mkdtemp()
    
    margin_width = 50
    to_tensor_transform = transforms.ToTensor()

    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    # depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(encoder)).to(DEVICE).eval()
    depth_anything = pipeline(task = "depth-estimation", model=f"nielsr/depth-anything-{mapper[encoder]}")
    
    # total_params = sum(param.numel() for param in depth_anything.parameters())
    # print('Total parameters: {:.2f}M'.format(total_params / 1e6))
    
    transform = Compose([
        Resize(
            width=518,
            height=518,
            resize_target=False,
            keep_aspect_ratio=True,
            ensure_multiple_of=14,
            resize_method='lower_bound',
            image_interpolation_method=cv2.INTER_CUBIC,
        ),
        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        PrepareForNet(),
    ])

    if os.path.isfile(video_path):
        if video_path.endswith('txt'):
            with open(video_path, 'r') as f:
                lines = f.read().splitlines()
        else:
            filenames = [video_path]
    else:
        filenames = os.listdir(video_path)
        filenames = [os.path.join(video_path, filename) for filename in filenames if not filename.startswith('.')]
        filenames.sort()
    
    # os.makedirs(outdir, exist_ok=True)
    
    for k, filename in enumerate(filenames):
        file_size = os.path.getsize(filename)/1024/1024
        if file_size > 128.0:
            print(f'File size of {filename} larger than 128Mb, sorry!')
            return filename
        print('Progress {:}/{:},'.format(k+1, len(filenames)), 'Processing', filename)
        
        raw_video = cv2.VideoCapture(filename)
        frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
        if frame_rate < 1:
            frame_rate = 1
        cframes = int(raw_video.get(cv2.CAP_PROP_FRAME_COUNT))
        print(f'frames: {cframes}, fps: {frame_rate}')
        # output_width = frame_width * 2 + margin_width
        
        #filename = os.path.basename(filename)
        # output_path = os.path.join(outdir, filename[:filename.rfind('.')] + '_video_depth.mp4')
        #with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
        #    output_path = tmpfile.name
        #out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"avc1"), frame_rate, (output_width, frame_height))
        #fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        #out = cv2.VideoWriter(output_path, fourcc, frame_rate, (output_width, frame_height))
        count=0
        depth_frames = []
        orig_frames = []
        while raw_video.isOpened():
            ret, raw_frame = raw_video.read()
            if not ret:
                break

            frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0
            frame_pil =  Image.fromarray((frame * 255).astype(np.uint8))
            frame = transform({'image': frame})['image']
            
            frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE)
            
            
            depth = to_tensor_transform(predict_depth(depth_anything, frame_pil))

            depth = F.interpolate(depth[None], (frame_height, frame_width), mode='bilinear', align_corners=False)[0, 0]
            depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
            
            depth = depth.cpu().numpy().astype(np.uint8)
            depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_BONE)
            depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY)
            depth_color = cv2.cvtColor(depth_gray, cv2.COLOR_GRAY2BGR)

            # Remove white border around map:
            # define lower and upper limits of white
            white_lo = np.array([250,250,250])
            white_hi = np.array([255,255,255])
            # mask image to only select white
            mask = cv2.inRange(depth_color, white_lo, white_hi)
            # change image to black where we found white
            depth_color[mask>0] = (0,0,0)
            
            # split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255
            # combined_frame = cv2.hconcat([raw_frame, split_region, depth_color])
            
            # out.write(combined_frame)
            # frame_path = os.path.join(temp_frame_dir, f"frame_{count:05d}.png")
            # cv2.imwrite(frame_path, combined_frame)
            
            cv2.imwrite(f"f{count}.jpg", raw_frame)
            orig_frames.append(f"f{count}.jpg")
            cv2.imwrite(f"f{count}_dmap.jpg", depth_color)
            depth_frames.append(f"f{count}_dmap.jpg")
            count += 1

        final_vid = create_video(depth_frames, frame_rate, "depth")
        final_zip = zip_files(orig_frames, depth_frames)
        raw_video.release()
        # out.release()
        cv2.destroyAllWindows()

        global frame_selected
        global masks
        masks = orig_frames
        return final_vid, final_zip, np.concatenate((orig_frames, depth_frames), axis=0), masks[frame_selected] #output_path

def depth_edges_mask(depth):
    """Returns a mask of edges in the depth map.
    Args:
    depth: 2D numpy array of shape (H, W) with dtype float32.
    Returns:
    mask: 2D numpy array of shape (H, W) with dtype bool.
    """
    # Compute the x and y gradients of the depth map.
    depth_dx, depth_dy = np.gradient(depth)
    # Compute the gradient magnitude.
    depth_grad = np.sqrt(depth_dx ** 2 + depth_dy ** 2)
    # Compute the edge mask.
    mask = depth_grad > 0.05
    return mask

def pano_depth_to_world_points(depth):
    """
    360 depth to world points
    given 2D depth is an equirectangular projection of a spherical image
    Treat depth as radius
    longitude : -pi to pi
    latitude : -pi/2 to pi/2
    """

    # Convert depth to radius
    radius = (255 - depth.flatten())

    lon = np.linspace(0, np.pi*2, depth.shape[1])
    lat = np.linspace(0, np.pi, depth.shape[0])
    lon, lat = np.meshgrid(lon, lat)
    lon = lon.flatten()
    lat = lat.flatten()

    pts3d = [[0,0,0]]
    uv = [[0,0]]
    for i in range(0, 1): #(0,2)
        for j in range(0, 1): #(0,2)
            #rnd_lon = (np.random.rand(depth.shape[0]*depth.shape[1]) - 0.5) / 8
            #rnd_lat = (np.random.rand(depth.shape[0]*depth.shape[1]) - 0.5) / 8
            d_lon = lon + i/2 * np.pi*2 / depth.shape[1]
            d_lat = lat + j/2 * np.pi / depth.shape[0]
            
            # Convert to cartesian coordinates
            x = radius * np.cos(d_lon) * np.sin(d_lat)
            y = radius * np.cos(d_lat)
            z = radius * np.sin(d_lon) * np.sin(d_lat)
            
            pts = np.stack([x, y, z], axis=1)
            uvs = np.stack([lon, lat], axis=1)
            
            pts3d = np.concatenate((pts3d, pts), axis=0)
            uv = np.concatenate((uv, uvs), axis=0)
            #print(f'i: {i}, j: {j}')
            j = j+1
        i = i+1
        
    return [pts3d, uv]

def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.333, 0.333, 0.333])

def get_mesh(image, blur_data, loadall):
    global locations
    global mesh
    if loadall == False:
        mesh = []
    fnum = frame_selected

    if fnum < len(image)/2:
        blur_img = blur_image(image[fnum][0], image[fnum+int(len(image)/2)][0], blur_data)
        gdepth = rgb2gray(image[fnum+int(len(image)/2)][0])
    else:
        blur_img = blur_image(image[fnum-int(len(image)/2)][0], image[fnum][0], blur_data)
        gdepth = rgb2gray(image[fnum][0])
    
    print('depth to gray - ok')
    points = pano_depth_to_world_points(gdepth)
    pts3d = points[0]
    uv = points[1]
    print('radius from depth - ok')

    # Create a trimesh mesh from the points
    # Each pixel is connected to its 4 neighbors
    # colors are the RGB values of the image

    verts = pts3d.reshape(-1, 3)
    #triangles = create_triangles(image.shape[0], image.shape[1])
    #print('triangles - ok')
    rgba = cv2.cvtColor(blur_img, cv2.COLOR_RGB2RGBA)
    colors = rgba.reshape(-1, 4)
    clrs = [[128, 128, 128, 0]]

    for i in range(0,1): #(0,4)
        clrs = np.concatenate((clrs, colors), axis=0)
        i = i+1

    #mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)
    mesh.append(trimesh.PointCloud(verts, colors=clrs))
    #material = trimesh.visual.texture.SimpleMaterial(image=image)
    #texture = trimesh.visual.TextureVisuals(uv=uv, image=image, material=material)
    #mesh.visual = texture
        
    scene = trimesh.Scene(mesh)
    print('mesh - ok')

    # Save as glb
    glb_file = tempfile.NamedTemporaryFile(suffix='.glb', delete=False)
    glb_path = glb_file.name
    scene.export(glb_path)
    print('file - ok')
    return glb_path

def blur_image(image, depth, blur_data):
    blur_a = blur_data.split()
    print(f'blur data {blur_data}')

    blur_frame = image.copy()
    j = 0
    while j < 256:
        i = 255 - j
        blur_lo = np.array([i,i,i])
        blur_hi = np.array([i+1,i+1,i+1])
        blur_mask = cv2.inRange(depth, blur_lo, blur_hi)
        
        print(f'kernel size {int(blur_a[j])}')
        blur = cv2.GaussianBlur(image, (int(blur_a[j]), int(blur_a[j])), 0)
                
        blur_frame[blur_mask>0] = blur[blur_mask>0]
        j = j + 1

    return blur_frame

def loadurl(url):
    return url

def select_frame(v, evt: gr.SelectData):
    global frame_selected
    global masks
    masks[frame_selected] = v
    
    if evt.index != frame_selected:
        frame_selected = evt.index
    v = masks[frame_selected]
    #print(v)
    return v, frame_selected

def align_rows(evt: gr.EventData):
    global masks
    return gr.Gallery(columns=int(len(masks)))


css = """
#img-display-container {
    max-height: 100vh;
    }
#img-display-input {
    max-height: 80vh;
    }
#img-display-output {
    max-height: 80vh;
    }
"""

title = "# Depth Anything Video Demo"
description = """Depth Anything on full video files.  
Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details.  
Mesh rendering from [ZoeDepth](https://huggingface.co/spaces/shariqfarooq/ZoeDepth) ([github](https://github.com/isl-org/ZoeDepth/tree/main/ui))."""

transform = Compose([
        Resize(
            width=518,
            height=518,
            resize_target=False,
            keep_aspect_ratio=True,
            ensure_multiple_of=14,
            resize_method='lower_bound',
            image_interpolation_method=cv2.INTER_CUBIC,
        ),
        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        PrepareForNet(),
])

# @torch.no_grad()
# def predict_depth(model, image):
#     return model(image)

with gr.Blocks(css=css) as demo:
    gr.Markdown(title)
    gr.Markdown(description)
    gr.Markdown("### Video Depth Prediction demo")

    with gr.Row():
        with gr.Column():
            input_url = gr.Textbox(value="./examples/streetview.mp4", label="URL")
            input_video = gr.Video(label="Input Video", format="mp4")
            input_url.change(fn=loadurl, inputs=[input_url], outputs=[input_video])
            output_frame = gr.Gallery(label="Frames", type='numpy', preview=True, columns=6)
            output_frame.change(fn=align_rows, inputs=None, outputs=[output_frame])
            output_mask = gr.ImageEditor(interactive=True, transforms=(None,), eraser=gr.Eraser(), brush=gr.Brush(colors=['black', 'darkgray', 'gray', 'lightgray', 'white']), layers=True)
            submit = gr.Button("Submit")
        with gr.Column():
            model_type = gr.Dropdown([("small", "vits"), ("base", "vitb"), ("large", "vitl")], type="value", value="vits", label='Model Type')
            processed_video = gr.Video(label="Output Video", format="mp4")
            processed_zip = gr.File(label="Output Archive")
            result = gr.Model3D(label="3D Mesh", clear_color=[0.5, 0.5, 0.5, 0.0], camera_position=[0, 90, 0], interactive=True, elem_id="model3D")   
            svg_in = gr.HTML(value="""<svg id='svg_in' height='32' width='256' viewBox='0 0 256 32' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' style='touch-action:none;background-color:#808080;' onpointerdown='
              try{
                if (document.getElementById(\"pl\").getAttribute(\"points\").length < 256) {
                  var pts = \"\";
                  for (var i=0; i<256; i++) {
                    pts += i+\",0 \";
                  }
                  document.getElementById(\"pl\").setAttribute(\"points\", pts.slice(0,-1)); 
                  var xold = 0;
                  var yold = 0;
                  var x = 0;
                  var y = 0;
                  function lerp(y1, y2, mu) { return y1*(1-mu)+y2*mu; }
                  
                  this.onpointermove = function(event) {
                    if (this.title != \"\") {
                      x = parseInt(event.clientX - this.getBoundingClientRect().x);
                      y = parseInt(event.clientY - this.getBoundingClientRect().y);
                      if (x < 0) { x = 0; } else if (x > 255) { x = 255; }
                      if (y < 0) { y = 0; } else if (y > 31) { y = 31; }
                      var pl_a = document.getElementById(\"pl\").getAttribute(\"points\").split(\" \");
                      
                      for (var i=Math.min(xold, x)+1; i<Math.max(xold, x); i++) {
                        pl_a[i] = x+\",\"+parseInt(lerp( yold, y, (i-xold)/(x-xold) ));
                      }
                      pl_a[x] = x+\",\"+y;
                      xold = x;
                      yold = y;
                      document.getElementById(\"pl\").setAttribute(\"points\", pl_a.join(\" \"));
                    }
                  }
                  this.onpointerup = function(event) {
                    var pl_a = document.getElementById(\"pl\").getAttribute(\"points\").replace(/\d+,/g, \"\").split(\" \"); 
                    for (var i=0; i<pl_a.length; i++) {
                      pl_a[i] = parseInt(pl_a[i]) * 2 + 1;
                    }
                    document.getElementsByTagName(\"textarea\")[1].value = pl_a.join(\" \");
                    var evt = document.createEvent(\"Event\");
                    evt.initEvent(\"input\", true, false);
                    document.getElementsByTagName(\"textarea\")[1].dispatchEvent(evt);
                    this.title = \"\";
                  }
                  this.onpointerleave = function(event) {
                    this.title = \"\";
                  }
                  this.onpointerdown = function(event) {
                    xold = parseInt(event.clientX - this.getBoundingClientRect().x);
                    yold = parseInt(event.clientY - this.getBoundingClientRect().y);
                    this.title = xold+\",\"+yold;
                  }
                }
              }catch(e){alert(e);}
              '>
                <defs>
                  <linearGradient id='lg' x1='0%' x2='100%' y1='0%' y2='0%'>
                    <stop offset='0%' stop-color='white'/>
                    <stop offset='100%' stop-color='black'/>
                  </linearGradient>
                </defs>
                <polyline id='pl' points='-3,0 0,15 255,15 258,0' stroke='url(#lg)' fill='none' stroke-width='3' stroke-linejoin='round'/>
              </svg>""")
            average = gr.HTML(value="""<label for='average'>Average</label><input id='average' type='range' style='width:256px;height:1em;' value='1' min='1' max='15' step='2' onclick='
              var pts_a = document.getElementsByTagName(\"textarea\")[1].value.split(\" \");
              for (var i=0; i<256; i++) {
                var avg = 0;
                var div = this.value;
                for (var j = i-parseInt(this.value/2); j <= i+parseInt(this.value/2); j++) {
                  if (pts_a[j]) {
                    avg += parseInt(pts_a[j]);
                  } else {
                    div--;
                  }
                }
                pts_a[i] = parseInt((avg / div - 1) / 2) * 2 + 1;
              }
              document.getElementsByTagName(\"textarea\")[1].value = pts_a.join(\" \");
              for (var i=0; i<pts_a.length; i++) {
                pts_a[i] = i+\",\"+parseInt((pts_a[i] - 1) / 2);
              }
              document.getElementById(\"pl\").setAttribute(\"points\", pts_a.join(\" \"));

              var evt = document.createEvent(\"Event\");
              evt.initEvent(\"input\", true, false);
              document.getElementsByTagName(\"textarea\")[1].dispatchEvent(evt);
            ' oninput='
              this.parentNode.childNodes[2].innerText = this.value;
            '/><span>1</span>""")
            with gr.Accordion(label="Blur levels", open=False):
                blur_in = gr.Textbox(value="", label="Kernel size", show_label=False)
            with gr.Accordion(label="Locations", open=False):
                offset = gr.HTML(value="""<input type='text' id='kbrd' onkeydown='
                if (BABYLON) {
                  if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) {
                    var evt = document.createEvent(\"Event\");
                    evt.initEvent(\"click\", true, false);
                    document.getElementById(\"reset_cam\").dispatchEvent(evt);
                  } 
                  event.preventDefault();
                  if (BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotationQuaternion) {
                    BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotationQuaternion = null;
                  }
                  switch(event.key) {
                    case \"w\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.y += 1;
                      this.value = \"w ⬆ x\";
                      break;
                    case \"x\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.y -= 1;
                      this.value = \"w ⬇ x\";
                      break;
                    case \"a\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.z -= 1;
                      this.value = \"a ⬅ d\";
                      break;
                    case \"d\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.z += 1;
                      this.value = \"a ➡ d\";
                      break;
                    case \"e\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.x -= 1;
                      this.value = \"z ↗ e\";
                      break;
                    case \"z\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.x += 1;
                      this.value = \"z ↙ e\";
                      break;
                    case \"s\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.x = 0;
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.y = 0;
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].position.z = 0;
                      this.value = \"\";
                      break;
                    case \"t\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.z += Math.PI/256;
                      this.value = \"t 🔃 b\";
                      break;
                    case \"b\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.z -= Math.PI/256;
                      this.value = \"t 🔃 b\";
                      break;
                    case \"f\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.y -= Math.PI/256;
                      this.value = \"f 🔁 h\";
                      break;
                    case \"h\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.y += Math.PI/256;
                      this.value = \"f 🔁 h\";
                      break;
                    case \"y\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.x -= Math.PI/256;
                      this.value = \"v 🔄 y\";
                      break;
                    case \"v\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.x += Math.PI/256;
                      this.value = \"v 🔄 y\";
                      break;
                    case \"g\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.x = 0;
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.y = 0;
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].rotation.z = 0;
                      this.value = \"\";
                      break;
                    case \"i\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.y *= 256/255;
                      this.value = \"i ↕ ,\";
                      break;
                    case \",\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.y /= 256/255;
                      this.value = \"i ↕ ,\";
                      break;
                    case \"j\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.z /= 256/255;
                      this.value = \"j ↔ l\";
                      break;
                    case \"l\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.z *= 256/255;
                      this.value = \"j ↔ l\";
                      break;
                    case \"o\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.x /= 256/255;
                      this.value = \"m ⤢ o\";
                      break;
                    case \"m\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.x *= 256/255;
                      this.value = \"m ⤢ o\";
                      break;
                    case \"k\":
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.x = 1;
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.y = 1;
                      BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].scaling.z = 1;
                      this.value = \"\";
                      break;
                    default:
                      this.value = \"\"; 
                  }
                }
                ' style='color:auto;background-color:transparent;border:1px solid lightgray;'/><pre id='keymap'>
`  1  2  3  4  5  6  7  8  9  0  -  =  
       W  E     T  Y     I  O     {  }
     A-`S´-D  F-`G´-H  J-`K´-L  ;  '
      Z´ X̀     V´ B̀     M´ `,  .  /
      <a id='move' href='#'>move</a>    <a id='rotate' href='#'>rotate</a>    <a id='scale' href='#'>scale</a>
                </pre>""")
                selected = gr.Number(elem_id="fnum", value=0, minimum=0, maximum=256, interactive=False)
                output_frame.select(fn=select_frame, inputs=[output_mask], outputs=[output_mask, selected], show_progress='hidden')
                example_coords = """[
                  {"latLng": { "lat": 50.07379596793083, "lng": 14.437146122950555 } },
                  {"latLng": { "lat": 50.073799567020004, "lng": 14.437146774240507 } },
                  {"latLng": { "lat": 50.07377647505558, "lng": 14.437161000659017 } },
                  {"latLng": { "lat": 50.07379496839027, "lng": 14.437148958238538 } },
                  {"latLng": { "lat": 50.073823157821664, "lng": 14.437124189538856 } }
                ]"""
                coords = gr.JSON(elem_id="coords", value=example_coords, label="Precise coordinates", show_label=False)      
                
            html = gr.HTML(value="""<label for='zoom'>Zoom</label><input id='zoom' type='range' style='width:256px;height:1em;' value='0.8' min='0.157' max='1.57' step='0.001' oninput='
              if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) {
                var evt = document.createEvent(\"Event\");
                evt.initEvent(\"click\", true, false);
                document.getElementById(\"reset_cam\").dispatchEvent(evt);
              } 
              BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].material.pointSize = Math.ceil(Math.log2(Math.PI/this.value));
              BABYLON.Engine.LastCreatedScene.activeCamera.fov = this.value;
              this.parentNode.childNodes[2].innerText = BABYLON.Engine.LastCreatedScene.activeCamera.fov;

              document.getElementById(\"model3D\").getElementsByTagName(\"canvas\")[0].style.filter = \"blur(\" + BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].material.pointSize/2.0*Math.sqrt(2.0) + \"px)\";
            '/><span>0.8</span>""")
            camera = gr.HTML(value="""<a href='#' id='reset_cam' onclick='
              if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) {
                BABYLON.Engine.LastCreatedScene.activeCamera.metadata = { 
                  screenshot: true,
                  pipeline: new BABYLON.DefaultRenderingPipeline(\"default\", true, BABYLON.Engine.LastCreatedScene, [BABYLON.Engine.LastCreatedScene.activeCamera]) 
                }
              } 
              BABYLON.Engine.LastCreatedScene.activeCamera.radius = 0;
              BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].material.pointSize = Math.ceil(Math.log2(Math.PI/document.getElementById(\"zoom\").value));
              BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.samples = 4; 
              BABYLON.Engine.LastCreatedScene.activeCamera.fov = document.getElementById(\"zoom\").value;
              BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.contrast = document.getElementById(\"contrast\").value;
              BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.exposure = document.getElementById(\"exposure\").value;
              
              document.getElementById(\"model3D\").getElementsByTagName(\"canvas\")[0].style.filter = \"blur(\" + BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1].material.pointSize/2.0*Math.sqrt(2.0) + \"px)\";
try {
              if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager) {
                BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager = new BABYLON.GizmoManager(BABYLON.Engine.LastCreatedScene, 12);
                
                BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.positionGizmoEnabled = true;
                BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.rotationGizmoEnabled = false;
                BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.scaleGizmoEnabled = false;
                BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.boundingBoxGizmoEnabled = false;
              
                BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.usePointerToAttachGizmos = false;
                document.getElementById(\"move\").onclick = function(event) {
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.positionGizmoEnabled = true;
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.rotationGizmoEnabled = false;
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.scaleGizmoEnabled = false;
                }
                document.getElementById(\"rotate\").onclick = function(event) {
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.positionGizmoEnabled = false;
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.rotationGizmoEnabled = true;
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.scaleGizmoEnabled = false;
                }
                document.getElementById(\"scale\").onclick = function(event) {
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.positionGizmoEnabled = false;
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.rotationGizmoEnabled = false;
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.scaleGizmoEnabled = true;
                }
              }
              BABYLON.Engine.LastCreatedScene.activeCamera.metadata.gizmoManager.attachToMesh(BABYLON.Engine.LastCreatedScene.getNodes()[parseInt(document.getElementById(\"fnum\").getElementsByTagName(\"input\")[0].value)+1]);
} catch(e) {alert(e)}
            '>reset camera</a>""")
            contrast = gr.HTML(value="""<label for='contrast'>Contrast</label><input id='contrast' type='range' style='width:256px;height:1em;' value='2.0' min='0' max='2' step='0.001' oninput='
              if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) {
                var evt = document.createEvent(\"Event\");
                evt.initEvent(\"click\", true, false);
                document.getElementById(\"reset_cam\").dispatchEvent(evt);
              } 
              BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.contrast = this.value;
              this.parentNode.childNodes[2].innerText = BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.contrast;
            '/><span>2.0</span>""")
            exposure = gr.HTML(value="""<label for='exposure'>Exposure</label><input id='exposure' type='range' style='width:256px;height:1em;' value='0.5' min='0' max='2' step='0.001' oninput='
              if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) {
                var evt = document.createEvent(\"Event\");
                evt.initEvent(\"click\", true, false);
                document.getElementById(\"reset_cam\").dispatchEvent(evt);
              } 
              BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.exposure = this.value;
              this.parentNode.childNodes[2].innerText = BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.exposure;
            '/><span>0.5</span>""")
            canvas = gr.HTML(value="""<a href='#' onclick='
              if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) {
                var evt = document.createEvent(\"Event\");
                evt.initEvent(\"click\", true, false);
                document.getElementById(\"reset_cam\").dispatchEvent(evt);
              } 
              BABYLON.Engine.LastCreatedScene.activeCamera.metadata.screenshot = true;

              BABYLON.Engine.LastCreatedScene.getEngine().onEndFrameObservable.add(function() {
                if (BABYLON.Engine.LastCreatedScene.activeCamera.metadata.screenshot === true) {
                  BABYLON.Engine.LastCreatedScene.activeCamera.metadata.screenshot = false;
                  try {
                    BABYLON.Tools.CreateScreenshotUsingRenderTarget(BABYLON.Engine.LastCreatedScene.getEngine(), BABYLON.Engine.LastCreatedScene.activeCamera, 
                      { precision: 1.0 }, (durl) => { 
                        var cnvs = document.getElementById(\"model3D\").getElementsByTagName(\"canvas\")[0]; //.getContext(\"webgl2\");
                        var svgd = `<svg id=\"svg_out\" viewBox=\"0 0 ` + cnvs.width + ` ` + cnvs.height + `\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">
                          <defs>
                            <filter id=\"blur\" x=\"0\" y=\"0\" xmlns=\"http://www.w3.org/2000/svg\">
                              <feGaussianBlur in=\"SourceGraphic\" stdDeviation=\"` + BABYLON.Engine.LastCreatedScene.getNodes()[1].material.pointSize/2.0*Math.sqrt(2.0) + `\" />
                            </filter>
                          </defs>
                          <image filter=\"url(#blur)\" id=\"svg_img\" x=\"0\" y=\"0\" width=\"` + cnvs.width + `\" height=\"` + cnvs.height + `\" xlink:href=\"` + durl + `\"/>
                        </svg>`;
                        document.getElementById(\"cnv_out\").width = cnvs.width;
                        document.getElementById(\"cnv_out\").height = cnvs.height;
                        document.getElementById(\"img_out\").src = \"data:image/svg+xml;base64,\" + btoa(svgd);        
                      }
                    );
                  } catch(e) { alert(e); }
                  // https://forum.babylonjs.com/t/best-way-to-save-to-jpeg-snapshots-of-scene/17663/11
                }
              });
            '/>snapshot</a><br/><img src='' id='img_out' onload='
              var ctxt = document.getElementById(\"cnv_out\").getContext(\"2d\");
              ctxt.drawImage(this, 0, 0); 
            '/><br/>
            <canvas id='cnv_out'/>""")
            load_all = gr.Checkbox(label="Load all")
            render = gr.Button("Render")
    
    def on_submit(uploaded_video,model_type,coordinates):
        global locations
        locations = []
        avg = [0, 0]
        
        if not coordinates:
            locations = json.loads(example_coords)
            for k, location in enumerate(locations):
                locations[k] = location["latLng"]
                avg[0] = avg[0] + locations[k]["lat"]
                avg[1] = avg[1] + locations[k]["lng"]
        else:
            locations = json.loads(coordinates)
            for k, location in enumerate(locations):
                locations[k] = location["location"]["latLng"]
                avg[0] = avg[0] + locations[k]["lat"]
                avg[1] = avg[1] + locations[k]["lng"]
                
        avg[0] = avg[0] / len(locations)
        avg[1] = avg[1] / len(locations)
        for k, location in enumerate(locations):
            locations[k]["lat"] = location["lat"] - avg[0]
            locations[k]["lng"] = location["lng"] - avg[1]
        print(locations)
            
        # Process the video and get the path of the output video
        output_video_path = make_video(uploaded_video,encoder=model_type)

        return output_video_path + (locations,)

    submit.click(on_submit, inputs=[input_video, model_type, coords], outputs=[processed_video, processed_zip, output_frame, output_mask, coords])
    render.click(partial(get_mesh), inputs=[output_frame, blur_in, load_all], outputs=[result])

    example_files = os.listdir('examples')
    example_files.sort()
    example_files = [os.path.join('examples', filename) for filename in example_files]
    
    examples = gr.Examples(examples=example_files, inputs=[input_video], outputs=[processed_video, processed_zip, output_frame, output_mask, coords], fn=on_submit, cache_examples=True)
    

if __name__ == '__main__':
    demo.queue().launch()