import os import cv2 import imutils import torch import timm import einops import tqdm import numpy as np import gradio as gr from cotracker.utils.visualizer import Visualizer def parse_video(video_file): vs = cv2.VideoCapture(video_file) frames = [] while True: (gotit, frame) = vs.read() if frame is not None: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frames.append(frame) if not gotit: break return np.stack(frames) def cotracker_demo( input_video, grid_size: int = 10, grid_query_frame: int = 0, backward_tracking: bool = False, tracks_leave_trace: bool = False ): load_video = parse_video(input_video) grid_query_frame = min(len(load_video)-1, grid_query_frame) load_video = torch.from_numpy(load_video).permute(0, 3, 1, 2)[None].float() model = torch.hub.load("facebookresearch/co-tracker", "cotracker_w8") if torch.cuda.is_available(): model = model.cuda() load_video = load_video.cuda() pred_tracks, pred_visibility = model( load_video, grid_size=grid_size, grid_query_frame=grid_query_frame, backward_tracking=backward_tracking ) linewidth = 2 if grid_size < 10: linewidth = 4 elif grid_size < 20: linewidth = 3 vis = Visualizer( save_dir=os.path.join(os.path.dirname(__file__), "results"), grayscale=False, pad_value=100, fps=10, linewidth=linewidth, show_first_frame=5, tracks_leave_trace= -1 if tracks_leave_trace else 0, ) import time def current_milli_time(): return round(time.time() * 1000) filename = str(current_milli_time()) vis.visualize( load_video.cpu(), tracks=pred_tracks.cpu(), visibility=pred_visibility.cpu(), filename=filename, query_frame=grid_query_frame, ) return os.path.join( os.path.dirname(__file__), "results", f"{filename}_pred_track.mp4" ) apple = os.path.join(os.path.dirname(__file__), "videos", "apple.mp4") bear = os.path.join(os.path.dirname(__file__), "videos", "bear.mp4") paragliding_launch = os.path.join(os.path.dirname(__file__), "videos", "paragliding-launch.mp4") paragliding = os.path.join(os.path.dirname(__file__), "videos", "paragliding.mp4") app = gr.Interface( title = "🎨 CoTracker: It is Better to Track Together", description = "

Welcome to CoTracker! This space demonstrates point (pixel) tracking in videos. \ Points are sampled on a regular grid and are tracked jointly.

To get started, simply upload your .mp4 video in landscape orientation or click on one of the example videos to load them. The shorter the video, the faster the processing. We recommend submitting short videos of length 2-7 seconds.

The total number of grid points is the square of Grid Size.
To specify the starting frame for tracking, adjust Grid Query Frame. Tracks will be visualized only after the selected frame.
Use Backward Tracking to track points from the selected frame in both directions.
Check Visualize Track Traces to visualize traces of all the tracked points.

For more details, check out our GitHub Repo ⭐

", fn=cotracker_demo, inputs=[ gr.Video(type="file", label="Input video", interactive=True), gr.Slider(minimum=1, maximum=30, step=1, value=10, label="Grid Size"), gr.Slider(minimum=0, maximum=30, step=1, default=0, label="Grid Query Frame"), gr.Checkbox(label="Backward Tracking"), gr.Checkbox(label="Visualize Track Traces"), ], outputs=gr.Video(label="Video with predicted tracks"), examples=[ [ apple, 10, 0, False, False ], [ apple, 20, 30, True, False ], [ bear, 10, 0, False, False ], [ paragliding, 10, 0, False, False ], [ paragliding_launch, 10, 0, False, False ], ], cache_examples=False, allow_flagging=False, ) app.queue(max_size=20, concurrency_count=2).launch(debug=True)