Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Aug 25

Commit

4ad47a9

•

1 Parent(s): 8f3cd14

Update app

Browse files

Files changed (2) hide show

app.py +105 -78
preprocess/inference_preprocess.py +1 -1

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from utils.inference_utils import *
 from sync_models.gestsync_models import *
 from tqdm import tqdm
 from glob import glob
 import mediapipe as mp
 from protobuf_to_dict import protobuf_to_dict
 import warnings
@@ -25,7 +26,7 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
 # Initialize global variables
-CHECKPOINT_PATH = "model_rgb.pth"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 use_cuda = torch.cuda.is_available()
 batch_size = 12
@@ -195,13 +196,14 @@ def resample_video(video_file, video_fname, result_folder):
 	video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
 	# Resample the video to 25 fps
-	command = ("ffmpeg -hide_banner -loglevel panic -y -i {} -q:v 1 -filter:v fps=25 {}".format(video_file, video_file_25fps))
-	from subprocess import call
-	cmd = command.split(' ')
 	print('Resampled the video to 25 fps: {}'.format(video_file_25fps))
-	call(cmd)
-	return video_file_25fps
 def load_checkpoint(path, model):
 	'''
@@ -418,7 +420,7 @@ def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, wi
 	return input_frames, num_frames, orig_masked_frames, "success"
-def load_spectrograms(wav_file, num_frames, window_frames=25, stride=4):
 	'''
 	This function extracts the spectrogram from the audio file
@@ -448,8 +450,9 @@ def load_spectrograms(wav_file, num_frames, window_frames=25, stride=4):
 	orig_spec = spec
 	spec = np.array([spec[i:i+(window_frames*stride), :] for i in range(0, spec.shape[0], stride) if (i+(window_frames*stride) <= spec.shape[0])])
-	if len(spec) != num_frames:
-		spec = spec[:num_frames]
 		frame_diff = np.abs(len(spec) - num_frames)
 		if frame_diff > 60:
 			print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
@@ -590,8 +593,9 @@ def generate_video(frames, audio_file, video_fname):
 		return None, msg
 	video_output = video_fname + '.mp4'
-	status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 -shortest %s' %
-					(audio_file, no_sound_video, video_output), shell=True)
 	if status != 0:
 		msg = "Oops! Could not generate the video. Please check the input video and try again."
 		return None, msg
@@ -599,7 +603,7 @@ def generate_video(frames, audio_file, video_fname):
 	os.remove(fname)
 	os.remove(no_sound_video)
-	return video_output
 def sync_correct_video(video_path, frames, wav_file, offset, result_folder, sample_rate=16000, fps=25):
@@ -637,9 +641,11 @@ def sync_correct_video(video_path, frames, wav_file, offset, result_folder, samp
 		corrected_frames = corrected_frames[:len(frames)-np.abs(offset)]
 	corrected_video_path = os.path.join(result_folder, "result_sync_corrected")
-	video_output = generate_video(corrected_frames, wav_file, corrected_video_path)
-	return video_output
 def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_folder):
@@ -661,23 +667,26 @@ def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_fold
 	all_frames, all_orig_frames = [], []
 	for video_num, video in enumerate(test_videos):
 		# Load the video frames
 		frames, status = load_video_frames(video)
 		if status != "success":
 			return None, None, status
 		# Extract the keypoints from the frames
 		kp_dict, status = get_keypoints(frames)
 		if status != "success":
 			return None, None, status
 		# Mask the frames using the keypoints extracted from the frames and prepare the input to the model
 		masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict)
 		if status != "success":
 			return None, None, status
-		input_masked_vid_path = os.path.join(result_folder, "input_masked_scene_{}_speaker_{}".format(scene_num, video_num))
-		generate_video(orig_masked_frames, wav_file, input_masked_vid_path)
 		# Check if the length of the input frames is equal to the length of the spectrogram
 		if spec.shape[2]!=masked_frames.shape[0]:
@@ -691,6 +700,7 @@ def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_fold
 		# Transpose the frames to the correct format
 		frames = np.transpose(masked_frames, (4, 0, 1, 2, 3))
 		frames = torch.FloatTensor(np.array(frames)).unsqueeze(0)
 		all_frames.append(frames)
 		all_orig_frames.append(orig_masked_frames)
@@ -830,24 +840,29 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
 		- video_output (string) : Path of the output video
 	'''
-	output_frames = []
-	for i in range(len(input_frames)):
-		# If the active speaker is found, draw a bounding box around the active speaker
-		if i in output_tracks:
-			bbox = output_tracks[i]
-			x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
-			out = cv2.rectangle(input_frames[i].copy(), (x1, y1), (x2, y2), color=[0, 255, 0], thickness=3)
-		else:
-			out = input_frames[i]
-		output_frames.append(out)
-	# Generate the output video
-	output_video_fname = os.path.join(result_folder, "result_active_speaker_det")
-	video_output = generate_video(output_frames, wav_file, output_video_fname)
-	return video_output
 def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
@@ -878,8 +893,12 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		# Resample the video to 25 fps if it is not already 25 fps
 		print("FPS of video: ", fps)
 		if fps!=25:
-			vid_path = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
-			orig_vid_path_25fps = resample_video(video_path, "input_video_25fps", result_folder_input)
 		else:
 			vid_path = vid_path_processed
 			orig_vid_path_25fps = video_path
@@ -978,7 +997,9 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		print("Predicted offset: ", pred_offset)
 		# Generate sync-corrected video
-		video_output = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
 		print("Successfully generated the video:", video_output)
 		return f"Predicted offset: {pred_offset}", video_output
@@ -1005,14 +1026,14 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		if global_speaker=="per-frame-prediction" and num_avg_frames<25:
 			msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
-			return None, msg
 		# Read the video
 		try:
 			vr = VideoReader(video_path, ctx=cpu(0))
 		except:
 			msg = "Oops! Could not load the input video file"
-			return None, msg
 		# Get the FPS of the video
 		fps = vr.get_avg_fps()
@@ -1020,29 +1041,28 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		# Resample the video to 25 FPS if the original video is of a different frame-rate
 		if fps!=25:
-			test_video_25fps = resample_video(video_path, video_fname, result_folder_input)
 		else:
 			test_video_25fps = video_path
 		# Load the video frames
 		orig_frames, status = load_video_frames(test_video_25fps)
 		if status != "success":
-			return None, status
-		print("Successfully loaded the frames")
 		# Extract and save the audio file
 		orig_wav_file, status = extract_audio(video_path, result_folder)
 		if status != "success":
-			return None, status
-		print("Successfully loaded the spectrograms")
 		# Pre-process and extract per-speaker tracks in each scene
 		print("Pre-processing the input video...")
 		status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
 		if status != 0:
-			return None, "Error in pre-processing the input video, please check the input video and try again..."
-		print("Successfully preprocessed the video")
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 			tracks = pickle.load(file)
@@ -1056,7 +1076,6 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 				track_dict[scene_num][i] = {}
 				for frame_num, bbox in zip(tracks[scene_num][i]['track']['frame'], tracks[scene_num][i]['track']['bbox']):
 					track_dict[scene_num][i][frame_num] = bbox
-		print("Successfully loaded the extracted person-tracks")
 		# Get the total number of scenes
 		test_scenes = os.listdir("{}/crops".format(result_folder_input))
@@ -1065,7 +1084,6 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		# Load the trained model
 		model = Transformer_RGB()
 		model = load_checkpoint(CHECKPOINT_PATH, model)
-		print("Successfully loaded the model")
 		# Compute the active speaker in each scene
 		output_tracks = {}
@@ -1076,20 +1094,21 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 			if len(test_videos)<=1:
 				msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
-				return None, msg
 			# Load the audio file
 			audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
 			spec, _, status = load_spectrograms(audio_file, window_frames=25)
 			if status != "success":
-				return None, status
 			spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
 			# Load the masked input frames
 			all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
 			if status != "success":
-				return None, status
 			# Prepare the audio and video sequences for the model
 			audio_sequences = torch.cat([spec[:, :, i] for i in range(spec.size(2))], dim=0)
@@ -1105,6 +1124,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 					else:
 						video_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=False)
 					all_video_embs.append(video_emb)
 			# Predict the active speaker in each scene
 			if global_speaker=="per-frame-prediction":
@@ -1134,7 +1154,7 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 				if len(predictions) != frame_pred:
 					msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
-					return None, msg
 				active_speakers[start:end] = predictions[0:]
@@ -1154,21 +1174,19 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 				output_tracks[frame] = track_dict[scene_num][label][frame]
 		# Save the output video
-		print("Generating active-speaker detection output video...")
 		video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
 		if status != "success":
-			return None, status
 		print("Successfully saved the output video: ", video_output)
-		return video_output, "success"
 	except Exception as e:
-		return None, f"Error: {str(e)}"
 if __name__ == "__main__":
 	# Custom CSS and HTML
 	custom_css = """
 	<style>
@@ -1291,7 +1309,7 @@ if __name__ == "__main__":
 				gr.update(visible=True),  # submit_button
 				gr.update(visible=True),  # clear_button
 				gr.update(visible=False), # sync_examples
-				gr.update(visible=True)   # asd_examples
 			)
 	def clear_inputs():
@@ -1303,16 +1321,16 @@ if __name__ == "__main__":
 		else:
 			return process_video_activespeaker(video_input, global_speaker, num_avg_frames)
 	# Define paths to sample videos
 	sync_sample_videos = [
-		"samples/sync_sample_1.mp4",
-		"samples/sync_sample_2.mp4",
 	]
 	asd_sample_videos = [
-		"samples/asd_sample_1.mp4",
-		"samples/asd_sample_2.mp4"
 	]
 	# Define Gradio interface
@@ -1356,32 +1374,40 @@ if __name__ == "__main__":
 		# Add a gap before examples
 		gr.HTML('<div class="examples-holder"></div>')
 		# Add examples that only populate the video input
-		sync_examples = gr.Examples(
-			examples=sync_sample_videos,
-			inputs=video_input,
-			outputs=None,
-			fn=None,
-			cache_examples=False,
 			visible=False
 		)
-		asd_examples = gr.Examples(
-			examples=asd_sample_videos,
-			inputs=video_input,
-			outputs=None,
-			fn=None,
-			cache_examples=False,
 			visible=False
 		)
 		demo_choice.change(
 			fn=toggle_demo,
 			inputs=demo_choice,
-			outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, result_text, output_video, submit_button, clear_button, sync_examples.dataset, asd_examples.dataset]
 		)
 		submit_button.click(
 			fn=process_video,
@@ -1394,5 +1420,6 @@ if __name__ == "__main__":
 			inputs=[],
 			outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
 		)
 	# Launch the interface
-	demo.launch(allowed_paths=["."], server_name="0.0.0.0", server_port=7860, share=True)

 from sync_models.gestsync_models import *
 from tqdm import tqdm
 from glob import glob
+from scipy.io.wavfile import write
 import mediapipe as mp
 from protobuf_to_dict import protobuf_to_dict
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
 # Initialize global variables
+CHECKPOINT_PATH = "model_rgb.pth"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 use_cuda = torch.cuda.is_available()
 batch_size = 12
 	video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
 	# Resample the video to 25 fps
+	# status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i {} -q:v 1 -filter:v fps=25 {}'.format(video_file, video_file_25fps), shell=True)
+	status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
+	if status != 0:
+		msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
+		return None, msg
 	print('Resampled the video to 25 fps: {}'.format(video_file_25fps))
+	return video_file_25fps, "success"
 def load_checkpoint(path, model):
 	'''
 	return input_frames, num_frames, orig_masked_frames, "success"
+def load_spectrograms(wav_file, num_frames=None, window_frames=25, stride=4):
 	'''
 	This function extracts the spectrogram from the audio file
 	orig_spec = spec
 	spec = np.array([spec[i:i+(window_frames*stride), :] for i in range(0, spec.shape[0], stride) if (i+(window_frames*stride) <= spec.shape[0])])
+	if num_frames is not None:
+		if len(spec) != num_frames:
+			spec = spec[:num_frames]
 		frame_diff = np.abs(len(spec) - num_frames)
 		if frame_diff > 60:
 			print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
 		return None, msg
 	video_output = video_fname + '.mp4'
+	status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -c:v libx264 -preset veryslow -crf 18 -pix_fmt yuv420p -strict -2 -q:v 1 -shortest %s' %
+				(audio_file, no_sound_video, video_output), shell=True)
 	if status != 0:
 		msg = "Oops! Could not generate the video. Please check the input video and try again."
 		return None, msg
 	os.remove(fname)
 	os.remove(no_sound_video)
+	return video_output, "success"
 def sync_correct_video(video_path, frames, wav_file, offset, result_folder, sample_rate=16000, fps=25):
 		corrected_frames = corrected_frames[:len(frames)-np.abs(offset)]
 	corrected_video_path = os.path.join(result_folder, "result_sync_corrected")
+	video_output, status = generate_video(corrected_frames, wav_file, corrected_video_path)
+	if status != "success":
+		return None, status
+	return video_output, "success"
 def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_folder):
 	all_frames, all_orig_frames = [], []
 	for video_num, video in enumerate(test_videos):
+		print("Processing video: ", video)
 		# Load the video frames
 		frames, status = load_video_frames(video)
 		if status != "success":
 			return None, None, status
+		print("Successfully loaded the video frames")
 		# Extract the keypoints from the frames
 		kp_dict, status = get_keypoints(frames)
 		if status != "success":
 			return None, None, status
+		print("Successfully extracted the keypoints")
 		# Mask the frames using the keypoints extracted from the frames and prepare the input to the model
 		masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict)
 		if status != "success":
 			return None, None, status
+		print("Successfully loaded the masked frames")
 		# Check if the length of the input frames is equal to the length of the spectrogram
 		if spec.shape[2]!=masked_frames.shape[0]:
 		# Transpose the frames to the correct format
 		frames = np.transpose(masked_frames, (4, 0, 1, 2, 3))
 		frames = torch.FloatTensor(np.array(frames)).unsqueeze(0)
+		print("Successfully converted the frames to tensor")
 		all_frames.append(frames)
 		all_orig_frames.append(orig_masked_frames)
 		- video_output (string) : Path of the output video
 	'''
+	try:
+		output_frames = []
+		for i in range(len(input_frames)):
+			# If the active speaker is found, draw a bounding box around the active speaker
+			if i in output_tracks:
+				bbox = output_tracks[i]
+				x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
+				out = cv2.rectangle(input_frames[i].copy(), (x1, y1), (x2, y2), color=[0, 255, 0], thickness=3)
+			else:
+				out = input_frames[i]
+			output_frames.append(out)
+		# Generate the output video
+		output_video_fname = os.path.join(result_folder, "result_active_speaker_det")
+		video_output, status = generate_video(output_frames, wav_file, output_video_fname)
+		if status != "success":
+			return None, status
+	except Exception as e:
+		return None, f"Error: {str(e)}"
+	return video_output, "success"
 def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		# Resample the video to 25 fps if it is not already 25 fps
 		print("FPS of video: ", fps)
 		if fps!=25:
+			vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
+			if status != "success":
+				return status, None
+			orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
+			if status != "success":
+				return status, None
 		else:
 			vid_path = vid_path_processed
 			orig_vid_path_25fps = video_path
 		print("Predicted offset: ", pred_offset)
 		# Generate sync-corrected video
+		video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
+		if status != "success":
+			return status, None
 		print("Successfully generated the video:", video_output)
 		return f"Predicted offset: {pred_offset}", video_output
 		if global_speaker=="per-frame-prediction" and num_avg_frames<25:
 			msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
+			return msg, None
 		# Read the video
 		try:
 			vr = VideoReader(video_path, ctx=cpu(0))
 		except:
 			msg = "Oops! Could not load the input video file"
+			return msg, None
 		# Get the FPS of the video
 		fps = vr.get_avg_fps()
 		# Resample the video to 25 FPS if the original video is of a different frame-rate
 		if fps!=25:
+			test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
+			if status != "success":
+				return status, None
 		else:
 			test_video_25fps = video_path
 		# Load the video frames
 		orig_frames, status = load_video_frames(test_video_25fps)
 		if status != "success":
+			return status, None
 		# Extract and save the audio file
 		orig_wav_file, status = extract_audio(video_path, result_folder)
 		if status != "success":
+			return status, None
 		# Pre-process and extract per-speaker tracks in each scene
 		print("Pre-processing the input video...")
 		status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
 		if status != 0:
+			return "Error in pre-processing the input video, please check the input video and try again...", None
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 			tracks = pickle.load(file)
 				track_dict[scene_num][i] = {}
 				for frame_num, bbox in zip(tracks[scene_num][i]['track']['frame'], tracks[scene_num][i]['track']['bbox']):
 					track_dict[scene_num][i][frame_num] = bbox
 		# Get the total number of scenes
 		test_scenes = os.listdir("{}/crops".format(result_folder_input))
 		# Load the trained model
 		model = Transformer_RGB()
 		model = load_checkpoint(CHECKPOINT_PATH, model)
 		# Compute the active speaker in each scene
 		output_tracks = {}
 			if len(test_videos)<=1:
 				msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
+				return msg, None
 			# Load the audio file
 			audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
 			spec, _, status = load_spectrograms(audio_file, window_frames=25)
 			if status != "success":
+				return status, None
 			spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
+			print("Successfully loaded the spectrograms")
 			# Load the masked input frames
 			all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
 			if status != "success":
+				return status, None
+			print("Successfully loaded the masked input frames")
 			# Prepare the audio and video sequences for the model
 			audio_sequences = torch.cat([spec[:, :, i] for i in range(spec.size(2))], dim=0)
 					else:
 						video_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=False)
 					all_video_embs.append(video_emb)
+			print("Successfully extracted GestSync embeddings")
 			# Predict the active speaker in each scene
 			if global_speaker=="per-frame-prediction":
 				if len(predictions) != frame_pred:
 					msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
+					return msg, None
 				active_speakers[start:end] = predictions[0:]
 				output_tracks[frame] = track_dict[scene_num][label][frame]
 		# Save the output video
 		video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
 		if status != "success":
+			return status, None
 		print("Successfully saved the output video: ", video_output)
+		return "success", video_output
 	except Exception as e:
+		return f"Error: {str(e)}", None
 if __name__ == "__main__":
 	# Custom CSS and HTML
 	custom_css = """
 	<style>
 				gr.update(visible=True),  # submit_button
 				gr.update(visible=True),  # clear_button
 				gr.update(visible=False), # sync_examples
+				gr.update(visible=True)     # asd_examples
 			)
 	def clear_inputs():
 		else:
 			return process_video_activespeaker(video_input, global_speaker, num_avg_frames)
 	# Define paths to sample videos
 	sync_sample_videos = [
+		["samples/sync_sample_1.mp4"],
+		["samples/sync_sample_2.mp4"],
+		["samples/sync_sample_3.mp4"]
 	]
 	asd_sample_videos = [
+		["samples/asd_sample_1.mp4"],
+		["samples/asd_sample_2.mp4"]
 	]
 	# Define Gradio interface
 		# Add a gap before examples
 		gr.HTML('<div class="examples-holder"></div>')
 		# Add examples that only populate the video input
+		sync_examples = gr.Dataset(
+			samples=sync_sample_videos,
+			components=[video_input],
+			type="values",
 			visible=False
 		)
+		asd_examples = gr.Dataset(
+			samples=asd_sample_videos,
+			components=[video_input],
+			type="values",
 			visible=False
 		)
 		demo_choice.change(
 			fn=toggle_demo,
 			inputs=demo_choice,
+			outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, result_text, output_video, submit_button, clear_button, sync_examples, asd_examples]
 		)
+		sync_examples.select(
+			fn=lambda x: gr.update(value=x[0], visible=True),
+			inputs=sync_examples,
+			outputs=video_input
+		)
+		asd_examples.select(
+			fn=lambda x: gr.update(value=x[0], visible=True),
+			inputs=asd_examples,
+			outputs=video_input
+		)
 		submit_button.click(
 			fn=process_video,
 			inputs=[],
 			outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
 		)
 	# Launch the interface
+	demo.launch(allowed_paths=["."], share=True)

preprocess/inference_preprocess.py CHANGED Viewed

@@ -165,7 +165,7 @@ def crop_video(opt, track, cropfile, tight_scale=1):
 def inference_video(opt, padding=0):
 	videofile = os.path.join(opt.avi_dir, 'video.avi')
 	vidObj = cv2.VideoCapture(videofile)
-	yolo_model = YOLO("yolov9t.pt")
 	global dets, fidx
 	dets = []
 	fidx = 0

 def inference_video(opt, padding=0):
 	videofile = os.path.join(opt.avi_dir, 'video.avi')
 	vidObj = cv2.VideoCapture(videofile)
+	yolo_model = YOLO("yolov9m.pt")
 	global dets, fidx
 	dets = []
 	fidx = 0