Spaces:

BilalSardar
/

Video_LipReading

Runtime error

App Files Files Community

Video_LipReading / app.py

BilalSardar

Update app.py

ce80742 about 1 year ago

raw

history blame contribute delete

4.26 kB

	import gradio as gr
	import tensorflow as tf
	from typing import List
	import os
	import cv2
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.optimizers import legacy
	from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
	from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler



	def convert_mp4_to_mpg(input_file, output_file):
	"""
	Convert an MP4 video file to an MPG video file using OpenCV.

	Args:
	input_file (str): Path to the input MP4 file.
	output_file (str): Path to the output MPG file.

	Returns:
	None
	"""
	cap = cv2.VideoCapture(input_file)

	# Check if the video file was opened successfully
	if not cap.isOpened():
	raise Exception(f"Could not open video file: {input_file}")

	# Define the codec and create a VideoWriter object
	fourcc = cv2.VideoWriter_fourcc(*'mpg2') # Use 'mpg2' codec for MPG files
	out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	out.write(frame)

	# Release the video objects
	cap.release()
	out.release()

	def load_video(video_path):

	cap = cv2.VideoCapture(video_path)
	frames = []
	for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
	ret, frame = cap.read()
	frame = tf.image.rgb_to_grayscale(frame)
	frames.append(frame[190:236,80:220,:])
	cap.release()

	mean = tf.math.reduce_mean(frames)
	std = tf.math.reduce_std(tf.cast(frames, tf.float32))
	return tf.cast((frames - mean), tf.float32) / std

	def load_data(path: str):
	path = bytes.decode(path.numpy())
	#file_name = path.split('/')[-1].split('.')[0]
	# File name splitting for windows
	file_name = path.split('\\')[-1].split('.')[0]
	video_path = os.path.join('data','s1',f'{file_name}.mpg')
	alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
	frames = load_video(video_path)


	return frames


	vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
	char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
	num_to_char = tf.keras.layers.StringLookup(
	vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
	)

	model = Sequential()
	model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
	model.add(Activation('relu'))
	model.add(MaxPool3D((1,2,2)))

	model.add(Conv3D(256, 3, padding='same'))
	model.add(Activation('relu'))
	model.add(MaxPool3D((1,2,2)))

	model.add(Conv3D(75, 3, padding='same'))
	model.add(Activation('relu'))
	model.add(MaxPool3D((1,2,2)))

	model.add(TimeDistributed(Flatten()))

	model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
	model.add(Dropout(.5))

	model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
	model.add(Dropout(.5))

	model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
	model.summary()

	optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training

	model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

	#Loading weights
	model.load_weights('model/checkpoint')

	def Predict(Video):
	#convert_mp4_to_mpg(Video,'output.mpg')
	sample = load_data(tf.convert_to_tensor(Video))
	yhat = model.predict(tf.expand_dims(sample, axis=0))
	decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
	result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
	return result[0].numpy().decode('utf-8')


	interface = gr.Interface(fn=Predict,
	inputs="video",
	outputs="text",
	title='Video Lip Reading',
	description="""Currently, it runs on MPG files. There is one present in the files by the name of 'bbaf2n.mpg', which results in 'bin blue at F two now.' """)


	interface.launch(debug=True)