BilalSardar's picture
Update app.py
ce80742
import gradio as gr
import tensorflow as tf
from typing import List
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
def convert_mp4_to_mpg(input_file, output_file):
"""
Convert an MP4 video file to an MPG video file using OpenCV.
Args:
input_file (str): Path to the input MP4 file.
output_file (str): Path to the output MPG file.
Returns:
None
"""
cap = cv2.VideoCapture(input_file)
# Check if the video file was opened successfully
if not cap.isOpened():
raise Exception(f"Could not open video file: {input_file}")
# Define the codec and create a VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mpg2') # Use 'mpg2' codec for MPG files
out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
out.write(frame)
# Release the video objects
cap.release()
out.release()
def load_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
ret, frame = cap.read()
frame = tf.image.rgb_to_grayscale(frame)
frames.append(frame[190:236,80:220,:])
cap.release()
mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
return tf.cast((frames - mean), tf.float32) / std
def load_data(path: str):
path = bytes.decode(path.numpy())
#file_name = path.split('/')[-1].split('.')[0]
# File name splitting for windows
file_name = path.split('\\')[-1].split('.')[0]
video_path = os.path.join('data','s1',f'{file_name}.mpg')
alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
frames = load_video(video_path)
return frames
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.summary()
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#Loading weights
model.load_weights('model/checkpoint')
def Predict(Video):
#convert_mp4_to_mpg(Video,'output.mpg')
sample = load_data(tf.convert_to_tensor(Video))
yhat = model.predict(tf.expand_dims(sample, axis=0))
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
return result[0].numpy().decode('utf-8')
interface = gr.Interface(fn=Predict,
inputs="video",
outputs="text",
title='Video Lip Reading',
description="""Currently, it runs on MPG files. There is one present in the files by the name of 'bbaf2n.mpg', which results in 'bin blue at F two now.' """)
interface.launch(debug=True)