Spaces:
Runtime error
Runtime error
import gradio as gr | |
import tensorflow as tf | |
from typing import List | |
import os | |
import cv2 | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.optimizers import legacy | |
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten | |
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler | |
def convert_mp4_to_mpg(input_file, output_file): | |
""" | |
Convert an MP4 video file to an MPG video file using OpenCV. | |
Args: | |
input_file (str): Path to the input MP4 file. | |
output_file (str): Path to the output MPG file. | |
Returns: | |
None | |
""" | |
cap = cv2.VideoCapture(input_file) | |
# Check if the video file was opened successfully | |
if not cap.isOpened(): | |
raise Exception(f"Could not open video file: {input_file}") | |
# Define the codec and create a VideoWriter object | |
fourcc = cv2.VideoWriter_fourcc(*'mpg2') # Use 'mpg2' codec for MPG files | |
out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4)))) | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
break | |
out.write(frame) | |
# Release the video objects | |
cap.release() | |
out.release() | |
def load_video(video_path): | |
cap = cv2.VideoCapture(video_path) | |
frames = [] | |
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): | |
ret, frame = cap.read() | |
frame = tf.image.rgb_to_grayscale(frame) | |
frames.append(frame[190:236,80:220,:]) | |
cap.release() | |
mean = tf.math.reduce_mean(frames) | |
std = tf.math.reduce_std(tf.cast(frames, tf.float32)) | |
return tf.cast((frames - mean), tf.float32) / std | |
def load_data(path: str): | |
path = bytes.decode(path.numpy()) | |
#file_name = path.split('/')[-1].split('.')[0] | |
# File name splitting for windows | |
file_name = path.split('\\')[-1].split('.')[0] | |
video_path = os.path.join('data','s1',f'{file_name}.mpg') | |
alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align') | |
frames = load_video(video_path) | |
return frames | |
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] | |
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") | |
num_to_char = tf.keras.layers.StringLookup( | |
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True | |
) | |
model = Sequential() | |
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same')) | |
model.add(Activation('relu')) | |
model.add(MaxPool3D((1,2,2))) | |
model.add(Conv3D(256, 3, padding='same')) | |
model.add(Activation('relu')) | |
model.add(MaxPool3D((1,2,2))) | |
model.add(Conv3D(75, 3, padding='same')) | |
model.add(Activation('relu')) | |
model.add(MaxPool3D((1,2,2))) | |
model.add(TimeDistributed(Flatten())) | |
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) | |
model.add(Dropout(.5)) | |
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) | |
model.add(Dropout(.5)) | |
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax')) | |
model.summary() | |
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training | |
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) | |
#Loading weights | |
model.load_weights('model/checkpoint') | |
def Predict(Video): | |
#convert_mp4_to_mpg(Video,'output.mpg') | |
sample = load_data(tf.convert_to_tensor(Video)) | |
yhat = model.predict(tf.expand_dims(sample, axis=0)) | |
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() | |
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded] | |
return result[0].numpy().decode('utf-8') | |
interface = gr.Interface(fn=Predict, | |
inputs="video", | |
outputs="text", | |
title='Video Lip Reading', | |
description="""Currently, it runs on MPG files. There is one present in the files by the name of 'bbaf2n.mpg', which results in 'bin blue at F two now.' """) | |
interface.launch(debug=True) |