import gradio as gr import tensorflow as tf from typing import List import os import cv2 from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import legacy from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler def convert_mp4_to_mpg(input_file, output_file): """ Convert an MP4 video file to an MPG video file using OpenCV. Args: input_file (str): Path to the input MP4 file. output_file (str): Path to the output MPG file. Returns: None """ cap = cv2.VideoCapture(input_file) # Check if the video file was opened successfully if not cap.isOpened(): raise Exception(f"Could not open video file: {input_file}") # Define the codec and create a VideoWriter object fourcc = cv2.VideoWriter_fourcc(*'mpg2') # Use 'mpg2' codec for MPG files out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4)))) while cap.isOpened(): ret, frame = cap.read() if not ret: break out.write(frame) # Release the video objects cap.release() out.release() def load_video(video_path): cap = cv2.VideoCapture(video_path) frames = [] for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): ret, frame = cap.read() frame = tf.image.rgb_to_grayscale(frame) frames.append(frame[190:236,80:220,:]) cap.release() mean = tf.math.reduce_mean(frames) std = tf.math.reduce_std(tf.cast(frames, tf.float32)) return tf.cast((frames - mean), tf.float32) / std def load_data(path: str): path = bytes.decode(path.numpy()) #file_name = path.split('/')[-1].split('.')[0] # File name splitting for windows file_name = path.split('\\')[-1].split('.')[0] video_path = os.path.join('data','s1',f'{file_name}.mpg') alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align') frames = load_video(video_path) return frames vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") num_to_char = tf.keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) model = Sequential() model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(256, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(75, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(TimeDistributed(Flatten())) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax')) model.summary() optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) #Loading weights model.load_weights('model/checkpoint') def Predict(Video): #convert_mp4_to_mpg(Video,'output.mpg') sample = load_data(tf.convert_to_tensor(Video)) yhat = model.predict(tf.expand_dims(sample, axis=0)) decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded] return result[0].numpy().decode('utf-8') interface = gr.Interface(fn=Predict, inputs="video", outputs="text", title='Video Lip Reading', description="""Currently, it runs on MPG files. There is one present in the files by the name of 'bbaf2n.mpg', which results in 'bin blue at F two now.' """) interface.launch(debug=True)