import gradio as gr import tensorflow as tf from typing import List import os import cv2 from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import legacy from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler from moviepy.editor import VideoFileClip def convert_mp4_to_mpg(input_file): if not os.path.exists(input_file): raise FileNotFoundError(f"Input file '{input_file}' not found.") video_clip = VideoFileClip(input_file) video_clip.write_videofile("output.mpg", codec='mpeg2video') video_clip.close() def load_video(video_path): cap = cv2.VideoCapture(video_path) frames = [] for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): ret, frame = cap.read() frame = tf.image.rgb_to_grayscale(frame) frames.append(frame[190:236,80:220,:]) cap.release() mean = tf.math.reduce_mean(frames) std = tf.math.reduce_std(tf.cast(frames, tf.float32)) return tf.cast((frames - mean), tf.float32) / std def load_data(path: str): path = bytes.decode(path.numpy()) #file_name = path.split('/')[-1].split('.')[0] # File name splitting for windows file_name = path.split('\\')[-1].split('.')[0] video_path = os.path.join('data','s1',f'{file_name}.mpg') alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align') frames = load_video(video_path) return frames vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") num_to_char = tf.keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) model = Sequential() model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(256, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(75, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(TimeDistributed(Flatten())) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax')) model.summary() optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) #Loading weights model.load_weights('model/checkpoint') def Predict(Video): convert_mp4_to_mpg(Video) sample = load_data(tf.convert_to_tensor("output.mpg")) yhat = model.predict(tf.expand_dims(sample, axis=0)) decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded] return result[0].numpy().decode('utf-8') interface = gr.Interface(fn=Predict, inputs="video", outputs="text", title='Video Lip Reading') interface.launch(debug=True)