import gradio as gr import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import legacy from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler def load_alignments(path:str) -> List[str]: with open(path, 'r') as f: lines = f.readlines() tokens = [] for line in lines: line = line.split() if line[2] != 'sil': tokens = [*tokens,' ',line[2]] return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:] def load_data(path: str): path = bytes.decode(path.numpy()) #file_name = path.split('/')[-1].split('.')[0] # File name splitting for windows file_name = path.split('\\')[-1].split('.')[0] video_path = os.path.join('data','s1',f'{file_name}.mpg') alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align') frames = load_video(video_path) alignments = load_alignments(alignment_path) return frames, alignments vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") num_to_char = tf.keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) model = Sequential() model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(256, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(75, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(TimeDistributed(Flatten())) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax')) model.summary() optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) #Loading weights model.load_weights('/content/models/checkpoint') def Predict(Video): sample = load_data(tf.convert_to_tensor(Video)) yhat = model.predict(tf.expand_dims(sample[0], axis=0)) decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded] return result[0].numpy().decode('utf-8') interface = gr.Interface(fn=Predict, inputs="video", outputs=[gr.inputs.Textbox(label='Generated Output')], title='Video Lip Reading') interface.launch(debug=True)