Spaces:

BilalSardar
/

Video_LipReading

Runtime error

App Files Files Community

BilalSardar commited on Sep 30, 2023

Commit

56c7749

•

1 Parent(s): bfac2f7

Create app.py

Browse files

Files changed (1) hide show

app.py +81 -0

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import gradio as gr
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.optimizers import legacy
+from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
+from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
+def load_alignments(path:str) -> List[str]:
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    tokens = []
+    for line in lines:
+        line = line.split()
+        if line[2] != 'sil':
+            tokens = [*tokens,' ',line[2]]
+    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]
+def load_data(path: str):
+    path = bytes.decode(path.numpy())
+    #file_name = path.split('/')[-1].split('.')[0]
+    # File name splitting for windows
+    file_name = path.split('\\')[-1].split('.')[0]
+    video_path = os.path.join('data','s1',f'{file_name}.mpg')
+    alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
+    frames = load_video(video_path)
+    alignments = load_alignments(alignment_path)
+    return frames, alignments
+vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
+char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
+num_to_char = tf.keras.layers.StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
+)
+model = Sequential()
+model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
+model.add(Activation('relu'))
+model.add(MaxPool3D((1,2,2)))
+model.add(Conv3D(256, 3, padding='same'))
+model.add(Activation('relu'))
+model.add(MaxPool3D((1,2,2)))
+model.add(Conv3D(75, 3, padding='same'))
+model.add(Activation('relu'))
+model.add(MaxPool3D((1,2,2)))
+model.add(TimeDistributed(Flatten()))
+model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+model.add(Dropout(.5))
+model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+model.add(Dropout(.5))
+model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
+model.summary()
+optimizer = legacy.Adam(learning_rate=0.001)  # Replace legacy.Adam with the appropriate legacy optimizer you used during training
+model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
+#Loading weights
+model.load_weights('/content/models/checkpoint')
+def Predict(Video):
+  sample = load_data(tf.convert_to_tensor(Video))
+  yhat = model.predict(tf.expand_dims(sample[0], axis=0))
+  decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
+  result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
+  return result[0].numpy().decode('utf-8')
+interface = gr.Interface(fn=Predict,
+                        inputs="video",
+                        outputs=[gr.inputs.Textbox(label='Generated Output')],
+                        title='Video Lip Reading')
+interface.launch(debug=True)