BilalSardar commited on
Commit
56c7749
1 Parent(s): bfac2f7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tensorflow as tf
3
+ from tensorflow.keras.models import Sequential
4
+ from tensorflow.keras.optimizers import legacy
5
+ from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
6
+ from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
7
+
8
+ def load_alignments(path:str) -> List[str]:
9
+ with open(path, 'r') as f:
10
+ lines = f.readlines()
11
+ tokens = []
12
+ for line in lines:
13
+ line = line.split()
14
+ if line[2] != 'sil':
15
+ tokens = [*tokens,' ',line[2]]
16
+ return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]
17
+ def load_data(path: str):
18
+ path = bytes.decode(path.numpy())
19
+ #file_name = path.split('/')[-1].split('.')[0]
20
+ # File name splitting for windows
21
+ file_name = path.split('\\')[-1].split('.')[0]
22
+ video_path = os.path.join('data','s1',f'{file_name}.mpg')
23
+ alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
24
+ frames = load_video(video_path)
25
+ alignments = load_alignments(alignment_path)
26
+
27
+ return frames, alignments
28
+
29
+
30
+ vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
31
+ char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
32
+ num_to_char = tf.keras.layers.StringLookup(
33
+ vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
34
+ )
35
+
36
+ model = Sequential()
37
+ model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
38
+ model.add(Activation('relu'))
39
+ model.add(MaxPool3D((1,2,2)))
40
+
41
+ model.add(Conv3D(256, 3, padding='same'))
42
+ model.add(Activation('relu'))
43
+ model.add(MaxPool3D((1,2,2)))
44
+
45
+ model.add(Conv3D(75, 3, padding='same'))
46
+ model.add(Activation('relu'))
47
+ model.add(MaxPool3D((1,2,2)))
48
+
49
+ model.add(TimeDistributed(Flatten()))
50
+
51
+ model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
52
+ model.add(Dropout(.5))
53
+
54
+ model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
55
+ model.add(Dropout(.5))
56
+
57
+ model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
58
+ model.summary()
59
+
60
+ optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training
61
+
62
+ model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
63
+
64
+ #Loading weights
65
+ model.load_weights('/content/models/checkpoint')
66
+
67
+ def Predict(Video):
68
+ sample = load_data(tf.convert_to_tensor(Video))
69
+ yhat = model.predict(tf.expand_dims(sample[0], axis=0))
70
+ decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
71
+ result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
72
+ return result[0].numpy().decode('utf-8')
73
+
74
+
75
+ interface = gr.Interface(fn=Predict,
76
+ inputs="video",
77
+ outputs=[gr.inputs.Textbox(label='Generated Output')],
78
+ title='Video Lip Reading')
79
+
80
+
81
+ interface.launch(debug=True)