File size: 4,255 Bytes
56c7749
 
d6cfc3d
f066e4f
e830b84
56c7749
 
 
 
 
f066e4f
67dcfd1
05dda97
 
 
67dcfd1
05dda97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67dcfd1
f066e4f
 
 
 
 
 
 
 
 
 
 
 
 
c97767e
 
 
 
 
 
 
 
56c7749
 
c97767e
f066e4f
56c7749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2a0cd0
56c7749
 
a069e9c
 
f066e4f
56c7749
 
 
 
 
 
 
d84bc8b
ce80742
 
56c7749
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
import tensorflow as tf
from typing import List
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler



def convert_mp4_to_mpg(input_file, output_file):
    """
    Convert an MP4 video file to an MPG video file using OpenCV.

    Args:
    input_file (str): Path to the input MP4 file.
    output_file (str): Path to the output MPG file.

    Returns:
    None
    """
    cap = cv2.VideoCapture(input_file)

    # Check if the video file was opened successfully
    if not cap.isOpened():
        raise Exception(f"Could not open video file: {input_file}")

    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mpg2')  # Use 'mpg2' codec for MPG files
    out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        out.write(frame)

    # Release the video objects
    cap.release()
    out.release()

def load_video(video_path):

    cap = cv2.VideoCapture(video_path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236,80:220,:])
    cap.release()

    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames - mean), tf.float32) / std

def load_data(path: str):
    path = bytes.decode(path.numpy())
    #file_name = path.split('/')[-1].split('.')[0]
    # File name splitting for windows
    file_name = path.split('\\')[-1].split('.')[0]
    video_path = os.path.join('data','s1',f'{file_name}.mpg')
    alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
    frames = load_video(video_path)


    return frames


vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))

model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))

model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.summary()

optimizer = legacy.Adam(learning_rate=0.001)  # Replace legacy.Adam with the appropriate legacy optimizer you used during training

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

#Loading weights 
model.load_weights('model/checkpoint')

def Predict(Video):
  #convert_mp4_to_mpg(Video,'output.mpg')
  sample = load_data(tf.convert_to_tensor(Video))
  yhat = model.predict(tf.expand_dims(sample, axis=0))
  decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
  result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
  return result[0].numpy().decode('utf-8')


interface = gr.Interface(fn=Predict, 
                        inputs="video",
                        outputs="text", 
                        title='Video Lip Reading',
                        description="""Currently, it runs on MPG files. There is one present in the files by the name of 'bbaf2n.mpg', which results in 'bin blue at F two now.' """)
                        

interface.launch(debug=True)