Spaces:

BilalSardar
/

Video_LipReading

Runtime error

File size: 4,255 Bytes

import gradio as gr
import tensorflow as tf
from typing import List
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler



def convert_mp4_to_mpg(input_file, output_file):
    """
    Convert an MP4 video file to an MPG video file using OpenCV.

    Args:
    input_file (str): Path to the input MP4 file.
    output_file (str): Path to the output MPG file.

    Returns:
    None
    """
    cap = cv2.VideoCapture(input_file)

    # Check if the video file was opened successfully
    if not cap.isOpened():
        raise Exception(f"Could not open video file: {input_file}")

    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mpg2')  # Use 'mpg2' codec for MPG files
    out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        out.write(frame)

    # Release the video objects
    cap.release()
    out.release()

def load_video(video_path):

    cap = cv2.VideoCapture(video_path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236,80:220,:])
    cap.release()

    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames - mean), tf.float32) / std

def load_data(path: str):
    path = bytes.decode(path.numpy())
    #file_name = path.split('/')[-1].split('.')[0]
    # File name splitting for windows
    file_name = path.split('\\')[-1].split('.')[0]
    video_path = os.path.join('data','s1',f'{file_name}.mpg')
    alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
    frames = load_video(video_path)


    return frames


vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))

model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))

model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.summary()

optimizer = legacy.Adam(learning_rate=0.001)  # Replace legacy.Adam with the appropriate legacy optimizer you used during training

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

#Loading weights 
model.load_weights('model/checkpoint')

def Predict(Video):
  #convert_mp4_to_mpg(Video,'output.mpg')
  sample = load_data(tf.convert_to_tensor(Video))
  yhat = model.predict(tf.expand_dims(sample, axis=0))
  decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
  result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
  return result[0].numpy().decode('utf-8')


interface = gr.Interface(fn=Predict, 
                        inputs="video",
                        outputs="text", 
                        title='Video Lip Reading',
                        description="""Currently, it runs on MPG files. There is one present in the files by the name of 'bbaf2n.mpg', which results in 'bin blue at F two now.' """)
                        

interface.launch(debug=True)