import gradio as gr import moviepy.video.io.ImageSequenceClip from PIL import Image from pydub import AudioSegment # Import everything needed to edit video clips from moviepy.editor import * import numpy as np import mutagen from mutagen.mp3 import MP3 import cv2 def resize(img_list): print("** inside resize **") print('Entity-Images generated by multimodal interface are:',img_list) resize_img_list = [] for item in img_list: im = Image.open(item) imResize = im.resize((256,256), Image.ANTIALIAS) resize_img_list.append(np.array(imResize)) print('Type of elements in the image list:',type(resize_img_list[0])) return resize_img_list def merge_audio_video(entities_num, resize_img_list, text_input): print("** inside merge aud vid **") print('Type of image list variable: ',type(resize_img_list)) print('Type of elements in the image list: ',type(resize_img_list[0])) #Convert text to speech using facebook's latest model from HF hub speech = text2speech(text_input) print('Back in merge_audio_video') print('Type of speech variable : ',type(speech)) print('Type of Audio file: ',speech) wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac") #convert flac to mp3 audio format print('COnverting flac format to mp3 using AudioSegment object:', type(wav_audio)) wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3") print('flac audio converted to mp3 audio' ) print('now getting duration of this mp3 audio' ) #getting audio clip's duration audio_length = int(MP3("audio.mp3").info.length) print('Audio length is :',audio_length) #Calculate the desired frame per second based on given audio length and entities identified fps= entities_num / audio_length #length of audio file fps = float(format(fps, '.5f')) print('Based on number of entities/images and audio length, FPS is set as : ',fps) #String a list of images into a video and write to memory clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps) clip.write_videofile('my_vid_tmp.mp4') print('video clip created successfully from images') # loading video file print('Starting video and audio merge') videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4") print('loading video-clip') # loading audio file audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15) print('loading mp3-format audio') # adding audio to the video clip mergedclip = videoclip.set_audio(audioclip) print('video and audio merged successfully') #Getting size and frame count of merged video file print('Getting size and frame count of merged video file') duration = mergedclip.duration frame_count = mergedclip.fps print('duration is:',duration) print('frame count :', frame_count) return mergedclip fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech") def text2speech(text): print('** inside testtospeech **') print('Loading the model through :',type(fastspeech)) print(fastspeech) speech = fastspeech(text) print('Type of variable in which file is stored:',type(speech)) print('Type of Audio file generated :',speech) return speech def engine(text_input): print(" ** Inside Enngine **") #Extract entities from text ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large") entities = ner(text_input) entities = [tupl for tupl in entities if None not in tupl] entities_num = len(entities) #Generate images using multimodelart's space for each entity identified above img_list = [] for ent in entities: img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0] img_list.append(img) print('img_list size:',len(img_list)) #Resizing all images produced to same size resize_img_list = resize(img_list) print('back from resize into engine') #Merge video and audio created above mergedclip = merge_audio_video(entities_num, resize_img_list, text_input) print('\n Back in engine') print(' Merged clip type :',type(mergedclip)) print('Writing the merged video clip to a video file') mergedclip.to_videofile('mergedvideo.mp4') print('mergedvideo.mp4 created') print('################################ Single Run Completed ##############################') return 'mergedvideo.mp4' app = gr.Interface(engine, gr.inputs.Textbox(lines=5, label="Input Text"), gr.outputs.Video(type=None, label='Final Merged video'), description="