|
import os |
|
import sys |
|
import gradio as gr |
|
|
|
device = "cuda" |
|
os.system('git clone https://github.com/Rudrabha/Wav2Lip.git') |
|
os.system('curl -o ./Wav2Lip/face_detection/detection/sfd/s3fd.pth https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth') |
|
os.system('pip3 install moviepy') |
|
os.system('pip3 uninstall numpy') |
|
os.system('pip3 install numpy>=1.17.1') |
|
os.system('pip3 install speechRecognition') |
|
os.system('pip3 install gtts') |
|
os.system('pip3 install googletrans==3.1.0a0') |
|
os.system('pip install numba==0.48') |
|
os.system('pip3 install transformers') |
|
|
|
title = "Automatic translation and dubbing for Indic Languages" |
|
description = "A demo application to dub and translate videos spoken in Tamil, Hindi, Bengali and Telugu" |
|
article = "Official Repo: https://github.com/Rudrabha/Wav2Lip" |
|
|
|
def inference(language,speed,voice,video ): |
|
import moviepy.editor as mp |
|
clip = mp.VideoFileClip(video) |
|
clip.audio.write_audiofile(r"audio.wav") |
|
os.system('pip3 install pydub') |
|
os.system('pip3 install transformers==4.11.3 soundfile sentencepiece torchaudio librosa') |
|
speechlist = [] |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC |
|
import torch |
|
import torchaudio |
|
import librosa |
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") |
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") |
|
def get_transcription(audio_path): |
|
speech, sr = librosa.load(audio_path, sr=16000) |
|
resampler = torchaudio.transforms.Resample(sr, 16000) |
|
speech = resampler(speech) |
|
input_values = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"] |
|
logits = model(input_values)["logits"] |
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.decode(predicted_ids[0]) |
|
return transcription.lower() |
|
speechtext = get_transcription("audio.wav") |
|
speechlist.append(speechtext) |
|
text = " ".join(speechlist) |
|
from googletrans import Translator |
|
from gtts import gTTS |
|
translator= Translator() |
|
if speed == "Slow": |
|
con = True |
|
elif speed == "Fast": |
|
con = False |
|
if language == "Hindi": |
|
translation = translator.translate(text, src = 'en', dest='hi', slow=con) |
|
tts = gTTS(translation.text, lang= "hi") |
|
tts.save('input_audio.wav') |
|
elif language == "Tamil": |
|
translation = translator.translate(text, src = 'en', dest='ta', slow=con) |
|
tts = gTTS(translation.text, lang= "ta") |
|
tts.save('input_audio.wav') |
|
elif language == "Bengali": |
|
translation = translator.translate(text, src = 'en', dest='bn', slow=con) |
|
tts = gTTS(translation.text, lang= "hi") |
|
tts.save('input_audio.wav') |
|
elif language == "Telugu": |
|
translation = translator.translate(text, src = 'en', dest='te', slow=con) |
|
tts = gTTS(translation.text, lang= "hi") |
|
tts.save('input_audio.wav') |
|
audio = "input_audio.wav" |
|
os.system('mv ./Wav2Lip/* .') |
|
os.system("python inference.py --checkpoint_path ./wav2lip_gan.pth --face {} --audio {}".format(video, audio)) |
|
return "./results/result_voice.mp4" |
|
|
|
iface = gr.Interface(inference, inputs=[gr.Radio(["Tamil", "Hindi", "Bengali", "Telugu"], label = "Enter language to translate to"), gr.Radio(["Slow", "Fast"], label = "Enter speaking speed"), gr.Radio(["Male", "Female"], label = "Enter preferred voice"), gr.Video(format="mp4", sources="upload", label="Video to be Translated")], outputs=["video"], title=title, description=description, article=article) |
|
iface.launch() |