karin.brisker commited on
Commit
7630e84
1 Parent(s): 7e27fe1
app.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from main import LANGS, Pipeline
4
+
5
+
6
+ def video_identity(video, source_language="English", target_language="Spanish"):
7
+ video_path = pipeline(video, "sample", source_language, target_language)
8
+
9
+ return video_path
10
+
11
+
12
+ demo = gr.Interface(video_identity,
13
+ inputs=[gr.Video(),
14
+ gr.components.Dropdown(label="Source Language", choices=LANGS),
15
+ gr.components.Dropdown(label="Target Language", choices=LANGS),
16
+ ],
17
+ outputs="playable_video",
18
+ examples=[[
19
+ os.path.join(os.path.dirname(__file__),
20
+ "sample/iPhone_14_Pro.mp4"), "English"]],
21
+ cache_examples=True,
22
+ title="Video Subtitler Demo 🍿🍿🍿",
23
+ description="This demo is a proof of concept for a video subtitler. "
24
+ )
25
+
26
+ pipeline = Pipeline()
27
+ demo.launch()
audio_to_transcript.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict
3
+
4
+ import torch
5
+ import whisper
6
+ from whisper.utils import get_writer
7
+
8
+ import numpy as np # for counting parameters
9
+
10
+ from utils import log
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+
14
+
15
+ class TranscribeAudio:
16
+ def __init__(self):
17
+ self.model = whisper.load_model("base", device=device)
18
+ log(
19
+ f"Model is {'multilingual' if self.model.is_multilingual else 'English-only'} "
20
+ f"and has {sum(np.prod(p.shape) for p in self.model.parameters()):,} parameters."
21
+ )
22
+ self.options = {"max_line_width": 20, "max_line_count": 3, "highlight_words": True}
23
+
24
+ def transcribe(self, audio_file_path: str, language: str = "en") -> Dict:
25
+ log(f"Transcribing {audio_file_path} in {language}")
26
+ options = dict(language=language, beam_size=5, best_of=5)
27
+ transcribe_options = dict(task="transcribe", **options)
28
+ result = self.model.transcribe(audio_file_path, **transcribe_options)
29
+ return result
30
+
31
+ def save_output(self, transcript_output: Dict, audio_file_path: str) -> str:
32
+ filename, ext = os.path.splitext(audio_file_path)
33
+ directory = os.path.dirname(filename)
34
+ log(f"Saving output to {directory} directory as {filename}.vtt")
35
+ # Save as an SRT file
36
+ srt_writer = get_writer("srt", directory)
37
+ srt_writer(transcript_output, audio_file_path, self.options)
38
+
39
+ # Save as a VTT file
40
+ vtt_writer = get_writer("vtt", directory)
41
+ vtt_writer(transcript_output, audio_file_path, self.options)
42
+
43
+ return f"{filename}.vtt"
44
+
45
+ def __call__(self, audio_file_path: str, output_dir: str, input_language: str = "en") -> str:
46
+ transcript = self.transcribe(audio_file_path)
47
+ transcript_path = self.save_output(transcript, audio_file_path)
48
+ return transcript_path
49
+
50
+
51
+ if __name__ == '__main__':
52
+ transcribe_audio = TranscribeAudio()
53
+ transcribe_audio('sample', 'iPhone_14_Pro.mp3')
main.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import subprocess
5
+
6
+ from audio_to_transcript import TranscribeAudio
7
+ from translator import MyTranslator
8
+ from utils import log
9
+ from video_to_audio_converter import VideoToAudioConverter
10
+
11
+ with open('resources/languages.json', 'r') as f:
12
+ code2lang = json.load(f)
13
+
14
+ # language code lookup by name, with a few language aliases
15
+ lang2code = {
16
+ **{language: code for code, language in code2lang.items()},
17
+ }
18
+
19
+ LANGS = sorted(lang2code.keys())
20
+
21
+
22
+ class Pipeline:
23
+ def __init__(self):
24
+ self.video_to_audio = VideoToAudioConverter()
25
+ self.audio_to_text = TranscribeAudio()
26
+ self.translator = MyTranslator()
27
+
28
+ def __call__(self, video_path: str, output_path: str, input_language: str, output_language: str):
29
+ filename, ext = os.path.splitext(video_path)
30
+
31
+ audio_path = self.video_to_audio.convert(video_path)
32
+ subtitle_path = self.audio_to_text(audio_path, output_path, input_language)
33
+ if input_language != output_language:
34
+ subtitle_path = self.translator.translate(subtitle_path, lang2code[input_language],
35
+ lang2code[output_language])
36
+ log(f"Embedding subtitles on input video and saves output video to {output_path}/output.mp4")
37
+ # Use ffmpeg to add the subtitles to the input MP4 file and create the output MP4 file
38
+
39
+ subtitles_cmd = ["ffmpeg", "-y", "-i", video_path, "-vf", f"subtitles={subtitle_path}", "-c:a", "copy",
40
+ f"{filename}_{output_language}_output.mp4"]
41
+
42
+ subprocess.run(subtitles_cmd, check=True)
43
+ return f"{filename}_{output_language}_output.mp4"
44
+
45
+
46
+ if __name__ == '__main__':
47
+ parser = argparse.ArgumentParser(
48
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
49
+ parser.add_argument("video", type=str,
50
+ help="video path to transcribe")
51
+ parser.add_argument("--output_dir", "-o", type=str,
52
+ default=".", help="directory to save the outputs")
53
+ parser.add_argument("--input_language", type=str, default=None, choices=LANGS,
54
+ help="language spoken in the video, skip to perform language detection")
55
+ parser.add_argument("--output_language", type=str, default=None, choices=LANGS,
56
+ help="required translation language")
57
+
58
+ args = parser.parse_args()
59
+ pipeline = Pipeline()
60
+ pipeline(args.video, args.output_dir, args.input_language, args.output_language)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ffmpeg_python==0.2.0
2
+ googletrans==3.1.0a0
3
+ gradio==3.27.0
4
+ numpy==1.23.5
5
+ openai_whisper==20230314
6
+ torch==2.0.0
resources/languages.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": "English",
3
+ "zh": "Chinese",
4
+ "de": "German",
5
+ "es": "Spanish",
6
+ "ru": "Russian",
7
+ "ko": "Korean",
8
+ "fr": "French",
9
+ "ja": "Japanese",
10
+ "pt": "Portuguese",
11
+ "tr": "Turkish",
12
+ "pl": "Polish",
13
+ "ca": "Catalan",
14
+ "nl": "Dutch",
15
+ "ar": "Arabic",
16
+ "sv": "Swedish",
17
+ "it": "Italian",
18
+ "id": "Indonesian",
19
+ "fi": "Finnish",
20
+ "he": "Hebrew",
21
+ "uk": "Ukrainian",
22
+ "no": "Norwegian",
23
+ "th": "Thai",
24
+ "la": "Latin"
25
+ }
translator.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from googletrans import Translator
4
+
5
+ from utils import log
6
+
7
+
8
+ class MyTranslator:
9
+ def __init__(self):
10
+ self.translator = Translator()
11
+
12
+ def translate(self, text_file_path, source_language, target_language):
13
+ # Open the input file and read its contents
14
+ with open(text_file_path, 'r') as f:
15
+ input_text = f.read()
16
+
17
+ filename, ext = os.path.splitext(text_file_path)
18
+ output_file_path = f"{filename}_translated{ext}"
19
+ log(f"Translating text to {target_language} and saving to {output_file_path}")
20
+ # Translate the text to the desired language
21
+ output_text = self.translator.translate(input_text, dest=target_language).text
22
+ # Write the translated text to the output file
23
+ with open(output_file_path, 'w') as f:
24
+ f.write(output_text)
25
+
26
+ return output_file_path
27
+
28
+
29
+ if __name__ == '__main__':
30
+ translator = MyTranslator()
31
+ translation_path = translator.translate('sample/iPhone_14_Pro.vtt', 'en', 'es')
utils.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+
4
+ def log(message):
5
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
6
+ print(f'[{timestamp}] {message}')
video_to_audio_converter.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+
4
+ import ffmpeg
5
+
6
+ from utils import log
7
+
8
+
9
+ class VideoToAudioConverter:
10
+ @staticmethod
11
+ def convert(path_to_video: str, output_ext="mp3") -> str:
12
+ """Converts video to audio directly using `ffmpeg` command
13
+ with the help of subprocess module"""
14
+ log("Converts video to audio")
15
+ filename, ext = os.path.splitext(path_to_video)
16
+ subprocess.call(["ffmpeg",
17
+ "-y",
18
+ "-i",
19
+ path_to_video,
20
+ f"{filename}.{output_ext}"],
21
+ stdout=subprocess.DEVNULL,
22
+ stderr=subprocess.STDOUT)
23
+
24
+ video_length = float(ffmpeg.probe(path_to_video)['format']['duration'])
25
+ audio_length = float(ffmpeg.probe(f"{filename}.{output_ext}")['format']['duration'])
26
+ if video_length - audio_length > 1:
27
+ raise Exception("Conversion failed")
28
+ return f"{filename}.{output_ext}"
29
+
30
+
31
+ if __name__ == '__main__':
32
+ video_to_audio_converter = VideoToAudioConverter()
33
+ video_to_audio_converter.convert('iPhone_14_Pro.mp4')
34
+ if os.path.exists('sample/iPhone_14_Pro.mp3'):
35
+ log("File converted successfully")
36
+ else:
37
+ log("File conversion failed")