Spaces:

Kabriske
/

Multilingual_Video_Subtitler

Runtime error

App Files Files Community

karin.brisker commited on Apr 18, 2023

Commit

7630e84

•

1 Parent(s): 7e27fe1

app

Browse files

Files changed (8) hide show

app.py +27 -0
audio_to_transcript.py +53 -0
main.py +60 -0
requirements.txt +6 -0
resources/languages.json +25 -0
translator.py +31 -0
utils.py +6 -0
video_to_audio_converter.py +37 -0

app.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import gradio as gr
+import os
+from main import LANGS, Pipeline
+def video_identity(video, source_language="English", target_language="Spanish"):
+    video_path = pipeline(video, "sample", source_language, target_language)
+    return video_path
+demo = gr.Interface(video_identity,
+                    inputs=[gr.Video(),
+                            gr.components.Dropdown(label="Source Language", choices=LANGS),
+                            gr.components.Dropdown(label="Target Language", choices=LANGS),
+                            ],
+                    outputs="playable_video",
+                    examples=[[
+                        os.path.join(os.path.dirname(__file__),
+                                     "sample/iPhone_14_Pro.mp4"), "English"]],
+                    cache_examples=True,
+                    title="Video Subtitler Demo 🍿🍿🍿",
+                    description="This demo is a proof of concept for a video subtitler. "
+                    )
+pipeline = Pipeline()
+demo.launch()

audio_to_transcript.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from typing import Dict
+import torch
+import whisper
+from whisper.utils import get_writer
+import numpy as np  # for counting parameters
+from utils import log
+device = "cuda" if torch.cuda.is_available() else "cpu"
+class TranscribeAudio:
+    def __init__(self):
+        self.model = whisper.load_model("base", device=device)
+        log(
+            f"Model is {'multilingual' if self.model.is_multilingual else 'English-only'} "
+            f"and has {sum(np.prod(p.shape) for p in self.model.parameters()):,} parameters."
+        )
+        self.options = {"max_line_width": 20, "max_line_count": 3, "highlight_words": True}
+    def transcribe(self, audio_file_path: str, language: str = "en") -> Dict:
+        log(f"Transcribing {audio_file_path} in {language}")
+        options = dict(language=language, beam_size=5, best_of=5)
+        transcribe_options = dict(task="transcribe", **options)
+        result = self.model.transcribe(audio_file_path, **transcribe_options)
+        return result
+    def save_output(self, transcript_output: Dict, audio_file_path: str) -> str:
+        filename, ext = os.path.splitext(audio_file_path)
+        directory = os.path.dirname(filename)
+        log(f"Saving output to {directory} directory as {filename}.vtt")
+        # Save as an SRT file
+        srt_writer = get_writer("srt", directory)
+        srt_writer(transcript_output, audio_file_path, self.options)
+        # Save as a VTT file
+        vtt_writer = get_writer("vtt", directory)
+        vtt_writer(transcript_output, audio_file_path, self.options)
+        return f"{filename}.vtt"
+    def __call__(self, audio_file_path: str, output_dir: str, input_language: str = "en") -> str:
+        transcript = self.transcribe(audio_file_path)
+        transcript_path = self.save_output(transcript, audio_file_path)
+        return transcript_path
+if __name__ == '__main__':
+    transcribe_audio = TranscribeAudio()
+    transcribe_audio('sample', 'iPhone_14_Pro.mp3')

main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import argparse
+import json
+import os
+import subprocess
+from audio_to_transcript import TranscribeAudio
+from translator import MyTranslator
+from utils import log
+from video_to_audio_converter import VideoToAudioConverter
+with open('resources/languages.json', 'r') as f:
+    code2lang = json.load(f)
+# language code lookup by name, with a few language aliases
+lang2code = {
+    **{language: code for code, language in code2lang.items()},
+}
+LANGS = sorted(lang2code.keys())
+class Pipeline:
+    def __init__(self):
+        self.video_to_audio = VideoToAudioConverter()
+        self.audio_to_text = TranscribeAudio()
+        self.translator = MyTranslator()
+    def __call__(self, video_path: str, output_path: str, input_language: str, output_language: str):
+        filename, ext = os.path.splitext(video_path)
+        audio_path = self.video_to_audio.convert(video_path)
+        subtitle_path = self.audio_to_text(audio_path, output_path, input_language)
+        if input_language != output_language:
+            subtitle_path = self.translator.translate(subtitle_path, lang2code[input_language],
+                                                      lang2code[output_language])
+        log(f"Embedding subtitles on input video and saves output video to {output_path}/output.mp4")
+        # Use ffmpeg to add the subtitles to the input MP4 file and create the output MP4 file
+        subtitles_cmd = ["ffmpeg", "-y", "-i", video_path, "-vf", f"subtitles={subtitle_path}", "-c:a", "copy",
+                         f"{filename}_{output_language}_output.mp4"]
+        subprocess.run(subtitles_cmd, check=True)
+        return f"{filename}_{output_language}_output.mp4"
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("video", type=str,
+                        help="video path to transcribe")
+    parser.add_argument("--output_dir", "-o", type=str,
+                        default=".", help="directory to save the outputs")
+    parser.add_argument("--input_language", type=str, default=None, choices=LANGS,
+                        help="language spoken in the video, skip to perform language detection")
+    parser.add_argument("--output_language", type=str, default=None, choices=LANGS,
+                        help="required translation language")
+    args = parser.parse_args()
+    pipeline = Pipeline()
+    pipeline(args.video, args.output_dir, args.input_language, args.output_language)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+ffmpeg_python==0.2.0
+googletrans==3.1.0a0
+gradio==3.27.0
+numpy==1.23.5
+openai_whisper==20230314
+torch==2.0.0

resources/languages.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "en": "English",
+    "zh": "Chinese",
+    "de": "German",
+    "es": "Spanish",
+    "ru": "Russian",
+    "ko": "Korean",
+    "fr": "French",
+    "ja": "Japanese",
+    "pt": "Portuguese",
+    "tr": "Turkish",
+    "pl": "Polish",
+    "ca": "Catalan",
+    "nl": "Dutch",
+    "ar": "Arabic",
+    "sv": "Swedish",
+    "it": "Italian",
+    "id": "Indonesian",
+    "fi": "Finnish",
+    "he": "Hebrew",
+    "uk": "Ukrainian",
+    "no": "Norwegian",
+    "th": "Thai",
+    "la": "Latin"
+}

translator.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+from googletrans import Translator
+from utils import log
+class MyTranslator:
+    def __init__(self):
+        self.translator = Translator()
+    def translate(self, text_file_path, source_language, target_language):
+        # Open the input file and read its contents
+        with open(text_file_path, 'r') as f:
+            input_text = f.read()
+        filename, ext = os.path.splitext(text_file_path)
+        output_file_path = f"{filename}_translated{ext}"
+        log(f"Translating text to {target_language} and saving to {output_file_path}")
+        # Translate the text to the desired language
+        output_text = self.translator.translate(input_text, dest=target_language).text
+        # Write the translated text to the output file
+        with open(output_file_path, 'w') as f:
+            f.write(output_text)
+        return output_file_path
+if __name__ == '__main__':
+    translator = MyTranslator()
+    translation_path = translator.translate('sample/iPhone_14_Pro.vtt', 'en', 'es')

utils.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from datetime import datetime
+def log(message):
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    print(f'[{timestamp}] {message}')

video_to_audio_converter.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import subprocess
+import ffmpeg
+from utils import log
+class VideoToAudioConverter:
+    @staticmethod
+    def convert(path_to_video: str, output_ext="mp3") -> str:
+        """Converts video to audio directly using `ffmpeg` command
+        with the help of subprocess module"""
+        log("Converts video to audio")
+        filename, ext = os.path.splitext(path_to_video)
+        subprocess.call(["ffmpeg",
+                         "-y",
+                         "-i",
+                         path_to_video,
+                         f"{filename}.{output_ext}"],
+                        stdout=subprocess.DEVNULL,
+                        stderr=subprocess.STDOUT)
+        video_length = float(ffmpeg.probe(path_to_video)['format']['duration'])
+        audio_length = float(ffmpeg.probe(f"{filename}.{output_ext}")['format']['duration'])
+        if video_length - audio_length > 1:
+            raise Exception("Conversion failed")
+        return f"{filename}.{output_ext}"
+if __name__ == '__main__':
+    video_to_audio_converter = VideoToAudioConverter()
+    video_to_audio_converter.convert('iPhone_14_Pro.mp4')
+    if os.path.exists('sample/iPhone_14_Pro.mp3'):
+        log("File converted successfully")
+    else:
+        log("File conversion failed")