whisper-webui-translate

Running

App Files Files Community

aadnk commited on Mar 27, 2023

Commit

c90f138

•

1 Parent(s): c963436

Ensure progress bar works for multiple files

Browse files

Files changed (2) hide show

app.py +23 -6
src/source.py +22 -12

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import numpy as np
 import torch
 from src.config import ApplicationConfig
-from src.hooks.whisperProgressHook import ProgressListener, create_progress_listener_handle
 from src.modelCache import ModelCache
 from src.source import get_audio_source_collection
 from src.vadParallel import ParallelContext, ParallelTranscription
@@ -135,9 +135,17 @@ class WhisperTranscriber:
                 outputDirectory = self.output_dir if self.output_dir is not None else downloadDirectory
                 # Execute whisper
                 for source in sources:
                     source_prefix = ""
                     if (len(sources) > 1):
                         # Prefix (minimum 2 digits)
@@ -145,10 +153,18 @@ class WhisperTranscriber:
                         source_prefix = str(source_index).zfill(2) + "_"
                         print("Transcribing ", source.source_path)
                     # Transcribe
-                    result = self.transcribe_file(model, source.source_path, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, progress, **decodeOptions)
                     filePrefix = slugify(source_prefix + source.get_short_name(), allow_unicode=True)
                     source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory)
                     if len(sources) > 1:
@@ -209,19 +225,20 @@ class WhisperTranscriber:
     def transcribe_file(self, model: WhisperContainer, audio_path: str, language: str, task: str = None, vad: str = None,
                         vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1,
-                        progress: gr.Progress = None, **decodeOptions: dict):
         initial_prompt = decodeOptions.pop('initial_prompt', None)
         if ('task' in decodeOptions):
             task = decodeOptions.pop('task')
         # Callable for processing an audio file
         whisperCallable = model.create_callback(language, task, initial_prompt, **decodeOptions)
-        # A listener that will report progress to Gradio
-        progressListener = self._create_progress_listener(progress)
         # The results
         if (vad == 'silero-vad'):
             # Silero VAD where non-speech gaps are transcribed

 import torch
 from src.config import ApplicationConfig
+from src.hooks.whisperProgressHook import ProgressListener, SubTaskProgressListener, create_progress_listener_handle
 from src.modelCache import ModelCache
 from src.source import get_audio_source_collection
 from src.vadParallel import ParallelContext, ParallelTranscription
                 outputDirectory = self.output_dir if self.output_dir is not None else downloadDirectory
+                # Progress
+                total_duration = sum([source.get_audio_duration() for source in sources])
+                current_progress = 0
+                # A listener that will report progress to Gradio
+                root_progress_listener = self._create_progress_listener(progress)
                 # Execute whisper
                 for source in sources:
                     source_prefix = ""
+                    source_audio_duration = source.get_audio_duration()
                     if (len(sources) > 1):
                         # Prefix (minimum 2 digits)
                         source_prefix = str(source_index).zfill(2) + "_"
                         print("Transcribing ", source.source_path)
+                    scaled_progress_listener = SubTaskProgressListener(root_progress_listener,
+                                                   base_task_total=total_duration,
+                                                   sub_task_start=current_progress,
+                                                   sub_task_total=source_audio_duration)
                     # Transcribe
+                    result = self.transcribe_file(model, source.source_path, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, scaled_progress_listener, **decodeOptions)
                     filePrefix = slugify(source_prefix + source.get_short_name(), allow_unicode=True)
+                    # Update progress
+                    current_progress += source_audio_duration
                     source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory)
                     if len(sources) > 1:
     def transcribe_file(self, model: WhisperContainer, audio_path: str, language: str, task: str = None, vad: str = None,
                         vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1,
+                        progressListener: ProgressListener = None, **decodeOptions: dict):
         initial_prompt = decodeOptions.pop('initial_prompt', None)
+        if progressListener is None:
+            # Default progress listener
+            progressListener = ProgressListener()
         if ('task' in decodeOptions):
             task = decodeOptions.pop('task')
         # Callable for processing an audio file
         whisperCallable = model.create_callback(language, task, initial_prompt, **decodeOptions)
         # The results
         if (vad == 'silero-vad'):
             # Silero VAD where non-speech gaps are transcribed

src/source.py CHANGED Viewed

@@ -12,15 +12,22 @@ from src.download import ExceededMaximumDuration, download_url
 MAX_FILE_PREFIX_LENGTH = 17
 class AudioSource:
-    def __init__(self, source_path, source_name = None):
         self.source_path = source_path
         self.source_name = source_name
         # Load source name if not provided
         if (self.source_name is None):
             file_path = pathlib.Path(self.source_path)
             self.source_name = file_path.name
     def get_full_name(self):
         return self.source_name
@@ -53,18 +60,21 @@ def get_audio_source_collection(urlData: str, multipleFiles: List, microphoneDat
         if (microphoneData is not None):
             output.append(AudioSource(microphoneData))
-        total_duration = 0
-        # Calculate total audio length. We do this even if input_audio_max_duration
-        # is disabled to ensure that all the audio files are valid.
-        for source in output:
-            audioDuration = ffmpeg.probe(source.source_path)["format"]["duration"]
-            total_duration += float(audioDuration)
-        # Ensure the total duration of the audio is not too long
-        if input_audio_max_duration > 0:
-            if float(total_duration) > input_audio_max_duration:
-                raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=input_audio_max_duration, message="Video(s) is too long")
     # Return a list of audio sources
     return output

 MAX_FILE_PREFIX_LENGTH = 17
 class AudioSource:
+    def __init__(self, source_path, source_name = None, audio_duration = None):
         self.source_path = source_path
         self.source_name = source_name
+        self._audio_duration = audio_duration
         # Load source name if not provided
         if (self.source_name is None):
             file_path = pathlib.Path(self.source_path)
             self.source_name = file_path.name
+    def get_audio_duration(self):
+        if self._audio_duration is None:
+            self._audio_duration = float(ffmpeg.probe(self.source_path)["format"]["duration"])
+        return self._audio_duration
     def get_full_name(self):
         return self.source_name
         if (microphoneData is not None):
             output.append(AudioSource(microphoneData))
+    total_duration = 0
+    # Calculate total audio length. We do this even if input_audio_max_duration
+    # is disabled to ensure that all the audio files are valid.
+    for source in output:
+        audioDuration = ffmpeg.probe(source.source_path)["format"]["duration"]
+        total_duration += float(audioDuration)
+        # Save audio duration
+        source._audio_duration = float(audioDuration)
+    # Ensure the total duration of the audio is not too long
+    if input_audio_max_duration > 0:
+        if float(total_duration) > input_audio_max_duration:
+            raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=input_audio_max_duration, message="Video(s) is too long")
     # Return a list of audio sources
     return output