whisper-webui-translate

Sleeping

App Files Files Community

aadnk commited on Sep 22, 2022

Commit

7ce6041

•

1 Parent(s): 05a2178

Limit audio files to 120s

Browse files

Files changed (2) hide show

app.py +20 -1
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,9 +4,14 @@ import gradio as gr
 from utils import write_vtt
 import whisper
 #import os
 #os.system("pip install git+https://github.com/openai/whisper.git")
 LANGUAGES = [
     "English",
     "Chinese",
@@ -116,6 +121,13 @@ def greet(modelName, languageName, uploadFile, microphoneData, task):
     selectedLanguage = languageName.lower() if len(languageName) > 0 else None
     selectedModel = modelName if modelName is not None else "base"
     model = model_cache.get(selectedModel, None)
     if not model:
@@ -130,7 +142,14 @@ def greet(modelName, languageName, uploadFile, microphoneData, task):
     return result["text"], segmentStream.read()
-demo = gr.Interface(fn=greet, description="Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.", inputs=[
     gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
     gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
     gr.Audio(source="upload", type="filepath", label="Upload Audio"),

 from utils import write_vtt
 import whisper
+import ffmpeg
 #import os
 #os.system("pip install git+https://github.com/openai/whisper.git")
+# Limitations (set to -1 to disable)
+INPUT_AUDIO_MAX_DURATION = 60 # seconds
 LANGUAGES = [
     "English",
     "Chinese",
     selectedLanguage = languageName.lower() if len(languageName) > 0 else None
     selectedModel = modelName if modelName is not None else "base"
+    if INPUT_AUDIO_MAX_DURATION > 0:
+        # Calculate audio length
+        audioDuration = ffmpeg.probe(source)["format"]["duration"]
+        if float(audioDuration) > INPUT_AUDIO_MAX_DURATION:
+            return ("[ERROR]: Maximum audio file length is " + str(INPUT_AUDIO_MAX_DURATION) + "s, file was " + str(audioDuration) + "s"), "[ERROR]"
     model = model_cache.get(selectedModel, None)
     if not model:
     return result["text"], segmentStream.read()
+ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
+ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
+ui_description += " as well as speech translation and language identification. "
+if INPUT_AUDIO_MAX_DURATION > 0:
+    ui_description += "\n\n" + "Max audio file length: " + str(INPUT_AUDIO_MAX_DURATION) + " s"
+demo = gr.Interface(fn=greet, description=ui_description, inputs=[
     gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
     gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
     gr.Audio(source="upload", type="filepath", label="Upload Audio"),

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 git+https://github.com/openai/whisper.git
-transformers

 git+https://github.com/openai/whisper.git
+transformers
+ffmpeg-python==0.2.0