Spaces:

jiedong-yang
/

Speech-Summarization-with-Whisper

Runtime error

App Files Files Community

jiedong-yang commited on Nov 2, 2022

Commit

ba842e8

•

2 Parent(s): c6d324f 0a4a5ed

Merge branch 'main' of https://huggingface.co/spaces/jiedong-yang/Speech-Summarization-with-Whisper into main

Browse files

Files changed (2) hide show

app.py +16 -13
utils.py +21 -0

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from wordcloud import WordCloud, STOPWORDS
 from scipy.io.wavfile import write
 from espnet2.bin.tts_inference import Text2Speech
 # load whisper model for ASR and BART for summarization
 asr_model = whisper.load_model('base.en')
 summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
@@ -23,7 +25,7 @@ def load_model(name: str):
     :return:
     """
     global asr_model
-    asr_model = whisper.load_model(f"{name.lower()}.en")
     return name
@@ -127,7 +129,7 @@ def text_to_speech(text: str, out_path="data/short_speech.wav"):
     return out_path
-demo = gr.Blocks(title="Speech Summarization")
 demo.encrypt = False
@@ -143,10 +145,7 @@ with demo:
     1. Type in a youtube URL or upload an audio file
     2. Generate transcription with Whisper (English Only)
     3. Summarize the transcribed speech
-    4. Generate summary's speech with ESPNet model
-    model references:
-    - [Whisper](https://github.com/openai/whisper), [ESPNet](https://github.com/espnet/espnet_model_zoo)
     """)
     # data preparation
@@ -161,17 +160,13 @@ with demo:
         url.change(audio_from_url, inputs=url, outputs=speech)
-    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
-                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
-                           inputs=[url])
     # ASR
     text = gr.Textbox(label="Transcription", placeholder="transcription")
     with gr.Row():
-        default_values = dict(model='Base', bs=5, bo=5) if torch.cuda.is_available() \
-            else dict(model='Tiny', bs=1, bo=1)
-        model_options = gr.Dropdown(['Tiny', 'Base'], value=default_values['model'], label="models")
         model_options.change(load_model, inputs=model_options, outputs=model_options)
         beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
@@ -202,6 +197,14 @@ with demo:
     text.change(wordcloud_func, inputs=text, outputs=image)
 if __name__ == '__main__':
     demo.launch()

 from scipy.io.wavfile import write
 from espnet2.bin.tts_inference import Text2Speech
+from utils import *
 # load whisper model for ASR and BART for summarization
 asr_model = whisper.load_model('base.en')
 summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
     :return:
     """
     global asr_model
+    asr_model = whisper.load_model(f"{name.lower()}")
     return name
     return out_path
+demo = gr.Blocks(css=demo_css, title="Speech Summarization")
 demo.encrypt = False
     1. Type in a youtube URL or upload an audio file
     2. Generate transcription with Whisper (English Only)
     3. Summarize the transcribed speech
+    4. Generate summary speech with the ESPNet model
     """)
     # data preparation
         url.change(audio_from_url, inputs=url, outputs=speech)
     # ASR
     text = gr.Textbox(label="Transcription", placeholder="transcription")
     with gr.Row():
+        default_values = dict(model='Base.en', bs=5, bo=5) if torch.cuda.is_available() \
+            else dict(model='Tiny.en', bs=1, bo=1)
+        model_options = gr.Dropdown(['Tiny.en', 'Base.en'], value=default_values['model'], label="models")
         model_options.change(load_model, inputs=model_options, outputs=model_options)
         beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
     text.change(wordcloud_func, inputs=text, outputs=image)
+    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
+                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
+                           inputs=url, outputs=text,
+                           fn=lambda x: speech_to_text(audio_from_url(x)),
+                           cache_examples=True)
+    gr.HTML(footer_html)
 if __name__ == '__main__':
     demo.launch()

utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+demo_css = """
+    .footer {
+                margin-bottom: 40px;
+                margin-top: 45px;
+                text-align: center;
+                border-bottom: 1px solid #e5e5e5;
+            }
+"""
+footer_html = """
+<div class="footer">
+        <p>Whisper by <a href="https://github.com/openai/whisper"
+        style="text-decoration: underline;"
+        target="_blank">OpenAI</a> - BART by <a href="https://huggingface.co/facebook/bart-large-cnn"
+        style="text-decoration: underline;"
+        target="_blank">Facebook</a> - Conformer by <a href="https://github.com/espnet/espnet_model_zoo"
+        style="text-decoration: underline;"
+        target="_blank">ESPNet</a>
+        </p>
+</div>
+"""