Spaces:

jiedong-yang
/

Speech-Summarization-with-Whisper

Runtime error

App Files Files Community

Jiedong Yang commited on Oct 25, 2022

Commit

22bdedf

•

1 Parent(s): 4591b06

Upload 2 files

Browse files

Files changed (2) hide show

app.py +143 -0
requirements.txt +16 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import pafy
+import time
+import whisper
+import validators
+import gradio as gr
+from wordcloud import WordCloud, STOPWORDS
+# load whisper model for ASR and BART for summarization
+asr_model = whisper.load_model('base.en')
+summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
+def audio_from_url(url, dst_dir='data', name=None, format='wav'):
+    """ Download video from url and save the audio from video
+    :param url: str, the video url
+    :param dst_dir: destination directory for save audio
+    :param name: audio file's name, if none, assign the name as the video's title
+    :param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
+    :return: path of audio
+    """
+    if not validators.url(url):
+        return None
+    os.makedirs(dst_dir, exist_ok=True)
+    # download audio
+    video = pafy.new(url)
+    path = os.path.join(dst_dir, f"audio.{format}")
+    os.remove(path)
+    os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url}  -o {path} --quiet")
+    return path
+def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
+    """ ASR inference with Whisper
+    :param audio:
+    :param beam_size:
+    :param best_of:
+    :param language:
+    :return:
+    """
+    result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of)
+    return result['text']
+def text_summarization(text):
+    return summarizer(text)
+def wordcloud_func(text: str, out_path='wordcloud_output.png'):
+    """ generate wordcloud based on text
+    :param text:
+    :param out_path:
+    :return:
+    """
+    if len(text) == 0:
+        return None
+    stopwords = STOPWORDS
+    wc = WordCloud(
+        background_color='white',
+        stopwords=stopwords,
+        height=600,
+        width=600
+    )
+    wc.generate(text)
+    wc.to_file(out_path)
+    return out_path
+demo = gr.Blocks(title="Speech Summarization")
+demo.encrypt = False
+with demo:
+    # demo description
+    gr.Markdown("""
+    ## Speech Summarization with Whisper
+    This space is intended to summarize a speech, a short one or long one, to save us sometime.
+    1. Type in a youtube URL or upload an audio file
+    2. Generate transcription with Whisper (Currently English Only)
+    3. Summarize the transcribed speech
+    4. A little wordcloud for you as well
+    """)
+    # data preparation
+    with gr.Row():
+        with gr.Column():
+            url = gr.Textbox(label="URL", placeholder="video url")
+            url_btn = gr.Button("clear")
+            url_btn.click(lambda x: '', inputs=url, outputs=url)
+        speech = gr.Audio(label="Speech", type="filepath")
+        url.change(audio_from_url, inputs=url, outputs=speech)
+    # ASR
+    text = gr.Textbox(label="Transcription", placeholder="transcription")
+    with gr.Row():
+        beam_size_slider = gr.Slider(1, 10, value=5, step=1, label="param: beam_size")
+        best_of_slider = gr.Slider(1, 10, value=5, step=1, label="param: best_of")
+    with gr.Row():
+        asr_clr_btn = gr.Button("clear")
+        asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
+        asr_btn = gr.Button("Recognize Speech")
+        asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)
+    # summarization
+    summary = gr.Textbox(label="Summarization")
+    with gr.Row():
+        sum_clr_btn = gr.Button("clear")
+        sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
+        sum_btn = gr.Button("Summarize")
+        sum_btn.click(text_summarization, inputs=text, outputs=summary)
+    # wordcloud
+    image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
+    text.change(wordcloud_func, inputs=text, outputs=image)
+    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
+                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
+                           inputs=[url])
+if __name__ == '__main__':
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+fastapi==0.85.1
+ffmpeg-python==0.2.0
+gradio==3.6
+huggingface-hub==0.9.1
+matplotlib==3.6.1
+pafy==0.5.5
+pandas==1.5.0
+tokenizers==0.12.1
+torch==1.12.1
+tqdm==4.64.1
+transformers==4.22.1
+validators==0.20.0
+whisper @ git+https://github.com/openai/whisper.git@8cf36f3508c9acd341a45eb2364239a3d81458b9
+wordcloud==1.8.2.2
+youtube-dl==2021.12.17
+yt-dlp==2022.10.4