Spaces:

jiedong-yang
/

Speech-Summarization-with-Whisper

Runtime error

App Files Files Community

jiedong-yang commited on Nov 2, 2022

Commit

dddc03b

•

1 Parent(s): d27921b

Upload app.py

Browse files

Files changed (1) hide show

app.py +209 -0

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import re
+import torch
+import whisper
+import validators
+import gradio as gr
+from wordcloud import WordCloud, STOPWORDS
+from scipy.io.wavfile import write
+from espnet2.bin.tts_inference import Text2Speech
+from utils import *
+# load whisper model for ASR and BART for summarization
+asr_model = whisper.load_model('base.en')
+summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
+tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan")
+def load_model(name: str):
+    """
+    :param name: model options, tiny or base only, for quick inference
+    :return:
+    """
+    global asr_model
+    asr_model = whisper.load_model(f"{name.lower()}")
+    return name
+def audio_from_url(url, dst_dir='data', name=None, format='wav'):
+    """ Download video from url and save the audio from video
+    :param url: str, the video url
+    :param dst_dir: destination directory for save audio
+    :param name: audio file's name, if none, assign the name as the video's title
+    :param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
+    :return: path of audio
+    """
+    if not validators.url(url):
+        return None
+    os.makedirs(dst_dir, exist_ok=True)
+    # download audio
+    path = os.path.join(dst_dir, f"audio.{format}")
+    if os.path.exists(path):
+        os.remove(path)
+    os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url}  -o {path} --quiet")
+    return path
+def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
+    """ ASR inference with Whisper
+    :param audio: filepath
+    :param beam_size: beam search parameter
+    :param best_of: number of best results
+    :param language: Currently English only
+    :return: transcription
+    """
+    result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
+    return result['text']
+def text_summarization(text):
+    return summarizer(text)
+def wordcloud_func(text: str, out_path='data/wordcloud_output.png'):
+    """ generate wordcloud based on text
+    :param text: transcription
+    :param out_path: filepath
+    :return: filepath
+    """
+    if len(text) == 0:
+        return None
+    stopwords = STOPWORDS
+    wc = WordCloud(
+        background_color='white',
+        stopwords=stopwords,
+        height=600,
+        width=600
+    )
+    wc.generate(text)
+    wc.to_file(out_path)
+    return out_path
+def normalize_dollars(text):
+    """ text normalization for '$'
+    :param text:
+    :return:
+    """
+    def expand_dollars(m):
+        match = m.group(1)
+        parts = match.split(' ')
+        parts.append('dollars')
+        return ' '.join(parts)
+    units = ['hundred', 'thousand', 'million', 'billion', 'trillion']
+    _dollars_re = re.compile(fr"\$([0-9\.\,]*[0-9]+ (?:{'|'.join(units)}))")
+    return re.sub(_dollars_re, expand_dollars, text)
+def text_to_speech(text: str, out_path="data/short_speech.wav"):
+    # espnet tts model process '$1.4 trillion' as 'one point four dollar trillion'
+    # use this function to fix this issue
+    text = normalize_dollars(text)
+    output = tts_model(text)
+    write(out_path, 22050, output['wav'].numpy())
+    return out_path
+demo = gr.Blocks(css=demo_css, title="Speech Summarization")
+demo.encrypt = False
+with demo:
+    # demo description
+    gr.Markdown("""
+    ## Speech Summarization with Whisper
+    This space is intended to summarize a speech, a short one or long one, to save us sometime
+    (runs faster with GPU inference). Check the example links provided below:
+    [3 mins speech](https://www.youtube.com/watch?v=DuX4K4eeTz8),
+    [13 mins speech](https://www.youtube.com/watch?v=nepOSEGHHCQ)
+    1. Type in a youtube URL or upload an audio file
+    2. Generate transcription with Whisper (English Only)
+    3. Summarize the transcribed speech
+    4. Generate summary speech with the ESPNet model
+    """)
+    # data preparation
+    with gr.Row():
+        with gr.Column():
+            url = gr.Textbox(label="URL", placeholder="video url")
+            url_btn = gr.Button("clear")
+            url_btn.click(lambda x: '', inputs=url, outputs=url)
+        speech = gr.Audio(label="Speech", type="filepath")
+        url.change(audio_from_url, inputs=url, outputs=speech)
+    # ASR
+    text = gr.Textbox(label="Transcription", placeholder="transcription")
+    with gr.Row():
+        default_values = dict(model='Base.en', bs=5, bo=5) if torch.cuda.is_available() \
+            else dict(model='Tiny.en', bs=1, bo=1)
+        model_options = gr.Dropdown(['Tiny.en', 'Base.en'], value=default_values['model'], label="models")
+        model_options.change(load_model, inputs=model_options, outputs=model_options)
+        beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
+        best_of_slider = gr.Slider(1, 10, value=default_values['bo'], step=1, label="param: best_of")
+    with gr.Row():
+        asr_clr_btn = gr.Button("clear")
+        asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
+        asr_btn = gr.Button("Recognize Speech")
+        asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)
+    # summarization
+    summary = gr.Textbox(label="Summarization")
+    with gr.Row():
+        sum_clr_btn = gr.Button("clear")
+        sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
+        sum_btn = gr.Button("Summarize")
+        sum_btn.click(text_summarization, inputs=text, outputs=summary)
+    with gr.Row():
+        # wordcloud
+        image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
+        with gr.Column():
+            tts = gr.Audio(label="Short Speech", type="filepath")
+            tts_btn = gr.Button("Read Summary")
+            tts_btn.click(text_to_speech, inputs=summary, outputs=tts)
+    text.change(wordcloud_func, inputs=text, outputs=image)
+    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
+                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
+                           fn=lambda x: speech_to_text(audio_from_url(x)),
+                           inputs=url, outputs=text, cache_examples=True)
+    gr.HTML(footer_html)
+if __name__ == '__main__':
+    demo.launch()