Spaces:

jiedong-yang
/

Speech-Summarization-with-Whisper

Runtime error

File size: 4,062 Bytes

import os
import pafy
import time
import whisper
import validators
import gradio as gr

from wordcloud import WordCloud, STOPWORDS

# load whisper model for ASR and BART for summarization
asr_model = whisper.load_model('base.en')
summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')


def audio_from_url(url, dst_dir='data', name=None, format='wav'):
    """ Download video from url and save the audio from video

    :param url: str, the video url
    :param dst_dir: destination directory for save audio
    :param name: audio file's name, if none, assign the name as the video's title
    :param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
    :return: path of audio
    """

    if not validators.url(url):
        return None

    os.makedirs(dst_dir, exist_ok=True)

    # download audio
    # video = pafy.new(url)
    path = os.path.join(dst_dir, f"audio.{format}")
    if os.path.exists(path):
        os.remove(path)
    os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url}  -o {path} --quiet")

    return path


def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
    """ ASR inference with Whisper

    :param audio:
    :param beam_size:
    :param best_of:
    :param language:
    :return:
    """

    result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)

    return result['text']


def text_summarization(text):
    return summarizer(text)


def wordcloud_func(text: str, out_path='wordcloud_output.png'):
    """ generate wordcloud based on text

    :param text:
    :param out_path:
    :return:
    """

    if len(text) == 0:
        return None

    stopwords = STOPWORDS

    wc = WordCloud(
        background_color='white',
        stopwords=stopwords,
        height=600,
        width=600
    )

    wc.generate(text)
    wc.to_file(out_path)

    return out_path


demo = gr.Blocks(title="Speech Summarization")

demo.encrypt = False

with demo:
    # demo description
    gr.Markdown("""
    ## Speech Summarization with Whisper
    This space is intended to summarize a speech, a short one or long one, to save us sometime.
    1. Type in a youtube URL or upload an audio file
    2. Generate transcription with Whisper (Currently English Only)
    3. Summarize the transcribed speech
    4. A little wordcloud for you as well
    """)

    # data preparation
    with gr.Row():
        with gr.Column():
            url = gr.Textbox(label="URL", placeholder="video url")

            url_btn = gr.Button("clear")
            url_btn.click(lambda x: '', inputs=url, outputs=url)

        speech = gr.Audio(label="Speech", type="filepath")

        url.change(audio_from_url, inputs=url, outputs=speech)

    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
                           inputs=[url])

    # ASR
    text = gr.Textbox(label="Transcription", placeholder="transcription")

    with gr.Row():
        beam_size_slider = gr.Slider(1, 10, value=5, step=1, label="param: beam_size")
        best_of_slider = gr.Slider(1, 10, value=5, step=1, label="param: best_of")

    with gr.Row():
        asr_clr_btn = gr.Button("clear")
        asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
        asr_btn = gr.Button("Recognize Speech")
        asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)

    # summarization
    summary = gr.Textbox(label="Summarization")

    with gr.Row():
        sum_clr_btn = gr.Button("clear")
        sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
        sum_btn = gr.Button("Summarize")
        sum_btn.click(text_summarization, inputs=text, outputs=summary)

    # wordcloud
    image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)

    text.change(wordcloud_func, inputs=text, outputs=image)
    

if __name__ == '__main__':
    demo.launch()