Spaces:

jiedong-yang
/

Speech-Summarization-with-Whisper

Runtime error

App Files Files Community

jiedong-yang commited on Nov 2, 2022

Commit

d27921b

•

1 Parent(s): 26ba01d

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -209

app.py DELETED Viewed

@@ -1,209 +0,0 @@
-import os
-import re
-import torch
-import whisper
-import validators
-import gradio as gr
-from wordcloud import WordCloud, STOPWORDS
-from scipy.io.wavfile import write
-from espnet2.bin.tts_inference import Text2Speech
-from utils import *
-# load whisper model for ASR and BART for summarization
-asr_model = whisper.load_model('base.en')
-summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
-tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan")
-def load_model(name: str):
-    """
-    :param name: model options, tiny or base only, for quick inference
-    :return:
-    """
-    global asr_model
-    asr_model = whisper.load_model(f"{name.lower()}")
-    return name
-def audio_from_url(url, dst_dir='data', name=None, format='wav'):
-    """ Download video from url and save the audio from video
-    :param url: str, the video url
-    :param dst_dir: destination directory for save audio
-    :param name: audio file's name, if none, assign the name as the video's title
-    :param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
-    :return: path of audio
-    """
-    if not validators.url(url):
-        return None
-    os.makedirs(dst_dir, exist_ok=True)
-    # download audio
-    path = os.path.join(dst_dir, f"audio.{format}")
-    if os.path.exists(path):
-        os.remove(path)
-    os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url}  -o {path} --quiet")
-    return path
-def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
-    """ ASR inference with Whisper
-    :param audio: filepath
-    :param beam_size: beam search parameter
-    :param best_of: number of best results
-    :param language: Currently English only
-    :return: transcription
-    """
-    result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
-    return result['text']
-def text_summarization(text):
-    return summarizer(text)
-def wordcloud_func(text: str, out_path='data/wordcloud_output.png'):
-    """ generate wordcloud based on text
-    :param text: transcription
-    :param out_path: filepath
-    :return: filepath
-    """
-    if len(text) == 0:
-        return None
-    stopwords = STOPWORDS
-    wc = WordCloud(
-        background_color='white',
-        stopwords=stopwords,
-        height=600,
-        width=600
-    )
-    wc.generate(text)
-    wc.to_file(out_path)
-    return out_path
-def normalize_dollars(text):
-    """ text normalization for '$'
-    :param text:
-    :return:
-    """
-    def expand_dollars(m):
-        match = m.group(1)
-        parts = match.split(' ')
-        parts.append('dollars')
-        return ' '.join(parts)
-    units = ['hundred', 'thousand', 'million', 'billion', 'trillion']
-    _dollars_re = re.compile(fr"\$([0-9\.\,]*[0-9]+ (?:{'|'.join(units)}))")
-    return re.sub(_dollars_re, expand_dollars, text)
-def text_to_speech(text: str, out_path="data/short_speech.wav"):
-    # espnet tts model process '$1.4 trillion' as 'one point four dollar trillion'
-    # use this function to fix this issue
-    text = normalize_dollars(text)
-    output = tts_model(text)
-    write(out_path, 22050, output['wav'].numpy())
-    return out_path
-demo = gr.Blocks(css=demo_css, title="Speech Summarization")
-demo.encrypt = False
-with demo:
-    # demo description
-    gr.Markdown("""
-    ## Speech Summarization with Whisper
-    This space is intended to summarize a speech, a short one or long one, to save us sometime
-    (runs faster with GPU inference). Check the example links provided below:
-    [3 mins speech](https://www.youtube.com/watch?v=DuX4K4eeTz8),
-    [13 mins speech](https://www.youtube.com/watch?v=nepOSEGHHCQ)
-    1. Type in a youtube URL or upload an audio file
-    2. Generate transcription with Whisper (English Only)
-    3. Summarize the transcribed speech
-    4. Generate summary speech with the ESPNet model
-    """)
-    # data preparation
-    with gr.Row():
-        with gr.Column():
-            url = gr.Textbox(label="URL", placeholder="video url")
-            url_btn = gr.Button("clear")
-            url_btn.click(lambda x: '', inputs=url, outputs=url)
-        speech = gr.Audio(label="Speech", type="filepath")
-        url.change(audio_from_url, inputs=url, outputs=speech)
-    # ASR
-    text = gr.Textbox(label="Transcription", placeholder="transcription")
-    with gr.Row():
-        default_values = dict(model='Base.en', bs=5, bo=5) if torch.cuda.is_available() \
-            else dict(model='Tiny.en', bs=1, bo=1)
-        model_options = gr.Dropdown(['Tiny.en', 'Base.en'], value=default_values['model'], label="models")
-        model_options.change(load_model, inputs=model_options, outputs=model_options)
-        beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
-        best_of_slider = gr.Slider(1, 10, value=default_values['bo'], step=1, label="param: best_of")
-    with gr.Row():
-        asr_clr_btn = gr.Button("clear")
-        asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
-        asr_btn = gr.Button("Recognize Speech")
-        asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)
-    # summarization
-    summary = gr.Textbox(label="Summarization")
-    with gr.Row():
-        sum_clr_btn = gr.Button("clear")
-        sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
-        sum_btn = gr.Button("Summarize")
-        sum_btn.click(text_summarization, inputs=text, outputs=summary)
-    with gr.Row():
-        # wordcloud
-        image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
-        with gr.Column():
-            tts = gr.Audio(label="Short Speech", type="filepath")
-            tts_btn = gr.Button("Read Summary")
-            tts_btn.click(text_to_speech, inputs=summary, outputs=tts)
-    text.change(wordcloud_func, inputs=text, outputs=image)
-    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
-                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
-                           fn=lambda x: speech_to_text(audio_from_url(x)),
-                           inputs=url, outputs=text, cache_examples=True)
-    gr.HTML(footer_html)
-if __name__ == '__main__':
-    demo.launch()