Spaces:
Runtime error
Runtime error
File size: 4,005 Bytes
22bdedf 18d2fb0 22bdedf e978023 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import time
import whisper
import validators
import gradio as gr
from wordcloud import WordCloud, STOPWORDS
# load whisper model for ASR and BART for summarization
asr_model = whisper.load_model('base.en')
summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
def audio_from_url(url, dst_dir='data', name=None, format='wav'):
""" Download video from url and save the audio from video
:param url: str, the video url
:param dst_dir: destination directory for save audio
:param name: audio file's name, if none, assign the name as the video's title
:param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
:return: path of audio
"""
if not validators.url(url):
return None
os.makedirs(dst_dir, exist_ok=True)
# download audio
path = os.path.join(dst_dir, f"audio.{format}")
if os.path.exists(path):
os.remove(path)
os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url} -o {path} --quiet")
return path
def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
""" ASR inference with Whisper
:param audio:
:param beam_size:
:param best_of:
:param language:
:return:
"""
result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of)
return result['text']
def text_summarization(text):
return summarizer(text)
def wordcloud_func(text: str, out_path='wordcloud_output.png'):
""" generate wordcloud based on text
:param text:
:param out_path:
:return:
"""
if len(text) == 0:
return None
stopwords = STOPWORDS
wc = WordCloud(
background_color='white',
stopwords=stopwords,
height=600,
width=600
)
wc.generate(text)
wc.to_file(out_path)
return out_path
demo = gr.Blocks(title="Speech Summarization")
demo.encrypt = False
with demo:
# demo description
gr.Markdown("""
## Speech Summarization with Whisper
This space is intended to summarize a speech, a short one or long one, to save us sometime.
1. Type in a youtube URL or upload an audio file
2. Generate transcription with Whisper (Currently English Only)
3. Summarize the transcribed speech
4. A little wordcloud for you as well
""")
# data preparation
with gr.Row():
with gr.Column():
url = gr.Textbox(label="URL", placeholder="video url")
url_btn = gr.Button("clear")
url_btn.click(lambda x: '', inputs=url, outputs=url)
speech = gr.Audio(label="Speech", type="filepath")
url.change(audio_from_url, inputs=url, outputs=speech)
# ASR
text = gr.Textbox(label="Transcription", placeholder="transcription")
with gr.Row():
beam_size_slider = gr.Slider(1, 10, value=5, step=1, label="param: beam_size")
best_of_slider = gr.Slider(1, 10, value=5, step=1, label="param: best_of")
with gr.Row():
asr_clr_btn = gr.Button("clear")
asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
asr_btn = gr.Button("Recognize Speech")
asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)
# summarization
summary = gr.Textbox(label="Summarization")
with gr.Row():
sum_clr_btn = gr.Button("clear")
sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
sum_btn = gr.Button("Summarize")
sum_btn.click(text_summarization, inputs=text, outputs=summary)
# wordcloud
image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
text.change(wordcloud_func, inputs=text, outputs=image)
examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
"https://www.youtube.com/watch?v=nepOSEGHHCQ"],
inputs=[url])
if __name__ == '__main__':
demo.launch() |