import os import pafy import time import whisper import validators import gradio as gr from wordcloud import WordCloud, STOPWORDS # load whisper model for ASR and BART for summarization asr_model = whisper.load_model('base.en') summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface') def audio_from_url(url, dst_dir='data', name=None, format='wav'): """ Download video from url and save the audio from video :param url: str, the video url :param dst_dir: destination directory for save audio :param name: audio file's name, if none, assign the name as the video's title :param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred. :return: path of audio """ if not validators.url(url): return None os.makedirs(dst_dir, exist_ok=True) # download audio # video = pafy.new(url) path = os.path.join(dst_dir, f"audio.{format}") if os.path.exists(path): os.remove(path) os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url} -o {path} --quiet") return path def speech_to_text(audio, beam_size=5, best_of=5, language='en'): """ ASR inference with Whisper :param audio: :param beam_size: :param best_of: :param language: :return: """ result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False) return result['text'] def text_summarization(text): return summarizer(text) def wordcloud_func(text: str, out_path='wordcloud_output.png'): """ generate wordcloud based on text :param text: :param out_path: :return: """ if len(text) == 0: return None stopwords = STOPWORDS wc = WordCloud( background_color='white', stopwords=stopwords, height=600, width=600 ) wc.generate(text) wc.to_file(out_path) return out_path demo = gr.Blocks(title="Speech Summarization") demo.encrypt = False with demo: # demo description gr.Markdown(""" ## Speech Summarization with Whisper This space is intended to summarize a speech, a short one or long one, to save us sometime. 1. Type in a youtube URL or upload an audio file 2. Generate transcription with Whisper (Currently English Only) 3. Summarize the transcribed speech 4. A little wordcloud for you as well """) # data preparation with gr.Row(): with gr.Column(): url = gr.Textbox(label="URL", placeholder="video url") url_btn = gr.Button("clear") url_btn.click(lambda x: '', inputs=url, outputs=url) speech = gr.Audio(label="Speech", type="filepath") url.change(audio_from_url, inputs=url, outputs=speech) examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8", "https://www.youtube.com/watch?v=nepOSEGHHCQ"], inputs=[url]) # ASR text = gr.Textbox(label="Transcription", placeholder="transcription") with gr.Row(): beam_size_slider = gr.Slider(1, 10, value=5, step=1, label="param: beam_size") best_of_slider = gr.Slider(1, 10, value=5, step=1, label="param: best_of") with gr.Row(): asr_clr_btn = gr.Button("clear") asr_clr_btn.click(lambda x: '', inputs=text, outputs=text) asr_btn = gr.Button("Recognize Speech") asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text) # summarization summary = gr.Textbox(label="Summarization") with gr.Row(): sum_clr_btn = gr.Button("clear") sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary) sum_btn = gr.Button("Summarize") sum_btn.click(text_summarization, inputs=text, outputs=summary) # wordcloud image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400) text.change(wordcloud_func, inputs=text, outputs=image) if __name__ == '__main__': demo.launch()