Spaces:

jiedong-yang
/

Speech-Summarization-with-Whisper

Runtime error

Speech-Summarization-with-Whisper / app.py

Jiedong Yang

Update app.py

263d74d almost 2 years ago

4.06 kB

	import os
	import pafy
	import time
	import whisper
	import validators
	import gradio as gr

	from wordcloud import WordCloud, STOPWORDS

	# load whisper model for ASR and BART for summarization
	asr_model = whisper.load_model('base.en')
	summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')


	def audio_from_url(url, dst_dir='data', name=None, format='wav'):
	""" Download video from url and save the audio from video

	:param url: str, the video url
	:param dst_dir: destination directory for save audio
	:param name: audio file's name, if none, assign the name as the video's title
	:param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
	:return: path of audio
	"""

	if not validators.url(url):
	return None

	os.makedirs(dst_dir, exist_ok=True)

	# download audio
	# video = pafy.new(url)
	path = os.path.join(dst_dir, f"audio.{format}")
	if os.path.exists(path):
	os.remove(path)
	os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url} -o {path} --quiet")

	return path


	def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
	""" ASR inference with Whisper

	:param audio:
	:param beam_size:
	:param best_of:
	:param language:
	:return:
	"""

	result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)

	return result['text']


	def text_summarization(text):
	return summarizer(text)


	def wordcloud_func(text: str, out_path='wordcloud_output.png'):
	""" generate wordcloud based on text

	:param text:
	:param out_path:
	:return:
	"""

	if len(text) == 0:
	return None

	stopwords = STOPWORDS

	wc = WordCloud(
	background_color='white',
	stopwords=stopwords,
	height=600,
	width=600
	)

	wc.generate(text)
	wc.to_file(out_path)

	return out_path


	demo = gr.Blocks(title="Speech Summarization")

	demo.encrypt = False

	with demo:
	# demo description
	gr.Markdown("""
	## Speech Summarization with Whisper
	This space is intended to summarize a speech, a short one or long one, to save us sometime.
	1. Type in a youtube URL or upload an audio file
	2. Generate transcription with Whisper (Currently English Only)
	3. Summarize the transcribed speech
	4. A little wordcloud for you as well
	""")

	# data preparation
	with gr.Row():
	with gr.Column():
	url = gr.Textbox(label="URL", placeholder="video url")

	url_btn = gr.Button("clear")
	url_btn.click(lambda x: '', inputs=url, outputs=url)

	speech = gr.Audio(label="Speech", type="filepath")

	url.change(audio_from_url, inputs=url, outputs=speech)

	examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
	"https://www.youtube.com/watch?v=nepOSEGHHCQ"],
	inputs=[url])

	# ASR
	text = gr.Textbox(label="Transcription", placeholder="transcription")

	with gr.Row():
	beam_size_slider = gr.Slider(1, 10, value=5, step=1, label="param: beam_size")
	best_of_slider = gr.Slider(1, 10, value=5, step=1, label="param: best_of")

	with gr.Row():
	asr_clr_btn = gr.Button("clear")
	asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
	asr_btn = gr.Button("Recognize Speech")
	asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)

	# summarization
	summary = gr.Textbox(label="Summarization")

	with gr.Row():
	sum_clr_btn = gr.Button("clear")
	sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
	sum_btn = gr.Button("Summarize")
	sum_btn.click(text_summarization, inputs=text, outputs=summary)

	# wordcloud
	image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)

	text.change(wordcloud_func, inputs=text, outputs=image)


	if __name__ == '__main__':
	demo.launch()