Spaces:
Runtime error
Runtime error
File size: 6,222 Bytes
dddc03b 45e0cb3 dddc03b 45e0cb3 dddc03b 45e0cb3 dddc03b 6fc6d51 ef64114 6fc6d51 dddc03b 45e0cb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import os
import re
import torch
import whisper
import validators
import gradio as gr
from wordcloud import WordCloud, STOPWORDS
from scipy.io.wavfile import write
from espnet2.bin.tts_inference import Text2Speech
from utils import *
# load whisper model for ASR and BART for summarization
default_model = 'base.en' if torch.cuda.is_available() else 'tiny.en'
asr_model = whisper.load_model(default_model)
summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan")
def load_model(name: str):
"""
:param name: model options, tiny or base only, for quick inference
:return:
"""
global asr_model
asr_model = whisper.load_model(f"{name.lower()}")
return name
def audio_from_url(url, dst_dir='data', name=None, format='wav'):
""" Download video from url and save the audio from video
:param url: str, the video url
:param dst_dir: destination directory for save audio
:param name: audio file's name, if none, assign the name as the video's title
:param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
:return: path of audio
"""
if not validators.url(url):
return None
os.makedirs(dst_dir, exist_ok=True)
# download audio
path = os.path.join(dst_dir, f"audio.{format}")
if os.path.exists(path):
os.remove(path)
os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url} -o {path} --quiet")
return path
def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
""" ASR inference with Whisper
:param audio: filepath
:param beam_size: beam search parameter
:param best_of: number of best results
:param language: Currently English only
:return: transcription
"""
result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
return result['text']
def text_summarization(text):
return summarizer(text)
def wordcloud_func(text: str, out_path='data/wordcloud_output.png'):
""" generate wordcloud based on text
:param text: transcription
:param out_path: filepath
:return: filepath
"""
if len(text) == 0:
return None
stopwords = STOPWORDS
wc = WordCloud(
background_color='white',
stopwords=stopwords,
height=600,
width=600
)
wc.generate(text)
wc.to_file(out_path)
return out_path
def normalize_dollars(text):
""" text normalization for '$'
:param text:
:return:
"""
def expand_dollars(m):
match = m.group(1)
parts = match.split(' ')
parts.append('dollars')
return ' '.join(parts)
units = ['hundred', 'thousand', 'million', 'billion', 'trillion']
_dollars_re = re.compile(fr"\$([0-9\.\,]*[0-9]+ (?:{'|'.join(units)}))")
return re.sub(_dollars_re, expand_dollars, text)
def text_to_speech(text: str, out_path="data/short_speech.wav"):
# espnet tts model process '$1.4 trillion' as 'one point four dollar trillion'
# use this function to fix this issue
text = normalize_dollars(text)
output = tts_model(text)
write(out_path, 22050, output['wav'].numpy())
return out_path
demo = gr.Blocks(css=demo_css, title="Speech Summarization")
demo.encrypt = False
with demo:
# demo description
gr.Markdown("""
## Speech Summarization with Whisper
This space is intended to summarize a speech, a short one or long one, to save us sometime
(runs faster with GPU inference). Check the example links provided below:
[3 mins speech](https://www.youtube.com/watch?v=DuX4K4eeTz8),
[13 mins speech](https://www.youtube.com/watch?v=nepOSEGHHCQ)
1. Type in a youtube URL or upload an audio file
2. Generate transcription with Whisper (English Only)
3. Summarize the transcribed speech
4. Generate summary speech with the ESPNet model
""")
# data preparation
with gr.Row():
with gr.Column():
url = gr.Textbox(label="URL", placeholder="video url")
url_btn = gr.Button("clear")
url_btn.click(lambda x: '', inputs=url, outputs=url)
speech = gr.Audio(label="Speech", type="filepath")
url.change(audio_from_url, inputs=url, outputs=speech)
# ASR
text = gr.Textbox(label="Transcription", placeholder="transcription")
with gr.Row():
model_options = gr.Dropdown(['Tiny.en', 'Base.en'], value=default_model, label="models")
model_options.change(load_model, inputs=model_options, outputs=model_options)
beam_size_slider = gr.Slider(1, 10, value=5, step=1, label="param: beam_size")
best_of_slider = gr.Slider(1, 10, value=5, step=1, label="param: best_of")
with gr.Row():
asr_clr_btn = gr.Button("clear")
asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
asr_btn = gr.Button("Recognize Speech")
asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)
# summarization
summary = gr.Textbox(label="Summarization")
with gr.Row():
sum_clr_btn = gr.Button("clear")
sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
sum_btn = gr.Button("Summarize")
sum_btn.click(text_summarization, inputs=text, outputs=summary)
with gr.Row():
# wordcloud
image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
with gr.Column():
tts = gr.Audio(label="Short Speech", type="filepath")
tts_btn = gr.Button("Read Summary")
tts_btn.click(text_to_speech, inputs=summary, outputs=tts)
text.change(wordcloud_func, inputs=text, outputs=image)
examples = gr.Examples(examples=[
"https://www.youtube.com/watch?v=DuX4K4eeTz8",
"https://www.youtube.com/watch?v=nepOSEGHHCQ"
],
inputs=url, outputs=text,
fn=lambda x: speech_to_text(audio_from_url(x)),
cache_examples=True
)
gr.HTML(footer_html)
if __name__ == '__main__':
demo.launch()
|