Spaces:
Runtime error
Runtime error
File size: 6,316 Bytes
22bdedf 8973ffd cb253f3 22bdedf 8973ffd 22bdedf 8973ffd 22bdedf 7d23a88 8973ffd 7d23a88 22bdedf 18d2fb0 22bdedf 7d23a88 8973ffd 22bdedf 263d74d 22bdedf 8973ffd 22bdedf 8973ffd 22bdedf 8973ffd 22bdedf 7d23a88 db54719 e6e2169 7d23a88 22bdedf b529762 22bdedf b529762 cb253f3 22bdedf 09182d9 22bdedf cb253f3 7d23a88 cb253f3 22bdedf 8973ffd 22bdedf 7d23a88 22bdedf e978023 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import os
import re
import torch
import whisper
import validators
import gradio as gr
from wordcloud import WordCloud, STOPWORDS
from scipy.io.wavfile import write
from espnet2.bin.tts_inference import Text2Speech
# load whisper model for ASR and BART for summarization
asr_model = whisper.load_model('base.en')
summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan")
def load_model(name: str):
"""
:param name: model options, tiny or base only, for quick inference
:return:
"""
global asr_model
asr_model = whisper.load_model(f"{name.lower()}.en")
return name
def audio_from_url(url, dst_dir='data', name=None, format='wav'):
""" Download video from url and save the audio from video
:param url: str, the video url
:param dst_dir: destination directory for save audio
:param name: audio file's name, if none, assign the name as the video's title
:param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
:return: path of audio
"""
if not validators.url(url):
return None
os.makedirs(dst_dir, exist_ok=True)
# download audio
path = os.path.join(dst_dir, f"audio.{format}")
if os.path.exists(path):
os.remove(path)
os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url} -o {path} --quiet")
return path
def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
""" ASR inference with Whisper
:param audio: filepath
:param beam_size: beam search parameter
:param best_of: number of best results
:param language: Currently English only
:return: transcription
"""
result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
return result['text']
def text_summarization(text):
return summarizer(text)
def wordcloud_func(text: str, out_path='data/wordcloud_output.png'):
""" generate wordcloud based on text
:param text: transcription
:param out_path: filepath
:return: filepath
"""
if len(text) == 0:
return None
stopwords = STOPWORDS
wc = WordCloud(
background_color='white',
stopwords=stopwords,
height=600,
width=600
)
wc.generate(text)
wc.to_file(out_path)
return out_path
def normalize_dollars(text):
""" text normalization for '$'
:param text:
:return:
"""
def expand_dollars(m):
match = m.group(1)
parts = match.split(' ')
parts.append('dollars')
return ' '.join(parts)
units = ['hundred', 'thousand', 'million', 'billion', 'trillion']
_dollars_re = re.compile(fr"\$([0-9\.\,]*[0-9]+ (?:{'|'.join(units)}))")
return re.sub(_dollars_re, expand_dollars, text)
def text_to_speech(text: str, out_path="data/short_speech.wav"):
# espnet tts model process '$1.4 trillion' as 'one point four dollar trillion'
# use this function to fix this issue
text = normalize_dollars(text)
output = tts_model(text)
write(out_path, 22050, output['wav'].numpy())
return out_path
demo = gr.Blocks(title="Speech Summarization")
demo.encrypt = False
with demo:
# demo description
gr.Markdown("""
## Speech Summarization with Whisper
This space is intended to summarize a speech, a short one or long one, to save us sometime
(runs faster with GPU inference). Check the example links provided below:
[3 mins speech](https://www.youtube.com/watch?v=DuX4K4eeTz8),
[13 mins speech](https://www.youtube.com/watch?v=nepOSEGHHCQ)
1. Type in a youtube URL or upload an audio file
2. Generate transcription with Whisper (English Only)
3. Summarize the transcribed speech
4. Generate summary's speech with ESPNet model
model references:
- [Whisper](https://github.com/openai/whisper), [ESPNet](https://github.com/espnet/espnet_model_zoo)
""")
# data preparation
with gr.Row():
with gr.Column():
url = gr.Textbox(label="URL", placeholder="video url")
url_btn = gr.Button("clear")
url_btn.click(lambda x: '', inputs=url, outputs=url)
speech = gr.Audio(label="Speech", type="filepath")
url.change(audio_from_url, inputs=url, outputs=speech)
examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
"https://www.youtube.com/watch?v=nepOSEGHHCQ"],
inputs=[url])
# ASR
text = gr.Textbox(label="Transcription", placeholder="transcription")
with gr.Row():
default_values = dict(model='base', bs=5, bo=5) if torch.cuda.is_available() \
else dict(model='tiny', bs=1, bo=1)
model_options = gr.Dropdown(['Tiny', 'Base'], value=default_values['model'], label="models")
model_options.change(load_model, inputs=model_options, outputs=model_options)
beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
best_of_slider = gr.Slider(1, 10, value=default_values['bo'], step=1, label="param: best_of")
with gr.Row():
asr_clr_btn = gr.Button("clear")
asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
asr_btn = gr.Button("Recognize Speech")
asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)
# summarization
summary = gr.Textbox(label="Summarization")
with gr.Row():
sum_clr_btn = gr.Button("clear")
sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
sum_btn = gr.Button("Summarize")
sum_btn.click(text_summarization, inputs=text, outputs=summary)
with gr.Row():
# wordcloud
image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
with gr.Column():
tts = gr.Audio(label="Short Speech", type="filepath")
tts_btn = gr.Button("Read Summary")
tts_btn.click(text_to_speech, inputs=summary, outputs=tts)
text.change(wordcloud_func, inputs=text, outputs=image)
if __name__ == '__main__':
demo.launch() |