Spaces:
Runtime error
Runtime error
Jiedong Yang
commited on
Commit
β’
8973ffd
1
Parent(s):
0e371c3
Upload app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,21 @@
|
|
1 |
import os
|
|
|
2 |
import whisper
|
3 |
import validators
|
4 |
import gradio as gr
|
5 |
|
|
|
|
|
|
|
6 |
from wordcloud import WordCloud, STOPWORDS
|
7 |
|
|
|
|
|
|
|
8 |
# load whisper model for ASR and BART for summarization
|
9 |
asr_model = whisper.load_model('base.en')
|
10 |
summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
|
|
|
11 |
|
12 |
|
13 |
def load_model(name: str):
|
@@ -16,6 +24,7 @@ def load_model(name: str):
|
|
16 |
:param name: model options, tiny or base only, for quick inference
|
17 |
:return:
|
18 |
"""
|
|
|
19 |
asr_model = whisper.load_model(f"{name.lower()}.en")
|
20 |
return name
|
21 |
|
@@ -50,8 +59,8 @@ def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
|
|
50 |
:param audio: filepath
|
51 |
:param beam_size: beam search parameter
|
52 |
:param best_of: number of best results
|
53 |
-
:param language:
|
54 |
-
:return:
|
55 |
"""
|
56 |
|
57 |
result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
|
@@ -63,12 +72,12 @@ def text_summarization(text):
|
|
63 |
return summarizer(text)
|
64 |
|
65 |
|
66 |
-
def wordcloud_func(text: str, out_path='wordcloud_output.png'):
|
67 |
""" generate wordcloud based on text
|
68 |
|
69 |
-
:param text:
|
70 |
-
:param out_path:
|
71 |
-
:return:
|
72 |
"""
|
73 |
|
74 |
if len(text) == 0:
|
@@ -89,6 +98,37 @@ def wordcloud_func(text: str, out_path='wordcloud_output.png'):
|
|
89 |
return out_path
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
demo = gr.Blocks(title="Speech Summarization")
|
93 |
|
94 |
demo.encrypt = False
|
@@ -149,8 +189,13 @@ with demo:
|
|
149 |
sum_btn = gr.Button("Summarize")
|
150 |
sum_btn.click(text_summarization, inputs=text, outputs=summary)
|
151 |
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
text.change(wordcloud_func, inputs=text, outputs=image)
|
156 |
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
import whisper
|
4 |
import validators
|
5 |
import gradio as gr
|
6 |
|
7 |
+
import nltk
|
8 |
+
nltk.download()
|
9 |
+
|
10 |
from wordcloud import WordCloud, STOPWORDS
|
11 |
|
12 |
+
from scipy.io.wavfile import write
|
13 |
+
from espnet2.bin.tts_inference import Text2Speech
|
14 |
+
|
15 |
# load whisper model for ASR and BART for summarization
|
16 |
asr_model = whisper.load_model('base.en')
|
17 |
summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
|
18 |
+
tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan")
|
19 |
|
20 |
|
21 |
def load_model(name: str):
|
|
|
24 |
:param name: model options, tiny or base only, for quick inference
|
25 |
:return:
|
26 |
"""
|
27 |
+
global asr_model
|
28 |
asr_model = whisper.load_model(f"{name.lower()}.en")
|
29 |
return name
|
30 |
|
|
|
59 |
:param audio: filepath
|
60 |
:param beam_size: beam search parameter
|
61 |
:param best_of: number of best results
|
62 |
+
:param language: Currently English only
|
63 |
+
:return: transcription
|
64 |
"""
|
65 |
|
66 |
result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
|
|
|
72 |
return summarizer(text)
|
73 |
|
74 |
|
75 |
+
def wordcloud_func(text: str, out_path='data/wordcloud_output.png'):
|
76 |
""" generate wordcloud based on text
|
77 |
|
78 |
+
:param text: transcription
|
79 |
+
:param out_path: filepath
|
80 |
+
:return: filepath
|
81 |
"""
|
82 |
|
83 |
if len(text) == 0:
|
|
|
98 |
return out_path
|
99 |
|
100 |
|
101 |
+
def normalize_dollars(text):
|
102 |
+
""" text normalization for '$'
|
103 |
+
|
104 |
+
:param text:
|
105 |
+
:return:
|
106 |
+
"""
|
107 |
+
|
108 |
+
def expand_dollars(m):
|
109 |
+
match = m.group(1)
|
110 |
+
parts = match.split(' ')
|
111 |
+
parts.append('dollars')
|
112 |
+
return ' '.join(parts)
|
113 |
+
|
114 |
+
units = ['hundred', 'thousand', 'million', 'billion', 'trillion']
|
115 |
+
_dollars_re = re.compile(fr"\$([0-9\.\,]*[0-9]+ (?:{'|'.join(units)}))")
|
116 |
+
|
117 |
+
return re.sub(_dollars_re, expand_dollars, text)
|
118 |
+
|
119 |
+
|
120 |
+
def text_to_speech(text: str, out_path="data/short_speech.wav"):
|
121 |
+
|
122 |
+
# espnet tts model process '$1.4 trillion' as 'one point four dollar trillion'
|
123 |
+
# use this function to fix this issue
|
124 |
+
text = normalize_dollars(text)
|
125 |
+
|
126 |
+
output = tts_model(text)
|
127 |
+
write(out_path, 22050, output['wav'].numpy())
|
128 |
+
|
129 |
+
return out_path
|
130 |
+
|
131 |
+
|
132 |
demo = gr.Blocks(title="Speech Summarization")
|
133 |
|
134 |
demo.encrypt = False
|
|
|
189 |
sum_btn = gr.Button("Summarize")
|
190 |
sum_btn.click(text_summarization, inputs=text, outputs=summary)
|
191 |
|
192 |
+
with gr.Row():
|
193 |
+
# wordcloud
|
194 |
+
image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
|
195 |
+
with gr.Column():
|
196 |
+
tts = gr.Audio(label="Short Speech", type="filepath")
|
197 |
+
tts_btn = gr.Button("Read Summary")
|
198 |
+
tts_btn.click(text_to_speech, inputs=summary, outputs=tts)
|
199 |
|
200 |
text.change(wordcloud_func, inputs=text, outputs=image)
|
201 |
|