jiedong-yang commited on
Commit
ba842e8
β€’
2 Parent(s): c6d324f 0a4a5ed

Merge branch 'main' of https://huggingface.co/spaces/jiedong-yang/Speech-Summarization-with-Whisper into main

Browse files
Files changed (2) hide show
  1. app.py +16 -13
  2. utils.py +21 -0
app.py CHANGED
@@ -10,6 +10,8 @@ from wordcloud import WordCloud, STOPWORDS
10
  from scipy.io.wavfile import write
11
  from espnet2.bin.tts_inference import Text2Speech
12
 
 
 
13
  # load whisper model for ASR and BART for summarization
14
  asr_model = whisper.load_model('base.en')
15
  summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
@@ -23,7 +25,7 @@ def load_model(name: str):
23
  :return:
24
  """
25
  global asr_model
26
- asr_model = whisper.load_model(f"{name.lower()}.en")
27
  return name
28
 
29
 
@@ -127,7 +129,7 @@ def text_to_speech(text: str, out_path="data/short_speech.wav"):
127
  return out_path
128
 
129
 
130
- demo = gr.Blocks(title="Speech Summarization")
131
 
132
  demo.encrypt = False
133
 
@@ -143,10 +145,7 @@ with demo:
143
  1. Type in a youtube URL or upload an audio file
144
  2. Generate transcription with Whisper (English Only)
145
  3. Summarize the transcribed speech
146
- 4. Generate summary's speech with ESPNet model
147
-
148
- model references:
149
- - [Whisper](https://github.com/openai/whisper), [ESPNet](https://github.com/espnet/espnet_model_zoo)
150
  """)
151
 
152
  # data preparation
@@ -161,17 +160,13 @@ with demo:
161
 
162
  url.change(audio_from_url, inputs=url, outputs=speech)
163
 
164
- examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
165
- "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
166
- inputs=[url])
167
-
168
  # ASR
169
  text = gr.Textbox(label="Transcription", placeholder="transcription")
170
 
171
  with gr.Row():
172
- default_values = dict(model='Base', bs=5, bo=5) if torch.cuda.is_available() \
173
- else dict(model='Tiny', bs=1, bo=1)
174
- model_options = gr.Dropdown(['Tiny', 'Base'], value=default_values['model'], label="models")
175
  model_options.change(load_model, inputs=model_options, outputs=model_options)
176
 
177
  beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
@@ -202,6 +197,14 @@ with demo:
202
 
203
  text.change(wordcloud_func, inputs=text, outputs=image)
204
 
 
 
 
 
 
 
 
 
205
 
206
  if __name__ == '__main__':
207
  demo.launch()
 
10
  from scipy.io.wavfile import write
11
  from espnet2.bin.tts_inference import Text2Speech
12
 
13
+ from utils import *
14
+
15
  # load whisper model for ASR and BART for summarization
16
  asr_model = whisper.load_model('base.en')
17
  summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
 
25
  :return:
26
  """
27
  global asr_model
28
+ asr_model = whisper.load_model(f"{name.lower()}")
29
  return name
30
 
31
 
 
129
  return out_path
130
 
131
 
132
+ demo = gr.Blocks(css=demo_css, title="Speech Summarization")
133
 
134
  demo.encrypt = False
135
 
 
145
  1. Type in a youtube URL or upload an audio file
146
  2. Generate transcription with Whisper (English Only)
147
  3. Summarize the transcribed speech
148
+ 4. Generate summary speech with the ESPNet model
 
 
 
149
  """)
150
 
151
  # data preparation
 
160
 
161
  url.change(audio_from_url, inputs=url, outputs=speech)
162
 
 
 
 
 
163
  # ASR
164
  text = gr.Textbox(label="Transcription", placeholder="transcription")
165
 
166
  with gr.Row():
167
+ default_values = dict(model='Base.en', bs=5, bo=5) if torch.cuda.is_available() \
168
+ else dict(model='Tiny.en', bs=1, bo=1)
169
+ model_options = gr.Dropdown(['Tiny.en', 'Base.en'], value=default_values['model'], label="models")
170
  model_options.change(load_model, inputs=model_options, outputs=model_options)
171
 
172
  beam_size_slider = gr.Slider(1, 10, value=default_values['bs'], step=1, label="param: beam_size")
 
197
 
198
  text.change(wordcloud_func, inputs=text, outputs=image)
199
 
200
+ examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
201
+ "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
202
+ inputs=url, outputs=text,
203
+ fn=lambda x: speech_to_text(audio_from_url(x)),
204
+ cache_examples=True)
205
+
206
+ gr.HTML(footer_html)
207
+
208
 
209
  if __name__ == '__main__':
210
  demo.launch()
utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ demo_css = """
2
+ .footer {
3
+ margin-bottom: 40px;
4
+ margin-top: 45px;
5
+ text-align: center;
6
+ border-bottom: 1px solid #e5e5e5;
7
+ }
8
+ """
9
+
10
+ footer_html = """
11
+ <div class="footer">
12
+ <p>Whisper by <a href="https://github.com/openai/whisper"
13
+ style="text-decoration: underline;"
14
+ target="_blank">OpenAI</a> - BART by <a href="https://huggingface.co/facebook/bart-large-cnn"
15
+ style="text-decoration: underline;"
16
+ target="_blank">Facebook</a> - Conformer by <a href="https://github.com/espnet/espnet_model_zoo"
17
+ style="text-decoration: underline;"
18
+ target="_blank">ESPNet</a>
19
+ </p>
20
+ </div>
21
+ """