Jiedong Yang commited on
Commit
8973ffd
β€’
1 Parent(s): 0e371c3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -8
app.py CHANGED
@@ -1,13 +1,21 @@
1
  import os
 
2
  import whisper
3
  import validators
4
  import gradio as gr
5
 
 
 
 
6
  from wordcloud import WordCloud, STOPWORDS
7
 
 
 
 
8
  # load whisper model for ASR and BART for summarization
9
  asr_model = whisper.load_model('base.en')
10
  summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
 
11
 
12
 
13
  def load_model(name: str):
@@ -16,6 +24,7 @@ def load_model(name: str):
16
  :param name: model options, tiny or base only, for quick inference
17
  :return:
18
  """
 
19
  asr_model = whisper.load_model(f"{name.lower()}.en")
20
  return name
21
 
@@ -50,8 +59,8 @@ def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
50
  :param audio: filepath
51
  :param beam_size: beam search parameter
52
  :param best_of: number of best results
53
- :param language:
54
- :return:
55
  """
56
 
57
  result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
@@ -63,12 +72,12 @@ def text_summarization(text):
63
  return summarizer(text)
64
 
65
 
66
- def wordcloud_func(text: str, out_path='wordcloud_output.png'):
67
  """ generate wordcloud based on text
68
 
69
- :param text:
70
- :param out_path:
71
- :return:
72
  """
73
 
74
  if len(text) == 0:
@@ -89,6 +98,37 @@ def wordcloud_func(text: str, out_path='wordcloud_output.png'):
89
  return out_path
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  demo = gr.Blocks(title="Speech Summarization")
93
 
94
  demo.encrypt = False
@@ -149,8 +189,13 @@ with demo:
149
  sum_btn = gr.Button("Summarize")
150
  sum_btn.click(text_summarization, inputs=text, outputs=summary)
151
 
152
- # wordcloud
153
- image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
 
 
 
 
 
154
 
155
  text.change(wordcloud_func, inputs=text, outputs=image)
156
 
 
1
  import os
2
+ import re
3
  import whisper
4
  import validators
5
  import gradio as gr
6
 
7
+ import nltk
8
+ nltk.download()
9
+
10
  from wordcloud import WordCloud, STOPWORDS
11
 
12
+ from scipy.io.wavfile import write
13
+ from espnet2.bin.tts_inference import Text2Speech
14
+
15
  # load whisper model for ASR and BART for summarization
16
  asr_model = whisper.load_model('base.en')
17
  summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
18
+ tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan")
19
 
20
 
21
  def load_model(name: str):
 
24
  :param name: model options, tiny or base only, for quick inference
25
  :return:
26
  """
27
+ global asr_model
28
  asr_model = whisper.load_model(f"{name.lower()}.en")
29
  return name
30
 
 
59
  :param audio: filepath
60
  :param beam_size: beam search parameter
61
  :param best_of: number of best results
62
+ :param language: Currently English only
63
+ :return: transcription
64
  """
65
 
66
  result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of, fp16=False)
 
72
  return summarizer(text)
73
 
74
 
75
+ def wordcloud_func(text: str, out_path='data/wordcloud_output.png'):
76
  """ generate wordcloud based on text
77
 
78
+ :param text: transcription
79
+ :param out_path: filepath
80
+ :return: filepath
81
  """
82
 
83
  if len(text) == 0:
 
98
  return out_path
99
 
100
 
101
+ def normalize_dollars(text):
102
+ """ text normalization for '$'
103
+
104
+ :param text:
105
+ :return:
106
+ """
107
+
108
+ def expand_dollars(m):
109
+ match = m.group(1)
110
+ parts = match.split(' ')
111
+ parts.append('dollars')
112
+ return ' '.join(parts)
113
+
114
+ units = ['hundred', 'thousand', 'million', 'billion', 'trillion']
115
+ _dollars_re = re.compile(fr"\$([0-9\.\,]*[0-9]+ (?:{'|'.join(units)}))")
116
+
117
+ return re.sub(_dollars_re, expand_dollars, text)
118
+
119
+
120
+ def text_to_speech(text: str, out_path="data/short_speech.wav"):
121
+
122
+ # espnet tts model process '$1.4 trillion' as 'one point four dollar trillion'
123
+ # use this function to fix this issue
124
+ text = normalize_dollars(text)
125
+
126
+ output = tts_model(text)
127
+ write(out_path, 22050, output['wav'].numpy())
128
+
129
+ return out_path
130
+
131
+
132
  demo = gr.Blocks(title="Speech Summarization")
133
 
134
  demo.encrypt = False
 
189
  sum_btn = gr.Button("Summarize")
190
  sum_btn.click(text_summarization, inputs=text, outputs=summary)
191
 
192
+ with gr.Row():
193
+ # wordcloud
194
+ image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
195
+ with gr.Column():
196
+ tts = gr.Audio(label="Short Speech", type="filepath")
197
+ tts_btn = gr.Button("Read Summary")
198
+ tts_btn.click(text_to_speech, inputs=summary, outputs=tts)
199
 
200
  text.change(wordcloud_func, inputs=text, outputs=image)
201