Jiedong Yang commited on
Commit
22bdedf
β€’
1 Parent(s): 4591b06

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +143 -0
  2. requirements.txt +16 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pafy
3
+ import time
4
+ import whisper
5
+ import validators
6
+ import gradio as gr
7
+
8
+ from wordcloud import WordCloud, STOPWORDS
9
+
10
+ # load whisper model for ASR and BART for summarization
11
+ asr_model = whisper.load_model('base.en')
12
+ summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')
13
+
14
+
15
+ def audio_from_url(url, dst_dir='data', name=None, format='wav'):
16
+ """ Download video from url and save the audio from video
17
+
18
+ :param url: str, the video url
19
+ :param dst_dir: destination directory for save audio
20
+ :param name: audio file's name, if none, assign the name as the video's title
21
+ :param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
22
+ :return: path of audio
23
+ """
24
+
25
+ if not validators.url(url):
26
+ return None
27
+
28
+ os.makedirs(dst_dir, exist_ok=True)
29
+
30
+ # download audio
31
+ video = pafy.new(url)
32
+ path = os.path.join(dst_dir, f"audio.{format}")
33
+ os.remove(path)
34
+ os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url} -o {path} --quiet")
35
+
36
+ return path
37
+
38
+
39
+ def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
40
+ """ ASR inference with Whisper
41
+
42
+ :param audio:
43
+ :param beam_size:
44
+ :param best_of:
45
+ :param language:
46
+ :return:
47
+ """
48
+
49
+ result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of)
50
+
51
+ return result['text']
52
+
53
+
54
+ def text_summarization(text):
55
+ return summarizer(text)
56
+
57
+
58
+ def wordcloud_func(text: str, out_path='wordcloud_output.png'):
59
+ """ generate wordcloud based on text
60
+
61
+ :param text:
62
+ :param out_path:
63
+ :return:
64
+ """
65
+
66
+ if len(text) == 0:
67
+ return None
68
+
69
+ stopwords = STOPWORDS
70
+
71
+ wc = WordCloud(
72
+ background_color='white',
73
+ stopwords=stopwords,
74
+ height=600,
75
+ width=600
76
+ )
77
+
78
+ wc.generate(text)
79
+ wc.to_file(out_path)
80
+
81
+ return out_path
82
+
83
+
84
+ demo = gr.Blocks(title="Speech Summarization")
85
+
86
+ demo.encrypt = False
87
+
88
+ with demo:
89
+ # demo description
90
+ gr.Markdown("""
91
+ ## Speech Summarization with Whisper
92
+ This space is intended to summarize a speech, a short one or long one, to save us sometime.
93
+ 1. Type in a youtube URL or upload an audio file
94
+ 2. Generate transcription with Whisper (Currently English Only)
95
+ 3. Summarize the transcribed speech
96
+ 4. A little wordcloud for you as well
97
+ """)
98
+
99
+ # data preparation
100
+ with gr.Row():
101
+ with gr.Column():
102
+ url = gr.Textbox(label="URL", placeholder="video url")
103
+
104
+ url_btn = gr.Button("clear")
105
+ url_btn.click(lambda x: '', inputs=url, outputs=url)
106
+
107
+ speech = gr.Audio(label="Speech", type="filepath")
108
+
109
+ url.change(audio_from_url, inputs=url, outputs=speech)
110
+
111
+ # ASR
112
+ text = gr.Textbox(label="Transcription", placeholder="transcription")
113
+
114
+ with gr.Row():
115
+ beam_size_slider = gr.Slider(1, 10, value=5, step=1, label="param: beam_size")
116
+ best_of_slider = gr.Slider(1, 10, value=5, step=1, label="param: best_of")
117
+
118
+ with gr.Row():
119
+ asr_clr_btn = gr.Button("clear")
120
+ asr_clr_btn.click(lambda x: '', inputs=text, outputs=text)
121
+ asr_btn = gr.Button("Recognize Speech")
122
+ asr_btn.click(speech_to_text, inputs=[speech, beam_size_slider, best_of_slider], outputs=text)
123
+
124
+ # summarization
125
+ summary = gr.Textbox(label="Summarization")
126
+
127
+ with gr.Row():
128
+ sum_clr_btn = gr.Button("clear")
129
+ sum_clr_btn.click(lambda x: '', inputs=summary, outputs=summary)
130
+ sum_btn = gr.Button("Summarize")
131
+ sum_btn.click(text_summarization, inputs=text, outputs=summary)
132
+
133
+ # wordcloud
134
+ image = gr.Image(label="wordcloud", show_label=False).style(height=400, width=400)
135
+
136
+ text.change(wordcloud_func, inputs=text, outputs=image)
137
+
138
+ examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
139
+ "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
140
+ inputs=[url])
141
+
142
+ if __name__ == '__main__':
143
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.85.1
2
+ ffmpeg-python==0.2.0
3
+ gradio==3.6
4
+ huggingface-hub==0.9.1
5
+ matplotlib==3.6.1
6
+ pafy==0.5.5
7
+ pandas==1.5.0
8
+ tokenizers==0.12.1
9
+ torch==1.12.1
10
+ tqdm==4.64.1
11
+ transformers==4.22.1
12
+ validators==0.20.0
13
+ whisper @ git+https://github.com/openai/whisper.git@8cf36f3508c9acd341a45eb2364239a3d81458b9
14
+ wordcloud==1.8.2.2
15
+ youtube-dl==2021.12.17
16
+ yt-dlp==2022.10.4