John6666 commited on
Commit
fb314e8
β€’
1 Parent(s): cdc2410

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -187
app.py CHANGED
@@ -1,187 +1,187 @@
1
- from nemo.collections.asr.models import EncDecCTCModelBPE
2
- from omegaconf import open_dict
3
- #import yt_dlp as youtube_dl
4
- import os
5
- import tempfile
6
- import torch
7
- import gradio as gr
8
- from pydub import AudioSegment
9
- import time
10
-
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
12
- MODEL_NAME="ayymen/stt_zgh_fastconformer_ctc_small"
13
- YT_LENGTH_LIMIT_S=3600
14
-
15
- model = EncDecCTCModelBPE.from_pretrained(model_name=MODEL_NAME).to(device)
16
-
17
- with open_dict(model.cfg):
18
- model.cfg.decoding.strategy = "beam"
19
- model.cfg.decoding.beam.beam_size = 256 # Desired Beam Size
20
- model.cfg.decoding.beam.beam_alpha = 1.5 # Desired Beam Alpha
21
- model.cfg.decoding.beam.beam_beta = 1.5 # Desired Beam Beta
22
- model.cfg.decoding.beam.kenlm_path = "kenlm.bin" # Path to KenLM binary file
23
-
24
- model.change_decoding_strategy(model.cfg.decoding)
25
-
26
- model.eval()
27
-
28
- def get_transcripts(audio_path):
29
- audio = AudioSegment.from_file(audio_path)
30
- # check if audio is mono 16kHz
31
- if audio.channels != 1 or audio.frame_rate != 16000:
32
- audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
33
- with tempfile.TemporaryDirectory() as tmpdirname:
34
- audio_path = os.path.join(tmpdirname, "audio.wav")
35
- audio.export(audio_path, format="wav")
36
- text = model.transcribe([audio_path])[0]
37
- else:
38
- text = model.transcribe([audio_path])[0]
39
- return text
40
-
41
- '''
42
- article = (
43
- "<p style='text-align: center'>"
44
- "<a href='https://huggingface.co/nvidia/parakeet-rnnt-1.1b' target='_blank'>πŸŽ™οΈ Learn more about Parakeet model</a> | "
45
- "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>πŸ“š FastConformer paper</a> | "
46
- "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>πŸ§‘β€πŸ’» Repository</a>"
47
- "</p>"
48
- )
49
- '''
50
-
51
- EXAMPLES = [
52
- ["135.wav"],
53
- ["common_voice_zgh_37837257.mp3"],
54
- ]
55
-
56
- """
57
- YT_EXAMPLES = [
58
- ["https://www.youtube.com/shorts/CSgTSE50MHY"],
59
- ["https://www.youtube.com/shorts/OxQtqOyAFLE"]
60
- ]
61
- """
62
-
63
- def _return_yt_html_embed(yt_url):
64
- video_id = yt_url.split("?v=")[-1]
65
- if "youtube.com/shorts/" in video_id:
66
- video_id = video_id.split("/")[-1]
67
- HTML_str = (
68
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
69
- " </center>"
70
- )
71
- return HTML_str
72
-
73
- def download_yt_audio(yt_url, filename):
74
- info_loader = youtube_dl.YoutubeDL()
75
-
76
- try:
77
- info = info_loader.extract_info(yt_url, download=False)
78
- except youtube_dl.utils.DownloadError as err:
79
- raise gr.Error(str(err))
80
-
81
- file_length = info["duration_string"]
82
- file_h_m_s = file_length.split(":")
83
- file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
84
-
85
- if len(file_h_m_s) == 1:
86
- file_h_m_s.insert(0, 0)
87
- if len(file_h_m_s) == 2:
88
- file_h_m_s.insert(0, 0)
89
- file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
90
-
91
- if file_length_s > YT_LENGTH_LIMIT_S:
92
- yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
93
- file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
94
- raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
95
-
96
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
97
-
98
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
99
- try:
100
- ydl.download([yt_url])
101
- except youtube_dl.utils.ExtractorError as err:
102
- raise gr.Error(str(err))
103
-
104
-
105
- def yt_transcribe(yt_url, max_filesize=75.0):
106
- html_embed_str = _return_yt_html_embed(yt_url)
107
-
108
- with tempfile.TemporaryDirectory() as tmpdirname:
109
- filepath = os.path.join(tmpdirname, "video.mp4")
110
- download_yt_audio(yt_url, filepath)
111
- audio = AudioSegment.from_file(filepath)
112
- audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
113
- wav_filepath = os.path.join(tmpdirname, "audio.wav")
114
- audio.export(wav_filepath, format="wav")
115
- text = get_transcripts(wav_filepath)
116
-
117
- return html_embed_str, text
118
-
119
-
120
- demo = gr.Blocks()
121
-
122
- mf_transcribe = gr.Interface(
123
- fn=get_transcripts,
124
- inputs=[
125
- gr.Audio(sources="microphone", type="filepath")
126
- ],
127
- outputs="text",
128
- title="Transcribe Audio",
129
- description=(
130
- "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
131
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
132
- " of arbitrary length."
133
- ),
134
- allow_flagging="never",
135
- cache_examples=False,
136
- )
137
-
138
- file_transcribe = gr.Interface(
139
- fn=get_transcripts,
140
- inputs=[
141
- gr.Audio(sources="upload", type="filepath", label="Audio file"),
142
- ],
143
- outputs="text",
144
- examples=EXAMPLES,
145
- title="Transcribe Audio",
146
- description=(
147
- "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
148
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
149
- " of arbitrary length."
150
- ),
151
- allow_flagging="never",
152
- cache_examples=False,
153
- )
154
-
155
- """
156
- youtube_transcribe = gr.Interface(
157
- fn=yt_transcribe,
158
- inputs=[
159
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
160
- ],
161
- outputs=["html", "text"],
162
- examples=YT_EXAMPLES,
163
- title="Transcribe Audio",
164
- description=(
165
- "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
166
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
167
- " of arbitrary length."
168
- ),
169
- allow_flagging="never",
170
- )
171
- """
172
-
173
- with demo:
174
- gr.TabbedInterface(
175
- [
176
- mf_transcribe,
177
- file_transcribe,
178
- #youtube_transcribe
179
- ],
180
- [
181
- "Microphone",
182
- "Audio file",
183
- #"Youtube Video"
184
- ]
185
- )
186
-
187
- demo.launch()
 
1
+ from nemo.collections.asr.models import EncDecCTCModelBPE
2
+ from omegaconf import open_dict
3
+ #import yt_dlp as youtube_dl
4
+ import os
5
+ import tempfile
6
+ import torch
7
+ import gradio as gr
8
+ from pydub import AudioSegment
9
+ import time
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ MODEL_NAME="ayymen/stt_zgh_fastconformer_ctc_small"
13
+ YT_LENGTH_LIMIT_S=3600
14
+
15
+ model = EncDecCTCModelBPE.from_pretrained(model_name=MODEL_NAME).to(device)
16
+
17
+ with open_dict(model.cfg):
18
+ model.cfg.decoding.strategy = "beam"
19
+ model.cfg.decoding.beam.beam_size = 256 # Desired Beam Size
20
+ model.cfg.decoding.beam.beam_alpha = 1.5 # Desired Beam Alpha
21
+ model.cfg.decoding.beam.beam_beta = 1.5 # Desired Beam Beta
22
+ model.cfg.decoding.beam.kenlm_path = "kenlm.bin" # Path to KenLM binary file
23
+
24
+ model.change_decoding_strategy(model.cfg.decoding)
25
+
26
+ model.eval()
27
+
28
+ def get_transcripts(audio_path):
29
+ audio = AudioSegment.from_file(audio_path)
30
+ # check if audio is mono 16kHz
31
+ if audio.channels != 1 or audio.frame_rate != 16000:
32
+ audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
33
+ with tempfile.TemporaryDirectory() as tmpdirname:
34
+ audio_path = os.path.join(tmpdirname, "audio.wav")
35
+ audio.export(audio_path, format="wav")
36
+ text = model.transcribe([audio_path])[0]
37
+ else:
38
+ text = model.transcribe([audio_path])[0]
39
+ return text
40
+
41
+ '''
42
+ article = (
43
+ "<p style='text-align: center'>"
44
+ "<a href='https://huggingface.co/nvidia/parakeet-rnnt-1.1b' target='_blank'>πŸŽ™οΈ Learn more about Parakeet model</a> | "
45
+ "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>πŸ“š FastConformer paper</a> | "
46
+ "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>πŸ§‘β€πŸ’» Repository</a>"
47
+ "</p>"
48
+ )
49
+ '''
50
+
51
+ EXAMPLES = [
52
+ ["135.wav"],
53
+ ["common_voice_zgh_37837257.mp3"],
54
+ ]
55
+
56
+ """
57
+ YT_EXAMPLES = [
58
+ ["https://www.youtube.com/shorts/CSgTSE50MHY"],
59
+ ["https://www.youtube.com/shorts/OxQtqOyAFLE"]
60
+ ]
61
+ """
62
+
63
+ def _return_yt_html_embed(yt_url):
64
+ video_id = yt_url.split("?v=")[-1]
65
+ if "youtube.com/shorts/" in video_id:
66
+ video_id = video_id.split("/")[-1]
67
+ HTML_str = (
68
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
69
+ " </center>"
70
+ )
71
+ return HTML_str
72
+
73
+ def download_yt_audio(yt_url, filename):
74
+ info_loader = youtube_dl.YoutubeDL()
75
+
76
+ try:
77
+ info = info_loader.extract_info(yt_url, download=False)
78
+ except youtube_dl.utils.DownloadError as err:
79
+ raise gr.Error(str(err))
80
+
81
+ file_length = info["duration_string"]
82
+ file_h_m_s = file_length.split(":")
83
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
84
+
85
+ if len(file_h_m_s) == 1:
86
+ file_h_m_s.insert(0, 0)
87
+ if len(file_h_m_s) == 2:
88
+ file_h_m_s.insert(0, 0)
89
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
90
+
91
+ if file_length_s > YT_LENGTH_LIMIT_S:
92
+ yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
93
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
94
+ raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
95
+
96
+ ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
97
+
98
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
99
+ try:
100
+ ydl.download([yt_url])
101
+ except youtube_dl.utils.ExtractorError as err:
102
+ raise gr.Error(str(err))
103
+
104
+
105
+ def yt_transcribe(yt_url, max_filesize=75.0):
106
+ html_embed_str = _return_yt_html_embed(yt_url)
107
+
108
+ with tempfile.TemporaryDirectory() as tmpdirname:
109
+ filepath = os.path.join(tmpdirname, "video.mp4")
110
+ download_yt_audio(yt_url, filepath)
111
+ audio = AudioSegment.from_file(filepath)
112
+ audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
113
+ wav_filepath = os.path.join(tmpdirname, "audio.wav")
114
+ audio.export(wav_filepath, format="wav")
115
+ text = get_transcripts(wav_filepath)
116
+
117
+ return html_embed_str, text
118
+
119
+
120
+ demo = gr.Blocks()
121
+
122
+ mf_transcribe = gr.Interface(
123
+ fn=get_transcripts,
124
+ inputs=[
125
+ gr.Audio(sources="microphone", type="filepath")
126
+ ],
127
+ outputs="text",
128
+ title="Transcribe Audio",
129
+ description=(
130
+ "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
131
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
132
+ " of arbitrary length."
133
+ ),
134
+ allow_flagging="never",
135
+ cache_examples=False,
136
+ )
137
+
138
+ file_transcribe = gr.Interface(
139
+ fn=get_transcripts,
140
+ inputs=[
141
+ gr.Audio(sources="upload", type="filepath", label="Audio file"),
142
+ ],
143
+ outputs="text",
144
+ examples=EXAMPLES,
145
+ title="Transcribe Audio",
146
+ description=(
147
+ "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
148
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
149
+ " of arbitrary length."
150
+ ),
151
+ allow_flagging="never",
152
+ cache_examples=False,
153
+ )
154
+
155
+ """
156
+ youtube_transcribe = gr.Interface(
157
+ fn=yt_transcribe,
158
+ inputs=[
159
+ gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
160
+ ],
161
+ outputs=["html", "text"],
162
+ examples=YT_EXAMPLES,
163
+ title="Transcribe Audio",
164
+ description=(
165
+ "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
166
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
167
+ " of arbitrary length."
168
+ ),
169
+ allow_flagging="never",
170
+ )
171
+ """
172
+
173
+ with demo:
174
+ gr.TabbedInterface(
175
+ [
176
+ mf_transcribe,
177
+ file_transcribe,
178
+ #youtube_transcribe
179
+ ],
180
+ [
181
+ "Microphone",
182
+ "Audio file",
183
+ #"Youtube Video"
184
+ ]
185
+ )
186
+
187
+ demo.launch(server_name="0.0.0.0")