ashhadahsan commited on
Commit
8ae5c89
1 Parent(s): f34b85f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -80
app.py CHANGED
@@ -2,12 +2,20 @@ import numpy as np
2
  import streamlit as st
3
  from constants import WHISPER_MODELS, language_dict
4
  import streamlit as st
5
- from utils import translate_to_english, detect_language, write, read, get_key
 
 
 
 
 
 
 
6
  import whisperx as whisper
7
  import json
8
  import pandas as pd
9
  from pydub import AudioSegment
10
  import os
 
11
 
12
  if "btn1" not in st.session_state:
13
  st.session_state["btn1"] = False
@@ -52,14 +60,14 @@ with input:
52
  # on_change=disable_btn2,
53
  # disabled=st.session_state["btn1"],
54
  )
55
- # text_json = st.file_uploader(
56
- # label="Aligned JSON",
57
- # type=["json"],
58
- # help="Your aligned json file",
59
- # # disabled=st.session_state["btn2"],
60
- # # on_change=disable_btn1,
61
- # )
62
- text_json = None
63
 
64
  # st.markdown("""**model**""", unsafe_allow_html=True)
65
  model_name = st.selectbox(
@@ -137,16 +145,16 @@ with input:
137
  )
138
  else:
139
  temperature = [temperature]
140
- try:
141
- if len(temperature) == 0:
142
- st.error("Choose correct value for temperature")
143
- except:
144
- pass
145
- # st.write(temperature)
146
  submit = st.button("Submit", type="primary")
147
  with output:
148
  st.header("Output")
149
- import uuid
 
 
 
 
 
 
150
 
151
  name = str(uuid.uuid1())
152
  if submit:
@@ -157,46 +165,37 @@ with output:
157
  if audio_uploaded.name.endswith(".wav"):
158
  temp = AudioSegment.from_wav(audio_uploaded)
159
  temp.export(f"{name}.wav")
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- if audio_uploaded.name.endswith(".mp3"):
162
- temp = AudioSegment.from_wav(audio_uploaded)
163
- temp.export(f"{name}.wav")
164
 
165
- # audio_bytes = audio_uploaded.read()
166
- # st.audio(audio_bytes, format="audio/wav")
167
- if language == "":
168
  model = whisper.load_model(model_name)
169
- with st.spinner("Detecting language..."):
170
- detection = detect_language(f"{name}.wav", model)
171
- language = detection.get("detected_language")
172
- del model
173
- # st.write(language)
174
- if len(language) > 2:
175
- language = get_key(language)
176
- segments_pre = st.empty()
177
- segments_post = st.empty()
178
- segments_post_json = st.empty()
179
- segments_post2 = st.empty()
180
- trans = st.empty()
181
- lang = st.empty()
182
- if text_json is None:
183
- with st.spinner("Running ... "):
184
- decode = {"suppress_tokens": suppress_tokens, "beam_size": 5}
185
- model = whisper.load_model(model_name)
186
- with st.container():
187
- with st.spinner(f"Running with {model_name} model"):
188
- result = model.transcribe(
189
- f"{name}.wav",
190
- language=language,
191
- patience=patience,
192
- initial_prompt=initial_prompt,
193
- condition_on_previous_text=condition_on_previous_text,
194
- temperature=temperature,
195
- compression_ratio_threshold=compression_ratio_threshold,
196
- logprob_threshold=logprob_threshold,
197
- no_speech_threshold=no_speech_threshold,
198
- **decode,
199
- )
200
 
201
  if translate:
202
  result = translate_to_english(result, json=False)
@@ -213,21 +212,6 @@ with output:
213
  f"{name}.wav",
214
  device=device,
215
  )
216
-
217
- if text_json is not None:
218
- if translate:
219
- result = translate_to_english(text_json, json=True)
220
- with st.spinner("Running alignment model ..."):
221
- model_a, metadata = whisper.load_align_model(
222
- language_code=language, device=device
223
- )
224
-
225
- result_aligned = whisper.align(
226
- text_json, model_a, metadata, audio_uploaded.name, device
227
- )
228
-
229
- if text_json is None:
230
- words_segments = result_aligned["word_segments"]
231
  write(
232
  f"{name}.wav",
233
  dtype=transcription,
@@ -237,13 +221,40 @@ with output:
237
  trans.text_area(
238
  "transcription", trans_text, height=None, max_chars=None, key=None
239
  )
240
- segments_pre.text_area(
241
- "Segments before alignment",
242
- result["segments"],
243
- height=None,
244
- max_chars=None,
245
- key=None,
246
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  segments_post.text_area(
248
  "Word Segments after alignment",
249
  result_aligned["word_segments"],
@@ -251,10 +262,6 @@ with output:
251
  max_chars=None,
252
  key=None,
253
  )
254
- with open("segments.json", "w", encoding="utf-8") as f:
255
-
256
- json.dump(result_aligned["word_segments"], f, indent=False)
257
-
258
  segments_post2.text_area(
259
  "Segments after alignment",
260
  result_aligned["segments"],
@@ -265,4 +272,100 @@ with output:
265
  lang.text_input(
266
  "detected language", language_dict.get(language), disabled=True
267
  )
268
- os.remove(f"{name}.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import streamlit as st
3
  from constants import WHISPER_MODELS, language_dict
4
  import streamlit as st
5
+ from utils import (
6
+ translate_to_english,
7
+ detect_language,
8
+ write,
9
+ read,
10
+ get_key,
11
+
12
+ )
13
  import whisperx as whisper
14
  import json
15
  import pandas as pd
16
  from pydub import AudioSegment
17
  import os
18
+ import uuid
19
 
20
  if "btn1" not in st.session_state:
21
  st.session_state["btn1"] = False
 
60
  # on_change=disable_btn2,
61
  # disabled=st.session_state["btn1"],
62
  )
63
+ text_json = st.file_uploader(
64
+ label="Aligned JSON",
65
+ type=["json"],
66
+ help="Your aligned json file (Only if you need to skip transcribe)",
67
+ # disabled=st.session_state["btn2"],
68
+ # on_change=disable_btn1,
69
+ )
70
+ # text_json = None
71
 
72
  # st.markdown("""**model**""", unsafe_allow_html=True)
73
  model_name = st.selectbox(
 
145
  )
146
  else:
147
  temperature = [temperature]
 
 
 
 
 
 
148
  submit = st.button("Submit", type="primary")
149
  with output:
150
  st.header("Output")
151
+
152
+ segments_pre = st.empty()
153
+ segments_post = st.empty()
154
+ segments_post_json = st.empty()
155
+ segments_post2 = st.empty()
156
+ trans = st.empty()
157
+ lang = st.empty()
158
 
159
  name = str(uuid.uuid1())
160
  if submit:
 
165
  if audio_uploaded.name.endswith(".wav"):
166
  temp = AudioSegment.from_wav(audio_uploaded)
167
  temp.export(f"{name}.wav")
168
+ if audio_uploaded.name.endswith(".mp3"):
169
+ temp = AudioSegment.from_wav(audio_uploaded)
170
+ temp.export(f"{name}.wav")
171
+ if language == "":
172
+ model = whisper.load_model(model_name)
173
+ with st.spinner("Detecting language..."):
174
+ detection = detect_language(f"{name}.wav", model)
175
+ language = detection.get("detected_language")
176
+ del model
177
+ if len(language) > 2:
178
+ language = get_key(language)
179
 
180
+ if text_json is None:
 
 
181
 
182
+ with st.spinner("Running ... "):
183
+ decode = {"suppress_tokens": suppress_tokens, "beam_size": 5}
 
184
  model = whisper.load_model(model_name)
185
+ with st.container():
186
+ with st.spinner(f"Running with {model_name} model"):
187
+ result = model.transcribe(
188
+ f"{name}.wav",
189
+ language=language,
190
+ patience=patience,
191
+ initial_prompt=initial_prompt,
192
+ condition_on_previous_text=condition_on_previous_text,
193
+ temperature=temperature,
194
+ compression_ratio_threshold=compression_ratio_threshold,
195
+ logprob_threshold=logprob_threshold,
196
+ no_speech_threshold=no_speech_threshold,
197
+ **decode,
198
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  if translate:
201
  result = translate_to_english(result, json=False)
 
212
  f"{name}.wav",
213
  device=device,
214
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  write(
216
  f"{name}.wav",
217
  dtype=transcription,
 
221
  trans.text_area(
222
  "transcription", trans_text, height=None, max_chars=None, key=None
223
  )
224
+ char_segments = []
225
+ word_segments = []
226
+
227
+ for x in range(len(result_aligned["segments"])):
228
+ word_segments.append(
229
+ {
230
+ "word-segments": result_aligned["segments"][x][
231
+ "word-segments"
232
+ ]
233
+ .fillna("")
234
+ .to_dict(orient="records")
235
+ }
236
+ )
237
+ char_segments.append(
238
+ {
239
+ "char-segments": result_aligned["segments"][x][
240
+ "char-segments"
241
+ ]
242
+ .fillna("")
243
+ .to_dict(orient="records")
244
+ }
245
+ )
246
+
247
+ for x in range(len(result_aligned["segments"])):
248
+
249
+ result_aligned["segments"][x]["word-segments"] = word_segments[x]
250
+ result_aligned["segments"][x]["char-segments"] = char_segments[x]
251
+ segments_pre.text_area(
252
+ "Segments before alignment",
253
+ result["segments"],
254
+ height=None,
255
+ max_chars=None,
256
+ key=None,
257
+ )
258
  segments_post.text_area(
259
  "Word Segments after alignment",
260
  result_aligned["word_segments"],
 
262
  max_chars=None,
263
  key=None,
264
  )
 
 
 
 
265
  segments_post2.text_area(
266
  "Segments after alignment",
267
  result_aligned["segments"],
 
272
  lang.text_input(
273
  "detected language", language_dict.get(language), disabled=True
274
  )
275
+ os.remove(f"{name}.wav")
276
+ if text_json is not None:
277
+ with st.spinner("Running ... "):
278
+
279
+ model = whisper.load_model(model_name)
280
+ json_filname = str(uuid.uuid1())
281
+ data = json.load(text_json)
282
+
283
+ # Close the uploaded file
284
+ text_json.close()
285
+
286
+ # Write the JSON data to a new file
287
+ with open(f"{json_filname}.json", "w") as outfile:
288
+ json.dump(data, outfile)
289
+
290
+ # with open("fold.json", "w", encoding="utf-8") as f:
291
+ # json.dump(text_json, f)
292
+ with open(f"{json_filname}.json", "r", encoding="utf-8") as f:
293
+ cont = json.load(f)
294
+
295
+ with st.spinner("Running alignment model ..."):
296
+ model_a, metadata = whisper.load_align_model(
297
+ language_code=language, device=device
298
+ )
299
+ result_aligned = whisper.align(
300
+ cont,
301
+ model_a,
302
+ metadata,
303
+ f"{name}.wav",
304
+ device=device,
305
+ )
306
+ words_segments = result_aligned["word_segments"]
307
+ write(
308
+ f"{name}.wav",
309
+ dtype=transcription,
310
+ result_aligned=result_aligned,
311
+ )
312
+ trans_text = read(f"{name}.wav", transcription)
313
+ char_segments = []
314
+ word_segments = []
315
+
316
+ for x in range(len(result_aligned["segments"])):
317
+ word_segments.append(
318
+ {
319
+ "word-segments": result_aligned["segments"][x][
320
+ "word-segments"
321
+ ]
322
+ .fillna("")
323
+ .to_dict(orient="records")
324
+ }
325
+ )
326
+ char_segments.append(
327
+ {
328
+ "char-segments": result_aligned["segments"][x][
329
+ "char-segments"
330
+ ]
331
+ .fillna("")
332
+ .to_dict(orient="records")
333
+ }
334
+ )
335
+
336
+ for x in range(len(result_aligned["segments"])):
337
+
338
+ result_aligned["segments"][x]["word-segments"] = word_segments[x]
339
+ result_aligned["segments"][x]["char-segments"] = char_segments[x]
340
+ trans.text_area(
341
+ "transcription", trans_text, height=None, max_chars=None, key=None
342
+ )
343
+ segments_pre.text_area(
344
+ "Segments before alignment",
345
+ cont,
346
+ height=None,
347
+ max_chars=None,
348
+ key=None,
349
+ )
350
+
351
+ segments_post.text_area(
352
+ "Word Segments after alignment",
353
+ result_aligned["word_segments"],
354
+ height=None,
355
+ max_chars=None,
356
+ key=None,
357
+ )
358
+
359
+ segments_post2.text_area(
360
+ "Segments after alignment",
361
+ result_aligned["segments"],
362
+ expanded=False
363
+ height=None,
364
+ max_chars=None,
365
+ key=None,
366
+ )
367
+ lang.text_input(
368
+ "detected language", language_dict.get(language), disabled=True
369
+ )
370
+ os.remove(f"{name}.wav")
371
+ os.remove(f"{json_filname}.json")