Update app.py
Browse files
app.py
CHANGED
@@ -34,6 +34,12 @@ combined_models = []
|
|
34 |
combined_models.extend(whisper_models)
|
35 |
combined_models.extend(custom_models)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
LANGUAGES = {
|
39 |
"en": "English",
|
@@ -217,7 +223,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
217 |
1. Download youtube video with a given url
|
218 |
2. Watch it in the first video component
|
219 |
3. Run automatic speech recognition on the video using fast Whisper models
|
220 |
-
4. Translate the recognized transcriptions to 26 languages supported by deepL
|
221 |
5. Download generated subtitles in .vtt and .srt formats
|
222 |
6. Watch the the original video with generated subtitles
|
223 |
|
@@ -229,13 +235,19 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
229 |
raise ValueError("Error no video input")
|
230 |
print(video_file_path)
|
231 |
try:
|
|
|
|
|
|
|
232 |
_,file_ending = os.path.splitext(f'{video_file_path}')
|
233 |
print(f'file enging is {file_ending}')
|
234 |
print("starting conversion to wav")
|
235 |
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
|
236 |
print("conversion to wav ready")
|
|
|
|
|
|
|
237 |
|
238 |
-
|
239 |
|
240 |
print("starting whisper c++")
|
241 |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
|
@@ -249,7 +261,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
249 |
os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
|
250 |
print("starting whisper done with whisper")
|
251 |
except Exception as e:
|
252 |
-
raise RuntimeError("Error
|
253 |
|
254 |
try:
|
255 |
|
@@ -283,12 +295,27 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
283 |
}
|
284 |
|
285 |
df = pd.concat([df, pd.DataFrame(srt_to_df)])
|
|
|
|
|
|
|
286 |
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
except Exception as e:
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
|
294 |
def translate_transcriptions(df, selected_translation_lang_2):
|
@@ -316,20 +343,24 @@ def translate_transcriptions(df, selected_translation_lang_2):
|
|
316 |
|
317 |
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
318 |
usage = json.loads(usage.text)
|
|
|
319 |
try:
|
320 |
-
print('Usage is at: ' +
|
321 |
except Exception as e:
|
322 |
print(e)
|
323 |
|
324 |
-
if
|
325 |
-
print("
|
326 |
-
|
327 |
-
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
|
328 |
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
|
|
|
|
|
|
|
|
333 |
except Exception as e:
|
334 |
print("EXCEPTION WITH DEEPL API")
|
335 |
print(e)
|
@@ -391,7 +422,7 @@ def translate_transcriptions(df, selected_translation_lang_2):
|
|
391 |
|
392 |
print("SRT DONE")
|
393 |
subtitle_files = ['subtitles.vtt','subtitles.srt']
|
394 |
-
|
395 |
return df, subtitle_files
|
396 |
|
397 |
# def burn_srt_to_video(srt_file, video_in):
|
@@ -467,6 +498,10 @@ demo = gr.Blocks(css='''
|
|
467 |
.output-markdown {max-width: 65ch !important;}
|
468 |
''')
|
469 |
demo.encrypt = False
|
|
|
|
|
|
|
|
|
470 |
with demo:
|
471 |
transcription_var = gr.Variable()
|
472 |
|
@@ -484,8 +519,9 @@ with demo:
|
|
484 |
|
485 |
with gr.Column():
|
486 |
gr.Markdown('''
|
487 |
-
|
488 |
-
(But please **consider using short videos** so others won't get queued)
|
|
|
489 |
''')
|
490 |
examples = gr.Examples(examples=
|
491 |
[ "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",
|
@@ -509,13 +545,13 @@ with demo:
|
|
509 |
with gr.Column():
|
510 |
gr.Markdown('''
|
511 |
##### Here you can start the transcription and translation process.
|
512 |
-
|
513 |
-
|
514 |
''')
|
515 |
selected_source_lang.render()
|
516 |
selected_whisper_model.render()
|
517 |
transcribe_btn = gr.Button("Step 2. Transcribe audio")
|
518 |
-
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df)
|
519 |
|
520 |
|
521 |
with gr.Row():
|
@@ -530,9 +566,15 @@ with demo:
|
|
530 |
with gr.Row():
|
531 |
with gr.Column():
|
532 |
gr.Markdown('''
|
533 |
-
|
534 |
Here you will can translate transcriptions to 26 languages.
|
535 |
-
If spoken language is not in the list, translation might not work. In this case original transcriptions are used
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
''')
|
537 |
selected_translation_lang_2.render()
|
538 |
translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
|
|
|
34 |
combined_models.extend(whisper_models)
|
35 |
combined_models.extend(custom_models)
|
36 |
|
37 |
+
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
38 |
+
usage = json.loads(usage.text)
|
39 |
+
deepL_character_usage = str(usage['character_count'])
|
40 |
+
print("deepL_character_usage")
|
41 |
+
|
42 |
+
|
43 |
|
44 |
LANGUAGES = {
|
45 |
"en": "English",
|
|
|
223 |
1. Download youtube video with a given url
|
224 |
2. Watch it in the first video component
|
225 |
3. Run automatic speech recognition on the video using fast Whisper models
|
226 |
+
4. Translate the recognized transcriptions to 26 languages supported by deepL (If free API usage for the month is not yet fully consumed)
|
227 |
5. Download generated subtitles in .vtt and .srt formats
|
228 |
6. Watch the the original video with generated subtitles
|
229 |
|
|
|
235 |
raise ValueError("Error no video input")
|
236 |
print(video_file_path)
|
237 |
try:
|
238 |
+
|
239 |
+
|
240 |
+
|
241 |
_,file_ending = os.path.splitext(f'{video_file_path}')
|
242 |
print(f'file enging is {file_ending}')
|
243 |
print("starting conversion to wav")
|
244 |
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
|
245 |
print("conversion to wav ready")
|
246 |
+
|
247 |
+
except Exception as e:
|
248 |
+
raise RuntimeError("Error Running inference with local model", e)
|
249 |
|
250 |
+
try:
|
251 |
|
252 |
print("starting whisper c++")
|
253 |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
|
|
|
261 |
os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
|
262 |
print("starting whisper done with whisper")
|
263 |
except Exception as e:
|
264 |
+
raise RuntimeError("Error running Whisper cpp model")
|
265 |
|
266 |
try:
|
267 |
|
|
|
295 |
}
|
296 |
|
297 |
df = pd.concat([df, pd.DataFrame(srt_to_df)])
|
298 |
+
except Exception as e:
|
299 |
+
print("Error creating srt df")
|
300 |
+
|
301 |
|
302 |
+
try:
|
303 |
+
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
304 |
+
usage = json.loads(usage.text)
|
305 |
+
char_count = str(usage['character_count'])
|
306 |
+
|
307 |
+
print('Usage is at: ' + str(usage['character_count']) + ' characters')
|
308 |
+
|
309 |
+
if usage['character_count'] >= 490000:
|
310 |
+
print("USAGE CLOSE TO LIMIT")
|
311 |
|
312 |
except Exception as e:
|
313 |
+
print('Error with DeepL API requesting usage count')
|
314 |
+
|
315 |
+
|
316 |
+
return df
|
317 |
+
|
318 |
+
|
319 |
|
320 |
|
321 |
def translate_transcriptions(df, selected_translation_lang_2):
|
|
|
343 |
|
344 |
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
345 |
usage = json.loads(usage.text)
|
346 |
+
deepL_character_usage = str(usage['character_count'])
|
347 |
try:
|
348 |
+
print('Usage is at: ' + deepL_character_usage + 'characters')
|
349 |
except Exception as e:
|
350 |
print(e)
|
351 |
|
352 |
+
if int(deepL_character_usage) <= 490000:
|
353 |
+
print("STILL CHARACTERS LEFT")
|
354 |
+
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
|
|
|
355 |
|
356 |
+
# Print the response from the server
|
357 |
+
translated_sentences = json.loads(response.text)
|
358 |
+
translated_sentences = translated_sentences['translations'][0]['text'].split('\n')
|
359 |
+
df['translation'] = translated_sentences
|
360 |
+
|
361 |
+
else:
|
362 |
+
df['translation'] = df['text']
|
363 |
+
|
364 |
except Exception as e:
|
365 |
print("EXCEPTION WITH DEEPL API")
|
366 |
print(e)
|
|
|
422 |
|
423 |
print("SRT DONE")
|
424 |
subtitle_files = ['subtitles.vtt','subtitles.srt']
|
425 |
+
|
426 |
return df, subtitle_files
|
427 |
|
428 |
# def burn_srt_to_video(srt_file, video_in):
|
|
|
498 |
.output-markdown {max-width: 65ch !important;}
|
499 |
''')
|
500 |
demo.encrypt = False
|
501 |
+
|
502 |
+
|
503 |
+
|
504 |
+
|
505 |
with demo:
|
506 |
transcription_var = gr.Variable()
|
507 |
|
|
|
519 |
|
520 |
with gr.Column():
|
521 |
gr.Markdown('''
|
522 |
+
### 1. Copy any non-private Youtube video URL to box below or click one of the examples.
|
523 |
+
(But please **consider using short videos** so others won't get queued) <br>
|
524 |
+
Then press button "1. Download Youtube video"-button:
|
525 |
''')
|
526 |
examples = gr.Examples(examples=
|
527 |
[ "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",
|
|
|
545 |
with gr.Column():
|
546 |
gr.Markdown('''
|
547 |
##### Here you can start the transcription and translation process.
|
548 |
+
Be aware that processing will last some time. With base model it is around 3x speed
|
549 |
+
**Please select source language** for better transcriptions. Using 'Let the model analyze' makes mistakes sometimes and may lead to bad transcriptions
|
550 |
''')
|
551 |
selected_source_lang.render()
|
552 |
selected_whisper_model.render()
|
553 |
transcribe_btn = gr.Button("Step 2. Transcribe audio")
|
554 |
+
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], [transcription_df])
|
555 |
|
556 |
|
557 |
with gr.Row():
|
|
|
566 |
with gr.Row():
|
567 |
with gr.Column():
|
568 |
gr.Markdown('''
|
569 |
+
### PLEASE READ BELOW
|
570 |
Here you will can translate transcriptions to 26 languages.
|
571 |
+
If spoken language is not in the list, translation might not work. In this case original transcriptions are used.
|
572 |
+
''')
|
573 |
+
gr.Markdown(f'''
|
574 |
+
DeepL API character usage:
|
575 |
+
{deepL_character_usage if deepL_character_usage is not None else ''}/500 000 characters
|
576 |
+
If usage is over 490 000 characters original transcriptions will be used for subtitles.
|
577 |
+
API usage resets on 5th of every month.
|
578 |
''')
|
579 |
selected_translation_lang_2.render()
|
580 |
translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
|