whisper-demo-mongolian

Runtime error

App Files Files Community

bayartsogt commited on Dec 21, 2022

Commit

ffa1767

•

1 Parent(s): 7a1ec76

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -83

app.py CHANGED Viewed

@@ -1,102 +1,111 @@
-import torch
 import gradio as gr
-import pytube as pt
 from transformers import pipeline
-from huggingface_hub import model_info
-MODEL_NAME = "bayartsogt/whisper-small-mn-12" #this always needs to stay in line 8 :D sorry for the hackiness
-lang = "mn"
-"""
-|                      model_id  |     WER |                           Keep Characters |
-| bayartsogt/whisper-small-mn-7  | 32.6469 | " абвгдеёжзийклмноөпрстуүфхцчшъыьэюя.,?!" |
-"""
 device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    chunk_length_s=12, # did a little experiment looks like this is much better
-    device=device,
-)
 pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
-def transcribe(microphone, file_upload):
-    warn_output = ""
-    if (microphone is not None) and (file_upload is not None):
-        warn_output = (
-            "WARNING: You've uploaded an audio file and used the microphone. "
-            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
-        )
-    elif (microphone is None) and (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
-    return warn_output + text
-def _return_yt_html_embed(yt_url):
-    video_id = yt_url.split("?v=")[-1]
-    HTML_str = (
-        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
-        " </center>"
-    )
-    return HTML_str
-def yt_transcribe(yt_url):
-    yt = pt.YouTube(yt_url)
-    html_embed_str = _return_yt_html_embed(yt_url)
-    stream = yt.streams.filter(only_audio=True)[0]
-    stream.download(filename="audio.mp3")
-    text = pipe("audio.mp3")["text"]
-    return html_embed_str, text
-demo = gr.Blocks()
-mf_transcribe = gr.Interface(
-    fn=transcribe,
     inputs=[
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Audio(source="upload", type="filepath", optional=True),
     ],
-    outputs="text",
-    layout="horizontal",
-    theme="huggingface",
-    title="Whisper Demo: Transcribe Audio",
-    description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
-        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
-        " of arbitrary length."
-    ),
-    allow_flagging="never",
-)
-yt_transcribe = gr.Interface(
-    fn=yt_transcribe,
-    inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
-    outputs=["html", "text"],
     layout="horizontal",
     theme="huggingface",
-    title="Whisper Demo: Transcribe YouTube",
     description=(
-        "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
-        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
-        " arbitrary length."
     ),
     allow_flagging="never",
 )
-with demo:
-    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
-demo.launch(enable_queue=True)

 import gradio as gr
+import numpy as np
+import time
+from pyannote.audio import Pipeline
+import librosa, torch
 from transformers import pipeline
+from utils import second_to_timecode, download_from_youtube
+MODEL_NAME = 'bayartsogt/whisper-small-mn-8'
+lang = 'mn'
+chunk_length_s = 9
+vad_activation_min_duration = 9 # sec
 device = 0 if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 16_000
+######## LOAD MODELS FROM HUB ########
+dia_model = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=True)
+vad_model = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=True)
+pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=chunk_length_s, device=device)
 pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
+print("----------> Loaded models <-----------")
+def generator(youtube_link, microphone, file_upload, num_speakers, max_duration, history):
+    if int(youtube_link != '') + int(microphone is not None) + int(file_upload is not None) != 1:
+        raise Exception(f"Only one of the source should be given youtube_link={youtube_link}, microphone={microphone}, file_upload={file_upload}")
+    history = history or ""
+    if microphone:
+        path = microphone
+    elif file_upload:
+        path = file_upload
+    elif youtube_link:
+        path = download_from_youtube(youtube_link)
+    waveform, sampling_rate = librosa.load(path, sr=SAMPLE_RATE, mono=True, duration=max_duration)
+    print(waveform.shape, sampling_rate)
+    waveform_tensor = torch.unsqueeze(torch.tensor(waveform), 0).to(device)
+    dia_result = dia_model({
+        "waveform": waveform_tensor,
+        "sample_rate": sampling_rate,
+    }, num_speakers=num_speakers)
+    for speech_turn, track, speaker in dia_result.itertracks(yield_label=True):
+        print(f"{speech_turn.start:4.1f} {speech_turn.end:4.1f} {speaker}")
+        _start = int(sampling_rate * speech_turn.start)
+        _end = int(sampling_rate * speech_turn.end)
+        data = waveform[_start: _end]
+        if speech_turn.end - speech_turn.start > vad_activation_min_duration:
+            print(f'audio duration {speech_turn.end - speech_turn.start} sec ----> activating VAD')
+            vad_output = vad_model({
+                'waveform': waveform_tensor[:, _start:_end],
+                'sample_rate': sampling_rate})
+            for vad_turn in vad_output.get_timeline().support():
+                vad_start = _start + int(sampling_rate * vad_turn.start)
+                vad_end = _start + int(sampling_rate * vad_turn.end)
+                prediction = pipe(waveform[vad_start: vad_end])['text']
+                history +=  f"{second_to_timecode(speech_turn.start + vad_turn.start)},{second_to_timecode(speech_turn.start + vad_turn.end)}\n" + \
+                            f"{prediction}\n\n"
+                            # f">> {speaker}: {prediction}\n\n"
+                yield history, history, None
+        else:
+            prediction = pipe(data)['text']
+            history +=  f"{second_to_timecode(speech_turn.start)},{second_to_timecode(speech_turn.end)}\n" + \
+                        f"{prediction}\n\n"
+                        # f">> {speaker}: {prediction}\n\n"
+        yield history, history, None
+    # https://support.google.com/youtube/answer/2734698?hl=en#zippy=%2Cbasic-file-formats%2Csubrip-srt-example%2Csubviewer-sbv-example
+    file_name = 'transcript.sbv'
+    with open(file_name, 'w') as fp:
+        fp.write(history)
+    yield history, history, file_name
+demo = gr.Interface(
+    generator,
     inputs=[
+        gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL", optional=True),
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Audio(source="upload", type="filepath", optional=True),
+        gr.Number(value=1, label="Number of Speakers"),
+        gr.Number(value=120, label="Maximum Duration (Seconds)"),
+        'state',
     ],
+    outputs=['text', 'state', 'file'],
     layout="horizontal",
     theme="huggingface",
+    title="Transcribe Mongolian Whisper 🇲🇳",
     description=(
+        "Transcribe Youtube Video / Microphone / Uploaded File in Mongolian Whisper Model." + \
+        " | You can upload SubView file (`.sbv`) [to your youtube video](https://support.google.com/youtube/answer/2734698?hl=en#zippy=%2Cbasic-file-formats)." + \
+        " | Please REFRESH 🔄 the page after you transcribed!" + \
+        " | 🐦 [@_tsogoo_](https://twitter.com/_tsogoo_)" + \
+        " | 🤗 [@bayartsogt](https://huggingface.co/bayartsogt)" + \
+        ""
     ),
     allow_flagging="never",
 )
+# define queue - required for generators
+demo.queue()
+demo.launch()