Spaces:

ashhadahsan
/

whisperX

Running

App Files Files Community

ashhadahsan commited on Mar 10, 2023

Commit

8ae5c89

•

1 Parent(s): f34b85f

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -80

app.py CHANGED Viewed

@@ -2,12 +2,20 @@ import numpy as np
 import streamlit as st
 from constants import WHISPER_MODELS, language_dict
 import streamlit as st
-from utils import translate_to_english, detect_language, write, read, get_key
 import whisperx as whisper
 import json
 import pandas as pd
 from pydub import AudioSegment
 import os
 if "btn1" not in st.session_state:
     st.session_state["btn1"] = False
@@ -52,14 +60,14 @@ with input:
         # on_change=disable_btn2,
         # disabled=st.session_state["btn1"],
     )
-    # text_json = st.file_uploader(
-    #     label="Aligned JSON",
-    #     type=["json"],
-    #     help="Your aligned json file",
-    #     # disabled=st.session_state["btn2"],
-    #     # on_change=disable_btn1,
-    # )
-    text_json = None
     # st.markdown("""**model**""", unsafe_allow_html=True)
     model_name = st.selectbox(
@@ -137,16 +145,16 @@ with input:
         )
     else:
         temperature = [temperature]
-    try:
-        if len(temperature) == 0:
-            st.error("Choose correct value for temperature")
-    except:
-        pass
-    # st.write(temperature)
     submit = st.button("Submit", type="primary")
 with output:
     st.header("Output")
-    import uuid
     name = str(uuid.uuid1())
     if submit:
@@ -157,46 +165,37 @@ with output:
             if audio_uploaded.name.endswith(".wav"):
                 temp = AudioSegment.from_wav(audio_uploaded)
                 temp.export(f"{name}.wav")
-            if audio_uploaded.name.endswith(".mp3"):
-                temp = AudioSegment.from_wav(audio_uploaded)
-                temp.export(f"{name}.wav")
-            # audio_bytes = audio_uploaded.read()
-            # st.audio(audio_bytes, format="audio/wav")
-            if language == "":
                 model = whisper.load_model(model_name)
-                with st.spinner("Detecting language..."):
-                    detection = detect_language(f"{name}.wav", model)
-                    language = detection.get("detected_language")
-                    del model
-                    # st.write(language)
-            if len(language) > 2:
-                language = get_key(language)
-            segments_pre = st.empty()
-            segments_post = st.empty()
-            segments_post_json = st.empty()
-            segments_post2 = st.empty()
-            trans = st.empty()
-            lang = st.empty()
-            if text_json is None:
-                with st.spinner("Running ... "):
-                    decode = {"suppress_tokens": suppress_tokens, "beam_size": 5}
-                    model = whisper.load_model(model_name)
-                    with st.container():
-                        with st.spinner(f"Running with {model_name} model"):
-                            result = model.transcribe(
-                                f"{name}.wav",
-                                language=language,
-                                patience=patience,
-                                initial_prompt=initial_prompt,
-                                condition_on_previous_text=condition_on_previous_text,
-                                temperature=temperature,
-                                compression_ratio_threshold=compression_ratio_threshold,
-                                logprob_threshold=logprob_threshold,
-                                no_speech_threshold=no_speech_threshold,
-                                **decode,
-                            )
                 if translate:
                     result = translate_to_english(result, json=False)
@@ -213,21 +212,6 @@ with output:
                         f"{name}.wav",
                         device=device,
                     )
-            if text_json is not None:
-                if translate:
-                    result = translate_to_english(text_json, json=True)
-                with st.spinner("Running alignment model ..."):
-                    model_a, metadata = whisper.load_align_model(
-                        language_code=language, device=device
-                    )
-                    result_aligned = whisper.align(
-                        text_json, model_a, metadata, audio_uploaded.name, device
-                    )
-            if text_json is None:
-                words_segments = result_aligned["word_segments"]
                 write(
                     f"{name}.wav",
                     dtype=transcription,
@@ -237,13 +221,40 @@ with output:
                 trans.text_area(
                     "transcription", trans_text, height=None, max_chars=None, key=None
                 )
-                segments_pre.text_area(
-                    "Segments before alignment",
-                    result["segments"],
-                    height=None,
-                    max_chars=None,
-                    key=None,
-                )
             segments_post.text_area(
                 "Word Segments after alignment",
                 result_aligned["word_segments"],
@@ -251,10 +262,6 @@ with output:
                 max_chars=None,
                 key=None,
             )
-            with open("segments.json", "w", encoding="utf-8") as f:
-                json.dump(result_aligned["word_segments"], f, indent=False)
             segments_post2.text_area(
                 "Segments after alignment",
                 result_aligned["segments"],
@@ -265,4 +272,100 @@ with output:
             lang.text_input(
                 "detected language", language_dict.get(language), disabled=True
             )
-            os.remove(f"{name}.wav")

 import streamlit as st
 from constants import WHISPER_MODELS, language_dict
 import streamlit as st
+from utils import (
+    translate_to_english,
+    detect_language,
+    write,
+    read,
+    get_key,
+)
 import whisperx as whisper
 import json
 import pandas as pd
 from pydub import AudioSegment
 import os
+import uuid
 if "btn1" not in st.session_state:
     st.session_state["btn1"] = False
         # on_change=disable_btn2,
         # disabled=st.session_state["btn1"],
     )
+    text_json = st.file_uploader(
+        label="Aligned JSON",
+        type=["json"],
+        help="Your aligned json file (Only if you need to skip transcribe)",
+        # disabled=st.session_state["btn2"],
+        # on_change=disable_btn1,
+    )
+    # text_json = None
     # st.markdown("""**model**""", unsafe_allow_html=True)
     model_name = st.selectbox(
         )
     else:
         temperature = [temperature]
     submit = st.button("Submit", type="primary")
 with output:
     st.header("Output")
+    segments_pre = st.empty()
+    segments_post = st.empty()
+    segments_post_json = st.empty()
+    segments_post2 = st.empty()
+    trans = st.empty()
+    lang = st.empty()
     name = str(uuid.uuid1())
     if submit:
             if audio_uploaded.name.endswith(".wav"):
                 temp = AudioSegment.from_wav(audio_uploaded)
                 temp.export(f"{name}.wav")
+        if audio_uploaded.name.endswith(".mp3"):
+            temp = AudioSegment.from_wav(audio_uploaded)
+            temp.export(f"{name}.wav")
+        if language == "":
+            model = whisper.load_model(model_name)
+            with st.spinner("Detecting language..."):
+                detection = detect_language(f"{name}.wav", model)
+                language = detection.get("detected_language")
+                del model
+        if len(language) > 2:
+            language = get_key(language)
+        if text_json is None:
+            with st.spinner("Running ... "):
+                decode = {"suppress_tokens": suppress_tokens, "beam_size": 5}
                 model = whisper.load_model(model_name)
+                with st.container():
+                    with st.spinner(f"Running with {model_name} model"):
+                        result = model.transcribe(
+                            f"{name}.wav",
+                            language=language,
+                            patience=patience,
+                            initial_prompt=initial_prompt,
+                            condition_on_previous_text=condition_on_previous_text,
+                            temperature=temperature,
+                            compression_ratio_threshold=compression_ratio_threshold,
+                            logprob_threshold=logprob_threshold,
+                            no_speech_threshold=no_speech_threshold,
+                            **decode,
+                        )
                 if translate:
                     result = translate_to_english(result, json=False)
                         f"{name}.wav",
                         device=device,
                     )
                 write(
                     f"{name}.wav",
                     dtype=transcription,
                 trans.text_area(
                     "transcription", trans_text, height=None, max_chars=None, key=None
                 )
+                char_segments = []
+                word_segments = []
+                for x in range(len(result_aligned["segments"])):
+                    word_segments.append(
+                        {
+                            "word-segments": result_aligned["segments"][x][
+                                "word-segments"
+                            ]
+                            .fillna("")
+                            .to_dict(orient="records")
+                        }
+                    )
+                    char_segments.append(
+                        {
+                            "char-segments": result_aligned["segments"][x][
+                                "char-segments"
+                            ]
+                            .fillna("")
+                            .to_dict(orient="records")
+                        }
+                    )
+                for x in range(len(result_aligned["segments"])):
+                    result_aligned["segments"][x]["word-segments"] = word_segments[x]
+                    result_aligned["segments"][x]["char-segments"] = char_segments[x]
+            segments_pre.text_area(
+                "Segments before alignment",
+                result["segments"],
+                height=None,
+                max_chars=None,
+                key=None,
+            )
             segments_post.text_area(
                 "Word Segments after alignment",
                 result_aligned["word_segments"],
                 max_chars=None,
                 key=None,
             )
             segments_post2.text_area(
                 "Segments after alignment",
                 result_aligned["segments"],
             lang.text_input(
                 "detected language", language_dict.get(language), disabled=True
             )
+            os.remove(f"{name}.wav")
+        if text_json is not None:
+            with st.spinner("Running ... "):
+                model = whisper.load_model(model_name)
+                json_filname = str(uuid.uuid1())
+                data = json.load(text_json)
+                # Close the uploaded file
+                text_json.close()
+                # Write the JSON data to a new file
+                with open(f"{json_filname}.json", "w") as outfile:
+                    json.dump(data, outfile)
+                # with open("fold.json", "w", encoding="utf-8") as f:
+                #     json.dump(text_json, f)
+                with open(f"{json_filname}.json", "r", encoding="utf-8") as f:
+                    cont = json.load(f)
+                with st.spinner("Running alignment model ..."):
+                    model_a, metadata = whisper.load_align_model(
+                        language_code=language, device=device
+                    )
+                    result_aligned = whisper.align(
+                        cont,
+                        model_a,
+                        metadata,
+                        f"{name}.wav",
+                        device=device,
+                    )
+                words_segments = result_aligned["word_segments"]
+                write(
+                    f"{name}.wav",
+                    dtype=transcription,
+                    result_aligned=result_aligned,
+                )
+                trans_text = read(f"{name}.wav", transcription)
+                char_segments = []
+                word_segments = []
+                for x in range(len(result_aligned["segments"])):
+                    word_segments.append(
+                        {
+                            "word-segments": result_aligned["segments"][x][
+                                "word-segments"
+                            ]
+                            .fillna("")
+                            .to_dict(orient="records")
+                        }
+                    )
+                    char_segments.append(
+                        {
+                            "char-segments": result_aligned["segments"][x][
+                                "char-segments"
+                            ]
+                            .fillna("")
+                            .to_dict(orient="records")
+                        }
+                    )
+                for x in range(len(result_aligned["segments"])):
+                    result_aligned["segments"][x]["word-segments"] = word_segments[x]
+                    result_aligned["segments"][x]["char-segments"] = char_segments[x]
+                trans.text_area(
+                    "transcription", trans_text, height=None, max_chars=None, key=None
+                )
+                segments_pre.text_area(
+                    "Segments before alignment",
+                    cont,
+                    height=None,
+                    max_chars=None,
+                    key=None,
+                )
+                segments_post.text_area(
+                    "Word Segments after alignment",
+                    result_aligned["word_segments"],
+                    height=None,
+                    max_chars=None,
+                    key=None,
+                )
+                segments_post2.text_area(
+                    "Segments after alignment",
+                    result_aligned["segments"],
+                    expanded=False
+                    height=None,
+                    max_chars=None,
+                    key=None,
+                )
+                lang.text_input(
+                    "detected language", language_dict.get(language), disabled=True
+                )
+                os.remove(f"{name}.wav")
+                os.remove(f"{json_filname}.json")