File size: 4,945 Bytes
bc94d2b
5487b58
913fbb4
2910afc
bc94d2b
 
 
 
 
913fbb4
bc94d2b
 
 
 
 
 
913fbb4
bc94d2b
 
 
 
 
 
 
 
 
 
 
913fbb4
 
 
 
 
 
bc94d2b
 
 
 
 
 
 
 
8b6c021
 
 
 
 
 
 
 
3fa4fb2
f36d376
3fa4fb2
 
 
f36d376
3fa4fb2
f36d376
3fa4fb2
f36d376
 
18922c4
8b6c021
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc94d2b
 
 
 
 
 
 
 
 
 
8b6c021
 
 
 
 
 
 
 
 
 
 
 
 
bc94d2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b6c021
bc94d2b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st
import requests
import Levenshtein
import time
from io import BytesIO
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from audio_recorder_streamlit import audio_recorder

@st.cache_resource
def load_model():
    MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
    processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
    return processor, model

processor, model = load_model()

def transcribe_audio_hf(audio_bytes):
    speech_array, sampling_rate = librosa.load(BytesIO(audio_bytes), sr=16000)
    input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0].strip()
    return transcription


def levenshtein_similarity(transcription1, transcription2):
    distance = Levenshtein.distance(transcription1, transcription2)
    max_len = max(len(transcription1), len(transcription2))
    return 1 - distance / max_len  # Normalize to get similarity score

def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
    transcription_original = transcribe_audio_hf(original_audio_bytes)
    transcription_user = transcribe_audio_hf(user_audio_bytes)
    similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
    return transcription_original, transcription_user, similarity_score_levenshtein

st.title("Audio Transcription and Similarity Checker")

# Initialize the session state to control the view
if 'initialized' not in st.session_state:
    st.session_state['initialized'] = False

# Button to initialize the recorders
if not st.session_state['initialized']:

    st.write("Click the Loader below to initialize the audio recorders.")
    init_button = audio_recorder(
        text="Click to Initialize",
        recording_color="#e8b62c",
        neutral_color="#6aa36f",
        pause_threshold=0.2,
        icon_name="play-circle",  # A nice play icon to signify starting the initialization
        icon_size="4x",
        auto_start=False
    )

    if init_button:
        st.session_state['initialized'] = True

# If initialized, display the recorders
if st.session_state['initialized']:

    st.subheader("Record or Upload Original Audio")

    # Style the record button with the provided parameters
    original_audio_bytes = audio_recorder(
        text="Click to Record Audio",
        recording_color="#e8b62c",
        neutral_color="#6aa36f",
        pause_threshold=30,
        icon_name="microphone",  # You can change this to any Font Awesome solid icon
        icon_size="4x"
    )

    if not original_audio_bytes:
        original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
        if original_audio:
            original_audio_bytes = original_audio.read()

    if original_audio_bytes:
        with st.spinner("Processing original audio..."):
            st.audio(original_audio_bytes, format="audio/wav")

    st.subheader("Record or Upload User Audio")

    st.write("")

    # Style the user audio recorder similarly
    user_audio_bytes = audio_recorder(
        text="Click to Record Audio",
        recording_color="#e86f6f",
        neutral_color="#6a6faf",
        pause_threshold=30,
        icon_name="user",  # You can change this to any Font Awesome solid icon
        icon_size="4x"
    )

    if not user_audio_bytes:
        user_audio = st.file_uploader("Or Upload User Audio", type=["wav", "mp3"])
        if user_audio:
            user_audio_bytes = user_audio.read()

    if user_audio_bytes:
        with st.spinner("Processing user audio..."):
            st.audio(user_audio_bytes, format="audio/wav")

    # Add a button to perform the test
    if original_audio_bytes and user_audio_bytes:
        if st.button("Perform Testing"):
            with st.spinner("Performing transcription and similarity testing..."):
                transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)

                # Display results
                st.markdown("---")
                st.subheader("Transcriptions and Similarity Score")
                st.write(f"**Original Transcription:** {transcription_original}")
                st.write(f"**User Transcription:** {transcription_user}")
                st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")

                if similarity_score > 0.8:  # Adjust the threshold as needed
                    st.success("The pronunciation is likely correct based on transcription similarity.")
                else:
                    st.error("The pronunciation may be incorrect based on transcription similarity.")