File size: 6,004 Bytes
913fbb4
5487b58
913fbb4
 
 
 
5487b58
913fbb4
5487b58
 
913fbb4
5487b58
 
913fbb4
5487b58
913fbb4
 
 
 
 
5487b58
 
 
 
913fbb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5487b58
 
913fbb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
import requests
import Levenshtein
from io import BytesIO
from audio_recorder_streamlit import audio_recorder

# Function to securely load the Hugging Face API token
@st.cache_resource
def load_hf_token():
    return st.secrets["HF_API_KEY"]

# Function to query the Hugging Face Inference API
def transcribe_audio_hf(audio_bytes):
    """
    Transcribes speech from an audio file using the Hugging Face Inference API.
    Args:
        audio_bytes (bytes): Audio data in bytes.
    Returns:
        str: The transcription of the speech in the audio file.
    """
    API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
    headers = {"Authorization": f"Bearer {load_hf_token()}"}
    response = requests.post(API_URL, headers=headers, data=audio_bytes)
    return response.json().get("text", "").strip()

def levenshtein_similarity(transcription1, transcription2):
    """
    Calculate the Levenshtein similarity between two transcriptions.
    Args:
        transcription1 (str): The first transcription.
        transcription2 (str): The second transcription.
    Returns:
        float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
    """
    distance = Levenshtein.distance(transcription1, transcription2)
    max_len = max(len(transcription1), len(transcription2))
    return 1 - distance / max_len  # Normalize to get similarity score

def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
    """
    Compares the similarity between the transcription of an original audio file and a user's audio file.
    Args:
        original_audio_bytes (bytes): Bytes of the original audio file.
        user_audio_bytes (bytes): Bytes of the user's audio file.
    Returns:
        tuple: Transcriptions and Levenshtein similarity score.
    """
    transcription_original = transcribe_audio_hf(original_audio_bytes)
    transcription_user = transcribe_audio_hf(user_audio_bytes)
    similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
    return transcription_original, transcription_user, similarity_score_levenshtein

st.title("Audio Transcription and Similarity Checker")

# Choose between upload or record
st.sidebar.header("Input Method")
input_method = st.sidebar.selectbox("Choose Input Method", ["Upload", "Record"])

original_audio_bytes = None
user_audio_bytes = None

if input_method == "Upload":
    # Upload original audio file
    original_audio = st.file_uploader("Upload Original Audio", type=["wav", "mp3"])
    # Upload user audio file
    user_audio = st.file_uploader("Upload User Audio", type=["wav", "mp3"])

    if original_audio:
        original_audio_bytes = original_audio.read()
        st.audio(original_audio_bytes, format="audio/wav")
    if user_audio:
        user_audio_bytes = user_audio.read()
        st.audio(user_audio_bytes, format="audio/wav")

    # Add a button to perform the test
    if original_audio_bytes and user_audio_bytes:
        if st.button("Perform Testing"):
            with st.spinner("Performing transcription and similarity testing..."):
                transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)

                # Display results
                st.markdown("---")
                st.subheader("Transcriptions and Similarity Score")
                st.write(f"**Original Transcription:** {transcription_original}")
                st.write(f"**User Transcription:** {transcription_user}")
                st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")

                if similarity_score > 0.8:  # Adjust the threshold as needed
                    st.success("The pronunciation is likely correct based on transcription similarity.")
                else:
                    st.error("The pronunciation may be incorrect based on transcription similarity.")

elif input_method == "Record":
    st.write("Record or Upload Original Audio")
    original_audio_bytes = audio_recorder(key="original_audio_recorder")

    if not original_audio_bytes:
        original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
        if original_audio:
            original_audio_bytes = original_audio.read()

    if original_audio_bytes:
        with st.spinner("Processing original audio..."):
            st.audio(original_audio_bytes, format="audio/wav")

    st.write("Record or Upload User Audio")
    user_audio_bytes = audio_recorder(key="user_audio_recorder")

    if not user_audio_bytes:
        user_audio = st.file_uploader("Or Upload User Audio", type=["wav", "mp3"])
        if user_audio:
            user_audio_bytes = user_audio.read()

    if user_audio_bytes:
        with st.spinner("Processing user audio..."):
            st.audio(user_audio_bytes, format="audio/wav")

    # Add a button to perform the test
    if original_audio_bytes and user_audio_bytes:
        if st.button("Perform Testing"):
            with st.spinner("Performing transcription and similarity testing..."):
                transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)

                # Display results
                st.markdown("---")
                st.subheader("Transcriptions and Similarity Score")
                st.write(f"**Original Transcription:** {transcription_original}")
                st.write(f"**User Transcription:** {transcription_user}")
                st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")

                if similarity_score > 0.8:  # Adjust the threshold as needed
                    st.success("The pronunciation is likely correct based on transcription similarity.")
                else:
                    st.error("The pronunciation may be incorrect based on transcription similarity.")