Spaces:
Sleeping
Sleeping
File size: 6,374 Bytes
bc94d2b 5487b58 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b be99e76 bc94d2b be99e76 bc94d2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import streamlit as st
import requests
import Levenshtein
from io import BytesIO
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from audio_recorder_streamlit import audio_recorder
@st.cache_resource
def load_model():
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
return processor, model
processor, model = load_model()
def transcribe_audio_hf(audio_bytes):
"""
Transcribes speech from an audio file using a pretrained Wav2Vec2 model.
Args:
audio_bytes (bytes): Audio data in bytes.
Returns:
str: The transcription of the speech in the audio file.
"""
speech_array, sampling_rate = librosa.load(BytesIO(audio_bytes), sr=16000)
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0].strip()
return transcription
def levenshtein_similarity(transcription1, transcription2):
"""
Calculate the Levenshtein similarity between two transcriptions.
Args:
transcription1 (str): The first transcription.
transcription2 (str): The second transcription.
Returns:
float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
"""
distance = Levenshtein.distance(transcription1, transcription2)
max_len = max(len(transcription1), len(transcription2))
return 1 - distance / max_len # Normalize to get similarity score
def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
"""
Compares the similarity between the transcription of an original audio file and a user's audio file.
Args:
original_audio_bytes (bytes): Bytes of the original audio file.
user_audio_bytes (bytes): Bytes of the user's audio file.
Returns:
tuple: Transcriptions and Levenshtein similarity score.
"""
transcription_original = transcribe_audio_hf(original_audio_bytes)
transcription_user = transcribe_audio_hf(user_audio_bytes)
similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
return transcription_original, transcription_user, similarity_score_levenshtein
st.title("Audio Transcription and Similarity Checker")
# Choose between upload or record
st.sidebar.header("Input Method")
input_method = st.sidebar.selectbox("Choose Input Method", ["Upload", "Record"])
original_audio_bytes = None
user_audio_bytes = None
if input_method == "Upload":
# Upload original audio file
original_audio = st.file_uploader("Upload Original Audio", type=["wav", "mp3"])
# Upload user audio file
user_audio = st.file_uploader("Upload User Audio", type=["wav", "mp3"])
if original_audio:
original_audio_bytes = original_audio.read()
st.audio(original_audio_bytes, format="audio/wav")
if user_audio:
user_audio_bytes = user_audio.read()
st.audio(user_audio_bytes, format="audio/wav")
# Add a button to perform the test
if original_audio_bytes and user_audio_bytes:
if st.button("Perform Testing"):
with st.spinner("Performing transcription and similarity testing..."):
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
# Display results
st.markdown("---")
st.subheader("Transcriptions and Similarity Score")
st.write(f"**Original Transcription:** {transcription_original}")
st.write(f"**User Transcription:** {transcription_user}")
st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
if similarity_score > 0.8: # Adjust the threshold as needed
st.success("The pronunciation is likely correct based on transcription similarity.")
else:
st.error("The pronunciation may be incorrect based on transcription similarity.")
elif input_method == "Record":
st.write("Record or Upload Original Audio")
original_audio_bytes = audio_recorder(key="original_audio_recorder", pause_threshold=30, icon_size='4x')
if not original_audio_bytes:
original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
if original_audio:
original_audio_bytes = original_audio.read()
if original_audio_bytes:
with st.spinner("Processing original audio..."):
st.audio(original_audio_bytes, format="audio/wav")
st.write("Record or Upload User Audio")
user_audio_bytes = audio_recorder(key="user_audio_recorder", pause_threshold=30, icon_size='4x')
if not user_audio_bytes:
user_audio = st.file_uploader("Or Upload User Audio", type=["wav", "mp3"])
if user_audio:
user_audio_bytes = user_audio.read()
if user_audio_bytes:
with st.spinner("Processing user audio..."):
st.audio(user_audio_bytes, format="audio/wav")
# Add a button to perform the test
if original_audio_bytes and user_audio_bytes:
if st.button("Perform Testing"):
with st.spinner("Performing transcription and similarity testing..."):
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
# Display results
st.markdown("---")
st.subheader("Transcriptions and Similarity Score")
st.write(f"**Original Transcription:** {transcription_original}")
st.write(f"**User Transcription:** {transcription_user}")
st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
if similarity_score > 0.8: # Adjust the threshold as needed
st.success("The pronunciation is likely correct based on transcription similarity.")
else:
st.error("The pronunciation may be incorrect based on transcription similarity.")
|