Spaces:
Building
Building
File size: 3,515 Bytes
2f36e0f 1bead67 2f36e0f af71291 2f36e0f af71291 2f36e0f 2424b7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import gradio as gr
import requests
import Levenshtein
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
def load_model():
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
return processor, model
processor, model = load_model()
def transcribe_audio_hf(audio_path):
"""
Transcribes speech from an audio file using a pretrained Wav2Vec2 model.
Args:
audio_path (str): Path to the audio file.
Returns:
str: The transcription of the speech in the audio file.
"""
speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0].strip()
return transcription
def levenshtein_similarity(transcription1, transcription2):
"""
Calculate the Levenshtein similarity between two transcriptions.
Args:
transcription1 (str): The first transcription.
transcription2 (str): The second transcription.
Returns:
float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
"""
distance = Levenshtein.distance(transcription1, transcription2)
max_len = max(len(transcription1), len(transcription2))
return 1 - distance / max_len # Normalize to get similarity score
def evaluate_audio_similarity(original_audio, user_audio):
"""
Compares the similarity between the transcription of an original audio file and a user's audio file.
Args:
original_audio (str): Path to the original audio file.
user_audio (str): Path to the user's audio file.
Returns:
tuple: Transcriptions and Levenshtein similarity score.
"""
transcription_original = transcribe_audio_hf(original_audio)
transcription_user = transcribe_audio_hf(user_audio)
similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
return transcription_original, transcription_user, similarity_score_levenshtein
def perform_testing(original_audio, user_audio):
if original_audio is not None and user_audio is not None:
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
return (
f"**Original Transcription:** {transcription_original}",
f"**User Transcription:** {transcription_user}",
f"**Levenshtein Similarity Score:** {similarity_score:.2f}"
)
# Gradio Interface
with gr.Blocks() as app:
gr.Markdown("# Audio Transcription and Similarity Checker")
original_audio_upload = gr.Audio(label="Upload Original Audio", type="filepath")
user_audio_upload = gr.Audio(label="Upload User Audio", type="filepath")
upload_button = gr.Button("Perform Testing")
output_original_transcription = gr.Markdown()
output_user_transcription = gr.Markdown()
output_similarity_score = gr.Markdown()
upload_button.click(
perform_testing,
inputs=[original_audio_upload, user_audio_upload],
outputs=[output_original_transcription, output_user_transcription, output_similarity_score]
)
app.launch(share=True)
|