File size: 3,505 Bytes
8835c0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
import requests
import Levenshtein
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def load_model():
    MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
    processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
    return processor, model

processor, model = load_model()

def transcribe_audio_hf(audio_path):
    """
    Transcribes speech from an audio file using a pretrained Wav2Vec2 model.
    Args:
        audio_path (str): Path to the audio file.
    Returns:
        str: The transcription of the speech in the audio file.
    """
    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
    input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0].strip()
    return transcription

def levenshtein_similarity(transcription1, transcription2):
    """
    Calculate the Levenshtein similarity between two transcriptions.
    Args:
        transcription1 (str): The first transcription.
        transcription2 (str): The second transcription.
    Returns:
        float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
    """
    distance = Levenshtein.distance(transcription1, transcription2)
    max_len = max(len(transcription1), len(transcription2))
    return 1 - distance / max_len  # Normalize to get similarity score

def evaluate_audio_similarity(original_audio, user_audio):
    """
    Compares the similarity between the transcription of an original audio file and a user's audio file.
    Args:
        original_audio (str): Path to the original audio file.
        user_audio (str): Path to the user's audio file.
    Returns:
        tuple: Transcriptions and Levenshtein similarity score.
    """
    transcription_original = transcribe_audio_hf(original_audio)
    transcription_user = transcribe_audio_hf(user_audio)
    similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
    return transcription_original, transcription_user, similarity_score_levenshtein

def perform_testing(original_audio, user_audio):
    if original_audio is not None and user_audio is not None:
        transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
        return (
            f"**Original Transcription:** {transcription_original}",
            f"**User Transcription:** {transcription_user}",
            f"**Levenshtein Similarity Score:** {similarity_score:.2f}"
        )

# Gradio Interface
with gr.Blocks() as app:
    gr.Markdown("# Audio Transcription and Similarity Checker")

    original_audio_upload = gr.Audio(label="Upload Original Audio", type="filepath")
    user_audio_upload = gr.Audio(label="Upload User Audio", type="filepath")
    upload_button = gr.Button("Perform Testing")
    output_original_transcription = gr.Markdown()
    output_user_transcription = gr.Markdown()
    output_similarity_score = gr.Markdown()

    upload_button.click(
        perform_testing,
        inputs=[original_audio_upload, user_audio_upload],
        outputs=[output_original_transcription, output_user_transcription, output_similarity_score]
    )

app.launch()