File size: 3,652 Bytes
8835c0c
 
869504a
8835c0c
869504a
 
8835c0c
869504a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8835c0c
 
 
869504a
8835c0c
 
 
869504a
8835c0c
869504a
 
8835c0c
 
 
 
 
 
 
 
 
 
 
869504a
8835c0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
869504a
8835c0c
869504a
 
 
 
 
 
 
8835c0c
869504a
 
 
 
 
8835c0c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import requests
import os

# API information for Hugging Face Inference API
API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"

# Fetch the API token from Hugging Face Secrets
hf_api_token = os.getenv("HF_API_TOKEN")
headers = {"Authorization": f"Bearer {hf_api_token}"}

def query(filename):
    """
    Queries the Hugging Face API to transcribe audio from a file.
    Args:
        filename (str): Path to the audio file.
    Returns:
        dict: The response from the Hugging Face API with transcription.
    """
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

def transcribe_audio_hf(audio_path):
    """
    Transcribes the audio using the Hugging Face Inference API.
    Args:
        audio_path (str): Path to the audio file.
    Returns:
        str: The transcription from the API.
    """
    result = query(audio_path)
    transcription = result.get('text', '').strip()
    return transcription

def levenshtein_similarity(transcription1, transcription2):
    """
    Calculate the Levenshtein similarity between two transcriptions.
    Args:
        transcription1 (str): The first transcription.
        transcription2 (str): The second transcription.
    Returns:
        float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
    """
    import Levenshtein
    distance = Levenshtein.distance(transcription1, transcription2)
    max_len = max(len(transcription1), len(transcription2))
    return 1 - distance / max_len  # Normalize to get similarity score

def evaluate_audio_similarity(original_audio, user_audio):
    """
    Compares the similarity between the transcription of an original audio file and a user's audio file.
    Args:
        original_audio (str): Path to the original audio file.
        user_audio (str): Path to the user's audio file.
    Returns:
        tuple: Transcriptions and Levenshtein similarity score.
    """
    transcription_original = transcribe_audio_hf(original_audio)
    transcription_user = transcribe_audio_hf(user_audio)
    similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
    return transcription_original, transcription_user, similarity_score_levenshtein

def perform_testing(original_audio, user_audio):
    if original_audio is not None and user_audio is not None:
        transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
        return (
            f"**Original Transcription:** {transcription_original}",
            f"**User Transcription:** {transcription_user}",
            f"**Levenshtein Similarity Score:** {similarity_score:.2f}"
        )

# Gradio Interface
with gr.Blocks() as app:
    gr.Markdown("# Audio Transcription and Similarity Checker using Hugging Face Inference API")

    with gr.Tab("Upload"):
        original_audio_upload = gr.Audio(label="Upload Original Audio", type="filepath")
        user_audio_upload = gr.Audio(label="Upload User Audio", type="filepath")
        upload_button = gr.Button("Perform Testing")
        output_original_transcription = gr.Markdown()
        output_user_transcription = gr.Markdown()
        output_similarity_score = gr.Markdown()

        upload_button.click(
            perform_testing, 
            inputs=[original_audio_upload, user_audio_upload], 
            outputs=[output_original_transcription, output_user_transcription, output_similarity_score]
        )

app.launch()