muzammil-eds commited on
Commit
2f36e0f
1 Parent(s): 5850693

Files added

Browse files
Files changed (6) hide show
  1. README.md +0 -11
  2. app.py +80 -78
  3. requirements.txt +1 -2
  4. static/script.js +0 -103
  5. static/style.css +0 -173
  6. templates/index.html +0 -52
README.md DELETED
@@ -1,11 +0,0 @@
1
- ---
2
- title: Arabic ASR Flask
3
- emoji: 🔥
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,83 +1,85 @@
1
- import os
2
- from flask import Flask, request, jsonify, render_template
3
- from transformers import pipeline
4
- from flask_cors import CORS
5
- from pydub import AudioSegment
6
- from io import BytesIO
7
  import Levenshtein
8
-
9
- # Set the FFmpeg paths explicitly
10
- AudioSegment.converter = "/usr/bin/ffmpeg"
11
- AudioSegment.ffprobe = "/usr/bin/ffprobe"
12
-
13
- # Set Hugging Face cache directory to avoid permission issues
14
- os.environ['HF_HOME'] = '/tmp/.cache'
15
-
16
- app = Flask(__name__)
17
- CORS(app)
18
-
19
- # Use Hugging Face ASR pipeline for automatic speech recognition
20
- asr_pipeline = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
21
-
22
-
23
- def convert_to_wav(audio_bytes):
24
- """Convert audio bytes to wav format using pydub"""
25
- try:
26
- audio = AudioSegment.from_file(BytesIO(audio_bytes)) # Auto-detect format
27
- wav_io = BytesIO()
28
- audio.export(wav_io, format="wav")
29
- wav_io.seek(0)
30
- return wav_io
31
- except Exception as e:
32
- print(f"Error converting audio: {e}")
33
- return None
34
-
35
-
36
- def transcribe_audio(audio_bytes):
37
- """Transcribes the audio using the Hugging Face ASR pipeline."""
38
- wav_io = convert_to_wav(audio_bytes)
39
- if wav_io is None:
40
- raise Exception("Could not convert audio to WAV format")
41
-
42
- # Read the audio file into bytes for the ASR pipeline
43
- wav_io.seek(0)
44
- transcription = asr_pipeline(wav_io)["text"]
45
- return transcription.strip()
46
-
47
 
48
  def levenshtein_similarity(transcription1, transcription2):
 
 
 
 
 
 
 
 
49
  distance = Levenshtein.distance(transcription1, transcription2)
50
  max_len = max(len(transcription1), len(transcription2))
51
- return 1 - distance / max_len
52
-
53
-
54
- @app.route('/')
55
- def index():
56
- return render_template('index.html')
57
-
58
-
59
- @app.route('/transcribe', methods=['POST'])
60
- def transcribe():
61
- try:
62
- original_audio = request.files['original_audio']
63
- user_audio = request.files['user_audio']
64
-
65
- original_audio_bytes = original_audio.read()
66
- user_audio_bytes = user_audio.read()
67
-
68
- transcription_original = transcribe_audio(original_audio_bytes)
69
- transcription_user = transcribe_audio(user_audio_bytes)
70
- except Exception as e:
71
- return jsonify({"error": str(e)}), 500
72
-
73
- similarity_score = levenshtein_similarity(transcription_original, transcription_user)
74
-
75
- return jsonify({
76
- "transcription_original": transcription_original,
77
- "transcription_user": transcription_user,
78
- "similarity_score": similarity_score
79
- })
80
-
81
-
82
- if __name__ == '__main__':
83
- app.run(debug=False, port=7860, host='0.0.0.0')
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
 
 
 
 
3
  import Levenshtein
4
+ import librosa
5
+ import torch
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
+
8
+ def load_model():
9
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
10
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
11
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
12
+ return processor, model
13
+
14
+ processor, model = load_model()
15
+
16
+ def transcribe_audio_hf(audio_path):
17
+ """
18
+ Transcribes speech from an audio file using a pretrained Wav2Vec2 model.
19
+ Args:
20
+ audio_path (str): Path to the audio file.
21
+ Returns:
22
+ str: The transcription of the speech in the audio file.
23
+ """
24
+ speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
25
+ input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
26
+ with torch.no_grad():
27
+ logits = model(input_values).logits
28
+ predicted_ids = torch.argmax(logits, dim=-1)
29
+ transcription = processor.batch_decode(predicted_ids)[0].strip()
30
+ return transcription
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def levenshtein_similarity(transcription1, transcription2):
33
+ """
34
+ Calculate the Levenshtein similarity between two transcriptions.
35
+ Args:
36
+ transcription1 (str): The first transcription.
37
+ transcription2 (str): The second transcription.
38
+ Returns:
39
+ float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
40
+ """
41
  distance = Levenshtein.distance(transcription1, transcription2)
42
  max_len = max(len(transcription1), len(transcription2))
43
+ return 1 - distance / max_len # Normalize to get similarity score
44
+
45
+ def evaluate_audio_similarity(original_audio, user_audio):
46
+ """
47
+ Compares the similarity between the transcription of an original audio file and a user's audio file.
48
+ Args:
49
+ original_audio (str): Path to the original audio file.
50
+ user_audio (str): Path to the user's audio file.
51
+ Returns:
52
+ tuple: Transcriptions and Levenshtein similarity score.
53
+ """
54
+ transcription_original = transcribe_audio_hf(original_audio)
55
+ transcription_user = transcribe_audio_hf(user_audio)
56
+ similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
57
+ return transcription_original, transcription_user, similarity_score_levenshtein
58
+
59
+ def perform_testing(original_audio, user_audio):
60
+ if original_audio is not None and user_audio is not None:
61
+ transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
62
+ return (
63
+ f"**Original Transcription:** {transcription_original}",
64
+ f"**User Transcription:** {transcription_user}",
65
+ f"**Levenshtein Similarity Score:** {similarity_score:.2f}"
66
+ )
67
+
68
+ # Gradio Interface
69
+ with gr.Blocks() as app:
70
+ gr.Markdown("# Audio Transcription and Similarity Checker")
71
+
72
+ original_audio_upload = gr.Audio(label="Upload Original Audio", type="filepath")
73
+ user_audio_upload = gr.Audio(label="Upload User Audio", type="filepath")
74
+ upload_button = gr.Button("Perform Testing")
75
+ output_original_transcription = gr.Markdown()
76
+ output_user_transcription = gr.Markdown()
77
+ output_similarity_score = gr.Markdown()
78
+
79
+ upload_button.click(
80
+ perform_testing,
81
+ inputs=[original_audio_upload, user_audio_upload],
82
+ outputs=[output_original_transcription, output_user_transcription, output_similarity_score]
83
+ )
84
+
85
+ app.launch()
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
- Flask
2
- Flask-Cors
3
  librosa
4
  torch
5
  transformers
@@ -7,4 +5,5 @@ pydub
7
  Levenshtein
8
  Werkzeug
9
  gunicorn
 
10
 
 
 
 
1
  librosa
2
  torch
3
  transformers
 
5
  Levenshtein
6
  Werkzeug
7
  gunicorn
8
+ gradio
9
 
static/script.js DELETED
@@ -1,103 +0,0 @@
1
- let mediaRecorder;
2
- let audioChunks = [];
3
- let originalAudioBlob = null;
4
- let userAudioBlob = null;
5
-
6
- document.getElementById('originalAudio').addEventListener('change', function (e) {
7
- const file = e.target.files[0];
8
- const audioPlayer = document.getElementById('originalAudioPlayer');
9
- const fileURL = URL.createObjectURL(file);
10
- audioPlayer.src = fileURL;
11
- audioPlayer.play();
12
- originalAudioBlob = file;
13
- });
14
-
15
- document.getElementById('userAudio').addEventListener('change', function (e) {
16
- const file = e.target.files[0];
17
- const audioPlayer = document.getElementById('userAudioPlayer');
18
- const fileURL = URL.createObjectURL(file);
19
- audioPlayer.src = fileURL;
20
- audioPlayer.play();
21
- userAudioBlob = file;
22
- });
23
-
24
- function startRecording(type) {
25
- audioChunks = [];
26
-
27
- navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
28
- mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' }); // Default format is webm
29
- mediaRecorder.start();
30
-
31
- mediaRecorder.addEventListener("dataavailable", event => {
32
- audioChunks.push(event.data);
33
- });
34
-
35
- mediaRecorder.addEventListener("stop", () => {
36
- const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); // Save as .wav
37
- const audioURL = URL.createObjectURL(audioBlob);
38
-
39
- if (type === 'original') {
40
- document.getElementById('originalAudioPlayer').src = audioURL;
41
- originalAudioBlob = audioBlob;
42
- } else if (type === 'user') {
43
- document.getElementById('userAudioPlayer').src = audioURL;
44
- userAudioBlob = audioBlob;
45
- }
46
- });
47
- });
48
-
49
- // Add recording animation and disable the start button
50
- if (type === 'original') {
51
- document.getElementById('recordOriginalAudio').classList.add('recording-active');
52
- document.getElementById('recordOriginalAudio').disabled = true;
53
- document.getElementById('stopOriginalAudio').disabled = false;
54
- } else {
55
- document.getElementById('recordUserAudio').classList.add('recording-active');
56
- document.getElementById('recordUserAudio').disabled = true;
57
- document.getElementById('stopUserAudio').disabled = false;
58
- }
59
- }
60
-
61
- function stopRecording(type) {
62
- mediaRecorder.stop();
63
-
64
- // Remove recording animation and enable the start button
65
- if (type === 'original') {
66
- document.getElementById('recordOriginalAudio').classList.remove('recording-active');
67
- document.getElementById('recordOriginalAudio').disabled = false;
68
- document.getElementById('stopOriginalAudio').disabled = true;
69
- } else {
70
- document.getElementById('recordUserAudio').classList.remove('recording-active');
71
- document.getElementById('recordUserAudio').disabled = false;
72
- document.getElementById('stopUserAudio').disabled = true;
73
- }
74
- }
75
-
76
- document.getElementById('performTesting').addEventListener('click', function () {
77
- if (originalAudioBlob && userAudioBlob) {
78
- const formData = new FormData();
79
- formData.append('original_audio', originalAudioBlob, 'original_audio.wav');
80
- formData.append('user_audio', userAudioBlob, 'user_audio.wav');
81
-
82
- // Show loader
83
- document.getElementById('loader').style.display = 'block';
84
- document.getElementById('results').style.display = 'none';
85
-
86
- fetch('/transcribe', {
87
- method: 'POST',
88
- body: formData
89
- })
90
- .then(response => response.json())
91
- .then(data => {
92
- // Hide loader and show results
93
- document.getElementById('loader').style.display = 'none';
94
- document.getElementById('results').style.display = 'block';
95
-
96
- document.getElementById('transcriptionOriginal').innerText = `Original Transcription: ${data.transcription_original}`;
97
- document.getElementById('transcriptionUser').innerText = `User Transcription: ${data.transcription_user}`;
98
- document.getElementById('similarityScore').innerText = `Similarity Score: ${data.similarity_score.toFixed(2)}`;
99
- });
100
- } else {
101
- alert('Please provide both original and user audio files.');
102
- }
103
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
static/style.css DELETED
@@ -1,173 +0,0 @@
1
- /* Reset some default browser styles */
2
- * {
3
- margin: 0;
4
- padding: 0;
5
- box-sizing: border-box;
6
- }
7
-
8
- /* Make the body take up the full viewport height */
9
- body {
10
- font-family: 'Arial', sans-serif;
11
- background-color: #2c2f33;
12
- color: white;
13
- height: 100vh;
14
- display: flex;
15
- flex-direction: column;
16
- justify-content: center;
17
- align-items: center;
18
- }
19
-
20
- /* Center container and add padding for mobile devices */
21
- .container {
22
- width: 90%;
23
- max-width: 1200px;
24
- margin: auto;
25
- text-align: center;
26
- padding: 20px;
27
- background-color: #1c1e22;
28
- border-radius: 12px;
29
- box-shadow: 0px 4px 20px rgba(0, 0, 0, 0.4);
30
- transition: all 0.3s ease;
31
- }
32
-
33
- /* Add hover effect for container */
34
- .container:hover {
35
- box-shadow: 0px 6px 24px rgba(0, 0, 0, 0.6);
36
- }
37
-
38
- /* Style the headings */
39
- h1 {
40
- margin: 20px 0;
41
- font-size: 2.5rem;
42
- color: #7289da;
43
- }
44
-
45
- /* Make the audio-panel responsive using flexbox */
46
- .audio-panel {
47
- display: flex;
48
- flex-wrap: wrap;
49
- justify-content: space-around;
50
- margin: 20px 0;
51
- }
52
-
53
- .audio-upload {
54
- width: 45%;
55
- min-width: 300px;
56
- padding: 10px;
57
- margin-bottom: 20px;
58
- background-color: #40444b;
59
- border-radius: 10px;
60
- transition: transform 0.2s;
61
- }
62
-
63
- .audio-upload:hover {
64
- transform: translateY(-5px);
65
- }
66
-
67
- h2 {
68
- font-size: 1.25rem;
69
- margin-bottom: 10px;
70
- color: #99aab5;
71
- }
72
-
73
- /* Style for file input */
74
- input[type="file"] {
75
- display: block;
76
- margin: 10px 0;
77
- background-color: #7289da;
78
- color: white;
79
- padding: 10px;
80
- border-radius: 5px;
81
- cursor: pointer;
82
- transition: background-color 0.3s;
83
- }
84
-
85
- input[type="file"]:hover {
86
- background-color: #5b6bb0;
87
- }
88
-
89
- /* Style the audio players */
90
- audio {
91
- width: 100%;
92
- margin: 10px 0;
93
- }
94
-
95
- /* Style buttons with consistent design */
96
- button {
97
- padding: 12px 25px;
98
- font-size: 1rem;
99
- background-color: #7289da;
100
- color: white;
101
- border: none;
102
- border-radius: 5px;
103
- cursor: pointer;
104
- transition: background-color 0.3s, transform 0.2s;
105
- margin: 10px 5px;
106
- }
107
-
108
- button:hover {
109
- background-color: #5b6bb0;
110
- transform: translateY(-3px);
111
- }
112
-
113
- /* Loader and result display */
114
- #loader {
115
- font-size: 1.25rem;
116
- color: #7289da;
117
- margin: 20px 0;
118
- }
119
-
120
- .results {
121
- margin-top: 20px;
122
- background-color: #40444b;
123
- padding: 20px;
124
- border-radius: 10px;
125
- color: #99aab5;
126
- text-align: left;
127
- }
128
-
129
- .results h3 {
130
- margin-bottom: 10px;
131
- color: #7289da;
132
- }
133
-
134
- #results p {
135
- font-size: 1.1rem;
136
- margin: 5px 0;
137
- }
138
-
139
- /* Media query to ensure responsiveness on smaller screens */
140
- @media (max-width: 768px) {
141
- .audio-upload {
142
- width: 100%;
143
- margin-bottom: 20px;
144
- }
145
-
146
- h1 {
147
- font-size: 2rem;
148
- }
149
-
150
- button {
151
- width: 100%;
152
- padding: 15px;
153
- }
154
- }
155
-
156
- /* Add recording animation style */
157
- .recording-active {
158
- animation: pulse 1s infinite;
159
- background-color: red;
160
- color: white;
161
- }
162
-
163
- @keyframes pulse {
164
- 0% {
165
- box-shadow: 0 0 0 0 rgba(255, 0, 0, 0.7);
166
- }
167
- 70% {
168
- box-shadow: 0 0 0 10px rgba(255, 0, 0, 0);
169
- }
170
- 100% {
171
- box-shadow: 0 0 0 0 rgba(255, 0, 0, 0);
172
- }
173
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
templates/index.html DELETED
@@ -1,52 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Audio Transcription and Similarity Checker</title>
7
- <link rel="stylesheet" href="/static/style.css">
8
- </head>
9
- <body>
10
- <div class="container">
11
- <h1>Audio Transcription and Similarity Checker</h1>
12
-
13
- <!-- Audio upload/record panel -->
14
- <div class="audio-panel">
15
- <div class="audio-upload">
16
- <h2>Upload or Record Original Audio</h2>
17
- <input type="file" id="originalAudio" accept="audio/*">
18
- <audio id="originalAudioPlayer" controls></audio>
19
- <br>
20
- <button id="recordOriginalAudio" onclick="startRecording('original')">Start Recording</button>
21
- <button id="stopOriginalAudio" onclick="stopRecording('original')" disabled>Stop Recording</button>
22
- </div>
23
- <div class="audio-upload">
24
- <h2>Upload or Record User Audio</h2>
25
- <input type="file" id="userAudio" accept="audio/*">
26
- <audio id="userAudioPlayer" controls></audio>
27
- <br>
28
- <button id="recordUserAudio" onclick="startRecording('user')">Start Recording</button>
29
- <button id="stopUserAudio" onclick="stopRecording('user')" disabled>Stop Recording</button>
30
- </div>
31
- </div>
32
-
33
- <!-- Button to perform similarity check -->
34
- <button id="performTesting">Perform Testing</button>
35
-
36
- <!-- Loader while processing -->
37
- <div id="loader" style="display: none;">
38
- <p>Processing... Please wait</p>
39
- </div>
40
-
41
- <!-- Results section -->
42
- <div id="results" class="results" style="display: none;">
43
- <h3>Results</h3>
44
- <p id="transcriptionOriginal"></p>
45
- <p id="transcriptionUser"></p>
46
- <p id="similarityScore"></p>
47
- </div>
48
- </div>
49
-
50
- <script src="/static/script.js"></script>
51
- </body>
52
- </html>