Spaces:
Building
Building
muzammil-eds
commited on
Commit
•
af71291
1
Parent(s):
473a050
Files added
Browse files- Dockerfile +18 -0
- app.py +81 -0
- requirements.txt +7 -0
- static/script.js +103 -0
- static/style.css +173 -0
- templates/index.html +52 -0
Dockerfile
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as the base image
|
2 |
+
FROM python:3.9-slim
|
3 |
+
LABEL authors="muzammil"
|
4 |
+
|
5 |
+
# Set the working directory inside the container
|
6 |
+
WORKDIR /app
|
7 |
+
|
8 |
+
# Copy the current directory contents into the container at /app
|
9 |
+
COPY . /app
|
10 |
+
|
11 |
+
# Install any required dependencies
|
12 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
13 |
+
|
14 |
+
# Expose the port Flask will run on
|
15 |
+
EXPOSE 7860
|
16 |
+
|
17 |
+
# Command to run the Flask app
|
18 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify, render_template
|
2 |
+
import librosa
|
3 |
+
import torch
|
4 |
+
import Levenshtein
|
5 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
6 |
+
from io import BytesIO
|
7 |
+
from flask_cors import CORS
|
8 |
+
from pydub import AudioSegment # NEW
|
9 |
+
|
10 |
+
app = Flask(__name__)
|
11 |
+
CORS(app)
|
12 |
+
|
13 |
+
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
|
14 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
|
15 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
16 |
+
|
17 |
+
|
18 |
+
def convert_to_wav(audio_bytes):
|
19 |
+
"""Convert audio bytes to wav format using pydub"""
|
20 |
+
try:
|
21 |
+
audio = AudioSegment.from_file(BytesIO(audio_bytes)) # Auto-detect format
|
22 |
+
wav_io = BytesIO()
|
23 |
+
audio.export(wav_io, format="wav")
|
24 |
+
wav_io.seek(0)
|
25 |
+
return wav_io
|
26 |
+
except Exception as e:
|
27 |
+
print(f"Error converting audio: {e}")
|
28 |
+
return None
|
29 |
+
|
30 |
+
|
31 |
+
def transcribe_audio_hf(audio_bytes):
|
32 |
+
"""Transcribes the audio using a pretrained Wav2Vec2 model."""
|
33 |
+
wav_io = convert_to_wav(audio_bytes) # Convert to wav
|
34 |
+
if wav_io is None:
|
35 |
+
raise Exception("Could not convert audio to WAV format")
|
36 |
+
|
37 |
+
speech_array, sampling_rate = librosa.load(wav_io, sr=16000)
|
38 |
+
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
|
39 |
+
with torch.no_grad():
|
40 |
+
logits = model(input_values).logits
|
41 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
42 |
+
transcription = processor.batch_decode(predicted_ids)[0].strip()
|
43 |
+
return transcription
|
44 |
+
|
45 |
+
|
46 |
+
def levenshtein_similarity(transcription1, transcription2):
|
47 |
+
distance = Levenshtein.distance(transcription1, transcription2)
|
48 |
+
max_len = max(len(transcription1), len(transcription2))
|
49 |
+
return 1 - distance / max_len
|
50 |
+
|
51 |
+
|
52 |
+
@app.route('/')
|
53 |
+
def index():
|
54 |
+
return render_template('index.html')
|
55 |
+
|
56 |
+
|
57 |
+
@app.route('/transcribe', methods=['POST'])
|
58 |
+
def transcribe():
|
59 |
+
original_audio = request.files['original_audio']
|
60 |
+
user_audio = request.files['user_audio']
|
61 |
+
|
62 |
+
original_audio_bytes = original_audio.read()
|
63 |
+
user_audio_bytes = user_audio.read()
|
64 |
+
|
65 |
+
try:
|
66 |
+
transcription_original = transcribe_audio_hf(original_audio_bytes)
|
67 |
+
transcription_user = transcribe_audio_hf(user_audio_bytes)
|
68 |
+
except Exception as e:
|
69 |
+
return jsonify({"error": str(e)}), 500
|
70 |
+
|
71 |
+
similarity_score = levenshtein_similarity(transcription_original, transcription_user)
|
72 |
+
|
73 |
+
return jsonify({
|
74 |
+
"transcription_original": transcription_original,
|
75 |
+
"transcription_user": transcription_user,
|
76 |
+
"similarity_score": similarity_score
|
77 |
+
})
|
78 |
+
|
79 |
+
|
80 |
+
if __name__ == '__main__':
|
81 |
+
app.run(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Flask==2.1.1
|
2 |
+
Flask-Cors==3.0.10
|
3 |
+
librosa==0.8.1
|
4 |
+
torch==1.9.0
|
5 |
+
transformers==4.5.1
|
6 |
+
pydub==0.25.1
|
7 |
+
Levenshtein==0.12.0
|
static/script.js
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
let mediaRecorder;
|
2 |
+
let audioChunks = [];
|
3 |
+
let originalAudioBlob = null;
|
4 |
+
let userAudioBlob = null;
|
5 |
+
|
6 |
+
document.getElementById('originalAudio').addEventListener('change', function (e) {
|
7 |
+
const file = e.target.files[0];
|
8 |
+
const audioPlayer = document.getElementById('originalAudioPlayer');
|
9 |
+
const fileURL = URL.createObjectURL(file);
|
10 |
+
audioPlayer.src = fileURL;
|
11 |
+
audioPlayer.play();
|
12 |
+
originalAudioBlob = file;
|
13 |
+
});
|
14 |
+
|
15 |
+
document.getElementById('userAudio').addEventListener('change', function (e) {
|
16 |
+
const file = e.target.files[0];
|
17 |
+
const audioPlayer = document.getElementById('userAudioPlayer');
|
18 |
+
const fileURL = URL.createObjectURL(file);
|
19 |
+
audioPlayer.src = fileURL;
|
20 |
+
audioPlayer.play();
|
21 |
+
userAudioBlob = file;
|
22 |
+
});
|
23 |
+
|
24 |
+
function startRecording(type) {
|
25 |
+
audioChunks = [];
|
26 |
+
|
27 |
+
navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
|
28 |
+
mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' }); // Default format is webm
|
29 |
+
mediaRecorder.start();
|
30 |
+
|
31 |
+
mediaRecorder.addEventListener("dataavailable", event => {
|
32 |
+
audioChunks.push(event.data);
|
33 |
+
});
|
34 |
+
|
35 |
+
mediaRecorder.addEventListener("stop", () => {
|
36 |
+
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); // Save as .wav
|
37 |
+
const audioURL = URL.createObjectURL(audioBlob);
|
38 |
+
|
39 |
+
if (type === 'original') {
|
40 |
+
document.getElementById('originalAudioPlayer').src = audioURL;
|
41 |
+
originalAudioBlob = audioBlob;
|
42 |
+
} else if (type === 'user') {
|
43 |
+
document.getElementById('userAudioPlayer').src = audioURL;
|
44 |
+
userAudioBlob = audioBlob;
|
45 |
+
}
|
46 |
+
});
|
47 |
+
});
|
48 |
+
|
49 |
+
// Add recording animation and disable the start button
|
50 |
+
if (type === 'original') {
|
51 |
+
document.getElementById('recordOriginalAudio').classList.add('recording-active');
|
52 |
+
document.getElementById('recordOriginalAudio').disabled = true;
|
53 |
+
document.getElementById('stopOriginalAudio').disabled = false;
|
54 |
+
} else {
|
55 |
+
document.getElementById('recordUserAudio').classList.add('recording-active');
|
56 |
+
document.getElementById('recordUserAudio').disabled = true;
|
57 |
+
document.getElementById('stopUserAudio').disabled = false;
|
58 |
+
}
|
59 |
+
}
|
60 |
+
|
61 |
+
function stopRecording(type) {
|
62 |
+
mediaRecorder.stop();
|
63 |
+
|
64 |
+
// Remove recording animation and enable the start button
|
65 |
+
if (type === 'original') {
|
66 |
+
document.getElementById('recordOriginalAudio').classList.remove('recording-active');
|
67 |
+
document.getElementById('recordOriginalAudio').disabled = false;
|
68 |
+
document.getElementById('stopOriginalAudio').disabled = true;
|
69 |
+
} else {
|
70 |
+
document.getElementById('recordUserAudio').classList.remove('recording-active');
|
71 |
+
document.getElementById('recordUserAudio').disabled = false;
|
72 |
+
document.getElementById('stopUserAudio').disabled = true;
|
73 |
+
}
|
74 |
+
}
|
75 |
+
|
76 |
+
document.getElementById('performTesting').addEventListener('click', function () {
|
77 |
+
if (originalAudioBlob && userAudioBlob) {
|
78 |
+
const formData = new FormData();
|
79 |
+
formData.append('original_audio', originalAudioBlob, 'original_audio.wav');
|
80 |
+
formData.append('user_audio', userAudioBlob, 'user_audio.wav');
|
81 |
+
|
82 |
+
// Show loader
|
83 |
+
document.getElementById('loader').style.display = 'block';
|
84 |
+
document.getElementById('results').style.display = 'none';
|
85 |
+
|
86 |
+
fetch('/transcribe', {
|
87 |
+
method: 'POST',
|
88 |
+
body: formData
|
89 |
+
})
|
90 |
+
.then(response => response.json())
|
91 |
+
.then(data => {
|
92 |
+
// Hide loader and show results
|
93 |
+
document.getElementById('loader').style.display = 'none';
|
94 |
+
document.getElementById('results').style.display = 'block';
|
95 |
+
|
96 |
+
document.getElementById('transcriptionOriginal').innerText = `Original Transcription: ${data.transcription_original}`;
|
97 |
+
document.getElementById('transcriptionUser').innerText = `User Transcription: ${data.transcription_user}`;
|
98 |
+
document.getElementById('similarityScore').innerText = `Similarity Score: ${data.similarity_score.toFixed(2)}`;
|
99 |
+
});
|
100 |
+
} else {
|
101 |
+
alert('Please provide both original and user audio files.');
|
102 |
+
}
|
103 |
+
});
|
static/style.css
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* Reset some default browser styles */
|
2 |
+
* {
|
3 |
+
margin: 0;
|
4 |
+
padding: 0;
|
5 |
+
box-sizing: border-box;
|
6 |
+
}
|
7 |
+
|
8 |
+
/* Make the body take up the full viewport height */
|
9 |
+
body {
|
10 |
+
font-family: 'Arial', sans-serif;
|
11 |
+
background-color: #2c2f33;
|
12 |
+
color: white;
|
13 |
+
height: 100vh;
|
14 |
+
display: flex;
|
15 |
+
flex-direction: column;
|
16 |
+
justify-content: center;
|
17 |
+
align-items: center;
|
18 |
+
}
|
19 |
+
|
20 |
+
/* Center container and add padding for mobile devices */
|
21 |
+
.container {
|
22 |
+
width: 90%;
|
23 |
+
max-width: 1200px;
|
24 |
+
margin: auto;
|
25 |
+
text-align: center;
|
26 |
+
padding: 20px;
|
27 |
+
background-color: #1c1e22;
|
28 |
+
border-radius: 12px;
|
29 |
+
box-shadow: 0px 4px 20px rgba(0, 0, 0, 0.4);
|
30 |
+
transition: all 0.3s ease;
|
31 |
+
}
|
32 |
+
|
33 |
+
/* Add hover effect for container */
|
34 |
+
.container:hover {
|
35 |
+
box-shadow: 0px 6px 24px rgba(0, 0, 0, 0.6);
|
36 |
+
}
|
37 |
+
|
38 |
+
/* Style the headings */
|
39 |
+
h1 {
|
40 |
+
margin: 20px 0;
|
41 |
+
font-size: 2.5rem;
|
42 |
+
color: #7289da;
|
43 |
+
}
|
44 |
+
|
45 |
+
/* Make the audio-panel responsive using flexbox */
|
46 |
+
.audio-panel {
|
47 |
+
display: flex;
|
48 |
+
flex-wrap: wrap;
|
49 |
+
justify-content: space-around;
|
50 |
+
margin: 20px 0;
|
51 |
+
}
|
52 |
+
|
53 |
+
.audio-upload {
|
54 |
+
width: 45%;
|
55 |
+
min-width: 300px;
|
56 |
+
padding: 10px;
|
57 |
+
margin-bottom: 20px;
|
58 |
+
background-color: #40444b;
|
59 |
+
border-radius: 10px;
|
60 |
+
transition: transform 0.2s;
|
61 |
+
}
|
62 |
+
|
63 |
+
.audio-upload:hover {
|
64 |
+
transform: translateY(-5px);
|
65 |
+
}
|
66 |
+
|
67 |
+
h2 {
|
68 |
+
font-size: 1.25rem;
|
69 |
+
margin-bottom: 10px;
|
70 |
+
color: #99aab5;
|
71 |
+
}
|
72 |
+
|
73 |
+
/* Style for file input */
|
74 |
+
input[type="file"] {
|
75 |
+
display: block;
|
76 |
+
margin: 10px 0;
|
77 |
+
background-color: #7289da;
|
78 |
+
color: white;
|
79 |
+
padding: 10px;
|
80 |
+
border-radius: 5px;
|
81 |
+
cursor: pointer;
|
82 |
+
transition: background-color 0.3s;
|
83 |
+
}
|
84 |
+
|
85 |
+
input[type="file"]:hover {
|
86 |
+
background-color: #5b6bb0;
|
87 |
+
}
|
88 |
+
|
89 |
+
/* Style the audio players */
|
90 |
+
audio {
|
91 |
+
width: 100%;
|
92 |
+
margin: 10px 0;
|
93 |
+
}
|
94 |
+
|
95 |
+
/* Style buttons with consistent design */
|
96 |
+
button {
|
97 |
+
padding: 12px 25px;
|
98 |
+
font-size: 1rem;
|
99 |
+
background-color: #7289da;
|
100 |
+
color: white;
|
101 |
+
border: none;
|
102 |
+
border-radius: 5px;
|
103 |
+
cursor: pointer;
|
104 |
+
transition: background-color 0.3s, transform 0.2s;
|
105 |
+
margin: 10px 5px;
|
106 |
+
}
|
107 |
+
|
108 |
+
button:hover {
|
109 |
+
background-color: #5b6bb0;
|
110 |
+
transform: translateY(-3px);
|
111 |
+
}
|
112 |
+
|
113 |
+
/* Loader and result display */
|
114 |
+
#loader {
|
115 |
+
font-size: 1.25rem;
|
116 |
+
color: #7289da;
|
117 |
+
margin: 20px 0;
|
118 |
+
}
|
119 |
+
|
120 |
+
.results {
|
121 |
+
margin-top: 20px;
|
122 |
+
background-color: #40444b;
|
123 |
+
padding: 20px;
|
124 |
+
border-radius: 10px;
|
125 |
+
color: #99aab5;
|
126 |
+
text-align: left;
|
127 |
+
}
|
128 |
+
|
129 |
+
.results h3 {
|
130 |
+
margin-bottom: 10px;
|
131 |
+
color: #7289da;
|
132 |
+
}
|
133 |
+
|
134 |
+
#results p {
|
135 |
+
font-size: 1.1rem;
|
136 |
+
margin: 5px 0;
|
137 |
+
}
|
138 |
+
|
139 |
+
/* Media query to ensure responsiveness on smaller screens */
|
140 |
+
@media (max-width: 768px) {
|
141 |
+
.audio-upload {
|
142 |
+
width: 100%;
|
143 |
+
margin-bottom: 20px;
|
144 |
+
}
|
145 |
+
|
146 |
+
h1 {
|
147 |
+
font-size: 2rem;
|
148 |
+
}
|
149 |
+
|
150 |
+
button {
|
151 |
+
width: 100%;
|
152 |
+
padding: 15px;
|
153 |
+
}
|
154 |
+
}
|
155 |
+
|
156 |
+
/* Add recording animation style */
|
157 |
+
.recording-active {
|
158 |
+
animation: pulse 1s infinite;
|
159 |
+
background-color: red;
|
160 |
+
color: white;
|
161 |
+
}
|
162 |
+
|
163 |
+
@keyframes pulse {
|
164 |
+
0% {
|
165 |
+
box-shadow: 0 0 0 0 rgba(255, 0, 0, 0.7);
|
166 |
+
}
|
167 |
+
70% {
|
168 |
+
box-shadow: 0 0 0 10px rgba(255, 0, 0, 0);
|
169 |
+
}
|
170 |
+
100% {
|
171 |
+
box-shadow: 0 0 0 0 rgba(255, 0, 0, 0);
|
172 |
+
}
|
173 |
+
}
|
templates/index.html
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Audio Transcription and Similarity Checker</title>
|
7 |
+
<link rel="stylesheet" href="/static/style.css">
|
8 |
+
</head>
|
9 |
+
<body>
|
10 |
+
<div class="container">
|
11 |
+
<h1>Audio Transcription and Similarity Checker</h1>
|
12 |
+
|
13 |
+
<!-- Audio upload/record panel -->
|
14 |
+
<div class="audio-panel">
|
15 |
+
<div class="audio-upload">
|
16 |
+
<h2>Upload or Record Original Audio</h2>
|
17 |
+
<input type="file" id="originalAudio" accept="audio/*">
|
18 |
+
<audio id="originalAudioPlayer" controls></audio>
|
19 |
+
<br>
|
20 |
+
<button id="recordOriginalAudio" onclick="startRecording('original')">Start Recording</button>
|
21 |
+
<button id="stopOriginalAudio" onclick="stopRecording('original')" disabled>Stop Recording</button>
|
22 |
+
</div>
|
23 |
+
<div class="audio-upload">
|
24 |
+
<h2>Upload or Record User Audio</h2>
|
25 |
+
<input type="file" id="userAudio" accept="audio/*">
|
26 |
+
<audio id="userAudioPlayer" controls></audio>
|
27 |
+
<br>
|
28 |
+
<button id="recordUserAudio" onclick="startRecording('user')">Start Recording</button>
|
29 |
+
<button id="stopUserAudio" onclick="stopRecording('user')" disabled>Stop Recording</button>
|
30 |
+
</div>
|
31 |
+
</div>
|
32 |
+
|
33 |
+
<!-- Button to perform similarity check -->
|
34 |
+
<button id="performTesting">Perform Testing</button>
|
35 |
+
|
36 |
+
<!-- Loader while processing -->
|
37 |
+
<div id="loader" style="display: none;">
|
38 |
+
<p>Processing... Please wait</p>
|
39 |
+
</div>
|
40 |
+
|
41 |
+
<!-- Results section -->
|
42 |
+
<div id="results" class="results" style="display: none;">
|
43 |
+
<h3>Results</h3>
|
44 |
+
<p id="transcriptionOriginal"></p>
|
45 |
+
<p id="transcriptionUser"></p>
|
46 |
+
<p id="similarityScore"></p>
|
47 |
+
</div>
|
48 |
+
</div>
|
49 |
+
|
50 |
+
<script src="/static/script.js"></script>
|
51 |
+
</body>
|
52 |
+
</html>
|