muzammil-eds commited on
Commit
bc94d2b
1 Parent(s): 09b7ae0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -106
app.py CHANGED
@@ -1,29 +1,38 @@
1
- import gradio as gr
2
  import requests
3
  import Levenshtein
4
- import numpy as np
5
- from transformers import pipeline
 
 
 
6
 
7
- # Function to securely load the Hugging Face API token
8
- def load_hf_token():
9
- # Replace this with your actual Hugging Face API token
10
- return "your_huggingface_api_token"
 
 
11
 
12
- # Function to query the Hugging Face Inference API
13
- def transcribe_audio_hf(audio):
 
14
  """
15
- Transcribes speech from an audio file using the Hugging Face Inference API.
16
  Args:
17
- audio (numpy.array): Audio data as a numpy array.
18
  Returns:
19
  str: The transcription of the speech in the audio file.
20
  """
21
- API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
22
- headers = {"Authorization": f"Bearer {load_hf_token()}"}
23
- response = requests.post(API_URL, headers=headers, data=audio.tobytes())
24
- return response.json().get("text", "").strip()
 
 
 
 
25
 
26
- # Function to calculate Levenshtein similarity
27
  def levenshtein_similarity(transcription1, transcription2):
28
  """
29
  Calculate the Levenshtein similarity between two transcriptions.
@@ -37,99 +46,99 @@ def levenshtein_similarity(transcription1, transcription2):
37
  max_len = max(len(transcription1), len(transcription2))
38
  return 1 - distance / max_len # Normalize to get similarity score
39
 
40
- # Function to evaluate audio similarity
41
- def evaluate_audio_similarity(original_audio, user_audio):
42
  """
43
  Compares the similarity between the transcription of an original audio file and a user's audio file.
44
  Args:
45
- original_audio (numpy.array): Original audio data.
46
- user_audio (numpy.array): User's audio data.
47
  Returns:
48
  tuple: Transcriptions and Levenshtein similarity score.
49
  """
50
- transcription_original = transcribe_audio_hf(original_audio)
51
- transcription_user = transcribe_audio_hf(user_audio)
52
- similarity_score = levenshtein_similarity(transcription_original, transcription_user)
53
- return transcription_original, transcription_user, similarity_score
54
-
55
- # Set up the Whisper ASR model for full-context and streaming ASR
56
- whisper_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
57
-
58
- # Full-context ASR function
59
- def full_context_asr(audio):
60
- sr, y = audio
61
- y = y.astype(np.float32)
62
- y /= np.max(np.abs(y))
63
- return whisper_transcriber({"sampling_rate": sr, "raw": y})["text"]
64
-
65
- # Streaming ASR function
66
- def streaming_asr(stream, new_chunk):
67
- sr, y = new_chunk
68
- y = y.astype(np.float32)
69
- y /= np.max(np.abs(y))
70
-
71
- if stream is not None:
72
- stream = np.concatenate([stream, y])
73
- else:
74
- stream = y
75
-
76
- return stream, whisper_transcriber({"sampling_rate": sr, "raw": stream})["text"]
77
-
78
- # Define Gradio interface for full-context ASR
79
- def gradio_full_context_interface(audio):
80
- if audio is not None:
81
- transcription = full_context_asr(audio)
82
- return transcription
83
- else:
84
- return "Please provide an audio file."
85
-
86
- # Define Gradio interface for audio similarity checking
87
- def gradio_similarity_interface(original_audio, user_audio):
88
- if original_audio is not None and user_audio is not None:
89
- transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
90
-
91
- result = {
92
- "Original Transcription": transcription_original,
93
- "User Transcription": transcription_user,
94
- "Levenshtein Similarity Score": similarity_score,
95
- }
96
-
97
- if similarity_score > 0.8: # Adjust the threshold as needed
98
- result["Feedback"] = "The pronunciation is likely correct based on transcription similarity."
99
- else:
100
- result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity."
101
-
102
- return result
103
- else:
104
- return "Please provide both original and user audio for comparison."
105
-
106
- # Create Gradio app for full-context ASR
107
- full_context_demo = gr.Interface(
108
- fn=gradio_full_context_interface,
109
- inputs=gr.Audio(source="microphone", type="numpy"),
110
- outputs="text",
111
- title="Full-Context ASR Demo"
112
- )
113
-
114
- # Create Gradio app for streaming ASR
115
- streaming_demo = gr.Interface(
116
- fn=streaming_asr,
117
- inputs=["state", gr.Audio(source="microphone", type="numpy", streaming=True)],
118
- outputs=["state", "text"],
119
- live=True,
120
- title="Streaming ASR Demo"
121
- )
122
-
123
- # Create Gradio app for audio similarity checking
124
- similarity_demo = gr.Interface(
125
- fn=gradio_similarity_interface,
126
- inputs=[
127
- gr.Audio(source="upload", type="numpy", label="Original Audio"),
128
- gr.Audio(source="upload", type="numpy", label="User Audio")
129
- ],
130
- outputs="json",
131
- title="Audio Transcription and Similarity Checker"
132
- )
133
-
134
- # Launch all three demos
135
- gr.TabbedInterface([full_context_demo, streaming_demo, similarity_demo], ["Full-Context ASR", "Streaming ASR", "Similarity Checker"]).launch()
 
 
1
+ import streamlit as st
2
  import requests
3
  import Levenshtein
4
+ from io import BytesIO
5
+ import librosa
6
+ import torch
7
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
8
+ from audio_recorder_streamlit import audio_recorder
9
 
10
+ @st.cache_resource
11
+ def load_model():
12
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
13
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
14
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
15
+ return processor, model
16
 
17
+ processor, model = load_model()
18
+
19
+ def transcribe_audio_hf(audio_bytes):
20
  """
21
+ Transcribes speech from an audio file using a pretrained Wav2Vec2 model.
22
  Args:
23
+ audio_bytes (bytes): Audio data in bytes.
24
  Returns:
25
  str: The transcription of the speech in the audio file.
26
  """
27
+ speech_array, sampling_rate = librosa.load(BytesIO(audio_bytes), sr=16000)
28
+ input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
29
+ with torch.no_grad():
30
+ logits = model(input_values).logits
31
+ predicted_ids = torch.argmax(logits, dim=-1)
32
+ transcription = processor.batch_decode(predicted_ids)[0].strip()
33
+ return transcription
34
+
35
 
 
36
  def levenshtein_similarity(transcription1, transcription2):
37
  """
38
  Calculate the Levenshtein similarity between two transcriptions.
 
46
  max_len = max(len(transcription1), len(transcription2))
47
  return 1 - distance / max_len # Normalize to get similarity score
48
 
49
+ def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
 
50
  """
51
  Compares the similarity between the transcription of an original audio file and a user's audio file.
52
  Args:
53
+ original_audio_bytes (bytes): Bytes of the original audio file.
54
+ user_audio_bytes (bytes): Bytes of the user's audio file.
55
  Returns:
56
  tuple: Transcriptions and Levenshtein similarity score.
57
  """
58
+ transcription_original = transcribe_audio_hf(original_audio_bytes)
59
+ transcription_user = transcribe_audio_hf(user_audio_bytes)
60
+ similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
61
+ return transcription_original, transcription_user, similarity_score_levenshtein
62
+
63
+ st.title("Audio Transcription and Similarity Checker")
64
+
65
+ # Choose between upload or record
66
+ st.sidebar.header("Input Method")
67
+ input_method = st.sidebar.selectbox("Choose Input Method", ["Upload", "Record"])
68
+
69
+ original_audio_bytes = None
70
+ user_audio_bytes = None
71
+
72
+ if input_method == "Upload":
73
+ # Upload original audio file
74
+ original_audio = st.file_uploader("Upload Original Audio", type=["wav", "mp3"])
75
+ # Upload user audio file
76
+ user_audio = st.file_uploader("Upload User Audio", type=["wav", "mp3"])
77
+
78
+ if original_audio:
79
+ original_audio_bytes = original_audio.read()
80
+ st.audio(original_audio_bytes, format="audio/wav")
81
+ if user_audio:
82
+ user_audio_bytes = user_audio.read()
83
+ st.audio(user_audio_bytes, format="audio/wav")
84
+
85
+ # Add a button to perform the test
86
+ if original_audio_bytes and user_audio_bytes:
87
+ if st.button("Perform Testing"):
88
+ with st.spinner("Performing transcription and similarity testing..."):
89
+ transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
90
+
91
+ # Display results
92
+ st.markdown("---")
93
+ st.subheader("Transcriptions and Similarity Score")
94
+ st.write(f"**Original Transcription:** {transcription_original}")
95
+ st.write(f"**User Transcription:** {transcription_user}")
96
+ st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
97
+
98
+ if similarity_score > 0.8: # Adjust the threshold as needed
99
+ st.success("The pronunciation is likely correct based on transcription similarity.")
100
+ else:
101
+ st.error("The pronunciation may be incorrect based on transcription similarity.")
102
+
103
+ elif input_method == "Record":
104
+ st.write("Record or Upload Original Audio")
105
+ original_audio_bytes = audio_recorder(key="original_audio_recorder")
106
+
107
+ if not original_audio_bytes:
108
+ original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
109
+ if original_audio:
110
+ original_audio_bytes = original_audio.read()
111
+
112
+ if original_audio_bytes:
113
+ with st.spinner("Processing original audio..."):
114
+ st.audio(original_audio_bytes, format="audio/wav")
115
+
116
+ st.write("Record or Upload User Audio")
117
+ user_audio_bytes = audio_recorder(key="user_audio_recorder")
118
+
119
+ if not user_audio_bytes:
120
+ user_audio = st.file_uploader("Or Upload User Audio", type=["wav", "mp3"])
121
+ if user_audio:
122
+ user_audio_bytes = user_audio.read()
123
+
124
+ if user_audio_bytes:
125
+ with st.spinner("Processing user audio..."):
126
+ st.audio(user_audio_bytes, format="audio/wav")
127
+
128
+ # Add a button to perform the test
129
+ if original_audio_bytes and user_audio_bytes:
130
+ if st.button("Perform Testing"):
131
+ with st.spinner("Performing transcription and similarity testing..."):
132
+ transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
133
+
134
+ # Display results
135
+ st.markdown("---")
136
+ st.subheader("Transcriptions and Similarity Score")
137
+ st.write(f"**Original Transcription:** {transcription_original}")
138
+ st.write(f"**User Transcription:** {transcription_user}")
139
+ st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
140
+
141
+ if similarity_score > 0.8: # Adjust the threshold as needed
142
+ st.success("The pronunciation is likely correct based on transcription similarity.")
143
+ else:
144
+ st.error("The pronunciation may be incorrect based on transcription similarity.")