muzammil-eds commited on
Commit
18922c4
1 Parent(s): 6d75aa3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -29
app.py CHANGED
@@ -18,13 +18,6 @@ def load_model():
18
  processor, model = load_model()
19
 
20
  def transcribe_audio_hf(audio_bytes):
21
- """
22
- Transcribes speech from an audio file using a pretrained Wav2Vec2 model.
23
- Args:
24
- audio_bytes (bytes): Audio data in bytes.
25
- Returns:
26
- str: The transcription of the speech in the audio file.
27
- """
28
  speech_array, sampling_rate = librosa.load(BytesIO(audio_bytes), sr=16000)
29
  input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
30
  with torch.no_grad():
@@ -35,27 +28,11 @@ def transcribe_audio_hf(audio_bytes):
35
 
36
 
37
  def levenshtein_similarity(transcription1, transcription2):
38
- """
39
- Calculate the Levenshtein similarity between two transcriptions.
40
- Args:
41
- transcription1 (str): The first transcription.
42
- transcription2 (str): The second transcription.
43
- Returns:
44
- float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
45
- """
46
  distance = Levenshtein.distance(transcription1, transcription2)
47
  max_len = max(len(transcription1), len(transcription2))
48
  return 1 - distance / max_len # Normalize to get similarity score
49
 
50
  def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
51
- """
52
- Compares the similarity between the transcription of an original audio file and a user's audio file.
53
- Args:
54
- original_audio_bytes (bytes): Bytes of the original audio file.
55
- user_audio_bytes (bytes): Bytes of the user's audio file.
56
- Returns:
57
- tuple: Transcriptions and Levenshtein similarity score.
58
- """
59
  transcription_original = transcribe_audio_hf(original_audio_bytes)
60
  transcription_user = transcribe_audio_hf(user_audio_bytes)
61
  similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
@@ -70,13 +47,20 @@ input_method = st.sidebar.selectbox("Choose Input Method", ["Record"])
70
  original_audio_bytes = None
71
  user_audio_bytes = None
72
 
73
-
74
  if input_method == "Record":
75
  st.write("Record or Upload Original Audio")
76
- test_bytes = audio_recorder(key="tester", pause_threshold=0.2, auto_start=True)
77
-
78
- time.sleep(5)
79
- original_audio_bytes = audio_recorder(key="original_audio_recorder", pause_threshold=30, icon_size='4x')
 
 
 
 
 
 
 
80
 
81
  if not original_audio_bytes:
82
  original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
@@ -112,7 +96,7 @@ if input_method == "Record":
112
  st.write(f"**User Transcription:** {transcription_user}")
113
  st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
114
 
115
- if similarity_score > 0.8: # Adjust the threshold as needed
116
  st.success("The pronunciation is likely correct based on transcription similarity.")
117
  else:
118
  st.error("The pronunciation may be incorrect based on transcription similarity.")
 
18
  processor, model = load_model()
19
 
20
  def transcribe_audio_hf(audio_bytes):
 
 
 
 
 
 
 
21
  speech_array, sampling_rate = librosa.load(BytesIO(audio_bytes), sr=16000)
22
  input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
23
  with torch.no_grad():
 
28
 
29
 
30
  def levenshtein_similarity(transcription1, transcription2):
 
 
 
 
 
 
 
 
31
  distance = Levenshtein.distance(transcription1, transcription2)
32
  max_len = max(len(transcription1), len(transcription2))
33
  return 1 - distance / max_len # Normalize to get similarity score
34
 
35
  def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
 
 
 
 
 
 
 
 
36
  transcription_original = transcribe_audio_hf(original_audio_bytes)
37
  transcription_user = transcribe_audio_hf(user_audio_bytes)
38
  similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
 
47
  original_audio_bytes = None
48
  user_audio_bytes = None
49
 
50
+ # Delay for initial setup to avoid first-click auto-stop issue
51
  if input_method == "Record":
52
  st.write("Record or Upload Original Audio")
53
+
54
+ # Introducing a delay for initial recording setup to avoid immediate stop issue
55
+ if 'initialized' not in st.session_state:
56
+ st.session_state['initialized'] = False
57
+
58
+ if not st.session_state['initialized']:
59
+ st.session_state['initialized'] = True
60
+ st.warning("Initializing recorder... Please wait a moment.")
61
+ time.sleep(2) # Add small delay before first-time recording
62
+ else:
63
+ original_audio_bytes = audio_recorder(key="original_audio_recorder", pause_threshold=30, icon_size='4x')
64
 
65
  if not original_audio_bytes:
66
  original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
 
96
  st.write(f"**User Transcription:** {transcription_user}")
97
  st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
98
 
99
+ if similarity_score > 0.8:
100
  st.success("The pronunciation is likely correct based on transcription similarity.")
101
  else:
102
  st.error("The pronunciation may be incorrect based on transcription similarity.")