vakodiya commited on
Commit
6bee80f
1 Parent(s): a205074

Update audio_to_text.py

Browse files
Files changed (1) hide show
  1. audio_to_text.py +34 -15
audio_to_text.py CHANGED
@@ -1,26 +1,45 @@
1
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  import torchaudio
3
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  # load model and processor
7
- processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
8
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
9
  model.config.forced_decoder_ids = None
10
 
11
 
12
- def audio_to_text(audio_data,sample_rate):
13
- # Convert raw audio frame (numpy array) to tensor and resample it to 16 kHz
14
- waveform = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
15
- # Check if the sample rate is 16 kHz; if not, resample it
16
- if sample_rate != 16000:
 
 
 
 
 
 
17
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
18
  waveform = resampler(waveform)
19
- waveform = waveform.squeeze().numpy()
20
- input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
21
-
22
- # generate token ids
23
- predicted_ids = model.generate(input_features)
24
- # decode token ids to text
25
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
26
- return transcription
 
 
1
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  import torchaudio
3
  import torch
4
+ import os
5
+ from pydub import AudioSegment
6
+
7
+
8
+ # Get the directory of the current file
9
+ current_dir = os.path.dirname(os.path.abspath(__file__))
10
+ # Construct the absolute path to the 'ffmpeg/bin' directory
11
+ ffmpeg_bin_path = os.path.join(current_dir, 'ffmpeg', 'bin')
12
+ # Add this path to the PATH environment variable
13
+ os.environ["PATH"] += os.pathsep + ffmpeg_bin_path
14
+ # Ensure ffmpeg is in PATH
15
+ AudioSegment.converter = os.path.join(ffmpeg_bin_path, 'ffmpeg.exe')
16
 
17
 
18
  # load model and processor
19
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
20
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
21
  model.config.forced_decoder_ids = None
22
 
23
 
24
+ def audio_to_text(webm_file_path):
25
+ wav_file = "recorded_audio.wav"
26
+ absolute_path = os.path.abspath(webm_file_path)
27
+
28
+ # Load and convert audio
29
+ # Check if the file exists
30
+ if os.path.exists(webm_file_path):
31
+ wav_audio = AudioSegment.from_file(absolute_path, format="webm")
32
+ wav_audio.export(wav_file, format="wav")
33
+ # Load the audio and resample it
34
+ waveform, sample_rate = torchaudio.load('recorded_audio.wav')
35
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
36
  waveform = resampler(waveform)
37
+ waveform = waveform.squeeze().numpy()
38
+ input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
39
+ # generate token ids
40
+ predicted_ids = model.generate(input_features)
41
+ # decode token ids to text
42
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
43
+ return transcription
44
+ else:
45
+ return None