oceansweep commited on
Commit
c08817b
1 Parent(s): 66a2900

Upload Audio_Transcription_Lib.py

Browse files
App_Function_Libraries/Audio_Transcription_Lib.py CHANGED
@@ -1,202 +1,329 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####################
8
- # Function List
9
- #
10
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries to run solo for testing
16
- import gc
17
- import json
18
- import logging
19
- import os
20
- import queue
21
- import sys
22
- import subprocess
23
- import tempfile
24
- import threading
25
- import time
26
- import configparser
27
- # DEBUG Imports
28
- #from memory_profiler import profile
29
- #import pyaudio
30
-
31
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
32
-
33
- # Import Local
34
- #
35
- #######################################################################################################################
36
- # Function Definitions
37
- #
38
-
39
- # Convert video .m4a into .wav using ffmpeg
40
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
- # https://www.gyan.dev/ffmpeg/builds/
42
- #
43
-
44
-
45
- whisper_model_instance = None
46
- config = load_comprehensive_config()
47
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
-
49
-
50
- # FIXME: This is a temporary solution.
51
- # This doesn't clear older models, which means potentially a lot of memory is being used...
52
- def get_whisper_model(model_name, device):
53
- global whisper_model_instance
54
- if whisper_model_instance is None:
55
- from faster_whisper import WhisperModel
56
- logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
57
- whisper_model_instance = WhisperModel(model_name, device=device)
58
- return whisper_model_instance
59
-
60
-
61
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
62
- #DEBUG
63
- #@profile
64
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
65
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
66
-
67
- if os.path.exists(out_path) and not overwrite:
68
- print(f"File '{out_path}' already exists. Skipping conversion.")
69
- logging.info(f"Skipping conversion as file already exists: {out_path}")
70
- return out_path
71
- print("Starting conversion process of .m4a to .WAV")
72
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
73
-
74
- try:
75
- if os.name == "nt":
76
- logging.debug("ffmpeg being ran on windows")
77
-
78
- if sys.platform.startswith('win'):
79
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
80
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
81
- else:
82
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
83
-
84
- command = [
85
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
86
- "-ss", "00:00:00", # Start at the beginning of the video
87
- "-i", video_file_path,
88
- "-ar", "16000", # Audio sample rate
89
- "-ac", "1", # Number of audio channels
90
- "-c:a", "pcm_s16le", # Audio codec
91
- out_path
92
- ]
93
- try:
94
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
95
- with open(os.devnull, 'rb') as null_file:
96
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
97
- if result.returncode == 0:
98
- logging.info("FFmpeg executed successfully")
99
- logging.debug("FFmpeg output: %s", result.stdout)
100
- else:
101
- logging.error("Error in running FFmpeg")
102
- logging.error("FFmpeg stderr: %s", result.stderr)
103
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
104
- except Exception as e:
105
- logging.error("Error occurred - ffmpeg doesn't like windows")
106
- raise RuntimeError("ffmpeg failed")
107
- elif os.name == "posix":
108
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
109
- else:
110
- raise RuntimeError("Unsupported operating system")
111
- logging.info("Conversion to WAV completed: %s", out_path)
112
- except subprocess.CalledProcessError as e:
113
- logging.error("Error executing FFmpeg command: %s", str(e))
114
- raise RuntimeError("Error converting video file to WAV")
115
- except Exception as e:
116
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
117
- return {"error": str(e)}
118
- gc.collect()
119
- return out_path
120
-
121
-
122
- # Transcribe .wav into .segments.json
123
- #DEBUG
124
- #@profile
125
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
126
- global whisper_model_instance, processing_choice
127
- logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
128
-
129
- time_start = time.time()
130
- if audio_file_path is None:
131
- raise ValueError("speech-to-text: No audio file provided")
132
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
133
-
134
- try:
135
- _, file_ending = os.path.splitext(audio_file_path)
136
- out_file = audio_file_path.replace(file_ending, ".segments.json")
137
- prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
138
- if os.path.exists(out_file):
139
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
140
- with open(out_file) as f:
141
- global segments
142
- segments = json.load(f)
143
- return segments
144
-
145
- logging.info('speech-to-text: Starting transcription...')
146
- options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
147
- transcribe_options = dict(task="transcribe", **options)
148
- # use function and config at top of file
149
- whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
150
- segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
151
-
152
- segments = []
153
- for segment_chunk in segments_raw:
154
- chunk = {
155
- "Time_Start": segment_chunk.start,
156
- "Time_End": segment_chunk.end,
157
- "Text": segment_chunk.text
158
- }
159
- logging.debug("Segment: %s", chunk)
160
- segments.append(chunk)
161
- # Print to verify its working
162
- print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
163
-
164
- # Log it as well.
165
- logging.debug(
166
- f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
167
-
168
- if segments:
169
- segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
170
-
171
- if not segments:
172
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
173
- logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
174
-
175
- # Save the segments to a JSON file - prettified and non-prettified
176
- # FIXME so this is an optional flag to save either the prettified json file or the normal one
177
- save_json = True
178
- if save_json:
179
- logging.info("speech-to-text: Saving segments to JSON file")
180
- output_data = {'segments': segments}
181
-
182
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
183
- with open(prettified_out_file, 'w') as f:
184
- json.dump(output_data, f, indent=2)
185
-
186
- logging.info("speech-to-text: Saving JSON to %s", out_file)
187
- with open(out_file, 'w') as f:
188
- json.dump(output_data, f)
189
-
190
- logging.debug(f"speech-to-text: returning {segments[:500]}")
191
- gc.collect()
192
- return segments
193
-
194
- except Exception as e:
195
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
196
- raise RuntimeError("speech-to-text: Error transcribing audio")
197
-
198
-
199
-
200
- #
201
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import os
20
+ import queue
21
+ import sys
22
+ import subprocess
23
+ import tempfile
24
+ import threading
25
+ import time
26
+ # DEBUG Imports
27
+ #from memory_profiler import profile
28
+ import pyaudio
29
+ from faster_whisper import WhisperModel as OriginalWhisperModel
30
+ from typing import Optional, Union, List, Dict, Any
31
+ #
32
+ # Import Local
33
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
34
+ #
35
+ #######################################################################################################################
36
+ # Function Definitions
37
+ #
38
+
39
+ # Convert video .m4a into .wav using ffmpeg
40
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
+ # https://www.gyan.dev/ffmpeg/builds/
42
+ #
43
+
44
+
45
+ whisper_model_instance = None
46
+ config = load_comprehensive_config()
47
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
+
49
+
50
+
51
+ class WhisperModel(OriginalWhisperModel):
52
+ tldw_dir = os.path.dirname(os.path.dirname(__file__))
53
+ default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
54
+
55
+ valid_model_sizes = [
56
+ "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
57
+ "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
58
+ "distil-small.en", "distil-large-v3"
59
+ ]
60
+
61
+ def __init__(
62
+ self,
63
+ model_size_or_path: str,
64
+ device: str = "auto",
65
+ device_index: Union[int, List[int]] = 0,
66
+ compute_type: str = "default",
67
+ cpu_threads: int = 16,
68
+ num_workers: int = 1,
69
+ download_root: Optional[str] = None,
70
+ local_files_only: bool = False,
71
+ files: Optional[Dict[str, Any]] = None,
72
+ **model_kwargs: Any
73
+ ):
74
+ if download_root is None:
75
+ download_root = self.default_download_root
76
+
77
+ os.makedirs(download_root, exist_ok=True)
78
+
79
+ # FIXME - validate....
80
+ # Also write an integration test...
81
+ # Check if model_size_or_path is a valid model size
82
+ if model_size_or_path in self.valid_model_sizes:
83
+ # It's a model size, so we'll use the download_root
84
+ model_path = os.path.join(download_root, model_size_or_path)
85
+ if not os.path.isdir(model_path):
86
+ # If it doesn't exist, we'll let the parent class download it
87
+ model_size_or_path = model_size_or_path # Keep the original model size
88
+ else:
89
+ # If it exists, use the full path
90
+ model_size_or_path = model_path
91
+ else:
92
+ # It's not a valid model size, so assume it's a path
93
+ model_size_or_path = os.path.abspath(model_size_or_path)
94
+
95
+ super().__init__(
96
+ model_size_or_path,
97
+ device=device,
98
+ device_index=device_index,
99
+ compute_type=compute_type,
100
+ cpu_threads=cpu_threads,
101
+ num_workers=num_workers,
102
+ download_root=download_root,
103
+ local_files_only=local_files_only,
104
+ # Maybe? idk, FIXME
105
+ # files=files,
106
+ # **model_kwargs
107
+ )
108
+
109
+ def get_whisper_model(model_name, device):
110
+ global whisper_model_instance
111
+ if whisper_model_instance is None:
112
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
113
+ whisper_model_instance = WhisperModel(model_name, device=device)
114
+ return whisper_model_instance
115
+
116
+ # # FIXME: This is a temporary solution.
117
+ # # This doesn't clear older models, which means potentially a lot of memory is being used...
118
+ # def get_whisper_model(model_name, device):
119
+ # global whisper_model_instance
120
+ # if whisper_model_instance is None:
121
+ # from faster_whisper import WhisperModel
122
+ # logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
123
+ #
124
+ # # FIXME - add logic to detect if the model is already downloaded
125
+ # # want to first check if the model is already downloaded
126
+ # # if not, download it using the existing logic in 'WhisperModel'
127
+ # # https://github.com/SYSTRAN/faster-whisper/blob/d57c5b40b06e59ec44240d93485a95799548af50/faster_whisper/transcribe.py#L584
128
+ # # Designated path should be `tldw/App_Function_Libraries/models/Whisper/`
129
+ # WhisperModel.download_root = os.path.join(os.path.dirname(__file__), 'models', 'Whisper')
130
+ # os.makedirs(WhisperModel.download_root, exist_ok=True)
131
+ # whisper_model_instance = WhisperModel(model_name, device=device)
132
+ # return whisper_model_instance
133
+
134
+
135
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
136
+ #DEBUG
137
+ #@profile
138
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
139
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
140
+
141
+ if os.path.exists(out_path) and not overwrite:
142
+ print(f"File '{out_path}' already exists. Skipping conversion.")
143
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
144
+ return out_path
145
+ print("Starting conversion process of .m4a to .WAV")
146
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
147
+
148
+ try:
149
+ if os.name == "nt":
150
+ logging.debug("ffmpeg being ran on windows")
151
+
152
+ if sys.platform.startswith('win'):
153
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
154
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
155
+ else:
156
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
157
+
158
+ command = [
159
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
160
+ "-ss", "00:00:00", # Start at the beginning of the video
161
+ "-i", video_file_path,
162
+ "-ar", "16000", # Audio sample rate
163
+ "-ac", "1", # Number of audio channels
164
+ "-c:a", "pcm_s16le", # Audio codec
165
+ out_path
166
+ ]
167
+ try:
168
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
169
+ with open(os.devnull, 'rb') as null_file:
170
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
171
+ if result.returncode == 0:
172
+ logging.info("FFmpeg executed successfully")
173
+ logging.debug("FFmpeg output: %s", result.stdout)
174
+ else:
175
+ logging.error("Error in running FFmpeg")
176
+ logging.error("FFmpeg stderr: %s", result.stderr)
177
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
178
+ except Exception as e:
179
+ logging.error("Error occurred - ffmpeg doesn't like windows")
180
+ raise RuntimeError("ffmpeg failed")
181
+ elif os.name == "posix":
182
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
183
+ else:
184
+ raise RuntimeError("Unsupported operating system")
185
+ logging.info("Conversion to WAV completed: %s", out_path)
186
+ except subprocess.CalledProcessError as e:
187
+ logging.error("Error executing FFmpeg command: %s", str(e))
188
+ raise RuntimeError("Error converting video file to WAV")
189
+ except Exception as e:
190
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
191
+ return {"error": str(e)}
192
+ gc.collect()
193
+ return out_path
194
+
195
+
196
+ # Transcribe .wav into .segments.json
197
+ #DEBUG
198
+ #@profile
199
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
200
+ global whisper_model_instance, processing_choice
201
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
202
+
203
+ time_start = time.time()
204
+ if audio_file_path is None:
205
+ raise ValueError("speech-to-text: No audio file provided")
206
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
207
+
208
+ try:
209
+ _, file_ending = os.path.splitext(audio_file_path)
210
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
211
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
212
+ if os.path.exists(out_file):
213
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
214
+ with open(out_file) as f:
215
+ global segments
216
+ segments = json.load(f)
217
+ return segments
218
+
219
+ logging.info('speech-to-text: Starting transcription...')
220
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
221
+ transcribe_options = dict(task="transcribe", **options)
222
+ # use function and config at top of file
223
+ logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
224
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
225
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
226
+
227
+ segments = []
228
+ for segment_chunk in segments_raw:
229
+ chunk = {
230
+ "Time_Start": segment_chunk.start,
231
+ "Time_End": segment_chunk.end,
232
+ "Text": segment_chunk.text
233
+ }
234
+ logging.debug("Segment: %s", chunk)
235
+ segments.append(chunk)
236
+ # Print to verify its working
237
+ print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
238
+
239
+ # Log it as well.
240
+ logging.debug(
241
+ f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
242
+
243
+ if segments:
244
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
245
+
246
+ if not segments:
247
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
248
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
249
+
250
+ # Save the segments to a JSON file - prettified and non-prettified
251
+ # FIXME so this is an optional flag to save either the prettified json file or the normal one
252
+ save_json = True
253
+ if save_json:
254
+ logging.info("speech-to-text: Saving segments to JSON file")
255
+ output_data = {'segments': segments}
256
+
257
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
258
+ with open(prettified_out_file, 'w') as f:
259
+ json.dump(output_data, f, indent=2)
260
+
261
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
262
+ with open(out_file, 'w') as f:
263
+ json.dump(output_data, f)
264
+
265
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
266
+ gc.collect()
267
+ return segments
268
+
269
+ except Exception as e:
270
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
271
+ raise RuntimeError("speech-to-text: Error transcribing audio")
272
+
273
+
274
+ def record_audio(duration, sample_rate=16000, chunk_size=1024):
275
+ p = pyaudio.PyAudio()
276
+ stream = p.open(format=pyaudio.paInt16,
277
+ channels=1,
278
+ rate=sample_rate,
279
+ input=True,
280
+ frames_per_buffer=chunk_size)
281
+
282
+ print("Recording...")
283
+ frames = []
284
+ stop_recording = threading.Event()
285
+ audio_queue = queue.Queue()
286
+
287
+ def audio_callback():
288
+ for _ in range(0, int(sample_rate / chunk_size * duration)):
289
+ if stop_recording.is_set():
290
+ break
291
+ data = stream.read(chunk_size)
292
+ audio_queue.put(data)
293
+
294
+ audio_thread = threading.Thread(target=audio_callback)
295
+ audio_thread.start()
296
+
297
+ return p, stream, audio_queue, stop_recording, audio_thread
298
+
299
+
300
+ def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
301
+ stop_recording_event.set()
302
+ audio_thread.join()
303
+
304
+ frames = []
305
+ while not audio_queue.empty():
306
+ frames.append(audio_queue.get())
307
+
308
+ print("Recording finished.")
309
+
310
+ stream.stop_stream()
311
+ stream.close()
312
+ p.terminate()
313
+
314
+ return b''.join(frames)
315
+
316
+ def save_audio_temp(audio_data, sample_rate=16000):
317
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
318
+ import wave
319
+ wf = wave.open(temp_file.name, 'wb')
320
+ wf.setnchannels(1)
321
+ wf.setsampwidth(2)
322
+ wf.setframerate(sample_rate)
323
+ wf.writeframes(audio_data)
324
+ wf.close()
325
+ return temp_file.name
326
+
327
+ #
328
+ #
329
  #######################################################################################################################