S3TVR-Demo / stream_VAD2.py
yalsaffar's picture
init
aa7cb02
raw
history blame
9.63 kB
import collections
import contextlib
import wave
import webrtcvad
import pyaudio
import os
import librosa
import numpy as np
from models.nllb import nllb_translate
from models.TTS_utils import append_text_order
from models.parakeet import parakeet_ctc_process
from models.es_fastconformer import stt_es_process
from concurrent.futures import ThreadPoolExecutor
import time
from models.noise_red import noise_reduction
class Frame(object):
"""
Represents a "frame" of audio data.
Args:
bytes (bytes): The audio data.
timestamp (float): The timestamp of the frame.
duration (float): The duration of the frame.
"""
def __init__(self, bytes, timestamp, duration):
self.bytes = bytes
self.timestamp = timestamp
self.duration = duration
def read_audio(stream, frame_duration_ms, rate):
"""
Generates audio frames from the input stream.
Args:
stream (pyaudio.Stream): The audio stream.
frame_duration_ms (int): Duration of each frame in milliseconds.
rate (int): The sample rate of the audio.
Yields:
bytes: The audio frames.
"""
frames_per_buffer = int(rate * frame_duration_ms / 1000)
while True:
yield stream.read(frames_per_buffer)
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
"""
Filters out non-voiced audio frames.
Args:
sample_rate (int): The sample rate of the audio.
frame_duration_ms (int): Duration of each frame in milliseconds.
padding_duration_ms (int): Duration of padding in milliseconds.
vad (webrtcvad.Vad): The VAD object.
frames (generator): A generator yielding audio frames.
Yields:
bytes: Voiced audio frames.
"""
num_padding_frames = int(padding_duration_ms / frame_duration_ms)
ring_buffer = collections.deque(maxlen=num_padding_frames)
triggered = False
voiced_frames = []
for frame in frames:
is_speech = vad.is_speech(frame.bytes, sample_rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
if num_voiced > 0.9 * ring_buffer.maxlen:
triggered = True
voiced_frames.extend(f for f, speech in ring_buffer)
ring_buffer.clear()
else:
voiced_frames.append(frame)
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
if num_unvoiced > 0.9 * ring_buffer.maxlen:
yield b''.join([f.bytes for f in voiced_frames])
ring_buffer.clear()
voiced_frames = []
triggered = False
if voiced_frames:
yield b''.join([f.bytes for f in voiced_frames])
def is_segment_empty(file_path):
"""
Check if the audio segment is empty.
Args:
file_path (str): Path to the audio file.
Returns:
bool: True if the segment is empty, False otherwise.
"""
audio, _ = librosa.load(file_path)
rms = librosa.feature.rms(y=audio) # Pass the audio data as an argument
rms_mean = np.mean(rms)
print(rms_mean)
if rms_mean < 0.015:
return True
else:
return False
def process_segment(asr_model, model_nllb, tokenizer_nllb, path_segments, path_results, target_lang, order, json_path_temp, json_path_record):
"""
Process an audio segment: noise reduction, transcription, translation, and append results.
Args:
asr_model: The ASR model for transcription.
model_nllb: The NLLB model for translation.
tokenizer_nllb: The tokenizer for the NLLB model.
path_segments (str): Path to the audio segment.
path_results (str): Path to save the results.
target_lang (str): Target language for translation.
order (int): Order index of the segment.
json_path_temp (str): Path to the temporary JSON file.
json_path_record (str): Path to the record JSON file.
"""
print("Processing segment...")
if is_segment_empty(path_segments):
print("No speech detected.")
# remove the empty segment
os.remove(path_segments)
return
# Noise Reduction
start_time = time.time()
noise_reduction(path_segments, path_segments)
print("Noise removed. Time:", time.time() - start_time)
# Transcription
transcription = transcribe(asr_model, path_segments, target_lang)
#if not transcription.strip():
# print("No speech detected.")
# return
# Translation
print("Translating...")
translation = translate(model_nllb, tokenizer_nllb, transcription, target_lang)
# Text-to-Speech
# process_tts(tts_model, translation, path_segments, target_lang, path_results)
append_text_order(json_path_temp,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
append_text_order(json_path_record,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
def transcribe(asr_model, path_segments, target_lang):
"""
Transcribe an audio segment using the specified ASR model.
Args:
asr_model: The ASR model for transcription.
path_segments (str): Path to the audio segment.
target_lang (str): Target language for transcription.
Returns:
str: The transcription of the audio segment.
"""
start_time = time.time()
transcription_func = {
"spanish": parakeet_ctc_process,
"english": stt_es_process
}[target_lang]
transcription = transcription_func(asr_model, path_segments)
print("Transcription:", transcription[0])
print("Transcription time:", time.time() - start_time)
return transcription[0]
def translate(model_nllb, tokenizer_nllb, text, target_lang):
"""
Translate text using the specified NLLB model and tokenizer.
Args:
model_nllb: The NLLB model for translation.
tokenizer_nllb: The tokenizer for the NLLB model.
text (str): The text to translate.
target_lang (str): Target language for translation.
Returns:
str: The translated text.
"""
print("Processing translation...")
start_time = time.time()
translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang)
print("Translation:", translation)
print("Translation time:", time.time() - start_time)
return translation
import os
import time
import contextlib
import wave
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from concurrent.futures import ThreadPoolExecutor
# Assuming you have the following functions defined elsewhere:
# - process_segment
# - asr_model
# - model_nllb
# - tokinizer_nllb
class NewAudioHandler(FileSystemEventHandler):
def __init__(self, asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir):
self.asr_model = asr_model
self.model_nllb = model_nllb
self.tokinizer_nllb = tokinizer_nllb
self.source_lang = source_lang
self.target_lang = target_lang
self.json_file_temp = json_file_temp
self.json_file_record = json_file_record
self.result_dir = result_dir
self.executor = ThreadPoolExecutor(max_workers=2)
def on_created(self, event):
if not event.is_directory and event.src_path.endswith(".wav"):
self.process_new_audio(event.src_path)
def process_new_audio(self, audio_path):
file_name = os.path.basename(audio_path)
result_path = os.path.join(self.result_dir, f"result_{file_name}")
print(f"Processing {audio_path}...")
self.executor.submit(process_segment, self.asr_model, self.model_nllb, self.tokinizer_nllb, audio_path, result_path, self.target_lang, file_name, self.json_file_temp, self.json_file_record)
def watch_folder(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, watch_dir="audio_segments", result_dir="results"):
"""
Watch a folder for new audio files and process them.
Args:
asr_model: The ASR model for transcription.
model_nllb: The NLLB model for translation.
tokinizer_nllb: The tokenizer for the NLLB model.
source_lang (str): Source language of the audio.
target_lang (str): Target language for translation.
json_file_temp (str): Path to the temporary JSON file.
json_file_record (str): Path to the record JSON file.
watch_dir (str, optional): Directory to watch for new audio files. Default is "audio_segments".
result_dir (str, optional): Directory to save the results. Default is "results".
"""
if not os.path.exists(watch_dir):
os.makedirs(watch_dir)
if not os.path.exists(result_dir):
os.makedirs(result_dir)
event_handler = NewAudioHandler(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir)
observer = Observer()
observer.schedule(event_handler, watch_dir, recursive=False)
observer.start()
print(f"Watching directory: {watch_dir}")
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
# Example usage:
# watch_folder(asr_model, model_nllb, tokinizer_nllb, "en", "fr", "temp.json", "record.json")