Spaces:
Sleeping
Sleeping
import collections | |
import contextlib | |
import wave | |
import webrtcvad | |
import pyaudio | |
import os | |
import librosa | |
import numpy as np | |
from models.nllb import nllb_translate | |
from models.TTS_utils import append_text_order | |
from models.parakeet import parakeet_ctc_process | |
from models.es_fastconformer import stt_es_process | |
from concurrent.futures import ThreadPoolExecutor | |
import time | |
from models.noise_red import noise_reduction | |
class Frame(object): | |
""" | |
Represents a "frame" of audio data. | |
Args: | |
bytes (bytes): The audio data. | |
timestamp (float): The timestamp of the frame. | |
duration (float): The duration of the frame. | |
""" | |
def __init__(self, bytes, timestamp, duration): | |
self.bytes = bytes | |
self.timestamp = timestamp | |
self.duration = duration | |
def read_audio(stream, frame_duration_ms, rate): | |
""" | |
Generates audio frames from the input stream. | |
Args: | |
stream (pyaudio.Stream): The audio stream. | |
frame_duration_ms (int): Duration of each frame in milliseconds. | |
rate (int): The sample rate of the audio. | |
Yields: | |
bytes: The audio frames. | |
""" | |
frames_per_buffer = int(rate * frame_duration_ms / 1000) | |
while True: | |
yield stream.read(frames_per_buffer) | |
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): | |
""" | |
Filters out non-voiced audio frames. | |
Args: | |
sample_rate (int): The sample rate of the audio. | |
frame_duration_ms (int): Duration of each frame in milliseconds. | |
padding_duration_ms (int): Duration of padding in milliseconds. | |
vad (webrtcvad.Vad): The VAD object. | |
frames (generator): A generator yielding audio frames. | |
Yields: | |
bytes: Voiced audio frames. | |
""" | |
num_padding_frames = int(padding_duration_ms / frame_duration_ms) | |
ring_buffer = collections.deque(maxlen=num_padding_frames) | |
triggered = False | |
voiced_frames = [] | |
for frame in frames: | |
is_speech = vad.is_speech(frame.bytes, sample_rate) | |
if not triggered: | |
ring_buffer.append((frame, is_speech)) | |
num_voiced = len([f for f, speech in ring_buffer if speech]) | |
if num_voiced > 0.9 * ring_buffer.maxlen: | |
triggered = True | |
voiced_frames.extend(f for f, speech in ring_buffer) | |
ring_buffer.clear() | |
else: | |
voiced_frames.append(frame) | |
ring_buffer.append((frame, is_speech)) | |
num_unvoiced = len([f for f, speech in ring_buffer if not speech]) | |
if num_unvoiced > 0.9 * ring_buffer.maxlen: | |
yield b''.join([f.bytes for f in voiced_frames]) | |
ring_buffer.clear() | |
voiced_frames = [] | |
triggered = False | |
if voiced_frames: | |
yield b''.join([f.bytes for f in voiced_frames]) | |
def is_segment_empty(file_path): | |
""" | |
Check if the audio segment is empty. | |
Args: | |
file_path (str): Path to the audio file. | |
Returns: | |
bool: True if the segment is empty, False otherwise. | |
""" | |
audio, _ = librosa.load(file_path) | |
rms = librosa.feature.rms(y=audio) # Pass the audio data as an argument | |
rms_mean = np.mean(rms) | |
print(rms_mean) | |
if rms_mean < 0.015: | |
return True | |
else: | |
return False | |
def process_segment(asr_model, model_nllb, tokenizer_nllb, path_segments, path_results, target_lang, order, json_path_temp, json_path_record): | |
""" | |
Process an audio segment: noise reduction, transcription, translation, and append results. | |
Args: | |
asr_model: The ASR model for transcription. | |
model_nllb: The NLLB model for translation. | |
tokenizer_nllb: The tokenizer for the NLLB model. | |
path_segments (str): Path to the audio segment. | |
path_results (str): Path to save the results. | |
target_lang (str): Target language for translation. | |
order (int): Order index of the segment. | |
json_path_temp (str): Path to the temporary JSON file. | |
json_path_record (str): Path to the record JSON file. | |
""" | |
print("Processing segment...") | |
if is_segment_empty(path_segments): | |
print("No speech detected.") | |
# remove the empty segment | |
os.remove(path_segments) | |
return | |
# Noise Reduction | |
start_time = time.time() | |
noise_reduction(path_segments, path_segments) | |
print("Noise removed. Time:", time.time() - start_time) | |
# Transcription | |
transcription = transcribe(asr_model, path_segments, target_lang) | |
#if not transcription.strip(): | |
# print("No speech detected.") | |
# return | |
# Translation | |
print("Translating...") | |
translation = translate(model_nllb, tokenizer_nllb, transcription, target_lang) | |
# Text-to-Speech | |
# process_tts(tts_model, translation, path_segments, target_lang, path_results) | |
append_text_order(json_path_temp,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription) | |
append_text_order(json_path_record,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription) | |
def transcribe(asr_model, path_segments, target_lang): | |
""" | |
Transcribe an audio segment using the specified ASR model. | |
Args: | |
asr_model: The ASR model for transcription. | |
path_segments (str): Path to the audio segment. | |
target_lang (str): Target language for transcription. | |
Returns: | |
str: The transcription of the audio segment. | |
""" | |
start_time = time.time() | |
transcription_func = { | |
"spanish": parakeet_ctc_process, | |
"english": stt_es_process | |
}[target_lang] | |
transcription = transcription_func(asr_model, path_segments) | |
print("Transcription:", transcription[0]) | |
print("Transcription time:", time.time() - start_time) | |
return transcription[0] | |
def translate(model_nllb, tokenizer_nllb, text, target_lang): | |
""" | |
Translate text using the specified NLLB model and tokenizer. | |
Args: | |
model_nllb: The NLLB model for translation. | |
tokenizer_nllb: The tokenizer for the NLLB model. | |
text (str): The text to translate. | |
target_lang (str): Target language for translation. | |
Returns: | |
str: The translated text. | |
""" | |
print("Processing translation...") | |
start_time = time.time() | |
translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang) | |
print("Translation:", translation) | |
print("Translation time:", time.time() - start_time) | |
return translation | |
def stream(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record,result_dir = "results",segments_dir = "audio_segments"): | |
""" | |
Stream audio input, process segments, and save the results. | |
Args: | |
asr_model: The ASR model for transcription. | |
model_nllb: The NLLB model for translation. | |
tokinizer_nllb: The tokenizer for the NLLB model. | |
source_lang (str): Source language of the audio. | |
target_lang (str): Target language for translation. | |
json_file_temp (str): Path to the temporary JSON file. | |
json_file_record (str): Path to the record JSON file. | |
result_dir (str, optional): Directory to save the results. Default is "results". | |
segments_dir (str, optional): Directory to save the audio segments. Default is "audio_segments". | |
""" | |
FORMAT = pyaudio.paInt16 | |
CHANNELS = 1 | |
RATE = 16000 | |
CHUNK_DURATION_MS = 30 # supports 10, 20 and 30 (ms) | |
PADDING_DURATION_MS = 300 | |
vad = webrtcvad.Vad(1) | |
audio = pyaudio.PyAudio() | |
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=160) | |
frames = read_audio(stream, CHUNK_DURATION_MS, RATE) | |
frames = (Frame(f, None, None) for f in frames) | |
if not os.path.exists(segments_dir): | |
os.makedirs(segments_dir) | |
if not os.path.exists(result_dir): | |
os.makedirs(result_dir) | |
executor = ThreadPoolExecutor(max_workers=2) # Adjust the number of workers as per your requirement | |
for i, segment in enumerate(vad_collector(RATE, CHUNK_DURATION_MS, PADDING_DURATION_MS, vad, frames)): | |
path_segements = os.path.join(segments_dir, f"segment_{i}.wav") | |
path_results = os.path.join(result_dir, f"result_{i}.wav") | |
print(f"Writing {path_segements}...") | |
with contextlib.closing(wave.open(path_segements, 'wb')) as wf: | |
wf.setnchannels(CHANNELS) | |
wf.setsampwidth(audio.get_sample_size(FORMAT)) | |
wf.setframerate(RATE) | |
wf.writeframes(segment) | |
executor.submit(process_segment, asr_model, model_nllb, tokinizer_nllb, path_segements,path_results, target_lang, i, json_file_temp, json_file_record) | |
stream.stop_stream() | |
stream.close() | |
audio.terminate() | |