Spaces:

yalsaffar
/

S3TVR-Demo

Sleeping

File size: 9,630 Bytes

aa7cb02

import collections
import contextlib
import wave
import webrtcvad
import pyaudio
import os
import librosa
import numpy as np
from models.nllb import nllb_translate
from models.TTS_utils import append_text_order
from models.parakeet import parakeet_ctc_process
from models.es_fastconformer import stt_es_process
from concurrent.futures import ThreadPoolExecutor
import time
from models.noise_red import noise_reduction
class Frame(object):
    """
    Represents a "frame" of audio data.
    
    Args:
        bytes (bytes): The audio data.
        timestamp (float): The timestamp of the frame.
        duration (float): The duration of the frame.
    """
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

def read_audio(stream, frame_duration_ms, rate):
    """
    Generates audio frames from the input stream.

    Args:
        stream (pyaudio.Stream): The audio stream.
        frame_duration_ms (int): Duration of each frame in milliseconds.
        rate (int): The sample rate of the audio.

    Yields:
        bytes: The audio frames.
    """
    frames_per_buffer = int(rate * frame_duration_ms / 1000)
    while True:
        yield stream.read(frames_per_buffer)

def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
    """
    Filters out non-voiced audio frames.

    Args:
        sample_rate (int): The sample rate of the audio.
        frame_duration_ms (int): Duration of each frame in milliseconds.
        padding_duration_ms (int): Duration of padding in milliseconds.
        vad (webrtcvad.Vad): The VAD object.
        frames (generator): A generator yielding audio frames.

    Yields:
        bytes: Voiced audio frames.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                voiced_frames.extend(f for f, speech in ring_buffer)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
                triggered = False
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])


def is_segment_empty(file_path):
    """
    Check if the audio segment is empty.

    Args:
        file_path (str): Path to the audio file.

    Returns:
        bool: True if the segment is empty, False otherwise.
    """
    audio, _ = librosa.load(file_path)
    rms = librosa.feature.rms(y=audio)  # Pass the audio data as an argument
    rms_mean = np.mean(rms)
    print(rms_mean)
    
    if rms_mean < 0.015:
        return True
    else:
        return False


def process_segment(asr_model, model_nllb, tokenizer_nllb, path_segments, path_results, target_lang, order, json_path_temp, json_path_record):
    """
    Process an audio segment: noise reduction, transcription, translation, and append results.

    Args:
        asr_model: The ASR model for transcription.
        model_nllb: The NLLB model for translation.
        tokenizer_nllb: The tokenizer for the NLLB model.
        path_segments (str): Path to the audio segment.
        path_results (str): Path to save the results.
        target_lang (str): Target language for translation.
        order (int): Order index of the segment.
        json_path_temp (str): Path to the temporary JSON file.
        json_path_record (str): Path to the record JSON file.
    """
    print("Processing segment...")
    if is_segment_empty(path_segments):
        print("No speech detected.")
        # remove the empty segment
        os.remove(path_segments)
        return
    # Noise Reduction
    start_time = time.time()
    noise_reduction(path_segments, path_segments)
    print("Noise removed. Time:", time.time() - start_time)
    
    
    # Transcription
    transcription = transcribe(asr_model, path_segments, target_lang)
    #if not transcription.strip():
    #    print("No speech detected.")
    #    return
    
    # Translation
    print("Translating...")
    translation = translate(model_nllb, tokenizer_nllb, transcription, target_lang)
    
    # Text-to-Speech
    # process_tts(tts_model, translation, path_segments, target_lang, path_results)
    append_text_order(json_path_temp,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
    append_text_order(json_path_record,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
def transcribe(asr_model, path_segments, target_lang):
    """
    Transcribe an audio segment using the specified ASR model.

    Args:
        asr_model: The ASR model for transcription.
        path_segments (str): Path to the audio segment.
        target_lang (str): Target language for transcription.

    Returns:
        str: The transcription of the audio segment.
    """
    start_time = time.time()
    transcription_func = {
        "spanish": parakeet_ctc_process,
        "english": stt_es_process
    }[target_lang]
    transcription = transcription_func(asr_model, path_segments)
    print("Transcription:", transcription[0])
    print("Transcription time:", time.time() - start_time)
    return transcription[0]

def translate(model_nllb, tokenizer_nllb, text, target_lang):
    """
    Translate text using the specified NLLB model and tokenizer.

    Args:
        model_nllb: The NLLB model for translation.
        tokenizer_nllb: The tokenizer for the NLLB model.
        text (str): The text to translate.
        target_lang (str): Target language for translation.

    Returns:
        str: The translated text.
    """
    print("Processing translation...")
    start_time = time.time()
    translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang)
    print("Translation:", translation)
    print("Translation time:", time.time() - start_time)
    return translation







import os
import time
import contextlib
import wave
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from concurrent.futures import ThreadPoolExecutor

# Assuming you have the following functions defined elsewhere:
# - process_segment
# - asr_model
# - model_nllb
# - tokinizer_nllb

class NewAudioHandler(FileSystemEventHandler):
    def __init__(self, asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir):
        self.asr_model = asr_model
        self.model_nllb = model_nllb
        self.tokinizer_nllb = tokinizer_nllb
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.json_file_temp = json_file_temp
        self.json_file_record = json_file_record
        self.result_dir = result_dir
        self.executor = ThreadPoolExecutor(max_workers=2)

    def on_created(self, event):
        if not event.is_directory and event.src_path.endswith(".wav"):
            self.process_new_audio(event.src_path)

    def process_new_audio(self, audio_path):
        file_name = os.path.basename(audio_path)
        result_path = os.path.join(self.result_dir, f"result_{file_name}")
        print(f"Processing {audio_path}...")
        self.executor.submit(process_segment, self.asr_model, self.model_nllb, self.tokinizer_nllb, audio_path, result_path, self.target_lang, file_name, self.json_file_temp, self.json_file_record)

def watch_folder(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, watch_dir="audio_segments", result_dir="results"):
    """
    Watch a folder for new audio files and process them.

    Args:
        asr_model: The ASR model for transcription.
        model_nllb: The NLLB model for translation.
        tokinizer_nllb: The tokenizer for the NLLB model.
        source_lang (str): Source language of the audio.
        target_lang (str): Target language for translation.
        json_file_temp (str): Path to the temporary JSON file.
        json_file_record (str): Path to the record JSON file.
        watch_dir (str, optional): Directory to watch for new audio files. Default is "audio_segments".
        result_dir (str, optional): Directory to save the results. Default is "results".
    """
    if not os.path.exists(watch_dir):
        os.makedirs(watch_dir)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    event_handler = NewAudioHandler(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir)
    observer = Observer()
    observer.schedule(event_handler, watch_dir, recursive=False)
    observer.start()
    print(f"Watching directory: {watch_dir}")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

# Example usage:
# watch_folder(asr_model, model_nllb, tokinizer_nllb, "en", "fr", "temp.json", "record.json")