File size: 9,630 Bytes
aa7cb02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import collections
import contextlib
import wave
import webrtcvad
import pyaudio
import os
import librosa
import numpy as np
from models.nllb import nllb_translate
from models.TTS_utils import append_text_order
from models.parakeet import parakeet_ctc_process
from models.es_fastconformer import stt_es_process
from concurrent.futures import ThreadPoolExecutor
import time
from models.noise_red import noise_reduction
class Frame(object):
    """
    Represents a "frame" of audio data.
    
    Args:
        bytes (bytes): The audio data.
        timestamp (float): The timestamp of the frame.
        duration (float): The duration of the frame.
    """
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

def read_audio(stream, frame_duration_ms, rate):
    """
    Generates audio frames from the input stream.

    Args:
        stream (pyaudio.Stream): The audio stream.
        frame_duration_ms (int): Duration of each frame in milliseconds.
        rate (int): The sample rate of the audio.

    Yields:
        bytes: The audio frames.
    """
    frames_per_buffer = int(rate * frame_duration_ms / 1000)
    while True:
        yield stream.read(frames_per_buffer)

def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
    """
    Filters out non-voiced audio frames.

    Args:
        sample_rate (int): The sample rate of the audio.
        frame_duration_ms (int): Duration of each frame in milliseconds.
        padding_duration_ms (int): Duration of padding in milliseconds.
        vad (webrtcvad.Vad): The VAD object.
        frames (generator): A generator yielding audio frames.

    Yields:
        bytes: Voiced audio frames.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                voiced_frames.extend(f for f, speech in ring_buffer)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
                triggered = False
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])


def is_segment_empty(file_path):
    """
    Check if the audio segment is empty.

    Args:
        file_path (str): Path to the audio file.

    Returns:
        bool: True if the segment is empty, False otherwise.
    """
    audio, _ = librosa.load(file_path)
    rms = librosa.feature.rms(y=audio)  # Pass the audio data as an argument
    rms_mean = np.mean(rms)
    print(rms_mean)
    
    if rms_mean < 0.015:
        return True
    else:
        return False


def process_segment(asr_model, model_nllb, tokenizer_nllb, path_segments, path_results, target_lang, order, json_path_temp, json_path_record):
    """
    Process an audio segment: noise reduction, transcription, translation, and append results.

    Args:
        asr_model: The ASR model for transcription.
        model_nllb: The NLLB model for translation.
        tokenizer_nllb: The tokenizer for the NLLB model.
        path_segments (str): Path to the audio segment.
        path_results (str): Path to save the results.
        target_lang (str): Target language for translation.
        order (int): Order index of the segment.
        json_path_temp (str): Path to the temporary JSON file.
        json_path_record (str): Path to the record JSON file.
    """
    print("Processing segment...")
    if is_segment_empty(path_segments):
        print("No speech detected.")
        # remove the empty segment
        os.remove(path_segments)
        return
    # Noise Reduction
    start_time = time.time()
    noise_reduction(path_segments, path_segments)
    print("Noise removed. Time:", time.time() - start_time)
    
    
    # Transcription
    transcription = transcribe(asr_model, path_segments, target_lang)
    #if not transcription.strip():
    #    print("No speech detected.")
    #    return
    
    # Translation
    print("Translating...")
    translation = translate(model_nllb, tokenizer_nllb, transcription, target_lang)
    
    # Text-to-Speech
    # process_tts(tts_model, translation, path_segments, target_lang, path_results)
    append_text_order(json_path_temp,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
    append_text_order(json_path_record,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
def transcribe(asr_model, path_segments, target_lang):
    """
    Transcribe an audio segment using the specified ASR model.

    Args:
        asr_model: The ASR model for transcription.
        path_segments (str): Path to the audio segment.
        target_lang (str): Target language for transcription.

    Returns:
        str: The transcription of the audio segment.
    """
    start_time = time.time()
    transcription_func = {
        "spanish": parakeet_ctc_process,
        "english": stt_es_process
    }[target_lang]
    transcription = transcription_func(asr_model, path_segments)
    print("Transcription:", transcription[0])
    print("Transcription time:", time.time() - start_time)
    return transcription[0]

def translate(model_nllb, tokenizer_nllb, text, target_lang):
    """
    Translate text using the specified NLLB model and tokenizer.

    Args:
        model_nllb: The NLLB model for translation.
        tokenizer_nllb: The tokenizer for the NLLB model.
        text (str): The text to translate.
        target_lang (str): Target language for translation.

    Returns:
        str: The translated text.
    """
    print("Processing translation...")
    start_time = time.time()
    translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang)
    print("Translation:", translation)
    print("Translation time:", time.time() - start_time)
    return translation







import os
import time
import contextlib
import wave
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from concurrent.futures import ThreadPoolExecutor

# Assuming you have the following functions defined elsewhere:
# - process_segment
# - asr_model
# - model_nllb
# - tokinizer_nllb

class NewAudioHandler(FileSystemEventHandler):
    def __init__(self, asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir):
        self.asr_model = asr_model
        self.model_nllb = model_nllb
        self.tokinizer_nllb = tokinizer_nllb
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.json_file_temp = json_file_temp
        self.json_file_record = json_file_record
        self.result_dir = result_dir
        self.executor = ThreadPoolExecutor(max_workers=2)

    def on_created(self, event):
        if not event.is_directory and event.src_path.endswith(".wav"):
            self.process_new_audio(event.src_path)

    def process_new_audio(self, audio_path):
        file_name = os.path.basename(audio_path)
        result_path = os.path.join(self.result_dir, f"result_{file_name}")
        print(f"Processing {audio_path}...")
        self.executor.submit(process_segment, self.asr_model, self.model_nllb, self.tokinizer_nllb, audio_path, result_path, self.target_lang, file_name, self.json_file_temp, self.json_file_record)

def watch_folder(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, watch_dir="audio_segments", result_dir="results"):
    """
    Watch a folder for new audio files and process them.

    Args:
        asr_model: The ASR model for transcription.
        model_nllb: The NLLB model for translation.
        tokinizer_nllb: The tokenizer for the NLLB model.
        source_lang (str): Source language of the audio.
        target_lang (str): Target language for translation.
        json_file_temp (str): Path to the temporary JSON file.
        json_file_record (str): Path to the record JSON file.
        watch_dir (str, optional): Directory to watch for new audio files. Default is "audio_segments".
        result_dir (str, optional): Directory to save the results. Default is "results".
    """
    if not os.path.exists(watch_dir):
        os.makedirs(watch_dir)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    event_handler = NewAudioHandler(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir)
    observer = Observer()
    observer.schedule(event_handler, watch_dir, recursive=False)
    observer.start()
    print(f"Watching directory: {watch_dir}")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

# Example usage:
# watch_folder(asr_model, model_nllb, tokinizer_nllb, "en", "fr", "temp.json", "record.json")