File size: 15,276 Bytes
2d522b6
366edf8
 
 
 
 
 
 
 
 
 
 
 
2d522b6
366edf8
 
2d522b6
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d522b6
366edf8
 
 
2d522b6
 
366edf8
 
 
 
 
 
 
 
 
 
 
 
8e82d74
366edf8
 
 
 
 
2d522b6
366edf8
 
 
2d522b6
366edf8
 
 
2d522b6
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
2d522b6
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1e0588
366edf8
 
 
 
 
 
2d522b6
366edf8
8e82d74
366edf8
 
 
8e82d74
 
366edf8
 
 
 
 
 
 
 
 
 
 
8e82d74
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e82d74
2d522b6
c1e0588
 
 
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d522b6
366edf8
 
 
 
 
 
 
8e82d74
366edf8
 
 
 
 
 
 
8e82d74
366edf8
 
 
8e82d74
366edf8
 
 
 
 
2d522b6
366edf8
 
2d522b6
366edf8
 
c1e0588
2d522b6
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d522b6
366edf8
 
 
 
 
 
 
2d522b6
 
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1e0588
366edf8
 
2d522b6
c1e0588
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
fd69a21
366edf8
 
c1e0588
366edf8
 
 
c1e0588
 
 
 
 
366edf8
c1e0588
366edf8
c1e0588
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1e0588
366edf8
 
 
 
 
 
 
c1e0588
366edf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d522b6
366edf8
c1e0588
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

from typing import Any, List, Tuple, Union, Optional
import numpy as np
import soundfile
import io
import asyncio
from simuleval.agents.pipeline import TreeAgentPipeline
from simuleval.agents.states import AgentStates
from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
import threading
import math
import logging
import sys
from pathlib import Path
import time
from g2p_en import G2p
import torch
import traceback
import time
import random
import colorlog


MODEL_SAMPLE_RATE = 16_000

logger = logging.getLogger(__name__)
logger.propagate = False
handler = colorlog.StreamHandler(stream=sys.stdout)
formatter = colorlog.ColoredFormatter(
    "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
    reset=True,
    log_colors={
        "DEBUG": "cyan",
        "INFO": "green",
        "WARNING": "yellow",
        "ERROR": "red",
        "CRITICAL": "red,bg_white",
    },
)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)


class SpeechAndTextOutput:
    def __init__(
        self,
        text: str = None,
        speech_samples: list = None,
        speech_sample_rate: float = None,
        final: bool = False,
    ):
        self.text = text
        self.speech_samples = speech_samples
        self.speech_sample_rate = speech_sample_rate
        self.final = final

class OutputSegments:
    def __init__(self, segments: Union[List[Segment], Segment]):
        if isinstance(segments, Segment):
            segments = [segments]
        self.segments: List[Segment] = [s for s in segments]

    @property
    def is_empty(self):
        return all(segment.is_empty for segment in self.segments)

    @property
    def finished(self):
        return all(segment.finished for segment in self.segments)

    def compute_length(self, g2p):
        lengths = []
        for segment in self.segments:
            if segment.data_type == "text":
                lengths.append(len([x for x in g2p(segment.content) if x != " "]))
            elif segment.data_type == "speech":
                lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
            elif isinstance(segment, EmptySegment):
                continue
            else:
                logger.warning(
                    f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
                )
        return max(lengths)

    @classmethod
    def join_output_buffer(
        cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
    ):
        num_segments = len(buffer[0])
        for i in range(num_segments):
            segment_list = [
                buffer[j][i]
                for j in range(len(buffer))
                if buffer[j][i].data_type is not None
            ]
            if len(segment_list) == 0:
                continue
            if len(set(segment.data_type for segment in segment_list)) != 1:
                logger.warning(
                    f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
                )
                continue
            data_type = segment_list[0].data_type
            if data_type == "text":
                if output.text is not None:
                    logger.warning("Multiple text outputs, overwriting!")
                output.text = " ".join([segment.content for segment in segment_list])
            elif data_type == "speech":
                if output.speech_samples is not None:
                    logger.warning("Multiple speech outputs, overwriting!")
                speech_out = []
                for segment in segment_list:
                    speech_out += segment.content
                output.speech_samples = speech_out
                output.speech_sample_rate = segment.sample_rate
            elif isinstance(segment_list[0], EmptySegment):
                continue
            else:
                logger.warning(
                    f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
                )

        return output

    def __repr__(self) -> str:
        repr_str = str(self.segments)
        return f"{self.__class__.__name__}(\n\t{repr_str}\n)"


def convert_waveform(
    waveform: Union[np.ndarray, torch.Tensor],
    sample_rate: int,
    normalize_volume: bool = False,
    to_mono: bool = False,
    to_sample_rate: Optional[int] = None,
) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
    """convert a waveform:
    - to a target sample rate
    - from multi-channel to mono channel
    - volume normalization

    Args:
        waveform (numpy.ndarray or torch.Tensor): 2D original waveform
            (channels x length)
        sample_rate (int): original sample rate
        normalize_volume (bool): perform volume normalization
        to_mono (bool): convert to mono channel if having multiple channels
        to_sample_rate (Optional[int]): target sample rate
    Returns:
        waveform (numpy.ndarray): converted 2D waveform (channels x length)
        sample_rate (float): target sample rate
    """
    try:
        import torchaudio.sox_effects as ta_sox
    except ImportError:
        raise ImportError("Please install torchaudio: pip install torchaudio")

    effects = []
    if normalize_volume:
        effects.append(["gain", "-n"])
    if to_sample_rate is not None and to_sample_rate != sample_rate:
        effects.append(["rate", f"{to_sample_rate}"])
    if to_mono and waveform.shape[0] > 1:
        effects.append(["channels", "1"])
    if len(effects) > 0:
        is_np_input = isinstance(waveform, np.ndarray)
        _waveform = torch.from_numpy(waveform) if is_np_input else waveform
        converted, converted_sample_rate = ta_sox.apply_effects_tensor(
            _waveform, sample_rate, effects
        )
        if is_np_input:
            converted = converted.numpy()
        return converted, converted_sample_rate
    return waveform, sample_rate

class SimulevalTranscoder:
    def __init__(self, agent, sample_rate, debug, buffer_limit):
        # agent is stateless
        self.agent = agent
        self.input_queue = asyncio.Queue()
        self.output_queue = asyncio.Queue()
        self.states = self.agent.build_states()
        if debug:
            self.get_states_root().debug = True
        self.incoming_sample_rate = sample_rate
        self.close = False
        self.g2p = G2p()

        # buffer all outgoing translations within this amount of time
        self.output_buffer_idle_ms = 5000
        self.output_buffer_size_limit = (
            buffer_limit  # phonemes for text, seconds for speech
        )
        self.output_buffer_cur_size = 0
        self.output_buffer: List[List[Segment]] = []
        self.speech_output_sample_rate = None

        self.last_output_ts = time.time() * 1000
        self.timeout_ms = (
            30000  # close the transcoder thread after this amount of silence
        )
        self.first_input_ts = None
        self.first_output_ts = None
        self.debug = debug
        self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
        if self.debug:
            debug_folder = Path(__file__).resolve().parent.parent / "debug"
            self.test_incoming_wav = soundfile.SoundFile(
                debug_folder / f"{self.debug_ts}_test_incoming.wav",
                mode="w+",
                format="WAV",
                subtype="PCM_16",
                samplerate=self.incoming_sample_rate,
                channels=1,
            )
            self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
                debug_folder / f"{self.debug_ts}_test_input_segments.wav",
                mode="w+",
                format="WAV",
                samplerate=MODEL_SAMPLE_RATE,
                channels=1,
            )

    def get_states_root(self) -> AgentStates:
        if isinstance(self.agent, TreeAgentPipeline):
            # self.states is a dict
            return self.states[self.agent.source_module]
        else:
            # self.states is a list
            return self.states[0]

    def reset_states(self):
        if isinstance(self.agent, TreeAgentPipeline):
            states_iter = self.states.values()
        else:
            states_iter = self.states
        for state in states_iter:
            state.reset()

    def debug_log(self, *args):
        if self.debug:
            logger.info(*args)

    def process_incoming_bytes(self, incoming_bytes, target_language, sample_rate):
        # TODO: currently just taking sample rate here, refactor sample rate
        # bytes is 16bit signed int
        self.incoming_sample_rate = sample_rate
        segment, sr = self._preprocess_wav(incoming_bytes)

        segment = SpeechSegment(
            content=segment, sample_rate=sr, tgt_lang=target_language
        )
        # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
        self.input_queue.put_nowait(segment)
        print("process_incoming: put input_queue")

    def get_input_segment(self):
        if self.input_queue.empty():
            return None
        chunk = self.input_queue.get_nowait()
        self.input_queue.task_done()
        return chunk

    def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
        segment, sample_rate = soundfile.read(
            io.BytesIO(data),
            dtype="float32",
            always_2d=True,
            frames=-1,
            start=0,
            format="RAW",
            subtype="PCM_16",
            samplerate=self.incoming_sample_rate,
            channels=1,
        )
        if self.debug:
            self.test_incoming_wav.seek(0, soundfile.SEEK_END)
            self.test_incoming_wav.write(segment)

        segment = segment.T
        segment, new_sample_rate = convert_waveform(
            segment,
            sample_rate,
            normalize_volume=False,
            to_mono=True,
            to_sample_rate=MODEL_SAMPLE_RATE,
        )

        assert MODEL_SAMPLE_RATE == new_sample_rate
        segment = segment.squeeze(axis=0)
        return segment, new_sample_rate

    def process_pipeline_impl(self, input_segment):
        try:
            with torch.no_grad():
                output_segment = OutputSegments(
                    self.agent.pushpop(input_segment, self.states)
                )
            if (
                self.get_states_root().first_input_ts is not None
                and self.first_input_ts is None
            ):
                # TODO: this is hacky
                self.first_input_ts = self.get_states_root().first_input_ts

            if not output_segment.is_empty:
                print("PUT IN OUTPUT QUEUE")
                self.output_queue.put_nowait(output_segment)

            if output_segment.finished:
                print("OUTPUT SEGMENT IS FINISHED. Resetting states.")

                self.reset_states()

                if self.debug:
                    # when we rebuild states, this value is reset to whatever
                    # is in the system dir config, which defaults debug=False.
                    self.get_states_root().debug = True
        except Exception as e:
            logger.error(f"Got exception while processing pipeline: {e}")
            traceback.print_exc()
        return input_segment

    def process_pipeline_loop(self):
        if self.close:
            print("transcoder closed")
            return  # closes the thread

        print("processing_pipeline")
        while not self.close:
            input_segment = self.get_input_segment()
            if input_segment is None:
                if self.get_states_root().is_fresh_state:  # TODO: this is hacky
                    time.sleep(0.3)
                    print("loop: input_queue empty")
                else:
                    time.sleep(0.03)
                continue
            print("loop: got input_segment")
            self.process_pipeline_impl(input_segment)
        print("finished processing_pipeline")

    def process_pipeline_once(self):
        if self.close:
            return

        self.debug_log("processing pipeline once")
        input_segment = self.get_input_segment()
        if input_segment is None:
            return
        self.process_pipeline_impl(input_segment)
        self.debug_log("finished processing_pipeline_once")

    def get_output_segment(self):
        if self.output_queue.empty():
            return None

        output_chunk = self.output_queue.get_nowait()
        self.output_queue.task_done()
        return output_chunk

    def start(self):
        print("starting transcoder in a thread")
        threading.Thread(target=self.process_pipeline_loop).start()

    def first_translation_time(self):
        return round((self.first_output_ts - self.first_input_ts) / 1000, 2)

    def get_buffered_output(self) -> SpeechAndTextOutput:
        now = time.time() * 1000
        print(f"get_buffered_output queue size: {self.output_queue.qsize()}")
        while not self.output_queue.empty():
            tmp_out = self.get_output_segment()
            if tmp_out and tmp_out.compute_length(self.g2p) > 0:
                if len(self.output_buffer) == 0:
                    self.last_output_ts = now
                self._populate_output_buffer(tmp_out)
                self._increment_output_buffer_size(tmp_out)

                if tmp_out.finished:
                    self.debug_log("tmp_out.finished")
                    res = self._gather_output_buffer_data(final=True)
                    self.debug_log(f"gathered output data: {res}")
                    self.output_buffer = []
                    self.increment_output_buffer_size = 0
                    self.last_output_ts = now
                    self.first_output_ts = now
                    return res
            else:
                self.debug_log("tmp_out.compute_length is not > 0")

        if len(self.output_buffer) > 0 and (
            now - self.last_output_ts >= self.output_buffer_idle_ms
            or self.output_buffer_cur_size >= self.output_buffer_size_limit
        ):
            self.debug_log(
                "[get_buffered_output] output_buffer is not empty. getting res to return."
            )
            self.last_output_ts = now
            res = self._gather_output_buffer_data(final=False)
            self.debug_log(f"gathered output data: {res}")
            self.output_buffer = []
            self.output_buffer_phoneme_count = 0
            self.first_output_ts = now
            return res
        else:
            self.debug_log("[get_buffered_output] output_buffer is empty...")
            return None

    def _gather_output_buffer_data(self, final):
        output = SpeechAndTextOutput()
        output.final = final
        output = OutputSegments.join_output_buffer(self.output_buffer, output)
        return output

    def _increment_output_buffer_size(self, segment: OutputSegments):
        self.output_buffer_cur_size += segment.compute_length(self.g2p)

    def _populate_output_buffer(self, segment: OutputSegments):
        self.output_buffer.append(segment.segments)

    def _compute_phoneme_count(self, string: str) -> int:
        return len([x for x in self.g2p(string) if x != " "])