File size: 4,636 Bytes
edce499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from collections import deque
import os
import threading
import time
import av
import numpy as np
import streamlit as st
from streamlit_webrtc import WebRtcMode, webrtc_streamer
import pydub
# import av
# import cv2
from sample_utils.turn import get_ice_servers
import json
from typing import List

from vosk import SetLogLevel, Model, KaldiRecognizer
SetLogLevel(-1) # mutes vosk verbosity

from dotenv import load_dotenv
load_dotenv()

system_one = {
    "audio_bit_rate": 16000,
    # "audio_bit_rate": 32000,
    # "audio_bit_rate": 48000,
}


playing = st.checkbox("Playing", value=True)

def load_vosk (model='small'):
    # load vosk model
    # get path of current file
    current_file_path = os.path.abspath(__file__)
    current_directory = os.path.dirname(current_file_path)
    _path = os.path.join(current_directory, 'models', 'vosk', model)
    model_voice = Model(_path)
    recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate'])
    return recognizer

vask = load_vosk()

def handle_audio_frame(frame):
    # if self.vosk.AcceptWaveform(data):
    pass


def do_work(data: bytearray) -> tuple[str, bool]:
    text = ''
    speaker_finished = False
    if vask.AcceptWaveform(data):
        result = vask.Result()
        result_json = json.loads(result)
        text = result_json['text']
        speaker_finished = True
    else:
        result = vask.PartialResult()
        result_json = json.loads(result)
        text = result_json['partial']
    return text, speaker_finished


frames_deque_lock = threading.Lock()
frames_deque: deque = deque([])

async def queued_audio_frames_callback(
            frames: List[av.AudioFrame],
        ) -> av.AudioFrame:
    with frames_deque_lock:
        frames_deque.extend(frames)

    # create frames to be returned.
    new_frames = []
    for frame in frames:
        input_array = frame.to_ndarray()
        new_frame = av.AudioFrame.from_ndarray(
            np.zeros(input_array.shape, dtype=input_array.dtype),
            layout=frame.layout.name,
        )
        new_frame.sample_rate = frame.sample_rate
        new_frames.append(new_frame)
    
    # TODO: replace with the audio we want to send to the other side.

    return new_frames

webrtc_ctx = webrtc_streamer(
    key="charles",
    desired_playing_state=playing,
    # audio_receiver_size=4096,
    # audio_frame_callback=process_audio,
    queued_audio_frames_callback=queued_audio_frames_callback,
    mode=WebRtcMode.SENDRECV,
    rtc_configuration={"iceServers": get_ice_servers()},
    async_processing=True,
)

system_one_audio_status = st.empty()

if not webrtc_ctx.state.playing:
    exit

system_one_audio_status.write("Initializing...")
system_one_audio_output = st.empty()
system_one_audio_history = []
system_one_audio_history_output = st.empty()


sound_chunk = pydub.AudioSegment.empty()
while True:
    if webrtc_ctx.state.playing:
        audio_frames = []
        with frames_deque_lock:
            while len(frames_deque) > 0:
                frame = frames_deque.popleft()
                audio_frames.append(frame)

        if len(audio_frames) == 0:
            time.sleep(0.1)
            system_one_audio_status.write("No frame arrived.")
            continue

        system_one_audio_status.write("Running. Say something!")

        for audio_frame in audio_frames:
            sound = pydub.AudioSegment(
                data=audio_frame.to_ndarray().tobytes(),
                sample_width=audio_frame.format.bytes,
                frame_rate=audio_frame.sample_rate,
                channels=len(audio_frame.layout.channels),
            )
            sound = sound.set_channels(1)
            sound = sound.set_frame_rate(system_one['audio_bit_rate'])
            sound_chunk += sound

        if len(sound_chunk) > 0:
            buffer = np.array(sound_chunk.get_array_of_samples())
            text, speaker_finished = do_work(buffer.tobytes())            
            system_one_audio_output.markdown(f"**System 1 Audio:** {text}")
            if speaker_finished and len(text) > 0:
                system_one_audio_history.append(text)
                if len(system_one_audio_history) > 10:
                    system_one_audio_history = system_one_audio_history[-10:]
                table_content = "| System 1 Audio History |\n| --- |\n"
                table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
                system_one_audio_history_output.markdown(table_content)
            sound_chunk = pydub.AudioSegment.empty()

    else:
        system_one_audio_status.write("Stopped.")
        break