from collections import deque import os import threading import time import av import numpy as np import streamlit as st from streamlit_webrtc import WebRtcMode, webrtc_streamer import pydub # import av # import cv2 from sample_utils.turn import get_ice_servers import json from typing import List from vosk import SetLogLevel, Model, KaldiRecognizer SetLogLevel(-1) # mutes vosk verbosity from dotenv import load_dotenv load_dotenv() system_one = { "audio_bit_rate": 16000, # "audio_bit_rate": 32000, # "audio_bit_rate": 48000, } playing = st.checkbox("Playing", value=True) def load_vosk (model='small'): # load vosk model # get path of current file current_file_path = os.path.abspath(__file__) current_directory = os.path.dirname(current_file_path) _path = os.path.join(current_directory, 'models', 'vosk', model) model_voice = Model(_path) recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate']) return recognizer vask = load_vosk() def handle_audio_frame(frame): # if self.vosk.AcceptWaveform(data): pass def do_work(data: bytearray) -> tuple[str, bool]: text = '' speaker_finished = False if vask.AcceptWaveform(data): result = vask.Result() result_json = json.loads(result) text = result_json['text'] speaker_finished = True else: result = vask.PartialResult() result_json = json.loads(result) text = result_json['partial'] return text, speaker_finished frames_deque_lock = threading.Lock() frames_deque: deque = deque([]) async def queued_audio_frames_callback( frames: List[av.AudioFrame], ) -> av.AudioFrame: with frames_deque_lock: frames_deque.extend(frames) # create frames to be returned. new_frames = [] for frame in frames: input_array = frame.to_ndarray() new_frame = av.AudioFrame.from_ndarray( np.zeros(input_array.shape, dtype=input_array.dtype), layout=frame.layout.name, ) new_frame.sample_rate = frame.sample_rate new_frames.append(new_frame) # TODO: replace with the audio we want to send to the other side. return new_frames webrtc_ctx = webrtc_streamer( key="charles", desired_playing_state=playing, # audio_receiver_size=4096, # audio_frame_callback=process_audio, queued_audio_frames_callback=queued_audio_frames_callback, mode=WebRtcMode.SENDRECV, rtc_configuration={"iceServers": get_ice_servers()}, async_processing=True, ) system_one_audio_status = st.empty() if not webrtc_ctx.state.playing: exit system_one_audio_status.write("Initializing...") system_one_audio_output = st.empty() system_one_audio_history = [] system_one_audio_history_output = st.empty() sound_chunk = pydub.AudioSegment.empty() while True: if webrtc_ctx.state.playing: audio_frames = [] with frames_deque_lock: while len(frames_deque) > 0: frame = frames_deque.popleft() audio_frames.append(frame) if len(audio_frames) == 0: time.sleep(0.1) system_one_audio_status.write("No frame arrived.") continue system_one_audio_status.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound = sound.set_channels(1) sound = sound.set_frame_rate(system_one['audio_bit_rate']) sound_chunk += sound if len(sound_chunk) > 0: buffer = np.array(sound_chunk.get_array_of_samples()) text, speaker_finished = do_work(buffer.tobytes()) system_one_audio_output.markdown(f"**System 1 Audio:** {text}") if speaker_finished and len(text) > 0: system_one_audio_history.append(text) if len(system_one_audio_history) > 10: system_one_audio_history = system_one_audio_history[-10:] table_content = "| System 1 Audio History |\n| --- |\n" table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)]) system_one_audio_history_output.markdown(table_content) sound_chunk = pydub.AudioSegment.empty() else: system_one_audio_status.write("Stopped.") break