project_charles / d_app.py
sohojoe's picture
prep for video frames
ad9371a
raw
history blame
6.61 kB
import asyncio
from collections import deque
import os
import threading
import time
import traceback
import av
import numpy as np
import streamlit as st
from streamlit_webrtc import WebRtcMode, webrtc_streamer
import pydub
import torch
# import av
# import cv2
from sample_utils.turn import get_ice_servers
import json
from typing import List
from vosk import SetLogLevel, Model, KaldiRecognizer
SetLogLevel(-1) # mutes vosk verbosity
from dotenv import load_dotenv
load_dotenv()
webrtc_ctx = None
async def main():
system_one_audio_status = st.empty()
playing = st.checkbox("Playing", value=True)
system_one_audio_status.write("Initializing streaming")
system_one_audio_output = st.empty()
system_one_video_output = st.empty()
system_one_audio_history = []
system_one_audio_history_output = st.empty()
# Initialize resources if not already done
print("000")
system_one_audio_status.write("Initializing streaming")
if "streamlit_av_queue" not in st.session_state:
print("001")
from streamlit_av_queue import StreamlitAVQueue
st.session_state.streamlit_av_queue = StreamlitAVQueue()
if "speech_to_text_vosk" not in st.session_state:
print("002")
from speech_to_text_vosk import SpeechToTextVosk
st.session_state.speech_to_text_vosk = SpeechToTextVosk()
from chat_pipeline import ChatPipeline
if "chat_pipeline" not in st.session_state:
print("003")
# from chat_pipeline import ChatPipeline
# st.session_state.chat_pipeline = ChatPipeline()
# await st.session_state.chat_pipeline.start()
st.session_state.chat_pipeline = ChatPipeline()
await st.session_state.chat_pipeline.start()
if "debug_queue" not in st.session_state:
st.session_state.debug_queue = [
# "hello, how are you today?",
# "hmm, interesting, tell me more about that.",
]
system_one_audio_status.write("resources referecned")
print("010")
system_one_audio_status.write("Initializing webrtc_streamer")
webrtc_ctx = webrtc_streamer(
key="charles",
desired_playing_state=playing,
queued_audio_frames_callback=st.session_state.streamlit_av_queue.queued_audio_frames_callback,
queued_video_frames_callback=st.session_state.streamlit_av_queue.queued_video_frames_callback,
mode=WebRtcMode.SENDRECV,
rtc_configuration={"iceServers": get_ice_servers()},
async_processing=True,
)
if not webrtc_ctx.state.playing:
exit
system_one_audio_status.write("Initializing speech")
try:
while True:
if not webrtc_ctx.state.playing:
system_one_audio_status.write("Stopped.")
await asyncio.sleep(0.1)
continue
system_one_audio_status.write("Streaming.")
if len(st.session_state.debug_queue) > 0:
prompt = st.session_state.debug_queue.pop(0)
await st.session_state.chat_pipeline.enqueue(prompt)
sound_chunk = pydub.AudioSegment.empty()
audio_frames = st.session_state.streamlit_av_queue.get_audio_frames()
if len(audio_frames) > 0:
for audio_frame in audio_frames:
sound = pydub.AudioSegment(
data=audio_frame.to_ndarray().tobytes(),
sample_width=audio_frame.format.bytes,
frame_rate=audio_frame.sample_rate,
channels=len(audio_frame.layout.channels),
)
sound = sound.set_channels(1)
sound = sound.set_frame_rate(st.session_state.speech_to_text_vosk.get_audio_bit_rate())
sound_chunk += sound
buffer = np.array(sound_chunk.get_array_of_samples())
st.session_state.speech_to_text_vosk.add_speech_bytes(buffer.tobytes())
prompt, speaker_finished = st.session_state.speech_to_text_vosk.get_text()
if speaker_finished and len(prompt) > 0:
print(f"Prompt: {prompt}")
system_one_audio_history.append(prompt)
if len(system_one_audio_history) > 10:
system_one_audio_history = system_one_audio_history[-10:]
table_content = "| System 1 Audio History |\n| --- |\n"
table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
system_one_audio_history_output.markdown(table_content)
await st.session_state.chat_pipeline.enqueue(prompt)
video_frames = st.session_state.streamlit_av_queue.get_video_frames()
if len(video_frames) > 0:
# for video_frame in video_frames:
# system_one_video_output.image(video_frame.to_ndarray())
pass
await asyncio.sleep(0.1)
# try:
# prompts = [
# "hello, how are you today?",
# "tell me about your shadow self?",
# "hmm, interesting, tell me more about that.",
# "wait, that is so interesting, what else?",
# ]
# for prompt in prompts:
# system_one_audio_history.append(prompt)
# if len(system_one_audio_history) > 10:
# system_one_audio_history = system_one_audio_history[-10:]
# table_content = "| System 1 Audio History |\n| --- |\n"
# table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
# system_one_audio_history_output.markdown(table_content)
# await chat_pipeline.enqueue(prompt)
except Exception as e:
print(f"An error occurred: {e}")
traceback.print_exc()
raise e
# while True:
# if webrtc_ctx.state.playing:
# system_one_audio_status.write("Streaming.")
# else:
# system_one_audio_status.write("Stopped.")
# await asyncio.sleep(0.5)
if __name__ == "__main__":
try:
asyncio.run(main())
except Exception as e:
if webrtc_ctx is not None:
del webrtc_ctx
webrtc_ctx = None
if "streamlit_av_queue" in st.session_state:
del st.session_state.streamlit_av_queue
if "speech_to_text_vosk" in st.session_state:
del st.session_state.speech_to_text_vosk
if "chat_pipeline" in st.session_state:
del st.session_state.chat_pipeline
finally:
pass