Spaces:
Sleeping
Sleeping
start of moving to ray
Browse files- d_app.py +9 -13
- input_av_queue_actor.py +37 -0
- streamlit_av_queue.py +40 -20
d_app.py
CHANGED
@@ -24,6 +24,12 @@ load_dotenv()
|
|
24 |
|
25 |
webrtc_ctx = None
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
async def main():
|
29 |
|
@@ -98,21 +104,11 @@ async def main():
|
|
98 |
if len(st.session_state.debug_queue) > 0:
|
99 |
prompt = st.session_state.debug_queue.pop(0)
|
100 |
await st.session_state.chat_pipeline.enqueue(prompt)
|
101 |
-
sound_chunk = pydub.AudioSegment.empty()
|
102 |
audio_frames = st.session_state.streamlit_av_queue.get_audio_frames()
|
103 |
if len(audio_frames) > 0:
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
sample_width=audio_frame.format.bytes,
|
108 |
-
frame_rate=audio_frame.sample_rate,
|
109 |
-
channels=len(audio_frame.layout.channels),
|
110 |
-
)
|
111 |
-
sound = sound.set_channels(1)
|
112 |
-
sound = sound.set_frame_rate(st.session_state.speech_to_text_vosk.get_audio_bit_rate())
|
113 |
-
sound_chunk += sound
|
114 |
-
buffer = np.array(sound_chunk.get_array_of_samples())
|
115 |
-
st.session_state.speech_to_text_vosk.add_speech_bytes(buffer.tobytes())
|
116 |
prompt, speaker_finished = st.session_state.speech_to_text_vosk.get_text()
|
117 |
if speaker_finished and len(prompt) > 0:
|
118 |
print(f"Prompt: {prompt}")
|
|
|
24 |
|
25 |
webrtc_ctx = None
|
26 |
|
27 |
+
# Initialize Ray
|
28 |
+
import ray
|
29 |
+
if not ray.is_initialized():
|
30 |
+
ray.init()
|
31 |
+
|
32 |
+
|
33 |
|
34 |
async def main():
|
35 |
|
|
|
104 |
if len(st.session_state.debug_queue) > 0:
|
105 |
prompt = st.session_state.debug_queue.pop(0)
|
106 |
await st.session_state.chat_pipeline.enqueue(prompt)
|
|
|
107 |
audio_frames = st.session_state.streamlit_av_queue.get_audio_frames()
|
108 |
if len(audio_frames) > 0:
|
109 |
+
# Concatenate all audio frames into a single buffer
|
110 |
+
audio_buffer = b"".join([buffer.tobytes() for buffer in audio_frames])
|
111 |
+
st.session_state.speech_to_text_vosk.add_speech_bytes(audio_buffer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
prompt, speaker_finished = st.session_state.speech_to_text_vosk.get_text()
|
113 |
if speaker_finished and len(prompt) > 0:
|
114 |
print(f"Prompt: {prompt}")
|
input_av_queue_actor.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ray
|
2 |
+
from ray.util.queue import Queue
|
3 |
+
from ray.actor import ActorHandle
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
@ray.remote
|
9 |
+
class InputAVQueueActor:
|
10 |
+
def __init__(self):
|
11 |
+
self.audio_queue = Queue(maxsize=100) # Adjust the size as needed
|
12 |
+
self.video_queue = Queue(maxsize=100) # Adjust the size as needed
|
13 |
+
|
14 |
+
def enqueue_video_frame(self, shared_tensor_ref):
|
15 |
+
self.video_queue.put(shared_tensor_ref)
|
16 |
+
|
17 |
+
def enqueue_audio_frame(self, shared_buffer_ref):
|
18 |
+
self.audio_queue.put(shared_buffer_ref)
|
19 |
+
|
20 |
+
|
21 |
+
def get_audio_frames(self):
|
22 |
+
audio_frames = []
|
23 |
+
if self.audio_queue.empty():
|
24 |
+
return audio_frames
|
25 |
+
while not self.audio_queue.empty():
|
26 |
+
shared_tensor_ref = self.audio_queue.get()
|
27 |
+
audio_frames.append(shared_tensor_ref)
|
28 |
+
return audio_frames
|
29 |
+
|
30 |
+
def get_video_frames(self):
|
31 |
+
video_frames = []
|
32 |
+
if self.video_queue.empty():
|
33 |
+
return video_frames
|
34 |
+
while not self.video_queue.empty():
|
35 |
+
shared_tensor_ref = self.video_queue.get()
|
36 |
+
video_frames.append(shared_tensor_ref)
|
37 |
+
return video_frames
|
streamlit_av_queue.py
CHANGED
@@ -5,29 +5,55 @@ from collections import deque
|
|
5 |
import threading
|
6 |
|
7 |
import numpy as np
|
|
|
|
|
|
|
|
|
8 |
|
9 |
class StreamlitAVQueue:
|
10 |
-
def __init__(self):
|
11 |
-
self.
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
17 |
async def queued_video_frames_callback(
|
18 |
self,
|
19 |
frames: List[av.AudioFrame],
|
20 |
) -> av.AudioFrame:
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
23 |
return frames
|
24 |
|
25 |
async def queued_audio_frames_callback(
|
26 |
self,
|
27 |
frames: List[av.AudioFrame],
|
28 |
) -> av.AudioFrame:
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# return empty frames to avoid echo
|
32 |
new_frames = []
|
33 |
for frame in frames:
|
@@ -41,15 +67,9 @@ class StreamlitAVQueue:
|
|
41 |
return new_frames
|
42 |
|
43 |
def get_audio_frames(self) -> List[av.AudioFrame]:
|
44 |
-
|
45 |
-
|
46 |
-
audio_frames = list(self.audio_frames_deque)
|
47 |
-
self.audio_frames_deque.clear()
|
48 |
-
return audio_frames
|
49 |
|
50 |
def get_video_frames(self) -> List[av.AudioFrame]:
|
51 |
-
|
52 |
-
|
53 |
-
video_frames = list(self.video_frames_deque)
|
54 |
-
self.video_frames_deque.clear()
|
55 |
-
return video_frames
|
|
|
5 |
import threading
|
6 |
|
7 |
import numpy as np
|
8 |
+
import ray
|
9 |
+
from input_av_queue_actor import InputAVQueueActor
|
10 |
+
import pydub
|
11 |
+
import torch
|
12 |
|
13 |
class StreamlitAVQueue:
|
14 |
+
def __init__(self, audio_bit_rate=16000):
|
15 |
+
self._audio_bit_rate = audio_bit_rate
|
16 |
+
try:
|
17 |
+
self.queue_actor = ray.get_actor("InputAVQueueActor")
|
18 |
+
except ValueError as e:
|
19 |
+
self.queue_actor = InputAVQueueActor.options(name="InputAVQueueActor").remote()
|
20 |
|
21 |
async def queued_video_frames_callback(
|
22 |
self,
|
23 |
frames: List[av.AudioFrame],
|
24 |
) -> av.AudioFrame:
|
25 |
+
try:
|
26 |
+
for frame in frames:
|
27 |
+
shared_tensor = torch.from_numpy(frame.to_ndarray())
|
28 |
+
shared_tensor_ref = ray.put(shared_tensor)
|
29 |
+
self.queue_actor.enqueue_video_frame.remote(shared_tensor_ref)
|
30 |
+
except Exception as e:
|
31 |
+
print (e)
|
32 |
return frames
|
33 |
|
34 |
async def queued_audio_frames_callback(
|
35 |
self,
|
36 |
frames: List[av.AudioFrame],
|
37 |
) -> av.AudioFrame:
|
38 |
+
try:
|
39 |
+
sound_chunk = pydub.AudioSegment.empty()
|
40 |
+
if len(frames) > 0:
|
41 |
+
for frame in frames:
|
42 |
+
sound = pydub.AudioSegment(
|
43 |
+
data=frame.to_ndarray().tobytes(),
|
44 |
+
sample_width=frame.format.bytes,
|
45 |
+
frame_rate=frame.sample_rate,
|
46 |
+
channels=len(frame.layout.channels),
|
47 |
+
)
|
48 |
+
sound = sound.set_channels(1)
|
49 |
+
sound = sound.set_frame_rate(self._audio_bit_rate)
|
50 |
+
sound_chunk += sound
|
51 |
+
shared_buffer = np.array(sound_chunk.get_array_of_samples())
|
52 |
+
shared_buffer_ref = ray.put(shared_buffer)
|
53 |
+
self.queue_actor.enqueue_audio_frame.remote(shared_buffer_ref)
|
54 |
+
except Exception as e:
|
55 |
+
print (e)
|
56 |
+
|
57 |
# return empty frames to avoid echo
|
58 |
new_frames = []
|
59 |
for frame in frames:
|
|
|
67 |
return new_frames
|
68 |
|
69 |
def get_audio_frames(self) -> List[av.AudioFrame]:
|
70 |
+
shared_buffers = ray.get(self.queue_actor.get_audio_frames.remote())
|
71 |
+
return shared_buffers
|
|
|
|
|
|
|
72 |
|
73 |
def get_video_frames(self) -> List[av.AudioFrame]:
|
74 |
+
shared_tensors = ray.get(self.queue_actor.get_video_frames.remote())
|
75 |
+
return shared_tensors
|
|
|
|
|
|