omni-mini-webrtc

Sleeping

App Files Files Community

freddyaboulton HF staff commited on Sep 23

Commit

eb02780

•

1 Parent(s): 63b59c5

Add code

Browse files

Files changed (1) hide show

app.py +39 -19

app.py CHANGED Viewed

@@ -6,11 +6,12 @@ import base64
 import numpy as np
 import requests
 import traceback
-from dataclasses import dataclass
 import io
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 from server import serve
@@ -91,6 +92,7 @@ class AppState:
     pause_detected: bool = False
     started_talking: bool =  False
     stopped: bool = False
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
@@ -111,21 +113,7 @@ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> b
     return (duration - dur_vad) > 1
-def speaking(audio: np.ndarray, sampling_rate: int):
-    audio_buffer = io.BytesIO()
-    segment = AudioSegment(
-        audio.tobytes(),
-        frame_rate=sampling_rate,
-        sample_width=audio.dtype.itemsize,
-        channels=(1 if len(audio.shape) == 1 else audio.shape[1]),
-    )
-    segment.export(audio_buffer, format="wav")
-    with open("input_audio.wav", "wb") as f:
-        f.write(audio_buffer.getvalue())
-    audio_bytes = audio_buffer.getvalue()
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
@@ -174,11 +162,39 @@ def process_audio(audio: tuple, state: AppState):
 def response(state: AppState):
     if not state.pause_detected and not state.started_talking:
         return None, AppState()
-    for mp3_bytes in speaking(state.stream, state.sampling_rate):
         yield mp3_bytes, state
-    yield None, AppState()
 def start_recording_user(state: AppState):
@@ -192,6 +208,7 @@ with gr.Blocks() as demo:
                 label="Input Audio", sources="microphone", type="numpy"
             )
         with gr.Column():
             output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
     state = gr.State(value=AppState())
@@ -207,13 +224,16 @@ with gr.Blocks() as demo:
         [state],
         [output_audio, state]
     )
     restart = output_audio.stop(
         start_recording_user,
         [state],
         [input_audio]
     )
     cancel = gr.Button("Stop Conversation", variant="stop")
-    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None, [state, input_audio], cancels=[respond, restart])
 demo.launch()

 import numpy as np
 import requests
 import traceback
+from dataclasses import dataclass, field
 import io
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
+import tempfile
 from server import serve
     pause_detected: bool = False
     started_talking: bool =  False
     stopped: bool = False
+    conversation: list = field(default_factory=list)
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     return (duration - dur_vad) > 1
+def speaking(audio_bytes: str):
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
 def response(state: AppState):
     if not state.pause_detected and not state.started_talking:
         return None, AppState()
+    audio_buffer = io.BytesIO()
+    segment = AudioSegment(
+        state.stream.tobytes(),
+        frame_rate=state.sampling_rate,
+        sample_width=state.stream.dtype.itemsize,
+        channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
+    )
+    segment.export(audio_buffer, format="wav")
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        f.write(audio_buffer.getvalue())
+    state.conversation.append({"role": "user",
+                                "content": {"path": f.name,
+                                "mime_type": "audio/wav"}})
+    output_buffer = b""
+    for mp3_bytes in speaking(audio_buffer.getvalue()):
+        output_buffer += mp3_bytes
         yield mp3_bytes, state
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        f.write(output_buffer)
+    state.conversation.append({"role": "assistant",
+                    "content": {"path": f.name,
+                                "mime_type": "audio/mp3"}})
+    yield None, AppState(conversation=state.conversation)
 def start_recording_user(state: AppState):
                 label="Input Audio", sources="microphone", type="numpy"
             )
         with gr.Column():
+            chatbot = gr.Chatbot(label="Conversation", type="messages")
             output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
     state = gr.State(value=AppState())
         [state],
         [output_audio, state]
     )
+    respond.then(lambda s: s.conversation, [state], [chatbot])
     restart = output_audio.stop(
         start_recording_user,
         [state],
         [input_audio]
     )
     cancel = gr.Button("Stop Conversation", variant="stop")
+    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
+                [state, input_audio], cancels=[respond, restart])
 demo.launch()