omni-mini-webrtc

Sleeping

App Files Files Community

freddyaboulton HF staff commited on 10 days ago

Commit

4e9b286

•

1 Parent(s): 5e3f570

let's try

Browse files

Files changed (2) hide show

app.py +125 -109
requirements.txt +1 -4

app.py CHANGED Viewed

@@ -1,18 +1,26 @@
-import gradio as gr
-from huggingface_hub import snapshot_download
-from threading import Thread
-import time
 import base64
-import numpy as np
-import requests
 import traceback
 from dataclasses import dataclass, field
-import io
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 import tempfile
 from server import serve
@@ -22,11 +30,15 @@ snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
 IP = "0.0.0.0"
 PORT = 60808
-serve(port=7860)
 API_URL = "http://0.0.0.0:60808/chat"
 # recording parameters
 IN_CHANNELS = 1
 IN_RATE = 24000
@@ -38,12 +50,7 @@ VAD_STRIDE = 0.5
 OUT_CHANNELS = 1
 OUT_RATE = 24000
 OUT_SAMPLE_WIDTH = 2
-OUT_CHUNK = 5760
 OUT_CHUNK = 20 * 4096
-OUT_RATE = 24000
-OUT_CHANNELS = 1
 def run_vad(ori_audio, sr):
@@ -77,94 +84,109 @@ def run_vad(ori_audio, sr):
 def warm_up():
-    frames = b"\x00\x00" * 1024 * 2  # 1024 frames of 2 bytes each
-    dur, frames, tcost = run_vad(frames, 16000)
     print(f"warm up done, time_cost: {tcost:.3f} s")
 warm_up()
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
     sampling_rate: int = 0
     pause_detected: bool = False
-    started_talking: bool =  False
     stopped: bool = False
-    conversation: list = field(default_factory=list)
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     """Take in the stream, determine if a pause happened"""
-    temp_audio = audio
-    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
     duration = len(audio) / sampling_rate
-    if dur_vad > 0.5 and not state.started_talking:
-        print("started talking")
-        state.started_talking = True
-        return False
-    print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
-    return (duration - dur_vad) > 1
 def speaking(audio_bytes: str):
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
     with requests.post(API_URL, json=files, stream=True) as response:
         try:
             for chunk in response.iter_content(chunk_size=OUT_CHUNK):
                 if chunk:
                     # Create an audio segment from the numpy array
                     audio_segment = AudioSegment(
-                        chunk,
                         frame_rate=OUT_RATE,
                         sample_width=OUT_SAMPLE_WIDTH,
                         channels=OUT_CHANNELS,
                     )
-                    # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
-                    mp3_io = io.BytesIO()
-                    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
-                    # Get the MP3 bytes
-                    mp3_bytes = mp3_io.getvalue()
-                    mp3_io.close()
-                    yield mp3_bytes
         except Exception as e:
             raise gr.Error(f"Error during audio streaming: {e}")
-def process_audio(audio: tuple, state: AppState):
-    if state.stream is None:
-        state.stream = audio[1]
-        state.sampling_rate = audio[0]
     else:
-        state.stream =  np.concatenate((state.stream, audio[1]))
-    pause_detected = determine_pause(state.stream, state.sampling_rate, state)
     state.pause_detected = pause_detected
-    if state.pause_detected and state.started_talking:
-        return gr.Audio(recording=False), state
-    return None, state
 def response(state: AppState):
     if not state.pause_detected and not state.started_talking:
-        return None, AppState()
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
         state.stream.tobytes(),
         frame_rate=state.sampling_rate,
@@ -172,68 +194,62 @@ def response(state: AppState):
         channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
     )
     segment.export(audio_buffer, format="wav")
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-        f.write(audio_buffer.getvalue())
-    state.conversation.append({"role": "user",
-                                "content": {"path": f.name,
-                                "mime_type": "audio/wav"}})
-    output_buffer = b""
-    for mp3_bytes in speaking(audio_buffer.getvalue()):
-        output_buffer += mp3_bytes
-        yield mp3_bytes, state
-    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
-        f.write(output_buffer)
-    state.conversation.append({"role": "assistant",
-                    "content": {"path": f.name,
-                                "mime_type": "audio/mp3"}})
-    yield None, AppState(conversation=state.conversation)
-def start_recording_user(state: AppState):
-    if not state.stopped:
-        return gr.Audio(recording=True)
 with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column():
-            input_audio = gr.Audio(
-                label="Input Audio", sources="microphone", type="numpy"
-            )
-        with gr.Column():
-            chatbot = gr.Chatbot(label="Conversation", type="messages")
-            output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
-    state = gr.State(value=AppState())
-    stream = input_audio.stream(
-        process_audio,
-        [input_audio, state],
-        [input_audio, state],
-        stream_every=0.50,
-        time_limit=30,
-    )
-    respond = input_audio.stop_recording(
-        response,
-        [state],
-        [output_audio, state]
     )
-    respond.then(lambda s: s.conversation, [state], [chatbot])
-    restart = output_audio.stop(
-        start_recording_user,
-        [state],
-        [input_audio]
-    )
-    cancel = gr.Button("Stop Conversation", variant="stop")
-    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
-                [state, input_audio], cancels=[respond, restart])
 demo.launch()

 import base64
+import io
+import tempfile
+import time
 import traceback
 from dataclasses import dataclass, field
+from queue import Queue
+from threading import Thread, Event
+import gradio as gr
+import librosa
+import numpy as np
+import requests
+from gradio_webrtc import StreamHandler, WebRTC
+from huggingface_hub import snapshot_download
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 import tempfile
+# from server import serve
+from utils.vad import VadOptions, collect_chunks, get_speech_timestamps
 from server import serve
 IP = "0.0.0.0"
 PORT = 60808
+thread = Thread(target=serve, daemon=True)
+thread.start()
 API_URL = "http://0.0.0.0:60808/chat"
+#API_URL = "https://freddyaboulton-omni-backend.hf.space/chat"
 # recording parameters
 IN_CHANNELS = 1
 IN_RATE = 24000
 OUT_CHANNELS = 1
 OUT_RATE = 24000
 OUT_SAMPLE_WIDTH = 2
 OUT_CHUNK = 20 * 4096
 def run_vad(ori_audio, sr):
 def warm_up():
+    frames = np.zeros((1, 1600))  # 1024 frames of 2 bytes each
+    _, frames, tcost = run_vad(frames, 16000)
     print(f"warm up done, time_cost: {tcost:.3f} s")
 warm_up()
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
     sampling_rate: int = 0
     pause_detected: bool = False
+    started_talking: bool = False
+    responding: bool =  False
     stopped: bool = False
+    buffer: np.ndarray | None = None
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     """Take in the stream, determine if a pause happened"""
     duration = len(audio) / sampling_rate
+    dur_vad, _, _ = run_vad(audio, sampling_rate)
+    if duration >= 0.60:
+        if dur_vad > 0.2 and not state.started_talking:
+            print("started talking")
+            state.started_talking = True
+        if state.started_talking:
+            if state.stream is None:
+                state.stream = audio
+            else:
+                state.stream =  np.concatenate((state.stream, audio))
+        state.buffer = None
+        if dur_vad < 0.1 and state.started_talking:
+            segment = AudioSegment(
+                state.stream.tobytes(),
+                frame_rate=sampling_rate,
+                sample_width=audio.dtype.itemsize,
+                channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
+            )
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                segment.export(f.name, format="wav")
+            print("input file written", f.name)
+            return True
+    return False
 def speaking(audio_bytes: str):
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
+    byte_buffer = b""
     with requests.post(API_URL, json=files, stream=True) as response:
         try:
             for chunk in response.iter_content(chunk_size=OUT_CHUNK):
                 if chunk:
                     # Create an audio segment from the numpy array
+                    byte_buffer += chunk
                     audio_segment = AudioSegment(
+                        chunk + b"\x00" if len(chunk) % 2 != 0 else chunk,
                         frame_rate=OUT_RATE,
                         sample_width=OUT_SAMPLE_WIDTH,
                         channels=OUT_CHANNELS,
                     )
+                    # Export the audio segment to a numpy array
+                    audio_np = np.array(audio_segment.get_array_of_samples())
+                    yield audio_np.reshape(1, -1)
+            all_output_audio = AudioSegment(
+                byte_buffer,
+                frame_rate=OUT_RATE,
+                sample_width=OUT_SAMPLE_WIDTH,
+                channels=1,
+            )
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                all_output_audio.export(f.name, format="wav")
+                print("output file written", f.name)
         except Exception as e:
             raise gr.Error(f"Error during audio streaming: {e}")
+def process_audio(audio: tuple, state: AppState) -> None:
+    frame_rate, array = audio
+    array = np.squeeze(array)
+    if not state.sampling_rate:
+        state.sampling_rate = frame_rate
+    if state.buffer is None:
+        state.buffer = array
     else:
+        state.buffer = np.concatenate((state.buffer, array))
+    pause_detected = determine_pause(state.buffer, state.sampling_rate, state)
     state.pause_detected = pause_detected
 def response(state: AppState):
     if not state.pause_detected and not state.started_talking:
+        return None
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
         state.stream.tobytes(),
         frame_rate=state.sampling_rate,
         channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
     )
     segment.export(audio_buffer, format="wav")
+    for numpy_array in speaking(audio_buffer.getvalue()):
+        yield (OUT_RATE, numpy_array, "mono")
+class OmniHandler(StreamHandler):
+    def __init__(self) -> None:
+        super().__init__(expected_layout="mono", output_sample_rate=OUT_RATE, output_frame_size=480)
+        self.chunk_queue = Queue()
+        self.state = AppState()
+        self.generator = None
+        self.duration = 0
+    def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        if self.state.responding:
+            return
+        process_audio(frame, self.state)
+        if self.state.pause_detected:
+            self.chunk_queue.put(True)
+    def reset(self):
+        self.generator = None
+        self.state = AppState()
+        self.duration = 0
+    def emit(self):
+        if not self.generator:
+            self.chunk_queue.get()
+            self.state.responding = True
+            self.generator = response(self.state)
+        try:
+           return next(self.generator)
+        except StopIteration:
+            self.reset()
 with gr.Blocks() as demo:
+    gr.HTML(
+        """
+    <h1 style='text-align: center'>
+    Omni Chat (Powered by WebRTC ⚡️)
+    </h1>
+    """
     )
+    with gr.Column():
+        with gr.Group():
+            audio = WebRTC(
+                label="Stream",
+                rtc_configuration=None,
+                mode="send-receive",
+                modality="audio",
+            )
+        audio.stream(fn=OmniHandler(), inputs=[audio], outputs=[audio], time_limit=300)
 demo.launch()

requirements.txt CHANGED Viewed

@@ -6,13 +6,10 @@ snac==1.2.0
 soundfile==0.12.1
 openai-whisper
 tokenizers==0.19.1
-streamlit==1.37.1
-# PyAudio==0.2.14
 pydub==0.25.1
 onnxruntime==1.19.0
-# numpy==1.26.3
-https://gradio-builds.s3.amazonaws.com/cffe9a7ab7f71e76d7214dc57c6278ffaf5bcdf9/gradio-5.0.0b1-py3-none-any.whl
 fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3
 fire

 soundfile==0.12.1
 openai-whisper
 tokenizers==0.19.1
 pydub==0.25.1
 onnxruntime==1.19.0
 fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3
 fire
+https://gradio-builds.s3.us-east-1.amazonaws.com/webrtc/08/gradio_webrtc-0.0.5-py3-none-any.whl