omni-mini-webrtc

Sleeping

App Files Files Community

freddyaboulton HF staff commited on 10 days ago

Commit

01a49c3

•

1 Parent(s): f20d058

lint

Browse files

Files changed (2) hide show

app.py +33 -26
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import base64
 import io
 import tempfile
 import time
 import traceback
-from dataclasses import dataclass, field
 from queue import Queue
-from threading import Thread, Event
 import gradio as gr
 import librosa
@@ -14,16 +15,13 @@ import requests
 from gradio_webrtc import StreamHandler, WebRTC
 from huggingface_hub import snapshot_download
 from pydub import AudioSegment
-import librosa
-from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
-import tempfile
 # from server import serve
 from utils.vad import VadOptions, collect_chunks, get_speech_timestamps
-from server import serve
 repo_id = "gpt-omni/mini-omni"
 snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
@@ -36,8 +34,20 @@ thread.start()
 API_URL = "http://0.0.0.0:60808/chat"
-#API_URL = "https://freddyaboulton-omni-backend.hf.space/chat"
 # recording parameters
 IN_CHANNELS = 1
@@ -89,7 +99,8 @@ def warm_up():
     print(f"warm up done, time_cost: {tcost:.3f} s")
-warm_up()
 @dataclass
 class AppState:
@@ -97,27 +108,26 @@ class AppState:
     sampling_rate: int = 0
     pause_detected: bool = False
     started_talking: bool = False
-    responding: bool =  False
     stopped: bool = False
     buffer: np.ndarray | None = None
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     """Take in the stream, determine if a pause happened"""
     duration = len(audio) / sampling_rate
     dur_vad, _, _ = run_vad(audio, sampling_rate)
     if duration >= 0.60:
         if dur_vad > 0.2 and not state.started_talking:
             print("started talking")
-            state.started_talking = True
         if state.started_talking:
             if state.stream is None:
                 state.stream = audio
             else:
-                state.stream =  np.concatenate((state.stream, audio))
         state.buffer = None
         if dur_vad < 0.1 and state.started_talking:
             segment = AudioSegment(
@@ -135,7 +145,6 @@ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> b
 def speaking(audio_bytes: str):
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
     byte_buffer = b""
@@ -167,7 +176,6 @@ def speaking(audio_bytes: str):
             raise gr.Error(f"Error during audio streaming: {e}")
 def process_audio(audio: tuple, state: AppState) -> None:
     frame_rate, array = audio
     array = np.squeeze(array)
@@ -185,7 +193,7 @@ def process_audio(audio: tuple, state: AppState) -> None:
 def response(state: AppState):
     if not state.pause_detected and not state.started_talking:
         return None
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
         state.stream.tobytes(),
@@ -194,14 +202,16 @@ def response(state: AppState):
         channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
     )
     segment.export(audio_buffer, format="wav")
     for numpy_array in speaking(audio_buffer.getvalue()):
-        yield (OUT_RATE, numpy_array, "mono")
 class OmniHandler(StreamHandler):
     def __init__(self) -> None:
-        super().__init__(expected_layout="mono", output_sample_rate=OUT_RATE, output_frame_size=480)
         self.chunk_queue = Queue()
         self.state = AppState()
         self.generator = None
@@ -213,7 +223,7 @@ class OmniHandler(StreamHandler):
         process_audio(frame, self.state)
         if self.state.pause_detected:
             self.chunk_queue.put(True)
     def reset(self):
         self.generator = None
         self.state = AppState()
@@ -225,10 +235,9 @@ class OmniHandler(StreamHandler):
             self.state.responding = True
             self.generator = response(self.state)
         try:
-           return next(self.generator)
         except StopIteration:
             self.reset()
 with gr.Blocks() as demo:
@@ -250,6 +259,4 @@ with gr.Blocks() as demo:
         audio.stream(fn=OmniHandler(), inputs=[audio], outputs=[audio], time_limit=300)
 demo.launch()

 import base64
 import io
+import os
 import tempfile
 import time
 import traceback
+from dataclasses import dataclass
 from queue import Queue
+from threading import Thread
 import gradio as gr
 import librosa
 from gradio_webrtc import StreamHandler, WebRTC
 from huggingface_hub import snapshot_download
 from pydub import AudioSegment
+from twilio.rest import Client
+from server import serve
 # from server import serve
 from utils.vad import VadOptions, collect_chunks, get_speech_timestamps
 repo_id = "gpt-omni/mini-omni"
 snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
 API_URL = "http://0.0.0.0:60808/chat"
+account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
+auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
+if account_sid and auth_token:
+    client = Client(account_sid, auth_token)
+    token = client.tokens.create()
+    rtc_configuration = {
+        "iceServers": token.ice_servers,
+        "iceTransportPolicy": "relay",
+    }
+else:
+    rtc_configuration = None
 # recording parameters
 IN_CHANNELS = 1
     print(f"warm up done, time_cost: {tcost:.3f} s")
+# warm_up()
 @dataclass
 class AppState:
     sampling_rate: int = 0
     pause_detected: bool = False
     started_talking: bool = False
+    responding: bool = False
     stopped: bool = False
     buffer: np.ndarray | None = None
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     """Take in the stream, determine if a pause happened"""
     duration = len(audio) / sampling_rate
     dur_vad, _, _ = run_vad(audio, sampling_rate)
     if duration >= 0.60:
         if dur_vad > 0.2 and not state.started_talking:
             print("started talking")
+            state.started_talking = True
         if state.started_talking:
             if state.stream is None:
                 state.stream = audio
             else:
+                state.stream = np.concatenate((state.stream, audio))
         state.buffer = None
         if dur_vad < 0.1 and state.started_talking:
             segment = AudioSegment(
 def speaking(audio_bytes: str):
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
     byte_buffer = b""
             raise gr.Error(f"Error during audio streaming: {e}")
 def process_audio(audio: tuple, state: AppState) -> None:
     frame_rate, array = audio
     array = np.squeeze(array)
 def response(state: AppState):
     if not state.pause_detected and not state.started_talking:
         return None
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
         state.stream.tobytes(),
         channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
     )
     segment.export(audio_buffer, format="wav")
     for numpy_array in speaking(audio_buffer.getvalue()):
+        yield (OUT_RATE, numpy_array, "mono")
 class OmniHandler(StreamHandler):
     def __init__(self) -> None:
+        super().__init__(
+            expected_layout="mono", output_sample_rate=OUT_RATE, output_frame_size=480
+        )
         self.chunk_queue = Queue()
         self.state = AppState()
         self.generator = None
         process_audio(frame, self.state)
         if self.state.pause_detected:
             self.chunk_queue.put(True)
     def reset(self):
         self.generator = None
         self.state = AppState()
             self.state.responding = True
             self.generator = response(self.state)
         try:
+            return next(self.generator)
         except StopIteration:
             self.reset()
 with gr.Blocks() as demo:
         audio.stream(fn=OmniHandler(), inputs=[audio], outputs=[audio], time_limit=300)
 demo.launch()

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3
 fire
-https://gradio-builds.s3.us-east-1.amazonaws.com/webrtc/08/gradio_webrtc-0.0.5-py3-none-any.whl

 librosa==0.10.2.post1
 flask==3.0.3
 fire
+https://gradio-builds.s3.us-east-1.amazonaws.com/webrtc/08/gradio_webrtc-0.0.5-py3-none-any.whl
+twilio