omni-mini-webrtc

Sleeping

App Files Files Community

freddyaboulton HF staff commited on 30 days ago

Commit

bc98115

•

1 Parent(s): f82efe7

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -149

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ import gradio as gr
 import librosa
 import numpy as np
 import requests
-from gradio_webrtc import StreamHandler, WebRTC
 from huggingface_hub import snapshot_download
 from pydub import AudioSegment
 from twilio.rest import Client
@@ -67,102 +67,13 @@ if account_sid and auth_token:
 else:
     rtc_configuration = None
-# recording parameters
-IN_CHANNELS = 1
-IN_RATE = 24000
-IN_CHUNK = 1024
-IN_SAMPLE_WIDTH = 2
-VAD_STRIDE = 0.5
-# playing parameters
 OUT_CHANNELS = 1
 OUT_RATE = 24000
 OUT_SAMPLE_WIDTH = 2
 OUT_CHUNK = 20 * 4096
-def run_vad(ori_audio, sr):
-    _st = time.time()
-    try:
-        audio = ori_audio
-        audio = audio.astype(np.float32) / 32768.0
-        sampling_rate = 16000
-        if sr != sampling_rate:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
-        vad_parameters = {}
-        vad_parameters = VadOptions(**vad_parameters)
-        speech_chunks = get_speech_timestamps(audio, vad_parameters)
-        audio = collect_chunks(audio, speech_chunks)
-        duration_after_vad = audio.shape[0] / sampling_rate
-        if sr != sampling_rate:
-            # resample to original sampling rate
-            vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
-        else:
-            vad_audio = audio
-        vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
-        vad_audio_bytes = vad_audio.tobytes()
-        return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
-    except Exception as e:
-        msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
-        print(msg)
-        return -1, ori_audio, round(time.time() - _st, 4)
-def warm_up():
-    frames = np.zeros((1, 1600))  # 1024 frames of 2 bytes each
-    _, frames, tcost = run_vad(frames, 16000)
-    print(f"warm up done, time_cost: {tcost:.3f} s")
-# warm_up()
-@dataclass
-class AppState:
-    stream: np.ndarray | None = None
-    sampling_rate: int = 0
-    pause_detected: bool = False
-    started_talking: bool = False
-    responding: bool = False
-    stopped: bool = False
-    buffer: np.ndarray | None = None
-def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
-    """Take in the stream, determine if a pause happened"""
-    duration = len(audio) / sampling_rate
-    dur_vad, _, _ = run_vad(audio, sampling_rate)
-    if duration >= 0.60:
-        if dur_vad > 0.2 and not state.started_talking:
-            print("started talking")
-            state.started_talking = True
-        if state.started_talking:
-            if state.stream is None:
-                state.stream = audio
-            else:
-                state.stream = np.concatenate((state.stream, audio))
-        state.buffer = None
-        if dur_vad < 0.1 and state.started_talking:
-            segment = AudioSegment(
-                state.stream.tobytes(),
-                frame_rate=sampling_rate,
-                sample_width=audio.dtype.itemsize,
-                channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
-            )
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-                segment.export(f.name, format="wav")
-            print("input file written", f.name)
-            return True
-    return False
-def speaking(audio_bytes: str):
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
     byte_buffer = b""
@@ -194,73 +105,24 @@ def speaking(audio_bytes: str):
             raise gr.Error(f"Error during audio streaming: {e}")
-def process_audio(audio: tuple, state: AppState) -> None:
-    frame_rate, array = audio
-    array = np.squeeze(array)
-    if not state.sampling_rate:
-        state.sampling_rate = frame_rate
-    if state.buffer is None:
-        state.buffer = array
-    else:
-        state.buffer = np.concatenate((state.buffer, array))
-    pause_detected = determine_pause(state.buffer, state.sampling_rate, state)
-    state.pause_detected = pause_detected
-def response(state: AppState):
-    if not state.pause_detected and not state.started_talking:
-        return None
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
-        state.stream.tobytes(),
-        frame_rate=state.sampling_rate,
-        sample_width=state.stream.dtype.itemsize,
-        channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
-    )
     segment.export(audio_buffer, format="wav")
     for numpy_array in speaking(audio_buffer.getvalue()):
         yield (OUT_RATE, numpy_array, "mono")
-class OmniHandler(StreamHandler):
-    def __init__(self) -> None:
-        super().__init__(
-            expected_layout="mono", output_sample_rate=OUT_RATE, output_frame_size=480
-        )
-        self.event = Event()
-        self.state = AppState()
-        self.generator = None
-        self.duration = 0
-    def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        if self.state.responding:
-            return
-        process_audio(frame, self.state)
-        if self.state.pause_detected:
-            self.event.set()
-    def reset(self):
-        self.generator = None
-        self.event.clear()
-        self.state = AppState()
-        self.duration = 0
-    def emit(self):
-        if not self.event.is_set():
-            return None
-        else:
-            if not self.generator:
-                self.generator = response(self.state)
-            self.state.responding = True
-            try:
-                return next(self.generator)
-            except StopIteration:
-                self.reset()
 with gr.Blocks() as demo:
     gr.HTML(
         """
@@ -277,7 +139,7 @@ with gr.Blocks() as demo:
                 mode="send-receive",
                 modality="audio",
             )
-        audio.stream(fn=OmniHandler(), inputs=[audio], outputs=[audio], time_limit=60)
 demo.launch(ssr_mode=False)

 import librosa
 import numpy as np
 import requests
+from gradio_webrtc import ReplyOnPause, WebRTC
 from huggingface_hub import snapshot_download
 from pydub import AudioSegment
 from twilio.rest import Client
 else:
     rtc_configuration = None
 OUT_CHANNELS = 1
 OUT_RATE = 24000
 OUT_SAMPLE_WIDTH = 2
 OUT_CHUNK = 20 * 4096
+def speaking(audio_bytes: bytes):
     base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
     files = {"audio": base64_encoded}
     byte_buffer = b""
             raise gr.Error(f"Error during audio streaming: {e}")
+def response(audio: tuple[int, np.ndarray]):
+    sampling_rate, audio_np = audio
+    audio_np = audio_np.squeeze()
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
+        audio_np.tobytes(),
+        frame_rate=sampling_rate,
+        sample_width=audio_np.dtype.itemsize,
+        channels=1)
     segment.export(audio_buffer, format="wav")
     for numpy_array in speaking(audio_buffer.getvalue()):
         yield (OUT_RATE, numpy_array, "mono")
 with gr.Blocks() as demo:
     gr.HTML(
         """
                 mode="send-receive",
                 modality="audio",
             )
+        audio.stream(fn=ReplyOnPause(response), inputs=[audio], outputs=[audio], time_limit=60)
 demo.launch(ssr_mode=False)