omni-mini-webrtc

Sleeping

App Files Files Community

freddyaboulton HF staff commited on Sep 18

Commit

c4d6bf6

•

1 Parent(s): 687343f

Fixing

Browse files

Files changed (1) hide show

app.py +37 -20

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pathlib import Path
 import io
 import wave
 import tempfile
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
@@ -20,8 +21,8 @@ from server import serve
 repo_id = "gpt-omni/mini-omni"
 snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
-IP='0.0.0.0'
-PORT=60808
 thread = Thread(target=serve, daemon=True)
 thread.start()
@@ -42,11 +43,11 @@ OUT_SAMPLE_WIDTH = 2
 OUT_CHUNK = 5760
-OUT_CHUNK = 4096
 OUT_RATE = 24000
 OUT_CHANNELS = 1
 def run_vad(ori_audio, sr):
     _st = time.time()
     try:
@@ -82,6 +83,7 @@ def warm_up():
     dur, frames, tcost = run_vad(frames, 16000)
     print(f"warm up done, time_cost: {tcost:.3f} s")
 warm_up()
@@ -107,7 +109,6 @@ def determine_pause(stream: bytes, start_talking: bool) -> tuple[bool, bool]:
 def speaking(total_frames: bytes):
     audio_buffer = io.BytesIO()
     wf = wave.open(audio_buffer, "wb")
     wf.setnchannels(IN_CHANNELS)
@@ -131,16 +132,26 @@ def speaking(total_frames: bytes):
         try:
             for chunk in response.iter_content(chunk_size=OUT_CHUNK):
                 if chunk:
-                    yield chunk
-                    # # Convert chunk to numpy array
-                    # output_audio_bytes += chunk
-                    # audio_data = np.frombuffer(chunk, dtype=np.int8)
-                    # # Play audio
-                    # stream.write(audio_data)
         except Exception as e:
             raise gr.Error(f"Error during audio streaming: {e}")
     wf.close()
@@ -151,20 +162,19 @@ class AppState:
     pause_detected: bool = False
 def process_audio(audio: str, state: AppState):
     state.stream += Path(audio).read_bytes()
     pause_detected, start_talking = determine_pause(state.stream, state.pause_detected)
     state.pause_detected = pause_detected
     state.start_talking = start_talking
     if not state.pause_detected:
         yield None, state
     for out_bytes in speaking(state.stream):
         yield out_bytes, state
     state = AppState()
     yield None, state
@@ -172,13 +182,20 @@ def process_audio(audio: str, state: AppState):
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            input_audio = gr.Audio(label="Input Audio", sources="microphone", type="filepath")
         with gr.Column():
-            output_audio = gr.Audio(label="Output Audio", streaming=True)
     state = gr.State(value=AppState())
-    input_audio.stream(process_audio, [input_audio, state], [output_audio, state],
-                       stream_every=0.5, time_limit=30)
 demo.launch()

 import io
 import wave
 import tempfile
+from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 repo_id = "gpt-omni/mini-omni"
 snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
+IP = "0.0.0.0"
+PORT = 60808
 thread = Thread(target=serve, daemon=True)
 thread.start()
 OUT_CHUNK = 5760
+OUT_CHUNK = 20 * 4096
 OUT_RATE = 24000
 OUT_CHANNELS = 1
 def run_vad(ori_audio, sr):
     _st = time.time()
     try:
     dur, frames, tcost = run_vad(frames, 16000)
     print(f"warm up done, time_cost: {tcost:.3f} s")
 warm_up()
 def speaking(total_frames: bytes):
     audio_buffer = io.BytesIO()
     wf = wave.open(audio_buffer, "wb")
     wf.setnchannels(IN_CHANNELS)
         try:
             for chunk in response.iter_content(chunk_size=OUT_CHUNK):
                 if chunk:
+                    # Create an audio segment from the numpy array
+                    audio_segment = AudioSegment(
+                        chunk,
+                        frame_rate=OUT_RATE,
+                        sample_width=OUT_SAMPLE_WIDTH,
+                        channels=OUT_CHANNELS,
+                    )
+                    # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
+                    mp3_io = io.BytesIO()
+                    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
+                    # Get the MP3 bytes
+                    mp3_bytes = mp3_io.getvalue()
+                    mp3_io.close()
+                    yield mp3_bytes
         except Exception as e:
             raise gr.Error(f"Error during audio streaming: {e}")
     wf.close()
     pause_detected: bool = False
 def process_audio(audio: str, state: AppState):
     state.stream += Path(audio).read_bytes()
     pause_detected, start_talking = determine_pause(state.stream, state.pause_detected)
     state.pause_detected = pause_detected
     state.start_talking = start_talking
     if not state.pause_detected:
         yield None, state
     for out_bytes in speaking(state.stream):
         yield out_bytes, state
     state = AppState()
     yield None, state
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            input_audio = gr.Audio(
+                label="Input Audio", sources="microphone", type="filepath"
+            )
         with gr.Column():
+            output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
     state = gr.State(value=AppState())
+    input_audio.stop_recording(
+        process_audio,
+        [input_audio, state],
+        [output_audio, state],
+        stream_every=0.5,
+        time_limit=30,
+    )
 demo.launch()