freddyaboulton HF staff commited on
Commit
c4d6bf6
1 Parent(s): 687343f
Files changed (1) hide show
  1. app.py +37 -20
app.py CHANGED
@@ -11,6 +11,7 @@ from pathlib import Path
11
  import io
12
  import wave
13
  import tempfile
 
14
  import librosa
15
  from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
16
 
@@ -20,8 +21,8 @@ from server import serve
20
  repo_id = "gpt-omni/mini-omni"
21
  snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
22
 
23
- IP='0.0.0.0'
24
- PORT=60808
25
 
26
  thread = Thread(target=serve, daemon=True)
27
  thread.start()
@@ -42,11 +43,11 @@ OUT_SAMPLE_WIDTH = 2
42
  OUT_CHUNK = 5760
43
 
44
 
45
-
46
- OUT_CHUNK = 4096
47
  OUT_RATE = 24000
48
  OUT_CHANNELS = 1
49
 
 
50
  def run_vad(ori_audio, sr):
51
  _st = time.time()
52
  try:
@@ -82,6 +83,7 @@ def warm_up():
82
  dur, frames, tcost = run_vad(frames, 16000)
83
  print(f"warm up done, time_cost: {tcost:.3f} s")
84
 
 
85
  warm_up()
86
 
87
 
@@ -107,7 +109,6 @@ def determine_pause(stream: bytes, start_talking: bool) -> tuple[bool, bool]:
107
 
108
 
109
  def speaking(total_frames: bytes):
110
-
111
  audio_buffer = io.BytesIO()
112
  wf = wave.open(audio_buffer, "wb")
113
  wf.setnchannels(IN_CHANNELS)
@@ -131,16 +132,26 @@ def speaking(total_frames: bytes):
131
  try:
132
  for chunk in response.iter_content(chunk_size=OUT_CHUNK):
133
  if chunk:
134
- yield chunk
135
- # # Convert chunk to numpy array
136
- # output_audio_bytes += chunk
137
- # audio_data = np.frombuffer(chunk, dtype=np.int8)
138
- # # Play audio
139
- # stream.write(audio_data)
 
 
 
 
 
 
 
 
 
 
 
140
  except Exception as e:
141
  raise gr.Error(f"Error during audio streaming: {e}")
142
 
143
-
144
  wf.close()
145
 
146
 
@@ -151,20 +162,19 @@ class AppState:
151
  pause_detected: bool = False
152
 
153
 
154
-
155
  def process_audio(audio: str, state: AppState):
156
  state.stream += Path(audio).read_bytes()
157
-
158
  pause_detected, start_talking = determine_pause(state.stream, state.pause_detected)
159
  state.pause_detected = pause_detected
160
  state.start_talking = start_talking
161
 
162
  if not state.pause_detected:
163
  yield None, state
164
-
165
  for out_bytes in speaking(state.stream):
166
  yield out_bytes, state
167
-
168
  state = AppState()
169
  yield None, state
170
 
@@ -172,13 +182,20 @@ def process_audio(audio: str, state: AppState):
172
  with gr.Blocks() as demo:
173
  with gr.Row():
174
  with gr.Column():
175
- input_audio = gr.Audio(label="Input Audio", sources="microphone", type="filepath")
 
 
176
  with gr.Column():
177
- output_audio = gr.Audio(label="Output Audio", streaming=True)
178
  state = gr.State(value=AppState())
179
 
180
- input_audio.stream(process_audio, [input_audio, state], [output_audio, state],
181
- stream_every=0.5, time_limit=30)
 
 
 
 
 
182
 
183
 
184
  demo.launch()
 
11
  import io
12
  import wave
13
  import tempfile
14
+ from pydub import AudioSegment
15
  import librosa
16
  from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
17
 
 
21
  repo_id = "gpt-omni/mini-omni"
22
  snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
23
 
24
+ IP = "0.0.0.0"
25
+ PORT = 60808
26
 
27
  thread = Thread(target=serve, daemon=True)
28
  thread.start()
 
43
  OUT_CHUNK = 5760
44
 
45
 
46
+ OUT_CHUNK = 20 * 4096
 
47
  OUT_RATE = 24000
48
  OUT_CHANNELS = 1
49
 
50
+
51
  def run_vad(ori_audio, sr):
52
  _st = time.time()
53
  try:
 
83
  dur, frames, tcost = run_vad(frames, 16000)
84
  print(f"warm up done, time_cost: {tcost:.3f} s")
85
 
86
+
87
  warm_up()
88
 
89
 
 
109
 
110
 
111
  def speaking(total_frames: bytes):
 
112
  audio_buffer = io.BytesIO()
113
  wf = wave.open(audio_buffer, "wb")
114
  wf.setnchannels(IN_CHANNELS)
 
132
  try:
133
  for chunk in response.iter_content(chunk_size=OUT_CHUNK):
134
  if chunk:
135
+ # Create an audio segment from the numpy array
136
+ audio_segment = AudioSegment(
137
+ chunk,
138
+ frame_rate=OUT_RATE,
139
+ sample_width=OUT_SAMPLE_WIDTH,
140
+ channels=OUT_CHANNELS,
141
+ )
142
+
143
+ # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
144
+ mp3_io = io.BytesIO()
145
+ audio_segment.export(mp3_io, format="mp3", bitrate="320k")
146
+
147
+ # Get the MP3 bytes
148
+ mp3_bytes = mp3_io.getvalue()
149
+ mp3_io.close()
150
+ yield mp3_bytes
151
+
152
  except Exception as e:
153
  raise gr.Error(f"Error during audio streaming: {e}")
154
 
 
155
  wf.close()
156
 
157
 
 
162
  pause_detected: bool = False
163
 
164
 
 
165
  def process_audio(audio: str, state: AppState):
166
  state.stream += Path(audio).read_bytes()
167
+
168
  pause_detected, start_talking = determine_pause(state.stream, state.pause_detected)
169
  state.pause_detected = pause_detected
170
  state.start_talking = start_talking
171
 
172
  if not state.pause_detected:
173
  yield None, state
174
+
175
  for out_bytes in speaking(state.stream):
176
  yield out_bytes, state
177
+
178
  state = AppState()
179
  yield None, state
180
 
 
182
  with gr.Blocks() as demo:
183
  with gr.Row():
184
  with gr.Column():
185
+ input_audio = gr.Audio(
186
+ label="Input Audio", sources="microphone", type="filepath"
187
+ )
188
  with gr.Column():
189
+ output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
190
  state = gr.State(value=AppState())
191
 
192
+ input_audio.stop_recording(
193
+ process_audio,
194
+ [input_audio, state],
195
+ [output_audio, state],
196
+ stream_every=0.5,
197
+ time_limit=30,
198
+ )
199
 
200
 
201
  demo.launch()