freddyaboulton HF staff commited on
Commit
eb02780
1 Parent(s): 63b59c5
Files changed (1) hide show
  1. app.py +39 -19
app.py CHANGED
@@ -6,11 +6,12 @@ import base64
6
  import numpy as np
7
  import requests
8
  import traceback
9
- from dataclasses import dataclass
10
  import io
11
  from pydub import AudioSegment
12
  import librosa
13
  from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 
14
 
15
 
16
  from server import serve
@@ -91,6 +92,7 @@ class AppState:
91
  pause_detected: bool = False
92
  started_talking: bool = False
93
  stopped: bool = False
 
94
 
95
 
96
  def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
@@ -111,21 +113,7 @@ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> b
111
  return (duration - dur_vad) > 1
112
 
113
 
114
- def speaking(audio: np.ndarray, sampling_rate: int):
115
- audio_buffer = io.BytesIO()
116
-
117
- segment = AudioSegment(
118
- audio.tobytes(),
119
- frame_rate=sampling_rate,
120
- sample_width=audio.dtype.itemsize,
121
- channels=(1 if len(audio.shape) == 1 else audio.shape[1]),
122
- )
123
- segment.export(audio_buffer, format="wav")
124
-
125
- with open("input_audio.wav", "wb") as f:
126
- f.write(audio_buffer.getvalue())
127
-
128
- audio_bytes = audio_buffer.getvalue()
129
 
130
  base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
131
  files = {"audio": base64_encoded}
@@ -174,11 +162,39 @@ def process_audio(audio: tuple, state: AppState):
174
  def response(state: AppState):
175
  if not state.pause_detected and not state.started_talking:
176
  return None, AppState()
 
 
 
 
 
 
 
 
 
 
177
 
178
- for mp3_bytes in speaking(state.stream, state.sampling_rate):
 
 
 
 
 
 
 
 
 
 
179
  yield mp3_bytes, state
 
 
 
180
 
181
- yield None, AppState()
 
 
 
 
 
182
 
183
 
184
  def start_recording_user(state: AppState):
@@ -192,6 +208,7 @@ with gr.Blocks() as demo:
192
  label="Input Audio", sources="microphone", type="numpy"
193
  )
194
  with gr.Column():
 
195
  output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
196
  state = gr.State(value=AppState())
197
 
@@ -207,13 +224,16 @@ with gr.Blocks() as demo:
207
  [state],
208
  [output_audio, state]
209
  )
 
 
210
  restart = output_audio.stop(
211
  start_recording_user,
212
  [state],
213
  [input_audio]
214
  )
215
  cancel = gr.Button("Stop Conversation", variant="stop")
216
- cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None, [state, input_audio], cancels=[respond, restart])
 
217
 
218
 
219
  demo.launch()
 
6
  import numpy as np
7
  import requests
8
  import traceback
9
+ from dataclasses import dataclass, field
10
  import io
11
  from pydub import AudioSegment
12
  import librosa
13
  from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
14
+ import tempfile
15
 
16
 
17
  from server import serve
 
92
  pause_detected: bool = False
93
  started_talking: bool = False
94
  stopped: bool = False
95
+ conversation: list = field(default_factory=list)
96
 
97
 
98
  def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
 
113
  return (duration - dur_vad) > 1
114
 
115
 
116
+ def speaking(audio_bytes: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
119
  files = {"audio": base64_encoded}
 
162
  def response(state: AppState):
163
  if not state.pause_detected and not state.started_talking:
164
  return None, AppState()
165
+
166
+ audio_buffer = io.BytesIO()
167
+
168
+ segment = AudioSegment(
169
+ state.stream.tobytes(),
170
+ frame_rate=state.sampling_rate,
171
+ sample_width=state.stream.dtype.itemsize,
172
+ channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
173
+ )
174
+ segment.export(audio_buffer, format="wav")
175
 
176
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
177
+ f.write(audio_buffer.getvalue())
178
+
179
+ state.conversation.append({"role": "user",
180
+ "content": {"path": f.name,
181
+ "mime_type": "audio/wav"}})
182
+
183
+ output_buffer = b""
184
+
185
+ for mp3_bytes in speaking(audio_buffer.getvalue()):
186
+ output_buffer += mp3_bytes
187
  yield mp3_bytes, state
188
+
189
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
190
+ f.write(output_buffer)
191
 
192
+ state.conversation.append({"role": "assistant",
193
+ "content": {"path": f.name,
194
+ "mime_type": "audio/mp3"}})
195
+ yield None, AppState(conversation=state.conversation)
196
+
197
+
198
 
199
 
200
  def start_recording_user(state: AppState):
 
208
  label="Input Audio", sources="microphone", type="numpy"
209
  )
210
  with gr.Column():
211
+ chatbot = gr.Chatbot(label="Conversation", type="messages")
212
  output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
213
  state = gr.State(value=AppState())
214
 
 
224
  [state],
225
  [output_audio, state]
226
  )
227
+ respond.then(lambda s: s.conversation, [state], [chatbot])
228
+
229
  restart = output_audio.stop(
230
  start_recording_user,
231
  [state],
232
  [input_audio]
233
  )
234
  cancel = gr.Button("Stop Conversation", variant="stop")
235
+ cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
236
+ [state, input_audio], cancels=[respond, restart])
237
 
238
 
239
  demo.launch()