freddyaboulton HF staff commited on
Commit
bc98115
1 Parent(s): f82efe7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -149
app.py CHANGED
@@ -30,7 +30,7 @@ import gradio as gr
30
  import librosa
31
  import numpy as np
32
  import requests
33
- from gradio_webrtc import StreamHandler, WebRTC
34
  from huggingface_hub import snapshot_download
35
  from pydub import AudioSegment
36
  from twilio.rest import Client
@@ -67,102 +67,13 @@ if account_sid and auth_token:
67
  else:
68
  rtc_configuration = None
69
 
70
- # recording parameters
71
- IN_CHANNELS = 1
72
- IN_RATE = 24000
73
- IN_CHUNK = 1024
74
- IN_SAMPLE_WIDTH = 2
75
- VAD_STRIDE = 0.5
76
-
77
- # playing parameters
78
  OUT_CHANNELS = 1
79
  OUT_RATE = 24000
80
  OUT_SAMPLE_WIDTH = 2
81
  OUT_CHUNK = 20 * 4096
82
 
83
 
84
- def run_vad(ori_audio, sr):
85
- _st = time.time()
86
- try:
87
- audio = ori_audio
88
- audio = audio.astype(np.float32) / 32768.0
89
- sampling_rate = 16000
90
- if sr != sampling_rate:
91
- audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
92
-
93
- vad_parameters = {}
94
- vad_parameters = VadOptions(**vad_parameters)
95
- speech_chunks = get_speech_timestamps(audio, vad_parameters)
96
- audio = collect_chunks(audio, speech_chunks)
97
- duration_after_vad = audio.shape[0] / sampling_rate
98
-
99
- if sr != sampling_rate:
100
- # resample to original sampling rate
101
- vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
102
- else:
103
- vad_audio = audio
104
- vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
105
- vad_audio_bytes = vad_audio.tobytes()
106
-
107
- return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
108
- except Exception as e:
109
- msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
110
- print(msg)
111
- return -1, ori_audio, round(time.time() - _st, 4)
112
-
113
-
114
- def warm_up():
115
- frames = np.zeros((1, 1600)) # 1024 frames of 2 bytes each
116
- _, frames, tcost = run_vad(frames, 16000)
117
- print(f"warm up done, time_cost: {tcost:.3f} s")
118
-
119
-
120
- # warm_up()
121
-
122
-
123
- @dataclass
124
- class AppState:
125
- stream: np.ndarray | None = None
126
- sampling_rate: int = 0
127
- pause_detected: bool = False
128
- started_talking: bool = False
129
- responding: bool = False
130
- stopped: bool = False
131
- buffer: np.ndarray | None = None
132
-
133
-
134
- def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
135
- """Take in the stream, determine if a pause happened"""
136
- duration = len(audio) / sampling_rate
137
-
138
- dur_vad, _, _ = run_vad(audio, sampling_rate)
139
-
140
- if duration >= 0.60:
141
- if dur_vad > 0.2 and not state.started_talking:
142
- print("started talking")
143
- state.started_talking = True
144
- if state.started_talking:
145
- if state.stream is None:
146
- state.stream = audio
147
- else:
148
- state.stream = np.concatenate((state.stream, audio))
149
- state.buffer = None
150
- if dur_vad < 0.1 and state.started_talking:
151
- segment = AudioSegment(
152
- state.stream.tobytes(),
153
- frame_rate=sampling_rate,
154
- sample_width=audio.dtype.itemsize,
155
- channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
156
- )
157
-
158
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
159
- segment.export(f.name, format="wav")
160
- print("input file written", f.name)
161
- return True
162
- return False
163
-
164
-
165
- def speaking(audio_bytes: str):
166
  base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
167
  files = {"audio": base64_encoded}
168
  byte_buffer = b""
@@ -194,73 +105,24 @@ def speaking(audio_bytes: str):
194
  raise gr.Error(f"Error during audio streaming: {e}")
195
 
196
 
197
- def process_audio(audio: tuple, state: AppState) -> None:
198
- frame_rate, array = audio
199
- array = np.squeeze(array)
200
- if not state.sampling_rate:
201
- state.sampling_rate = frame_rate
202
- if state.buffer is None:
203
- state.buffer = array
204
- else:
205
- state.buffer = np.concatenate((state.buffer, array))
206
-
207
- pause_detected = determine_pause(state.buffer, state.sampling_rate, state)
208
- state.pause_detected = pause_detected
209
-
210
 
211
- def response(state: AppState):
212
- if not state.pause_detected and not state.started_talking:
213
- return None
214
 
215
  audio_buffer = io.BytesIO()
216
  segment = AudioSegment(
217
- state.stream.tobytes(),
218
- frame_rate=state.sampling_rate,
219
- sample_width=state.stream.dtype.itemsize,
220
- channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
221
- )
222
  segment.export(audio_buffer, format="wav")
223
 
224
  for numpy_array in speaking(audio_buffer.getvalue()):
225
  yield (OUT_RATE, numpy_array, "mono")
226
 
227
 
228
- class OmniHandler(StreamHandler):
229
- def __init__(self) -> None:
230
- super().__init__(
231
- expected_layout="mono", output_sample_rate=OUT_RATE, output_frame_size=480
232
- )
233
- self.event = Event()
234
- self.state = AppState()
235
- self.generator = None
236
- self.duration = 0
237
-
238
- def receive(self, frame: tuple[int, np.ndarray]) -> None:
239
- if self.state.responding:
240
- return
241
- process_audio(frame, self.state)
242
- if self.state.pause_detected:
243
- self.event.set()
244
-
245
- def reset(self):
246
- self.generator = None
247
- self.event.clear()
248
- self.state = AppState()
249
- self.duration = 0
250
-
251
- def emit(self):
252
- if not self.event.is_set():
253
- return None
254
- else:
255
- if not self.generator:
256
- self.generator = response(self.state)
257
- self.state.responding = True
258
- try:
259
- return next(self.generator)
260
- except StopIteration:
261
- self.reset()
262
-
263
-
264
  with gr.Blocks() as demo:
265
  gr.HTML(
266
  """
@@ -277,7 +139,7 @@ with gr.Blocks() as demo:
277
  mode="send-receive",
278
  modality="audio",
279
  )
280
- audio.stream(fn=OmniHandler(), inputs=[audio], outputs=[audio], time_limit=60)
281
 
282
 
283
  demo.launch(ssr_mode=False)
 
30
  import librosa
31
  import numpy as np
32
  import requests
33
+ from gradio_webrtc import ReplyOnPause, WebRTC
34
  from huggingface_hub import snapshot_download
35
  from pydub import AudioSegment
36
  from twilio.rest import Client
 
67
  else:
68
  rtc_configuration = None
69
 
 
 
 
 
 
 
 
 
70
  OUT_CHANNELS = 1
71
  OUT_RATE = 24000
72
  OUT_SAMPLE_WIDTH = 2
73
  OUT_CHUNK = 20 * 4096
74
 
75
 
76
+ def speaking(audio_bytes: bytes):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
78
  files = {"audio": base64_encoded}
79
  byte_buffer = b""
 
105
  raise gr.Error(f"Error during audio streaming: {e}")
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ def response(audio: tuple[int, np.ndarray]):
110
+ sampling_rate, audio_np = audio
111
+ audio_np = audio_np.squeeze()
112
 
113
  audio_buffer = io.BytesIO()
114
  segment = AudioSegment(
115
+ audio_np.tobytes(),
116
+ frame_rate=sampling_rate,
117
+ sample_width=audio_np.dtype.itemsize,
118
+ channels=1)
119
+
120
  segment.export(audio_buffer, format="wav")
121
 
122
  for numpy_array in speaking(audio_buffer.getvalue()):
123
  yield (OUT_RATE, numpy_array, "mono")
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  with gr.Blocks() as demo:
127
  gr.HTML(
128
  """
 
139
  mode="send-receive",
140
  modality="audio",
141
  )
142
+ audio.stream(fn=ReplyOnPause(response), inputs=[audio], outputs=[audio], time_limit=60)
143
 
144
 
145
  demo.launch(ssr_mode=False)