freddyaboulton HF staff commited on
Commit
d531709
1 Parent(s): 556b4ae

first response

Browse files
Files changed (1) hide show
  1. app.py +93 -45
app.py CHANGED
@@ -1,13 +1,19 @@
1
  import gradio as gr
2
  from huggingface_hub import snapshot_download
3
  from threading import Thread
4
- import os
5
  import time
6
- import gradio as gr
7
  import base64
8
  import numpy as np
9
  import requests
10
  import traceback
 
 
 
 
 
 
 
 
11
 
12
  from server import serve
13
 
@@ -78,58 +84,100 @@ def warm_up():
78
 
79
  warm_up()
80
 
81
- def determine_pause(stream: bytes, start_talking: bool) -> tuple[bytes, bool]:
 
82
  """Take in the stream, determine if a pause happened"""
83
 
84
  temp_audio = stream
85
 
86
  if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
87
- dur_vad, vad_audio_bytes, time_vad = run_vad(temp_audio, IN_RATE)
88
 
89
  print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
90
 
91
  if dur_vad > 0.2 and not start_talking:
92
- if last_temp_audio is not None:
93
- st.session_state.frames.append(last_temp_audio)
94
  start_talking = True
95
- if start_talking:
96
- st.session_state.frames.append(temp_audio)
97
  if dur_vad < 0.1 and start_talking:
98
- st.session_state.recording = False
99
- print(f"speech end detected. excit")
100
- last_temp_audio = temp_audio
101
- temp_audio = b""
102
-
103
-
104
- def process_audio(audio):
105
- filepath = audio
106
- print(f"filepath: {filepath}")
107
- if filepath is None:
108
- return
109
-
110
- cnt = 0
111
- with open(filepath, "rb") as f:
112
- data = f.read()
113
- base64_encoded = str(base64.b64encode(data), encoding="utf-8")
114
- files = {"audio": base64_encoded}
115
- tik = time.time()
116
- with requests.post(API_URL, json=files, stream=True) as response:
117
- try:
118
- for chunk in response.iter_content(chunk_size=OUT_CHUNK):
119
- if chunk:
120
- # Convert chunk to numpy array
121
- if cnt == 0:
122
- print(f"first chunk time cost: {time.time() - tik:.3f}")
123
- cnt += 1
124
- audio_data = np.frombuffer(chunk, dtype=np.int16)
125
- audio_data = audio_data.reshape(-1, OUT_CHANNELS)
126
- yield OUT_RATE, audio_data.astype(np.int16)
127
-
128
- except Exception as e:
129
- print(f"error: {e}")
130
-
131
- def greet(name):
132
- return "Hello " + name + "!!"
133
-
134
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  demo.launch()
 
1
  import gradio as gr
2
  from huggingface_hub import snapshot_download
3
  from threading import Thread
 
4
  import time
 
5
  import base64
6
  import numpy as np
7
  import requests
8
  import traceback
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ import io
12
+ import wave
13
+ import tempfile
14
+ import librosa
15
+ from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
16
+
17
 
18
  from server import serve
19
 
 
84
 
85
  warm_up()
86
 
87
+
88
+ def determine_pause(stream: bytes, start_talking: bool) -> tuple[bool, bool]:
89
  """Take in the stream, determine if a pause happened"""
90
 
91
  temp_audio = stream
92
 
93
  if len(temp_audio) > IN_SAMPLE_WIDTH * IN_RATE * IN_CHANNELS * VAD_STRIDE:
94
+ dur_vad, _, time_vad = run_vad(temp_audio, IN_RATE)
95
 
96
  print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
97
 
98
  if dur_vad > 0.2 and not start_talking:
 
 
99
  start_talking = True
100
+ pause = False
101
+ return pause, start_talking
102
  if dur_vad < 0.1 and start_talking:
103
+ print("pause detected")
104
+ return True, start_talking
105
+ return False, start_talking
106
+ return False, start_talking
107
+
108
+
109
+ def speaking(total_frames: bytes):
110
+
111
+ audio_buffer = io.BytesIO()
112
+ wf = wave.open(audio_buffer, "wb")
113
+ wf.setnchannels(IN_CHANNELS)
114
+ wf.setsampwidth(IN_SAMPLE_WIDTH)
115
+ wf.setframerate(IN_RATE)
116
+
117
+ dur = len(total_frames) / (IN_RATE * IN_CHANNELS * IN_SAMPLE_WIDTH)
118
+ print(f"Speaking... recorded audio duration: {dur:.3f} s")
119
+
120
+ wf.writeframes(total_frames)
121
+
122
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
123
+ with open(tmpfile.name, "wb") as f:
124
+ f.write(audio_buffer.getvalue())
125
+
126
+ audio_bytes = audio_buffer.getvalue()
127
+
128
+ base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
129
+ files = {"audio": base64_encoded}
130
+ with requests.post(API_URL, json=files, stream=True) as response:
131
+ try:
132
+ for chunk in response.iter_content(chunk_size=OUT_CHUNK):
133
+ if chunk:
134
+ yield chunk
135
+ # # Convert chunk to numpy array
136
+ # output_audio_bytes += chunk
137
+ # audio_data = np.frombuffer(chunk, dtype=np.int8)
138
+ # # Play audio
139
+ # stream.write(audio_data)
140
+ except Exception as e:
141
+ raise gr.Error(f"Error during audio streaming: {e}")
142
+
143
+
144
+ wf.close()
145
+
146
+
147
+ @dataclass
148
+ class AppState:
149
+ start_talking: bool = False
150
+ stream: bytes = b""
151
+ pause_detected: bool = False
152
+
153
+
154
+
155
+ def process_audio(audio: str, state: AppState):
156
+ state.stream += Path(audio).read_bytes()
157
+
158
+ pause_detected, start_talking = determine_pause(state.stream, state.pause_detected)
159
+ state.pause_detected = pause_detected
160
+ state.start_talking = start_talking
161
+
162
+ if not state.pause_detected:
163
+ yield None, state
164
+
165
+ for out_bytes in speaking(state.stream):
166
+ yield out_bytes, state
167
+
168
+ state = AppState()
169
+ yield None, state
170
+
171
+
172
+ with gr.Blocks() as demo:
173
+ with gr.Row():
174
+ input_audio = gr.Audio(label="Input Audio")
175
+ with gr.Row():
176
+ output_audio = gr.Audio(label="Output Audio")
177
+ state = gr.State(value=AppState())
178
+
179
+ input_audio.stream(process_audio, [input_audio, state], [output_audio, state],
180
+ stream_every=0.5, time_limit=30)
181
+
182
+
183
  demo.launch()