Upload folder using huggingface_hub
Browse files- STT/whisper_stt_handler.py +3 -0
- VAD/vad_handler.py +4 -0
- audio_streaming_client.py +4 -4
- handler.py +8 -0
STT/whisper_stt_handler.py
CHANGED
@@ -111,6 +111,7 @@ class WhisperSTTHandler(BaseHandler):
|
|
111 |
|
112 |
def process(self, spoken_prompt):
|
113 |
logger.debug("infering whisper...")
|
|
|
114 |
|
115 |
global pipeline_start
|
116 |
pipeline_start = perf_counter()
|
@@ -121,6 +122,7 @@ class WhisperSTTHandler(BaseHandler):
|
|
121 |
|
122 |
if language_code not in SUPPORTED_LANGUAGES: # reprocess with the last language
|
123 |
logger.warning("Whisper detected unsupported language:", language_code)
|
|
|
124 |
gen_kwargs = copy(self.gen_kwargs)
|
125 |
gen_kwargs['language'] = self.last_language
|
126 |
language_code = self.last_language
|
@@ -135,6 +137,7 @@ class WhisperSTTHandler(BaseHandler):
|
|
135 |
|
136 |
logger.debug("finished whisper inference")
|
137 |
console.print(f"[yellow]USER: {pred_text}")
|
|
|
138 |
logger.debug(f"Language Code Whisper: {language_code}")
|
139 |
|
140 |
yield (pred_text, language_code)
|
|
|
111 |
|
112 |
def process(self, spoken_prompt):
|
113 |
logger.debug("infering whisper...")
|
114 |
+
console.print("infering whisper...")
|
115 |
|
116 |
global pipeline_start
|
117 |
pipeline_start = perf_counter()
|
|
|
122 |
|
123 |
if language_code not in SUPPORTED_LANGUAGES: # reprocess with the last language
|
124 |
logger.warning("Whisper detected unsupported language:", language_code)
|
125 |
+
console.print("Whisper detected unsupported language:", language_code)
|
126 |
gen_kwargs = copy(self.gen_kwargs)
|
127 |
gen_kwargs['language'] = self.last_language
|
128 |
language_code = self.last_language
|
|
|
137 |
|
138 |
logger.debug("finished whisper inference")
|
139 |
console.print(f"[yellow]USER: {pred_text}")
|
140 |
+
console.print(f"Language Code Whisper: {language_code}")
|
141 |
logger.debug(f"Language Code Whisper: {language_code}")
|
142 |
|
143 |
yield (pred_text, language_code)
|
VAD/vad_handler.py
CHANGED
@@ -53,10 +53,14 @@ class VADHandler(BaseHandler):
|
|
53 |
audio_float32 = int2float(audio_int16)
|
54 |
vad_output = self.iterator(torch.from_numpy(audio_float32))
|
55 |
if vad_output is not None and len(vad_output) != 0:
|
|
|
56 |
logger.debug("VAD: end of speech detected")
|
57 |
array = torch.cat(vad_output).cpu().numpy()
|
58 |
duration_ms = len(array) / self.sample_rate * 1000
|
59 |
if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
|
|
|
|
|
|
|
60 |
logger.debug(
|
61 |
f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
|
62 |
)
|
|
|
53 |
audio_float32 = int2float(audio_int16)
|
54 |
vad_output = self.iterator(torch.from_numpy(audio_float32))
|
55 |
if vad_output is not None and len(vad_output) != 0:
|
56 |
+
console.print("VAD: end of speech detected")
|
57 |
logger.debug("VAD: end of speech detected")
|
58 |
array = torch.cat(vad_output).cpu().numpy()
|
59 |
duration_ms = len(array) / self.sample_rate * 1000
|
60 |
if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
|
61 |
+
console.print(
|
62 |
+
f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
|
63 |
+
)
|
64 |
logger.debug(
|
65 |
f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
|
66 |
)
|
audio_streaming_client.py
CHANGED
@@ -60,10 +60,10 @@ class AudioStreamingClient:
|
|
60 |
if len(buffer) >= self.args.chunk_size * 2: # * 2 because of int16
|
61 |
self.send_request(buffer)
|
62 |
buffer = b''
|
63 |
-
time.sleep(
|
64 |
else:
|
65 |
self.send_request()
|
66 |
-
time.sleep(
|
67 |
|
68 |
def send_request(self, audio_data=None):
|
69 |
payload = {"input_type": "speech",
|
@@ -106,8 +106,8 @@ class AudioStreamingClient:
|
|
106 |
self.session_id = None
|
107 |
while not self.recv_queue.empty():
|
108 |
time.sleep(0.01) # wait for the queue to empty
|
109 |
-
|
110 |
-
|
111 |
|
112 |
except Exception as e:
|
113 |
print(f"Error sending request: {e}")
|
|
|
60 |
if len(buffer) >= self.args.chunk_size * 2: # * 2 because of int16
|
61 |
self.send_request(buffer)
|
62 |
buffer = b''
|
63 |
+
time.sleep(16*self.args.chunk_size/self.args.sample_rate)
|
64 |
else:
|
65 |
self.send_request()
|
66 |
+
time.sleep(16*self.args.chunk_size/self.args.sample_rate)
|
67 |
|
68 |
def send_request(self, audio_data=None):
|
69 |
payload = {"input_type": "speech",
|
|
|
106 |
self.session_id = None
|
107 |
while not self.recv_queue.empty():
|
108 |
time.sleep(0.01) # wait for the queue to empty
|
109 |
+
with self.send_queue.mutex:
|
110 |
+
self.send_queue.queue.clear() # Clear the queue
|
111 |
|
112 |
except Exception as e:
|
113 |
print(f"Error sending request: {e}")
|
handler.py
CHANGED
@@ -64,6 +64,7 @@ class EndpointHandler:
|
|
64 |
self.sample_rate = 16000 # Set the expected sample rate
|
65 |
|
66 |
def _process_audio_chunk(self, audio_data: bytes, session_id: str):
|
|
|
67 |
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
68 |
|
69 |
# Ensure the audio is in chunks of the correct size
|
@@ -113,6 +114,8 @@ class EndpointHandler:
|
|
113 |
|
114 |
input_type = data.get("input_type", "text")
|
115 |
input_data = data.get("inputs", "")
|
|
|
|
|
116 |
|
117 |
if input_type == "speech":
|
118 |
audio_bytes = base64.b64decode(input_data)
|
@@ -129,6 +132,8 @@ class EndpointHandler:
|
|
129 |
|
130 |
def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
131 |
session_id = data.get("session_id")
|
|
|
|
|
132 |
if not session_id or session_id not in self.sessions:
|
133 |
raise ValueError("Invalid or missing session_id")
|
134 |
|
@@ -136,8 +141,10 @@ class EndpointHandler:
|
|
136 |
|
137 |
if not self.queues_and_events['should_listen'].is_set():
|
138 |
session['status'] = 'processing'
|
|
|
139 |
elif "inputs" in data: # Handle additional input if provided
|
140 |
input_data = data["inputs"]
|
|
|
141 |
audio_bytes = base64.b64decode(input_data)
|
142 |
self._process_audio_chunk(audio_bytes, session_id)
|
143 |
|
@@ -145,6 +152,7 @@ class EndpointHandler:
|
|
145 |
session['last_sent_index'] = len(session['chunks'])
|
146 |
|
147 |
if chunks_to_send:
|
|
|
148 |
combined_audio = b''.join(chunks_to_send)
|
149 |
base64_audio = base64.b64encode(combined_audio).decode('utf-8')
|
150 |
return {
|
|
|
64 |
self.sample_rate = 16000 # Set the expected sample rate
|
65 |
|
66 |
def _process_audio_chunk(self, audio_data: bytes, session_id: str):
|
67 |
+
print('processing audio chunk')
|
68 |
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
69 |
|
70 |
# Ensure the audio is in chunks of the correct size
|
|
|
114 |
|
115 |
input_type = data.get("input_type", "text")
|
116 |
input_data = data.get("inputs", "")
|
117 |
+
console.print(f"input_type: {input_type}")
|
118 |
+
console.print(f"input_data: {input_data}")
|
119 |
|
120 |
if input_type == "speech":
|
121 |
audio_bytes = base64.b64decode(input_data)
|
|
|
132 |
|
133 |
def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
134 |
session_id = data.get("session_id")
|
135 |
+
print(f"session_id: {session_id}")
|
136 |
+
print('continue request')
|
137 |
if not session_id or session_id not in self.sessions:
|
138 |
raise ValueError("Invalid or missing session_id")
|
139 |
|
|
|
141 |
|
142 |
if not self.queues_and_events['should_listen'].is_set():
|
143 |
session['status'] = 'processing'
|
144 |
+
print('should_listen is not set, processing')
|
145 |
elif "inputs" in data: # Handle additional input if provided
|
146 |
input_data = data["inputs"]
|
147 |
+
print(f"input_data: {input_data}")
|
148 |
audio_bytes = base64.b64decode(input_data)
|
149 |
self._process_audio_chunk(audio_bytes, session_id)
|
150 |
|
|
|
152 |
session['last_sent_index'] = len(session['chunks'])
|
153 |
|
154 |
if chunks_to_send:
|
155 |
+
print('chunks_to_send')
|
156 |
combined_audio = b''.join(chunks_to_send)
|
157 |
base64_audio = base64.b64encode(combined_audio).decode('utf-8')
|
158 |
return {
|