Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

STT/whisper_stt_handler.py +3 -0
VAD/vad_handler.py +4 -0
audio_streaming_client.py +4 -4
handler.py +8 -0

STT/whisper_stt_handler.py CHANGED Viewed

@@ -111,6 +111,7 @@ class WhisperSTTHandler(BaseHandler):
     def process(self, spoken_prompt):
         logger.debug("infering whisper...")
         global pipeline_start
         pipeline_start = perf_counter()
@@ -121,6 +122,7 @@ class WhisperSTTHandler(BaseHandler):
         if language_code not in SUPPORTED_LANGUAGES:  # reprocess with the last language
             logger.warning("Whisper detected unsupported language:", language_code)
             gen_kwargs = copy(self.gen_kwargs)
             gen_kwargs['language'] = self.last_language
             language_code = self.last_language
@@ -135,6 +137,7 @@ class WhisperSTTHandler(BaseHandler):
         logger.debug("finished whisper inference")
         console.print(f"[yellow]USER: {pred_text}")
         logger.debug(f"Language Code Whisper: {language_code}")
         yield (pred_text, language_code)

     def process(self, spoken_prompt):
         logger.debug("infering whisper...")
+        console.print("infering whisper...")
         global pipeline_start
         pipeline_start = perf_counter()
         if language_code not in SUPPORTED_LANGUAGES:  # reprocess with the last language
             logger.warning("Whisper detected unsupported language:", language_code)
+            console.print("Whisper detected unsupported language:", language_code)
             gen_kwargs = copy(self.gen_kwargs)
             gen_kwargs['language'] = self.last_language
             language_code = self.last_language
         logger.debug("finished whisper inference")
         console.print(f"[yellow]USER: {pred_text}")
+        console.print(f"Language Code Whisper: {language_code}")
         logger.debug(f"Language Code Whisper: {language_code}")
         yield (pred_text, language_code)

VAD/vad_handler.py CHANGED Viewed

@@ -53,10 +53,14 @@ class VADHandler(BaseHandler):
         audio_float32 = int2float(audio_int16)
         vad_output = self.iterator(torch.from_numpy(audio_float32))
         if vad_output is not None and len(vad_output) != 0:
             logger.debug("VAD: end of speech detected")
             array = torch.cat(vad_output).cpu().numpy()
             duration_ms = len(array) / self.sample_rate * 1000
             if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
                 logger.debug(
                     f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
                 )

         audio_float32 = int2float(audio_int16)
         vad_output = self.iterator(torch.from_numpy(audio_float32))
         if vad_output is not None and len(vad_output) != 0:
+            console.print("VAD: end of speech detected")
             logger.debug("VAD: end of speech detected")
             array = torch.cat(vad_output).cpu().numpy()
             duration_ms = len(array) / self.sample_rate * 1000
             if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
+                console.print(
+                    f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
+                )
                 logger.debug(
                     f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
                 )

audio_streaming_client.py CHANGED Viewed

@@ -60,10 +60,10 @@ class AudioStreamingClient:
                 if len(buffer) >= self.args.chunk_size * 2:  # * 2 because of int16
                     self.send_request(buffer)
                     buffer = b''
-                    time.sleep(4*self.args.chunk_size/self.args.sample_rate)
             else:
                 self.send_request()
-                time.sleep(4*self.args.chunk_size/self.args.sample_rate)
     def send_request(self, audio_data=None):
         payload = {"input_type": "speech",
@@ -106,8 +106,8 @@ class AudioStreamingClient:
                 self.session_id = None
                 while not self.recv_queue.empty():
                     time.sleep(0.01)  # wait for the queue to empty
-                while not self.send_queue.empty():
-                    _ = self.send_queue.get()  # Clear the queue
         except Exception as e:
             print(f"Error sending request: {e}")

                 if len(buffer) >= self.args.chunk_size * 2:  # * 2 because of int16
                     self.send_request(buffer)
                     buffer = b''
+                    time.sleep(16*self.args.chunk_size/self.args.sample_rate)
             else:
                 self.send_request()
+                time.sleep(16*self.args.chunk_size/self.args.sample_rate)
     def send_request(self, audio_data=None):
         payload = {"input_type": "speech",
                 self.session_id = None
                 while not self.recv_queue.empty():
                     time.sleep(0.01)  # wait for the queue to empty
+                with self.send_queue.mutex:
+                    self.send_queue.queue.clear()  # Clear the queue
         except Exception as e:
             print(f"Error sending request: {e}")

handler.py CHANGED Viewed

@@ -64,6 +64,7 @@ class EndpointHandler:
         self.sample_rate = 16000  # Set the expected sample rate
     def _process_audio_chunk(self, audio_data: bytes, session_id: str):
         audio_array = np.frombuffer(audio_data, dtype=np.int16)
         # Ensure the audio is in chunks of the correct size
@@ -113,6 +114,8 @@ class EndpointHandler:
         input_type = data.get("input_type", "text")
         input_data = data.get("inputs", "")
         if input_type == "speech":
             audio_bytes = base64.b64decode(input_data)
@@ -129,6 +132,8 @@ class EndpointHandler:
     def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
         session_id = data.get("session_id")
         if not session_id or session_id not in self.sessions:
             raise ValueError("Invalid or missing session_id")
@@ -136,8 +141,10 @@ class EndpointHandler:
         if not self.queues_and_events['should_listen'].is_set():
             session['status'] = 'processing'
         elif "inputs" in data:  # Handle additional input if provided
             input_data = data["inputs"]
             audio_bytes = base64.b64decode(input_data)
             self._process_audio_chunk(audio_bytes, session_id)
@@ -145,6 +152,7 @@ class EndpointHandler:
         session['last_sent_index'] = len(session['chunks'])
         if chunks_to_send:
             combined_audio = b''.join(chunks_to_send)
             base64_audio = base64.b64encode(combined_audio).decode('utf-8')
             return {

         self.sample_rate = 16000  # Set the expected sample rate
     def _process_audio_chunk(self, audio_data: bytes, session_id: str):
+        print('processing audio chunk')
         audio_array = np.frombuffer(audio_data, dtype=np.int16)
         # Ensure the audio is in chunks of the correct size
         input_type = data.get("input_type", "text")
         input_data = data.get("inputs", "")
+        console.print(f"input_type: {input_type}")
+        console.print(f"input_data: {input_data}")
         if input_type == "speech":
             audio_bytes = base64.b64decode(input_data)
     def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
         session_id = data.get("session_id")
+        print(f"session_id: {session_id}")
+        print('continue request')
         if not session_id or session_id not in self.sessions:
             raise ValueError("Invalid or missing session_id")
         if not self.queues_and_events['should_listen'].is_set():
             session['status'] = 'processing'
+            print('should_listen is not set, processing')
         elif "inputs" in data:  # Handle additional input if provided
             input_data = data["inputs"]
+            print(f"input_data: {input_data}")
             audio_bytes = base64.b64decode(input_data)
             self._process_audio_chunk(audio_bytes, session_id)
         session['last_sent_index'] = len(session['chunks'])
         if chunks_to_send:
+            print('chunks_to_send')
             combined_audio = b''.join(chunks_to_send)
             base64_audio = base64.b64encode(combined_audio).decode('utf-8')
             return {