project_charles / speech_to_text_vosk_actor.py
sohojoe's picture
refactor - move ffmpeg_converter_actor to within respond_to_prompt_actor
b6ba8eb
import json
import os
from vosk import SetLogLevel, Model, KaldiRecognizer
import ray
SetLogLevel(-1) # mutes vosk verbosity
@ray.remote
class SpeechToTextVoskActor:
def __init__(self, model='small', audio_bit_rate=16000) -> None:
self.model = model
self.audio_bit_rate = audio_bit_rate
# load vosk model
# get path of current file
current_file_path = os.path.abspath(__file__)
current_directory = os.path.dirname(current_file_path)
_path = os.path.join(current_directory, 'models', 'vosk', self.model)
self.model_voice = Model(_path)
self.vosk = KaldiRecognizer(self.model_voice, self.audio_bit_rate)
self.text_queue = []
self.finished_queue = []
def process_speech(self, data: bytearray) -> tuple[str, bool]:
text = ''
speaker_finished = False
if self.vosk.AcceptWaveform(data):
result = self.vosk.Result()
result_json = json.loads(result)
text = result_json['text']
speaker_finished = True
else:
result = self.vosk.PartialResult()
result_json = json.loads(result)
text = result_json['partial']
return text, speaker_finished, result_json
def add_speech_bytes(self, data: bytearray):
text, speaker_finished = self._process_speech(data)
self.text_queue.append(text)
if speaker_finished:
self.finished_queue.append(speaker_finished)
def _process_speech(self, data: bytearray) -> tuple[str, bool]:
text = ''
speaker_finished = False
if self.vosk.AcceptWaveform(data):
result = self.vosk.Result()
result_json = json.loads(result)
text = result_json['text']
speaker_finished = True
else:
result = self.vosk.PartialResult()
result_json = json.loads(result)
text = result_json['partial']
return text, speaker_finished
def get_text(self):
text = ''
speaker_finished = False
while self.text_queue:
result = self.text_queue.pop(0)
text += result
if self.finished_queue:
speaker_finished = self.finished_queue.pop(0)
break
return text, speaker_finished
def get_audio_bit_rate(self):
return self.audio_bit_rate