Spaces:
Running
Running
from llama_cpp import Llama | |
import whisper | |
from TTS.api import TTS | |
import numpy as np | |
import gradio as gr | |
from gradio_unifiedaudio import UnifiedAudio | |
from pathlib import Path | |
import torch | |
from scipy.io import wavfile | |
from collections import deque | |
whisper_model = whisper.load_model("base") | |
llm = Llama.from_pretrained( | |
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", | |
filename="*q8_0.gguf", | |
verbose=False | |
) | |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False) | |
dir_ = Path(__file__).parent | |
instream = None | |
def detect_pause(instream, energy_threshold=800, pause_duration=2.0, sample_rate=16000): | |
pause_samples = int(pause_duration * sample_rate) | |
energy = np.abs(instream[1]) | |
window = deque(maxlen=pause_samples) | |
for i, e in enumerate(energy): | |
window.append(e < energy_threshold) | |
if len(window) == pause_samples and all(window): | |
return True | |
return False | |
def add_to_stream(audio, instream, pause_detected): | |
if instream is None: | |
ret = audio | |
else: | |
ret = (audio[0], np.concatenate((instream[1], audio[1]))) | |
if detect_pause(instream): | |
pause_detected = True | |
stop_recording(ret) | |
return audio, ret, pause_detected | |
def stop_recording(audio): | |
wavfile.write("user_output.wav", audio[0], audio[1]) | |
text = whisper_model.transcribe("user_output.wav")['text'] | |
print(f"You said: {text}") | |
if text.lower() in ["exit", "quit", "stop"]: | |
print("Voice Assistant is shutting down.") | |
response = generate_response(text) | |
print(f"Assistant: {response}") | |
return UnifiedAudio(value=speak_text(response), streaming=False) | |
def stop_playing(): | |
pause_detected = False | |
return UnifiedAudio(value=None, streaming=True), None, pause_detected | |
def transcribe_audio(audio_data): | |
return whisper_model.transcribe("user_output.wav", language='en')['text'] | |
def generate_response(prompt): | |
response = llm(prompt=prompt) | |
return response['choices'][0]['text'].strip() | |
def speak_text(text): | |
tts.tts_to_file(text=text.strip(), file_path="bot_output.wav") | |
return "bot_output.wav" | |
with gr.Blocks() as demo: | |
mic = UnifiedAudio(sources=["microphone"], streaming=True) | |
stream = gr.State() | |
pause_detected = gr.State(False) | |
mic.stop_recording(stop_recording, stream, mic) | |
mic.end(stop_playing, None, [mic, stream, pause_detected]) | |
mic.stream(add_to_stream, [mic, stream, pause_detected], [mic, stream, pause_detected]) | |
# @gr.render(inputs=[mic, stream, pause_detected]) | |
# def recording_paused(microphone, stream, pause_detected): | |
# if pause_detected: | |
# stop_recording(stream) | |
if __name__ == '__main__': | |
demo.launch() | |