Spaces:

simonraj
/

ZeroCpuSpeechChat

Sleeping

File size: 2,511 Bytes

0441ef7
 
 
 
 
 
 
788cf13
0441ef7
 
 
788cf13
0441ef7
 
 
 
 
 
788cf13
0441ef7
 
788cf13
ef1d0d3
0441ef7
788cf13
2895ffe
0441ef7
 
 
 
 
 
 
 
 
2895ffe
0441ef7
788cf13
 
0441ef7
 
 
 
 
 
788cf13
1918a73
0441ef7
 
 
 
 
1918a73
0441ef7
 
788cf13
ef1d0d3
 
788cf13
791889e
 
6d7626f
fd7b0d8
791889e
 
 
 
 
 
788cf13
 
0441ef7
788cf13

import spaces
import tempfile
import gradio as gr
from streaming_stt_nemo import Model
from huggingface_hub import InferenceClient
import edge_tts

# Initialize default language and STT model
default_lang = "en"
engines = {default_lang: Model(default_lang)}

# Function to transcribe audio to text
def transcribe(audio):
    lang = "en"
    model = engines[lang]
    text = model.stt_file(audio)[0]
    return text

# Initialize Huggingface InferenceClient
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

# System instructions for the CrucialCoach
system_instructions = "[SYSTEM] You are CrucialCoach, an AI-powered conversational coach. Guide the user through challenging workplace situations using the principles from 'Crucial Conversations'. Ask one question at a time and provide step-by-step guidance.\n\n[USER]"

# Decorator for using GPU with a duration of 120 seconds
@spaces.GPU(duration=120)
def model(text):
    generate_kwargs = dict(
        temperature=0.7,
        max_new_tokens=512,
        top_p=0.95,
        repetition_penalty=1,
        do_sample=True,
        seed=42,
    )
    formatted_prompt = system_instructions + text + "[CrucialCoach]"
    stream = client.text_generation(
        formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False
    )
    output = ""
    for response in stream:
        if not response.token.text == "</s>":
            output += response.token.text
    return output

# Asynchronous function to handle audio input and provide response
async def respond(audio):
    user = transcribe(audio)
    reply = model(user)
    communicate = edge_tts.Communicate(reply)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

# Gradio theme
theme = gr.themes.Base()

# Gradio interface for voice chat
with gr.Blocks(theme=theme, css="footer {visibility: hidden} textbox {resize: none}", title="CrucialCoach DEMO") as demo:
    with gr.Tab("🗣️ Crucial Coach Chat"):
        input_audio = gr.Audio(sources=["microphone"], type="filepath", label="Voice Chat")
        output_audio = gr.Audio(type="filepath", label="CrucialCoach", interactive=False, autoplay=True, elem_classes="audio")
        gr.Interface(
            fn=respond,
            inputs=input_audio,
            outputs=output_audio,
            live=True
        )

# Queue setup and launch
demo.queue(max_size=200)
demo.launch()