File size: 2,734 Bytes
6f5c4d3
69f3d39
 
 
 
648fba4
 
9602bc7
69f3d39
6ef3a8f
69f3d39
 
0c8fcbe
69f3d39
 
 
 
 
 
 
 
 
 
 
 
 
9602bc7
 
 
abc0ae6
9602bc7
 
 
 
69f3d39
 
 
 
 
 
 
 
 
 
 
ff917b6
 
 
 
 
 
 
 
 
 
 
 
69f3d39
 
 
 
 
 
9602bc7
69f3d39
9602bc7
69f3d39
5233ff4
69f3d39
 
 
9602bc7
 
 
 
5233ff4
 
69f3d39
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import gradio as gr
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import pipeline
import librosa
from openai import OpenAI

# Load ASR model 
asr_pipe = pipeline(model="divakaivan/glaswegian-asr")


# Load TTS components
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load dataset for speaker embedding
dataset = load_dataset("divakaivan/glaswegian_audio")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']

def transcribe(audio):
    text = asr_pipe(audio)["text"]
    return text

def generate_response(text, api_key):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model='gpt-3.5-turbo-0125',
        messages=[{"role": "user", "content": text}]
    )
    
    return response.choices[0].message.content

def synthesize_speech(text):
    inputs = processor(text=text, return_tensors="pt")
    speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"])
    spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings]))
    with torch.no_grad():
        speech = vocoder(spectrogram)
    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)

def create_speaker_embedding(waveform):

    import os
    from speechbrain.inference.speaker import EncoderClassifier
    
    spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    speaker_model = EncoderClassifier.from_hparams(
        source=spk_model_name, 
        run_opts={"device": device}, 
        savedir=os.path.join("/tmp", spk_model_name),
    )
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

def voice_assistant(audio, api_key):
    transcribed_text = transcribe(audio)
    response_text = generate_response(transcribed_text, api_key)
    speech_audio = synthesize_speech(response_text)
    return speech_audio

iface = gr.Interface(
    fn=voice_assistant,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Textbox(label="OpenAI API Key", type="password") 
    ],
    outputs=gr.Audio(label="Response Speech", type="numpy"),
    title="Your Glaswegian Assistant"
)

iface.launch()