divakaivan's picture
Update app.py
6ef3a8f verified
raw
history blame
2.94 kB
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import gradio as gr
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import pipeline
import librosa
from openai import OpenAI
# Load ASR model
asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
# Load GPT-2 model for generating responses
model_name = "gpt2"
gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt_model = GPT2LMHeadModel.from_pretrained(model_name)
# Load TTS components
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load dataset for speaker embedding
dataset = load_dataset("divakaivan/glaswegian_audio")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
def transcribe(audio):
text = asr_pipe(audio)["text"]
return text
def generate_response(text, api_key):
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=[{"role": "user", "content": text}]
)
return response.choices[0].message.content
def synthesize_speech(text):
inputs = processor(text=text, return_tensors="pt")
speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"])
spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings]))
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
def create_speaker_embedding(waveform):
import os
from speechbrain.inference.speaker import EncoderClassifier
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def voice_assistant(audio, api_key):
transcribed_text = transcribe(audio)
response_text = generate_response(transcribed_text, api_key)
speech_audio = synthesize_speech(response_text)
return speech_audio
iface = gr.Interface(
fn=voice_assistant,
inputs=[
gr.Audio(type="filepath"),
gr.Textbox(label="OpenAI API Key", type="password")
],
outputs=gr.Audio(label="Response Speech", type="numpy"),
title="Your Glaswegian Assistant"
)
iface.launch()