divakaivan's picture
Update app.py
5233ff4 verified
raw
history blame
2.84 kB
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import gradio as gr
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import pipeline
import librosa
# Load ASR model
asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
# Load GPT-2 model for generating responses
model_name = "gpt2"
gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt_model = GPT2LMHeadModel.from_pretrained(model_name)
# Load TTS components
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load dataset for speaker embedding
dataset = load_dataset("divakaivan/glaswegian_audio")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
def transcribe(audio):
text = asr_pipe(audio)["text"]
return text
def generate_response(text):
input_ids = gpt_tokenizer.encode(text, return_tensors='pt')
response_ids = gpt_model.generate(input_ids, max_length=100, num_return_sequences=1)
response_text = gpt_tokenizer.decode(response_ids[0], skip_special_tokens=True)
return response_text
def synthesize_speech(text):
inputs = processor(text=text, return_tensors="pt")
speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"])
spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings]))
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
def create_speaker_embedding(waveform):
import os
from speechbrain.inference.speaker import EncoderClassifier
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def voice_assistant(audio):
transcribed_text = transcribe(audio)
response_text = generate_response(transcribed_text)
speech_audio = synthesize_speech(response_text)
return speech_audio
iface = gr.Interface(
fn=voice_assistant,
inputs=gr.Audio(type="filepath"),
outputs=gr.Audio(label="Response Speech", type="numpy"),
title="Your Glaswegian Assistant"
)
iface.launch()