divakaivan's picture
Update app.py
648fba4 verified
raw
history blame
No virus
2.6 kB
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import gradio as gr
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import pipeline
import librosa
# Load ASR model
asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
# Load GPT-2 model for generating responses
model_name = "gpt2"
gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt_model = GPT2LMHeadModel.from_pretrained(model_name)
# Load TTS components
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load dataset for speaker embedding
dataset = load_dataset("divakaivan/glaswegian_audio")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
def transcribe(audio):
text = asr_pipe(audio)["text"]
return text
def generate_response(text):
input_ids = gpt_tokenizer.encode(text, return_tensors='pt')
response_ids = gpt_model.generate(input_ids, max_length=100, num_return_sequences=1)
response_text = gpt_tokenizer.decode(response_ids[0], skip_special_tokens=True)
return response_text
def synthesize_speech(text):
inputs = processor(text=text, return_tensors="pt")
speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"])
spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings]))
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def voice_assistant(audio):
transcribed_text = transcribe(audio)
response_text = generate_response(transcribed_text)
speech_audio = synthesize_speech(response_text)
return response_text, speech_audio
iface = gr.Interface(
fn=voice_assistant,
inputs=gr.Audio(type="filepath"),
outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Speech", type="numpy")],
title="Voice Assistant with LLM",
description="A voice assistant that uses ASR, LLM, and TTS to interact with users.",
)
iface.launch()