Spaces:
Sleeping
Sleeping
File size: 2,596 Bytes
648fba4 69f3d39 648fba4 69f3d39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import gradio as gr
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import pipeline
import librosa
# Load ASR model
asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
# Load GPT-2 model for generating responses
model_name = "gpt2"
gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt_model = GPT2LMHeadModel.from_pretrained(model_name)
# Load TTS components
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load dataset for speaker embedding
dataset = load_dataset("divakaivan/glaswegian_audio")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
def transcribe(audio):
text = asr_pipe(audio)["text"]
return text
def generate_response(text):
input_ids = gpt_tokenizer.encode(text, return_tensors='pt')
response_ids = gpt_model.generate(input_ids, max_length=100, num_return_sequences=1)
response_text = gpt_tokenizer.decode(response_ids[0], skip_special_tokens=True)
return response_text
def synthesize_speech(text):
inputs = processor(text=text, return_tensors="pt")
speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"])
spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings]))
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def voice_assistant(audio):
transcribed_text = transcribe(audio)
response_text = generate_response(transcribed_text)
speech_audio = synthesize_speech(response_text)
return response_text, speech_audio
iface = gr.Interface(
fn=voice_assistant,
inputs=gr.Audio(type="filepath"),
outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Speech", type="numpy")],
title="Voice Assistant with LLM",
description="A voice assistant that uses ASR, LLM, and TTS to interact with users.",
)
iface.launch()
|