from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import gradio as gr import torch import numpy as np from datasets import load_dataset, Audio from transformers import pipeline import librosa # Load ASR model asr_pipe = pipeline(model="divakaivan/glaswegian-asr") # Load GPT-2 model for generating responses model_name = "gpt2" gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name) gpt_model = GPT2LMHeadModel.from_pretrained(model_name) # Load TTS components processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load dataset for speaker embedding dataset = load_dataset("divakaivan/glaswegian_audio") dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train'] def transcribe(audio): text = asr_pipe(audio)["text"] return text def generate_response(text): input_ids = gpt_tokenizer.encode(text, return_tensors='pt') response_ids = gpt_model.generate(input_ids, max_length=100, num_return_sequences=1) response_text = gpt_tokenizer.decode(response_ids[0], skip_special_tokens=True) return response_text def synthesize_speech(text): inputs = processor(text=text, return_tensors="pt") speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"]) spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings])) with torch.no_grad(): speech = vocoder(spectrogram) speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def voice_assistant(audio): transcribed_text = transcribe(audio) response_text = generate_response(transcribed_text) speech_audio = synthesize_speech(response_text) return response_text, speech_audio iface = gr.Interface( fn=voice_assistant, inputs=gr.Audio(type="filepath"), outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Speech", type="numpy")], title="Voice Assistant with LLM", description="A voice assistant that uses ASR, LLM, and TTS to interact with users.", ) iface.launch()