from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import gradio as gr import torch import numpy as np from datasets import load_dataset, Audio from transformers import pipeline import librosa from openai import OpenAI # Load ASR model asr_pipe = pipeline(model="divakaivan/glaswegian-asr") # Load GPT-2 model for generating responses model_name = "gpt2" gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name) gpt_model = GPT2LMHeadModel.from_pretrained(model_name) # Load TTS components processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load dataset for speaker embedding dataset = load_dataset("divakaivan/glaswegian_audio") dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train'] def transcribe(audio): text = asr_pipe(audio)["text"] return text def generate_response(text, api_key): client = OpenAI(api_key=api_key) response = client.chat.completions.create( model='gpt-4o-mini', messages=[{"role": "user", "content": text}] ) return response.choices[0].message.content def synthesize_speech(text): inputs = processor(text=text, return_tensors="pt") speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"]) spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings])) with torch.no_grad(): speech = vocoder(spectrogram) speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech) def create_speaker_embedding(waveform): import os from speechbrain.inference.speaker import EncoderClassifier spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name), ) with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def voice_assistant(audio, api_key): transcribed_text = transcribe(audio) response_text = generate_response(transcribed_text, api_key) speech_audio = synthesize_speech(response_text) return speech_audio iface = gr.Interface( fn=voice_assistant, inputs=[ gr.Audio(type="filepath"), gr.Textbox(label="OpenAI API Key", type="password") ], outputs=gr.Audio(label="Response Speech", type="numpy"), title="Your Glaswegian Assistant" ) iface.launch()