divakaivan commited on
Commit
69f3d39
1 Parent(s): 7fdce89

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
2
+ import gradio as gr
3
+ import torch
4
+ import numpy as np
5
+ from datasets import load_dataset, Audio
6
+
7
+ # Load ASR model
8
+ asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
9
+
10
+ # Load GPT-2 model for generating responses
11
+ model_name = "gpt2"
12
+ gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
13
+ gpt_model = GPT2LMHeadModel.from_pretrained(model_name)
14
+
15
+ # Load TTS components
16
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
17
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
19
+
20
+ # Load dataset for speaker embedding
21
+ dataset = load_dataset("divakaivan/glaswegian_audio")
22
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train']
23
+
24
+ def transcribe(audio):
25
+ text = asr_pipe(audio)["text"]
26
+ return text
27
+
28
+ def generate_response(text):
29
+ input_ids = gpt_tokenizer.encode(text, return_tensors='pt')
30
+ response_ids = gpt_model.generate(input_ids, max_length=100, num_return_sequences=1)
31
+ response_text = gpt_tokenizer.decode(response_ids[0], skip_special_tokens=True)
32
+ return response_text
33
+
34
+ def synthesize_speech(text):
35
+ inputs = processor(text=text, return_tensors="pt")
36
+ speaker_embeddings = create_speaker_embedding(dataset[0]["audio"]["array"])
37
+ spectrogram = tts_model.generate_speech(inputs["input_ids"], torch.tensor([speaker_embeddings]))
38
+ with torch.no_grad():
39
+ speech = vocoder(spectrogram)
40
+ speech = (speech.numpy() * 32767).astype(np.int16)
41
+ return (16000, speech)
42
+
43
+ def create_speaker_embedding(waveform):
44
+ with torch.no_grad():
45
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
46
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
47
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
48
+ return speaker_embeddings
49
+
50
+ def voice_assistant(audio):
51
+ transcribed_text = transcribe(audio)
52
+ response_text = generate_response(transcribed_text)
53
+ speech_audio = synthesize_speech(response_text)
54
+ return response_text, speech_audio
55
+
56
+ iface = gr.Interface(
57
+ fn=voice_assistant,
58
+ inputs=gr.Audio(type="filepath"),
59
+ outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Speech", type="numpy")],
60
+ title="Voice Assistant with LLM",
61
+ description="A voice assistant that uses ASR, LLM, and TTS to interact with users.",
62
+ )
63
+
64
+ iface.launch()