divakaivan commited on
Commit
9602bc7
1 Parent(s): 5233ff4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -8
app.py CHANGED
@@ -5,6 +5,7 @@ import numpy as np
5
  from datasets import load_dataset, Audio
6
  from transformers import pipeline
7
  import librosa
 
8
 
9
  # Load ASR model
10
  asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
@@ -27,11 +28,14 @@ def transcribe(audio):
27
  text = asr_pipe(audio)["text"]
28
  return text
29
 
30
- def generate_response(text):
31
- input_ids = gpt_tokenizer.encode(text, return_tensors='pt')
32
- response_ids = gpt_model.generate(input_ids, max_length=100, num_return_sequences=1)
33
- response_text = gpt_tokenizer.decode(response_ids[0], skip_special_tokens=True)
34
- return response_text
 
 
 
35
 
36
  def synthesize_speech(text):
37
  inputs = processor(text=text, return_tensors="pt")
@@ -61,15 +65,18 @@ def create_speaker_embedding(waveform):
61
  speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
62
  return speaker_embeddings
63
 
64
- def voice_assistant(audio):
65
  transcribed_text = transcribe(audio)
66
- response_text = generate_response(transcribed_text)
67
  speech_audio = synthesize_speech(response_text)
68
  return speech_audio
69
 
70
  iface = gr.Interface(
71
  fn=voice_assistant,
72
- inputs=gr.Audio(type="filepath"),
 
 
 
73
  outputs=gr.Audio(label="Response Speech", type="numpy"),
74
  title="Your Glaswegian Assistant"
75
  )
 
5
  from datasets import load_dataset, Audio
6
  from transformers import pipeline
7
  import librosa
8
+ from openai import OpenAI
9
 
10
  # Load ASR model
11
  asr_pipe = pipeline(model="divakaivan/glaswegian-asr")
 
28
  text = asr_pipe(audio)["text"]
29
  return text
30
 
31
+ def generate_response(text, api_key):
32
+ client = OpenAI(api_key=api_key)
33
+ response = client.chat.completions.create(
34
+ model='gpt-4o-mini',
35
+ messages=[{"role": "user", "content": text}]
36
+ )
37
+
38
+ return response.choices[0].message.content
39
 
40
  def synthesize_speech(text):
41
  inputs = processor(text=text, return_tensors="pt")
 
65
  speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
66
  return speaker_embeddings
67
 
68
+ def voice_assistant(audio, api_key):
69
  transcribed_text = transcribe(audio)
70
+ response_text = generate_response(transcribed_text, api_key)
71
  speech_audio = synthesize_speech(response_text)
72
  return speech_audio
73
 
74
  iface = gr.Interface(
75
  fn=voice_assistant,
76
+ inputs=[
77
+ gr.Audio(type="filepath"),
78
+ gr.Textbox(label="OpenAI API Key", type="password")
79
+ ],
80
  outputs=gr.Audio(label="Response Speech", type="numpy"),
81
  title="Your Glaswegian Assistant"
82
  )