archit11 commited on
Commit
60f64df
1 Parent(s): c591299

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -40
app.py CHANGED
@@ -5,11 +5,8 @@ import torch
5
  import spaces
6
  import numpy as np
7
 
8
- # Initialize the conversation history globally
9
- conversation_history = []
10
-
11
- @spaces.GPU(duration=120)
12
- def transcribe_and_respond(audio_file, chat_history):
13
  try:
14
  pipe = transformers.pipeline(
15
  model='sarvamai/shuka_v1',
@@ -21,54 +18,35 @@ def transcribe_and_respond(audio_file, chat_history):
21
  # Load the audio file
22
  audio, sr = librosa.load(audio_file, sr=16000)
23
 
24
- # Debug: Print audio properties for debugging
25
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
26
 
27
- # Prepare conversation turns
28
- turns = chat_history.copy() # Take the existing chat history and append user input
29
- turns.append({'role': 'user', 'content': '<|audio|>'})
 
30
 
31
- # Debug: Print the updated turns for debugging purposes
32
- print(f"Updated turns: {turns}")
33
 
34
- # Call the model with the updated conversation turns and audio
35
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
36
 
37
- # Append the model's response to the conversation history
38
- turns.append({'role': 'system', 'content': output})
39
-
40
- # Debug: Print the model's response
41
  print(f"Model output: {output}")
42
 
43
- # Format the chat history for Gradio's Chatbot
44
- chat_history_for_display = []
45
- for turn in turns:
46
- if turn['role'] == 'user':
47
- chat_history_for_display.append(("User", "🗣️ (Spoken Audio)"))
48
- else:
49
- chat_history_for_display.append(("AI", turn['content']))
50
-
51
- return chat_history_for_display, turns # Return the formatted chat history for display and the updated history
52
 
53
  except Exception as e:
54
- return f"Error: {str(e)}", chat_history # Ensure history is returned even on error
55
 
56
- # Define the Gradio interface
57
  iface = gr.Interface(
58
  fn=transcribe_and_respond,
59
- inputs=[
60
- gr.Audio(sources="microphone", type="filepath", label="Your Audio (Microphone)"),
61
- gr.State([]) # Hidden state to maintain conversation history
62
- ],
63
- outputs=[
64
- gr.Chatbot(label="Conversation History"), # Display the conversation
65
- gr.State([]) # Hidden state to keep track of the updated conversation history
66
- ],
67
- title="Shuka demo",
68
- description="shuka live demo",
69
- live=True, # Enable live mode for real-time interaction
70
- allow_flagging="auto",
71
- # enable_queue=True
72
  )
73
 
74
  if __name__ == "__main__":
 
5
  import spaces
6
  import numpy as np
7
 
8
+ @spaces.GPU(duration=20)
9
+ def transcribe_and_respond(audio_file):
 
 
 
10
  try:
11
  pipe = transformers.pipeline(
12
  model='sarvamai/shuka_v1',
 
18
  # Load the audio file
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
 
21
+ # Print audio properties for debugging
22
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
23
 
24
+ turns = [
25
+ {'role': 'system', 'content': 'Respond naturally and informatively.'},
26
+ {'role': 'user', 'content': '<|audio|>'}
27
+ ]
28
 
29
+ # Debug: Print the initial turns
30
+ print(f"Initial turns: {turns}")
31
 
32
+ # Call the model with the audio and prompt
33
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
34
 
35
+ # Debug: Print the final output from the model
 
 
 
36
  print(f"Model output: {output}")
37
 
38
+ return output
 
 
 
 
 
 
 
 
39
 
40
  except Exception as e:
41
+ return f"Error: {str(e)}"
42
 
 
43
  iface = gr.Interface(
44
  fn=transcribe_and_respond,
45
+ inputs=gr.Audio(sources="microphone", type="filepath"),
46
+ outputs="text",
47
+ title="Live Transcription and Response",
48
+ description="Speak into your microphone, and the model will respond naturally and informatively.",
49
+ live=True
 
 
 
 
 
 
 
 
50
  )
51
 
52
  if __name__ == "__main__":