archit11 commited on
Commit
2ba8923
1 Parent(s): 011a958

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -51
app.py CHANGED
@@ -1,70 +1,41 @@
1
  import transformers
2
-
3
  import gradio as gr
 
4
  import torch
5
- import numpy as np
6
- from typing import Dict, List
7
  import spaces
8
 
9
- # Constants
10
- MODEL_NAME = 'sarvamai/shuka_v1'
11
- SAMPLE_RATE = 16000
12
- MAX_NEW_TOKENS = 256
13
-
14
- # Load the ShukaPipeline
15
- def load_pipeline():
16
- model = transformers.AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
17
- pipeline = transformers.pipeline(
18
- "shuka-pipeline",
19
- model=model,
20
- torch_dtype=torch.float16,
21
- device=0 if torch.cuda.is_available() else -1,
22
- )
23
- return pipeline
24
-
25
- pipe = load_pipeline()
26
-
27
- def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
28
- return [
29
- {'role': 'system', 'content': 'Respond naturally and informatively.'},
30
- {'role': 'user', 'content': prompt}
31
- ]
32
-
33
  @spaces.GPU(duration=120)
34
- def transcribe_and_respond(audio: np.ndarray) -> str:
35
  try:
36
- # Ensure audio is float32
37
- if audio.dtype != np.float32:
38
- audio = audio.astype(np.float32)
 
 
 
 
 
 
 
 
 
 
39
 
 
40
 
 
41
 
42
-
43
- # Create input for the pipeline
44
- turns = create_conversation_turns("<|audio|>")
45
- inputs = {
46
- 'audio': audio,
47
- 'turns': turns,
48
- 'sampling_rate': SAMPLE_RATE
49
- }
50
-
51
- # Generate response
52
- response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, repetition_penalty=1.1)
53
-
54
- return response
55
  except Exception as e:
56
- return f"Error processing audio: {str(e)}"
57
 
58
- # Create the Gradio interface
59
  iface = gr.Interface(
60
  fn=transcribe_and_respond,
61
- inputs=gr.Audio(sources="microphone", type="numpy", sampling_rate=SAMPLE_RATE),
62
- outputs="text",
63
- title="Live Voice Input for Transcription and Response",
64
  description="Speak into your microphone, and the model will respond naturally and informatively.",
65
- live=True
66
  )
67
 
68
- # Launch the app
69
  if __name__ == "__main__":
70
  iface.launch()
 
1
  import transformers
 
2
  import gradio as gr
3
+ import librosa
4
  import torch
 
 
5
  import spaces
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  @spaces.GPU(duration=120)
8
+ def transcribe_and_respond(audio_file):
9
  try:
10
+ pipe = transformers.pipeline(
11
+ model='sarvamai/shuka_v1',
12
+ trust_remote_code=True,
13
+ device=0,
14
+ torch_dtype=torch.bfloat16
15
+ )
16
+
17
+ audio, sr = librosa.load(audio_file, sr=16000)
18
+
19
+ turns = [
20
+ {'role': 'system', 'content': 'Respond naturally and informatively.'},
21
+ {'role': 'user', 'content': ''}
22
+ ]
23
 
24
+ output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
25
 
26
+ return output
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  except Exception as e:
29
+ return f"Error: {str(e)}"
30
 
 
31
  iface = gr.Interface(
32
  fn=transcribe_and_respond,
33
+ inputs=gr.Audio(sources="microphone", type="filepath"), # Accept audio input from microphone
34
+ outputs="text", # Output as text
35
+ title="Live Transcription and Response",
36
  description="Speak into your microphone, and the model will respond naturally and informatively.",
37
+ live=True # Enable live processing
38
  )
39
 
 
40
  if __name__ == "__main__":
41
  iface.launch()