archit11 commited on
Commit
c621812
1 Parent(s): ab07d9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -29
app.py CHANGED
@@ -1,47 +1,66 @@
1
  import transformers
2
- import librosa
3
  import gradio as gr
 
 
 
4
  import spaces
5
 
6
- # Load the model pipeline on GPU:0
7
- pipe = transformers.pipeline(
8
- model='sarvamai/shuka_v1',
9
- trust_remote_code=True,
10
- device=0,
11
- torch_dtype='bfloat16'
12
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  @spaces.GPU(duration=120)
15
- def transcribe_and_respond(audio_file):
16
  try:
17
- # Check if the audio file is valid and exists
18
- if audio_file is None or not isinstance(audio_file, str):
19
- raise ValueError("Invalid audio file input.")
20
-
21
- # Load the audio using librosa
22
- audio, sr = librosa.load(audio_file, sr=16000)
23
 
24
- # Prepare the conversation turns
25
- turns = [
26
- {'role': 'system', 'content': 'Respond naturally and informatively.'},
27
- {'role': 'user', 'content': ''}
28
- ]
 
 
29
 
30
- # Run inference with the pipeline
31
- response = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
32
 
33
  return response
34
  except Exception as e:
35
  return f"Error processing audio: {str(e)}"
36
 
37
- # Create the Gradio interface with microphone input
38
  iface = gr.Interface(
39
  fn=transcribe_and_respond,
40
- inputs=gr.Audio(sources="microphone", type="filepath"), # Use the microphone for audio input
41
- outputs="text", # The output will be a text response
42
- title="Voice Input for Transcription and Response",
43
- description="Record your voice, and the model will respond naturally and informatively."
 
44
  )
45
 
46
- # Launch the Gradio app
47
- iface.launch()
 
 
1
  import transformers
 
2
  import gradio as gr
3
+ import torch
4
+ import numpy as np
5
+ from typing import Dict, List
6
  import spaces
7
 
8
+ # Constants
9
+ MODEL_NAME = 'sarvamai/shuka_v1'
10
+ SAMPLE_RATE = 16000
11
+ MAX_NEW_TOKENS = 256
12
+
13
+ # Load the ShukaPipeline
14
+ def load_pipeline():
15
+ model = transformers.AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
16
+ pipeline = transformers.pipeline(
17
+ "shuka-pipeline",
18
+ model=model,
19
+ torch_dtype=torch.float16,
20
+ device=0 if torch.cuda.is_available() else -1,
21
+ )
22
+ return pipeline
23
+
24
+ pipe = load_pipeline()
25
+
26
+ def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
27
+ return [
28
+ {'role': 'system', 'content': 'Respond naturally and informatively.'},
29
+ {'role': 'user', 'content': prompt}
30
+ ]
31
 
32
  @spaces.GPU(duration=120)
33
+ def transcribe_and_respond(audio: np.ndarray) -> str:
34
  try:
35
+ # Ensure audio is float32
36
+ if audio.dtype != np.float32:
37
+ audio = audio.astype(np.float32)
 
 
 
38
 
39
+ # Create input for the pipeline
40
+ turns = create_conversation_turns("<|audio|>")
41
+ inputs = {
42
+ 'audio': audio,
43
+ 'turns': turns,
44
+ 'sampling_rate': SAMPLE_RATE
45
+ }
46
 
47
+ # Generate response
48
+ response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, repetition_penalty=1.1)
49
 
50
  return response
51
  except Exception as e:
52
  return f"Error processing audio: {str(e)}"
53
 
54
+ # Create the Gradio interface
55
  iface = gr.Interface(
56
  fn=transcribe_and_respond,
57
+ inputs=gr.Audio(sources="microphone", type="numpy", sampling_rate=SAMPLE_RATE),
58
+ outputs="text",
59
+ title="Live Voice Input for Transcription and Response",
60
+ description="Speak into your microphone, and the model will respond naturally and informatively.",
61
+ live=True
62
  )
63
 
64
+ # Launch the app
65
+ if __name__ == "__main__":
66
+ iface.launch()