archit11 commited on
Commit
dc03737
1 Parent(s): 05dddc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -54
app.py CHANGED
@@ -1,72 +1,41 @@
1
  import transformers
2
  import gradio as gr
 
3
  import torch
4
- import numpy as np
5
- from typing import Dict, List, Tuple
6
  import spaces
7
- import librosa
8
- import soundfile as sf
9
 
10
- MODEL_NAME = 'sarvamai/shuka_v1'
11
- SAMPLE_RATE = 16000
12
- MAX_NEW_TOKENS = 256
 
 
 
 
 
 
13
 
14
- def load_pipeline():
15
- return transformers.pipeline(
16
- model=MODEL_NAME,
17
- trust_remote_code=True,
18
- device=0,
19
- torch_dtype=torch.bfloat16
20
- )
21
 
22
- pipe = load_pipeline()
 
 
 
23
 
24
- def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
25
- return [
26
- {'role': 'system', 'content': 'Respond naturally and informatively.'},
27
- {'role': 'user', 'content': prompt}
28
- ]
29
 
30
- @spaces.GPU(duration=120)
31
- def transcribe_and_respond(audio_input: Tuple[int, np.ndarray]) -> str:
32
- try:
33
- # Unpack the audio input
34
- sample_rate, audio = audio_input
35
-
36
- # Ensure audio is float32
37
- if audio.dtype != np.float32:
38
- audio = audio.astype(np.float32)
39
-
40
- if sample_rate != SAMPLE_RATE:
41
- audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
42
-
43
- # Convert the audio to WAV format
44
- wav_data = librosa.util.buf_to_float(audio, n_bytes=2)
45
- sf.write('temp_audio.wav', wav_data, SAMPLE_RATE)
46
-
47
- # Prepare the inputs for the model
48
- turns = create_conversation_turns("")
49
- inputs = {
50
- 'audio': wav_data,
51
- 'turns': turns,
52
- 'sampling_rate': SAMPLE_RATE
53
- }
54
-
55
- response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS)
56
-
57
- return response
58
  except Exception as e:
59
- return f"Error processing audio: {str(e)}"
60
 
61
  iface = gr.Interface(
62
  fn=transcribe_and_respond,
63
- inputs=gr.Audio(sources="microphone", type="numpy"),
64
- outputs="text",
65
- title="Live Voice Input for Transcription and Response",
66
  description="Speak into your microphone, and the model will respond naturally and informatively.",
67
- live=True
68
  )
69
 
70
- # Launch the app
71
  if __name__ == "__main__":
72
  iface.launch()
 
1
  import transformers
2
  import gradio as gr
3
+ import librosa
4
  import torch
 
 
5
  import spaces
 
 
6
 
7
+ @spaces.GPU(duration=120)
8
+ def transcribe_and_respond(audio_file):
9
+ try:
10
+ pipe = transformers.pipeline(
11
+ model='sarvamai/shuka_v1',
12
+ trust_remote_code=True,
13
+ device=0,
14
+ torch_dtype=torch.bfloat16
15
+ )
16
 
17
+ audio, sr = librosa.load(audio_file, sr=16000)
 
 
 
 
 
 
18
 
19
+ turns = [
20
+ {'role': 'system', 'content': 'Respond naturally and informatively.'},
21
+ {'role': 'user', 'content': ''}
22
+ ]
23
 
24
+ output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
25
+
26
+ return output
 
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  except Exception as e:
29
+ return f"Error: {str(e)}"
30
 
31
  iface = gr.Interface(
32
  fn=transcribe_and_respond,
33
+ inputs=gr.Audio(source="microphone", type="filepath"), # Accept audio input from microphone
34
+ outputs="text", # Output as text
35
+ title="Live Transcription and Response",
36
  description="Speak into your microphone, and the model will respond naturally and informatively.",
37
+ live=True # Enable live processing
38
  )
39
 
 
40
  if __name__ == "__main__":
41
  iface.launch()