Spaces:

archit11
/

shuka_demo

Running on Zero

App Files Files Community

archit11 commited on Aug 14

Commit

c621812

•

1 Parent(s): ab07d9e

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -29

app.py CHANGED Viewed

@@ -1,47 +1,66 @@
 import transformers
-import librosa
 import gradio as gr
 import spaces
-# Load the model pipeline on GPU:0
-pipe = transformers.pipeline(
-    model='sarvamai/shuka_v1',
-    trust_remote_code=True,
-    device=0,
-    torch_dtype='bfloat16'
-)
 @spaces.GPU(duration=120)
-def transcribe_and_respond(audio_file):
     try:
-        # Check if the audio file is valid and exists
-        if audio_file is None or not isinstance(audio_file, str):
-            raise ValueError("Invalid audio file input.")
-        # Load the audio using librosa
-        audio, sr = librosa.load(audio_file, sr=16000)
-        # Prepare the conversation turns
-        turns = [
-            {'role': 'system', 'content': 'Respond naturally and informatively.'},
-            {'role': 'user', 'content': ''}
-        ]
-        # Run inference with the pipeline
-        response = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
         return response
     except Exception as e:
         return f"Error processing audio: {str(e)}"
-# Create the Gradio interface with microphone input
 iface = gr.Interface(
     fn=transcribe_and_respond,
-    inputs=gr.Audio(sources="microphone", type="filepath"),  # Use the microphone for audio input
-    outputs="text",  # The output will be a text response
-    title="Voice Input for Transcription and Response",
-    description="Record your voice, and the model will respond naturally and informatively."
 )
-# Launch the Gradio app
-iface.launch()

 import transformers
 import gradio as gr
+import torch
+import numpy as np
+from typing import Dict, List
 import spaces
+# Constants
+MODEL_NAME = 'sarvamai/shuka_v1'
+SAMPLE_RATE = 16000
+MAX_NEW_TOKENS = 256
+# Load the ShukaPipeline
+def load_pipeline():
+    model = transformers.AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    pipeline = transformers.pipeline(
+        "shuka-pipeline",
+        model=model,
+        torch_dtype=torch.float16,
+        device=0 if torch.cuda.is_available() else -1,
+    )
+    return pipeline
+pipe = load_pipeline()
+def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
+    return [
+        {'role': 'system', 'content': 'Respond naturally and informatively.'},
+        {'role': 'user', 'content': prompt}
+    ]
 @spaces.GPU(duration=120)
+def transcribe_and_respond(audio: np.ndarray) -> str:
     try:
+        # Ensure audio is float32
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+        # Create input for the pipeline
+        turns = create_conversation_turns("<|audio|>")
+        inputs = {
+            'audio': audio,
+            'turns': turns,
+            'sampling_rate': SAMPLE_RATE
+        }
+        # Generate response
+        response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, repetition_penalty=1.1)
         return response
     except Exception as e:
         return f"Error processing audio: {str(e)}"
+# Create the Gradio interface
 iface = gr.Interface(
     fn=transcribe_and_respond,
+    inputs=gr.Audio(sources="microphone", type="numpy", sampling_rate=SAMPLE_RATE),
+    outputs="text",
+    title="Live Voice Input for Transcription and Response",
+    description="Speak into your microphone, and the model will respond naturally and informatively.",
+    live=True
 )
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()