File size: 1,510 Bytes
2ed7223
 
2ba8923
c621812
039f770
ee53056
011a958
47c53f6
60f64df
dc03737
2ba8923
d649fba
 
 
 
 
ee53056
d649fba
2ba8923
ee53056
60f64df
ee53056
 
60f64df
 
 
 
ee53056
60f64df
 
ee53056
60f64df
2ba8923
ee53056
60f64df
d649fba
ee53056
60f64df
2ed7223
ab07d9e
60f64df
2ed7223
 
 
60f64df
 
 
 
 
2ed7223
 
c621812
d649fba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np

@spaces.GPU(duration=60)
def transcribe_and_respond(audio_file):
    try:
        pipe = transformers.pipeline(
            model='sarvamai/shuka_v1',
            trust_remote_code=True,
            device=0,
            torch_dtype=torch.bfloat16
        )

        # Load the audio file
        audio, sr = librosa.load(audio_file, sr=16000)

        # Print audio properties for debugging
        print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")

        turns = [
            {'role': 'system', 'content': 'Respond naturally and informatively.'},
            {'role': 'user', 'content': '<|audio|>'}
        ]

        # Debug: Print the initial turns
        print(f"Initial turns: {turns}")

        # Call the model with the audio and prompt
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)

        # Debug: Print the final output from the model
        print(f"Model output: {output}")

        return output

    except Exception as e:
        return f"Error: {str(e)}"

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs="text",
    title="Live Transcription and Response",
    description="Speak into your microphone, and the model will respond naturally and informatively.",
    live=True
)

if __name__ == "__main__":
    iface.launch()