import gradio as gr from transformers import pipeline import librosa import numpy as np import matplotlib.pyplot as plt # Load Whisper model using transformers pipeline transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") def analyze_audio(audio): # Convert audio to text using Whisper transcription_result = transcriber(audio) transcription = transcription_result["text"] # Load audio file y, sr = librosa.load(audio, sr=None) # Extract prosodic features pitch = librosa.yin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) tempo, _ = librosa.beat.beat_track(y=y, sr=sr) # Calculate pitch variance pitch_variance = np.var(pitch) # Estimate speaking pace (syllables per second) # This is a simplified estimation based on the number of words num_syllables = len(transcription.split()) duration = librosa.get_duration(y=y, sr=sr) pace = num_syllables / duration # Plot pitch plt.figure(figsize=(10, 4)) plt.plot(pitch, label='Pitch') plt.xlabel('Time') plt.ylabel('Frequency (Hz)') plt.title('Pitch Over Time') plt.legend() pitch_plot_path = '/tmp/pitch_contour.png' plt.savefig(pitch_plot_path) plt.close() # Voice Stress Analysis (simplified example) stress_level = np.std(pitch) # Standard deviation as a simple stress indicator return transcription, tempo, pace, pitch_variance, pitch_plot_path # Create Gradio interface input_audio = gr.Audio(label="Input Audio", type="filepath") iface = gr.Interface( fn=analyze_audio, inputs=input_audio, outputs=[ gr.Textbox(label="Transcription"), gr.Number(label="Tempo (BPM)"), gr.Number(label="Speaking Pace (syllables/sec)"), gr.Number(label="Pitch Variance"), gr.Image(label="Pitch Contour Plot") ], live=True ) iface.launch(share=False)