Spaces:

vitorcalvi
/

MMESA-ZeroGPU

Sleeping

File size: 1,947 Bytes

efabbbd

import gradio as gr
from transformers import pipeline
import librosa
import numpy as np
import matplotlib.pyplot as plt

# Load Whisper model using transformers pipeline
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

def analyze_audio(audio):
    # Convert audio to text using Whisper
    transcription_result = transcriber(audio)
    transcription = transcription_result["text"]
    
    # Load audio file
    y, sr = librosa.load(audio, sr=None)
    
    # Extract prosodic features
    pitch = librosa.yin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    
    # Calculate pitch variance
    pitch_variance = np.var(pitch)
    
    # Estimate speaking pace (syllables per second)
    # This is a simplified estimation based on the number of words
    num_syllables = len(transcription.split())
    duration = librosa.get_duration(y=y, sr=sr)
    pace = num_syllables / duration
    
    # Plot pitch
    plt.figure(figsize=(10, 4))
    plt.plot(pitch, label='Pitch')
    plt.xlabel('Time')
    plt.ylabel('Frequency (Hz)')
    plt.title('Pitch Over Time')
    plt.legend()
    pitch_plot_path = '/tmp/pitch_contour.png'
    plt.savefig(pitch_plot_path)
    plt.close()
    
    # Voice Stress Analysis (simplified example)
    stress_level = np.std(pitch)  # Standard deviation as a simple stress indicator
    
    return transcription, tempo, pace, pitch_variance, pitch_plot_path

# Create Gradio interface
input_audio = gr.Audio(label="Input Audio", type="filepath")

iface = gr.Interface(
    fn=analyze_audio,
    inputs=input_audio,
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Number(label="Tempo (BPM)"),
        gr.Number(label="Speaking Pace (syllables/sec)"),
        gr.Number(label="Pitch Variance"),
        gr.Image(label="Pitch Contour Plot")
    ],
    live=True
)

iface.launch(share=False)