|
import gradio as gr |
|
import torch |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer |
|
import librosa |
|
import numpy as np |
|
import plotly.graph_objects as go |
|
from plotly.subplots import make_subplots |
|
import warnings |
|
import os |
|
import pandas as pd |
|
from scipy.stats import kurtosis, skew |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
processor = None |
|
whisper_model = None |
|
emotion_tokenizer = None |
|
emotion_model = None |
|
|
|
def load_models(): |
|
"""Initialize and load all required models""" |
|
global processor, whisper_model, emotion_tokenizer, emotion_model |
|
|
|
try: |
|
print("Loading Whisper model...") |
|
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") |
|
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") |
|
|
|
print("Loading emotion model...") |
|
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") |
|
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") |
|
|
|
|
|
whisper_model.to("cpu") |
|
emotion_model.to("cpu") |
|
|
|
print("Models loaded successfully!") |
|
return True |
|
except Exception as e: |
|
print(f"Error loading models: {str(e)}") |
|
return False |
|
|
|
def extract_voice_features(waveform, sr): |
|
"""Extract comprehensive voice features for health analysis""" |
|
features = {} |
|
|
|
try: |
|
|
|
f0, voiced_flag, _ = librosa.pyin(waveform, |
|
fmin=librosa.note_to_hz('C2'), |
|
fmax=librosa.note_to_hz('C7')) |
|
f0_valid = f0[voiced_flag] |
|
features['f0_mean'] = np.mean(f0_valid) |
|
features['f0_std'] = np.std(f0_valid) |
|
features['f0_range'] = np.ptp(f0_valid) |
|
|
|
|
|
if len(f0_valid) > 1: |
|
f0_diff = np.diff(f0_valid) |
|
features['jitter'] = np.mean(np.abs(f0_diff)) |
|
features['jitter_percent'] = (features['jitter'] / features['f0_mean']) * 100 |
|
|
|
|
|
amplitude_envelope = np.abs(librosa.stft(waveform)) |
|
features['shimmer'] = np.mean(np.std(amplitude_envelope, axis=1)) |
|
|
|
|
|
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0] |
|
features['spectral_centroid_mean'] = np.mean(spectral_centroids) |
|
features['spectral_centroid_std'] = np.std(spectral_centroids) |
|
|
|
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0] |
|
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff) |
|
|
|
|
|
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13) |
|
features['mfcc_means'] = np.mean(mfccs, axis=1) |
|
features['mfcc_stds'] = np.std(mfccs, axis=1) |
|
|
|
|
|
tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr) |
|
features['speech_rate'] = tempo |
|
|
|
|
|
rms = librosa.feature.rms(y=waveform)[0] |
|
features['energy_mean'] = np.mean(rms) |
|
features['energy_std'] = np.std(rms) |
|
features['energy_kurtosis'] = kurtosis(rms) |
|
features['energy_skewness'] = skew(rms) |
|
|
|
|
|
silence_threshold = 0.01 |
|
is_silence = rms < silence_threshold |
|
silence_regions = librosa.effects.split(waveform, top_db=20) |
|
features['pause_count'] = len(silence_regions) |
|
features['average_pause_duration'] = np.mean([r[1] - r[0] for r in silence_regions]) / sr |
|
|
|
return features, True |
|
except Exception as e: |
|
print(f"Error extracting voice features: {str(e)}") |
|
return {}, False |
|
|
|
def create_voice_analysis_plots(features): |
|
"""Create comprehensive visualization of voice analysis""" |
|
try: |
|
|
|
fig = make_subplots( |
|
rows=2, cols=2, |
|
subplot_titles=( |
|
'Fundamental Frequency Analysis', |
|
'Voice Quality Measures', |
|
'Energy and Rhythm Analysis', |
|
'MFCC Analysis' |
|
) |
|
) |
|
|
|
|
|
f0_metrics = { |
|
'Mean F0': features['f0_mean'], |
|
'F0 Std Dev': features['f0_std'], |
|
'F0 Range': features['f0_range'], |
|
'Jitter %': features['jitter_percent'] |
|
} |
|
fig.add_trace( |
|
go.Bar( |
|
x=list(f0_metrics.keys()), |
|
y=list(f0_metrics.values()), |
|
name='F0 Metrics' |
|
), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
quality_metrics = { |
|
'Shimmer': features['shimmer'], |
|
'Spectral Centroid': features['spectral_centroid_mean'] / 1000, |
|
'Spectral Rolloff': features['spectral_rolloff_mean'] / 1000 |
|
} |
|
fig.add_trace( |
|
go.Bar( |
|
x=list(quality_metrics.keys()), |
|
y=list(quality_metrics.values()), |
|
name='Voice Quality' |
|
), |
|
row=1, col=2 |
|
) |
|
|
|
|
|
energy_metrics = { |
|
'Energy Mean': features['energy_mean'], |
|
'Energy Std': features['energy_std'], |
|
'Speech Rate': features['speech_rate'] / 10, |
|
'Pause Count': features['pause_count'] |
|
} |
|
fig.add_trace( |
|
go.Bar( |
|
x=list(energy_metrics.keys()), |
|
y=list(energy_metrics.values()), |
|
name='Energy & Rhythm' |
|
), |
|
row=2, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
y=features['mfcc_means'], |
|
mode='lines+markers', |
|
name='MFCC Coefficients' |
|
), |
|
row=2, col=2 |
|
) |
|
|
|
|
|
fig.update_layout( |
|
height=800, |
|
showlegend=False, |
|
title_text="Comprehensive Voice Analysis", |
|
) |
|
|
|
return fig.to_html(include_plotlyjs=True) |
|
except Exception as e: |
|
print(f"Error creating voice analysis plots: {str(e)}") |
|
return "Error creating visualizations" |
|
|
|
def analyze_audio(audio_input): |
|
"""Main function to analyze audio input""" |
|
try: |
|
if audio_input is None: |
|
print("No audio input provided") |
|
return "No audio file provided", "Please provide an audio file", "" |
|
|
|
print(f"Received audio input: {audio_input}") |
|
|
|
|
|
if isinstance(audio_input, tuple): |
|
audio_path = audio_input[0] |
|
else: |
|
audio_path = audio_input |
|
|
|
|
|
waveform, sr = librosa.load(audio_path, sr=None) |
|
|
|
|
|
voice_features, success = extract_voice_features(waveform, sr) |
|
if not success: |
|
return "Error extracting voice features", "Analysis failed", "" |
|
|
|
|
|
voice_analysis_html = create_voice_analysis_plots(voice_features) |
|
|
|
|
|
print("Transcribing audio...") |
|
|
|
waveform_16k = librosa.resample(waveform, orig_sr=sr, target_sr=16000) |
|
inputs = processor(waveform_16k, sampling_rate=16000, return_tensors="pt").input_features |
|
|
|
with torch.no_grad(): |
|
predicted_ids = whisper_model.generate(inputs) |
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
print("Analyzing emotions...") |
|
inputs = emotion_tokenizer( |
|
transcription, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=512 |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = emotion_model(**inputs) |
|
emotions = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
|
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] |
|
emotion_scores = { |
|
label: float(score) |
|
for label, score in zip(emotion_labels, emotions[0].cpu().numpy()) |
|
} |
|
|
|
|
|
emotion_viz = create_emotion_plot(emotion_scores) |
|
|
|
|
|
summary = f"""Voice Analysis Summary: |
|
|
|
Speech Characteristics: |
|
- Fundamental Frequency (Pitch): {voice_features['f0_mean']:.2f} Hz (average) |
|
- Jitter: {voice_features['jitter_percent']:.2f}% (voice stability) |
|
- Speech Rate: {voice_features['speech_rate']:.2f} BPM |
|
- Number of Pauses: {voice_features['pause_count']} |
|
- Average Pause Duration: {voice_features['average_pause_duration']:.2f} seconds |
|
|
|
Voice Quality Indicators: |
|
- Shimmer: {voice_features['shimmer']:.4f} (amplitude variation) |
|
- Energy Distribution: {voice_features['energy_skewness']:.2f} (skewness) |
|
- Spectral Centroid: {voice_features['spectral_centroid_mean']:.2f} Hz |
|
|
|
Emotional Content: |
|
- Primary Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]} |
|
- Emotional Variability: {np.std(list(emotion_scores.values())):.2f} |
|
|
|
Speech Content: |
|
{transcription} |
|
""" |
|
|
|
return summary, emotion_viz, voice_analysis_html |
|
|
|
except Exception as e: |
|
error_msg = f"Error analyzing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, "Error in analysis", "" |
|
|
|
|
|
print("Initializing application...") |
|
if not load_models(): |
|
raise RuntimeError("Failed to load required models") |
|
|
|
|
|
demo = gr.Interface( |
|
fn=analyze_audio, |
|
inputs=gr.Audio( |
|
sources=["microphone", "upload"], |
|
type="filepath", |
|
label="Audio Input" |
|
), |
|
outputs=[ |
|
gr.Textbox(label="Analysis Summary", lines=10), |
|
gr.HTML(label="Emotional Analysis"), |
|
gr.HTML(label="Voice Biomarker Analysis") |
|
], |
|
title="Comprehensive Vocal Biomarker Analysis", |
|
description=""" |
|
This application performs comprehensive analysis of voice recordings to extract potential health-related biomarkers: |
|
|
|
1. Speech Characteristics: |
|
- Fundamental frequency analysis |
|
- Voice stability measures (jitter, shimmer) |
|
- Speech rate and rhythm |
|
|
|
2. Voice Quality Analysis: |
|
- Spectral features |
|
- Energy distribution |
|
- MFCC analysis |
|
|
|
3. Emotional Content: |
|
- Emotion detection |
|
- Emotional stability analysis |
|
|
|
4. Speech Content: |
|
- Text transcription |
|
- Pause analysis |
|
|
|
Upload an audio file or record directly through your microphone. |
|
""", |
|
article=""" |
|
### About Vocal Biomarkers |
|
Vocal biomarkers are measurable indicators in the human voice that can potentially indicate various health conditions. |
|
This analysis focuses on several key aspects: |
|
|
|
- **Voice Quality**: Changes in voice quality can indicate respiratory or neurological conditions |
|
- **Prosody**: Speech rhythm and timing can be indicators of cognitive function |
|
- **Emotional Content**: Emotional patterns can be relevant to mental health assessment |
|
- **Acoustic Features**: Specific acoustic patterns may correlate with various health conditions |
|
|
|
Note: This is a demonstration tool and should not be used for medical diagnosis. |
|
""", |
|
examples=None, |
|
cache_examples=False |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |