import gradio as gr import numpy as np import librosa import librosa.display import matplotlib.pyplot as plt from transformers import pipeline emotion_model = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition") transcription_model = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") emotion_mapping = { "angry": (0.8, 0.8, -0.5), "happy": (0.6, 0.6, 0.8), "sad": (-0.6, -0.4, -0.6), "neutral": (0, 0, 0), "fear": (0.3, -0.3, -0.7), "surprise": (0.4, 0.2, 0.2), "disgust": (0.2, 0.5, -0.6), "calm": (-0.2, 0.1, 0.3), "excited": (0.7, 0.5, 0.7), "frustrated": (0.6, 0.5, -0.4) } def process_audio(audio_file): y, sr = librosa.load(audio_file, sr=None) transcription = transcription_model(audio_file)["text"] emotion_result = emotion_model(audio_file)[0] emotion, confidence = emotion_result["label"], emotion_result["score"] arousal, dominance, valence = emotion_mapping.get(emotion.lower(), (0, 0, 0)) plt.figure(figsize=(10, 4)) librosa.display.waveshow(y, sr=sr) plt.title("Waveform") waveform_plot = plt.gcf() plt.close() mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) plt.figure(figsize=(10, 4)) librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), sr=sr, x_axis='time', y_axis='mel') plt.colorbar(format='%+2.0f dB') plt.title("Mel Spectrogram") mel_spec_plot = plt.gcf() plt.close() return transcription, emotion, confidence, arousal, dominance, valence, waveform_plot, mel_spec_plot def create_emotion_recognition_tab(): with gr.Row(): with gr.Column(scale=2): audio_input = gr.Audio(type="filepath") gr.Examples(["./assets/audio/fitness.wav"], inputs=[audio_input]) transcription_output = gr.Textbox(label="Transcription") emotion_output = gr.Textbox(label="Emotion") with gr.Column(scale=1): outputs = [gr.Number(label=label) for label in ["Confidence", "Arousal", "Dominance", "Valence"]] with gr.Column(scale=1): plots = [gr.Plot(label=label) for label in ["Waveform", "Mel Spectrogram"]] audio_input.change(process_audio, inputs=[audio_input], outputs=[transcription_output, emotion_output] + outputs + plots)