File size: 2,362 Bytes
efabbbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from transformers import pipeline

emotion_model = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
transcription_model = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

emotion_mapping = {
    "angry": (0.8, 0.8, -0.5), "happy": (0.6, 0.6, 0.8), "sad": (-0.6, -0.4, -0.6),
    "neutral": (0, 0, 0), "fear": (0.3, -0.3, -0.7), "surprise": (0.4, 0.2, 0.2),
    "disgust": (0.2, 0.5, -0.6), "calm": (-0.2, 0.1, 0.3), "excited": (0.7, 0.5, 0.7),
    "frustrated": (0.6, 0.5, -0.4)
}

def process_audio(audio_file):
    y, sr = librosa.load(audio_file, sr=None)
    transcription = transcription_model(audio_file)["text"]
    emotion_result = emotion_model(audio_file)[0]
    emotion, confidence = emotion_result["label"], emotion_result["score"]
    arousal, dominance, valence = emotion_mapping.get(emotion.lower(), (0, 0, 0))
    
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr)
    plt.title("Waveform")
    waveform_plot = plt.gcf()
    plt.close()
    
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title("Mel Spectrogram")
    mel_spec_plot = plt.gcf()
    plt.close()
    
    return transcription, emotion, confidence, arousal, dominance, valence, waveform_plot, mel_spec_plot

def create_emotion_recognition_tab():
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(type="filepath")
            gr.Examples(["./assets/audio/fitness.wav"], inputs=[audio_input])
            transcription_output = gr.Textbox(label="Transcription")
            emotion_output = gr.Textbox(label="Emotion")
        with gr.Column(scale=1):    
            outputs = [gr.Number(label=label) for label in ["Confidence", "Arousal", "Dominance", "Valence"]]
        with gr.Column(scale=1):    
            plots = [gr.Plot(label=label) for label in ["Waveform", "Mel Spectrogram"]]

    audio_input.change(process_audio, inputs=[audio_input], 
                       outputs=[transcription_output, emotion_output] + outputs + plots)