3loi's picture
Update app.py
8dda170 verified
raw
history blame
1.16 kB
from transformers import pipeline
from transformers import AutoModelForAudioClassification
import gradio as gr
import librosa
import torch
import numpy as np
def classify_audio(audio_file):
model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes", trust_remote_code=True)
sr, raw_wav = audio_file
print(audio_file, audio_file[1].dtype)
y = raw_wav.astype(np.float32)
y /= np.max(np.abs(y))
#raw_wav, _ librosa.load(audio_file, sr=16000)
norm_wav = (y - mean) / (std+0.000001)
mask = torch.ones(1, len(norm_wav))
wavs = torch.tensor(norm_wav).unsqueeze(0)
pred = model(wavs, mask).detach().numpy()
print(str(pred))
return str(pred)
def main():
iface = gr.Interface(fn=classify_audio, inputs=gr.Audio(sources=["upload", "microphone"], label="Audio file"),
outputs=gr.Text(), title="Speech Emotion Recognition App",
description="Upload an audio file and hit the 'Submit'\
button")
iface.launch()
if __name__ == '__main__':
main()