|
import torch |
|
import gradio as gr |
|
from transformers import Wav2Vec2FeatureExtractor |
|
from datasets import Dataset |
|
import librosa |
|
|
|
def get_emotion(microphone, file_upload, task): |
|
|
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-large-superb-er") |
|
warn_output = "" |
|
|
|
if (microphone is not None) and (file_upload is not None): |
|
warn_output = ( |
|
"WARNING: You've uploaded an audio file and used the microphone. " |
|
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" |
|
) |
|
elif (microphone is None) and (file_upload is None): |
|
return "ERROR: You have to either use the microphone or upload an audio file" |
|
|
|
model = torch.load('model.pth', map_location=torch.device('cpu')) |
|
file = microphone if microphone is not None else file_upload |
|
speech, _ = librosa.load(file, sr=16000, mono=True) |
|
test = feature_extractor(speech, sampling_rate=16000, padding=True, return_tensors="pt" ) |
|
|
|
logits = model(**test).logits |
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()] |
|
return labels |
|
|
|
demo = gr.Blocks() |
|
|
|
mf_transcribe = gr.Interface( |
|
fn=get_emotion, |
|
inputs=[ |
|
gr.inputs.Audio(source="microphone", type="filepath", optional=True), |
|
gr.inputs.Audio(source="upload", type="filepath", optional=True), |
|
], |
|
outputs="text", |
|
layout="horizontal", |
|
theme="huggingface", |
|
title="AER", |
|
description=( |
|
"get the emotion" |
|
), |
|
allow_flagging="never", |
|
) |
|
|
|
with demo: |
|
gr.TabbedInterface([mf_transcribe],'Trancribe') |
|
demo.launch(enable_queue=True) |
|
|