import torch import gradio as gr from transformers import Wav2Vec2FeatureExtractor from datasets import Dataset import librosa feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-large-superb-er") def get_emotion(microphone, file_upload, task): warn_output = "" if (microphone is not None) and (file_upload is not None): warn_output = ( "WARNING: You've uploaded an audio file and used the microphone. " "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" ) elif (microphone is None) and (file_upload is None): return "ERROR: You have to either use the microphone or upload an audio file" file = microphone if microphone is not None else file_upload test = feature_extractor(file, sampling_rate=16000, padding=True, return_tensors="pt" ).to(torch.float32) logits = model(**test).logits predicted_ids = torch.argmax(logits, dim=-1) labels = [model.config.id2label[_id] for _id in predicated_ids.tolist()] return labels demo = gr.Blocks() mf_transcribe = gr.Interface( fn=get_emotion, inputs=[ gr.inputs.Audio(source="microphone", type="filepath", optional=True), gr.inputs.Audio(source="upload", type="filepath", optional=True), ], outputs="text", layout="horizontal", theme="huggingface", title="AER", description=( "get the emotion" ), allow_flagging="never", ) with demo: gr.TabbledInterface([mf_transcribe],'Trancribe') demo.launch(enable_queue=True)