Hatman's picture
req and app initial
a1288b8
raw
history blame
1.47 kB
import gradio as gr
import spaces
import torch
import librosa
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model_name = "Hemg/human-emotion-detection"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name).to(device)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name).to(device)
def preprocess_audio(example):
audio_array, sampling_rate = librosa.load(audio_file_path, sr=16000) # Load and resample to 16kHz
return {'speech': audio_array, 'sampling_rate': sampling_rate}
@spaces.GPU
def inference(audio):
example = preprocess_audio(audio_file_path)
inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device) # Move inputs to GPU
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
return model.config.id2label[predicted_ids.item()], logits, predicted_ids # Move tensors back to CPU for further processing
iface = gr.Interface(fn=predict_sentiment,
inputs=gr.inputs.Audio(source="microphone", type="filepath"),
outputs="text",
title="Audio Sentiment Analysis",
description="Upload an audio file or record one to analyze sentiment.")
iface.launch()