Spaces:
Runtime error
Runtime error
import joblib | |
from transformers import AutoFeatureExtractor, WavLMModel | |
import torch | |
import soundfile as sf | |
import numpy as np | |
import gradio as gr | |
import librosa | |
class HuggingFaceFeatureExtractor: | |
def __init__(self, model_class, name): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.feature_extractor = AutoFeatureExtractor.from_pretrained(name) | |
self.model = model_class.from_pretrained(name) | |
self.model.eval() | |
self.model.to(self.device) | |
def __call__(self, audio, sr): | |
inputs = self.feature_extractor( | |
audio, | |
sampling_rate=sr, | |
return_tensors="pt", | |
padding=True, | |
) | |
inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
return outputs.last_hidden_state | |
FEATURE_EXTRACTORS = { | |
"wavlm-base": lambda: HuggingFaceFeatureExtractor(WavLMModel, "microsoft/wavlm-base"), | |
"wavLM-V1": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V1"), | |
"wavLM-V2": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V2"), | |
"wavLM-V3": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V3"), | |
} | |
model1 = joblib.load('model1.joblib') | |
model2 = joblib.load('model2.joblib') | |
model3 = joblib.load('model3.joblib') | |
model4 = joblib.load('model4.joblib') | |
final_model = joblib.load('final_model.joblib') | |
def process_audio(file_audio): | |
audio, sr = librosa.load(file_audio, sr=16000) # Resample to 16 kHz | |
if len(audio.shape) > 1: | |
audio = audio[0] | |
extractor_1 = FEATURE_EXTRACTORS['wavlm-base']() | |
extractor_2 = FEATURE_EXTRACTORS['wavLM-V1']() | |
extractor_3 = FEATURE_EXTRACTORS['wavLM-V2']() | |
extractor_4 = FEATURE_EXTRACTORS['wavLM-V3']() | |
eval1 = extractor_1(audio, sr) | |
eval1 = torch.mean(eval1, dim=1).cpu().numpy() | |
eval2 = extractor_2(audio, sr) | |
eval2 = torch.mean(eval2, dim=1).cpu().numpy() | |
eval3 = extractor_3(audio, sr) | |
eval3 = torch.mean(eval3, dim=1).cpu().numpy() | |
eval4 = extractor_4(audio, sr) | |
eval4 = torch.mean(eval4, dim=1).cpu().numpy() | |
eval1 = eval1.reshape(1, -1) | |
eval2 = eval2.reshape(1, -1) | |
eval3 = eval3.reshape(1, -1) | |
eval4 = eval4.reshape(1, -1) | |
eval_prob1 = model1.predict_proba(eval1)[:, 1].reshape(-1, 1) | |
eval_prob2 = model2.predict_proba(eval2)[:, 1].reshape(-1, 1) | |
eval_prob3 = model3.predict_proba(eval3)[:, 1].reshape(-1, 1) | |
eval_prob4 = model4.predict_proba(eval4)[:, 1].reshape(-1, 1) | |
eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4)) | |
final_prob = final_model.predict_proba(eval_combined_probs)[:, 1] | |
if final_prob < 0.5: | |
return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%" | |
else: | |
return f"Real with a confidence of: {final_prob[0] * 100:.2f}%" | |
interface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(type="filepath"), | |
outputs="text", | |
title="Audio Deepfake Detection", | |
description="Upload an audio file to detect whether it is fake or real. The system uses features ensamble from wavLM base and finetuned versions. Submitted to ASVSpoof5.", | |
) | |
interface.launch(share=True) | |