DavidCombei's picture
Update app.py
0710285 verified
raw
history blame
3.32 kB
import joblib
from transformers import AutoFeatureExtractor, WavLMModel
import torch
import soundfile as sf
import numpy as np
import gradio as gr
import librosa
class HuggingFaceFeatureExtractor:
def __init__(self, model_class, name):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.feature_extractor = AutoFeatureExtractor.from_pretrained(name)
self.model = model_class.from_pretrained(name)
self.model.eval()
self.model.to(self.device)
def __call__(self, audio, sr):
inputs = self.feature_extractor(
audio,
sampling_rate=sr,
return_tensors="pt",
padding=True,
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
return outputs.last_hidden_state
FEATURE_EXTRACTORS = {
"wavlm-base": lambda: HuggingFaceFeatureExtractor(WavLMModel, "microsoft/wavlm-base"),
"wavLM-V1": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V1"),
"wavLM-V2": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V2"),
"wavLM-V3": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V3"),
}
model1 = joblib.load('model1.joblib')
model2 = joblib.load('model2.joblib')
model3 = joblib.load('model3.joblib')
model4 = joblib.load('model4.joblib')
final_model = joblib.load('final_model.joblib')
def process_audio(file_audio):
audio, sr = librosa.load(file_audio, sr=16000) # Resample to 16 kHz
if len(audio.shape) > 1:
audio = audio[0]
extractor_1 = FEATURE_EXTRACTORS['wavlm-base']()
extractor_2 = FEATURE_EXTRACTORS['wavLM-V1']()
extractor_3 = FEATURE_EXTRACTORS['wavLM-V2']()
extractor_4 = FEATURE_EXTRACTORS['wavLM-V3']()
eval1 = extractor_1(audio, sr)
eval1 = torch.mean(eval1, dim=1).cpu().numpy()
eval2 = extractor_2(audio, sr)
eval2 = torch.mean(eval2, dim=1).cpu().numpy()
eval3 = extractor_3(audio, sr)
eval3 = torch.mean(eval3, dim=1).cpu().numpy()
eval4 = extractor_4(audio, sr)
eval4 = torch.mean(eval4, dim=1).cpu().numpy()
eval1 = eval1.reshape(1, -1)
eval2 = eval2.reshape(1, -1)
eval3 = eval3.reshape(1, -1)
eval4 = eval4.reshape(1, -1)
eval_prob1 = model1.predict_proba(eval1)[:, 1].reshape(-1, 1)
eval_prob2 = model2.predict_proba(eval2)[:, 1].reshape(-1, 1)
eval_prob3 = model3.predict_proba(eval3)[:, 1].reshape(-1, 1)
eval_prob4 = model4.predict_proba(eval4)[:, 1].reshape(-1, 1)
eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4))
final_prob = final_model.predict_proba(eval_combined_probs)[:, 1]
if final_prob < 0.5:
return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%"
else:
return f"Real with a confidence of: {final_prob[0] * 100:.2f}%"
interface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Deepfake Detection",
description="Upload an audio file to detect whether it is fake or real. The system uses features ensamble from wavLM base and finetuned versions. Submitted to ASVSpoof5.",
)
interface.launch(share=True)