gaunernst commited on
Commit
cac3ec7
1 Parent(s): fc54a80

initial commit

Browse files
Files changed (3) hide show
  1. app.py +50 -0
  2. packages.txt +1 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import shlex
3
+ import subprocess
4
+
5
+ import gradio as gr
6
+ import numpy as np
7
+ import requests
8
+ import timm
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torchaudio.compliance import kaldi
12
+
13
+ TAG = "gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k"
14
+ MODEL = timm.create_model(f"hf_hub:{TAG}", pretrained=True).eval()
15
+
16
+ LABEL_URL = "https://huggingface.co/datasets/huggingface/label-files/raw/main/audioset-id2label.json"
17
+ AUDIOSET_LABELS = list(json.loads(requests.get(LABEL_URL).content).values())
18
+
19
+ SAMPLING_RATE = 16_000
20
+
21
+
22
+ def resample(x: np.ndarray, sr: int):
23
+ cmd = f"ffmpeg -ar {sr} -f s16le -i - -ar {SAMPLING_RATE} -f f32le -"
24
+ proc = subprocess.run(shlex.split(cmd), capture_output=True, input=x.tobytes())
25
+ return np.frombuffer(proc.stdout, dtype=np.float32)
26
+
27
+
28
+ def preprocess(x: torch.Tensor):
29
+ melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
30
+ if melspec.shape[0] < 1024:
31
+ melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
32
+ else:
33
+ melspec = melspec[:1024]
34
+ return melspec.view(1, 1, 1024, 128)
35
+
36
+
37
+ def predict(audio):
38
+ sr, x = audio
39
+ x = resample(x, sr)
40
+ x = torch.from_numpy(x)
41
+
42
+ with torch.inference_mode():
43
+ logits = MODEL(preprocess(x)).squeeze(0)
44
+
45
+ topk_probs, topk_classes = logits.softmax(dim=-1).topk(5)
46
+ return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
47
+
48
+
49
+ iface = gr.Interface(fn=predict, inputs="audio", outputs="dataframe")
50
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ requests
2
+ timm
3
+ numpy
4
+ torch
5
+ torchaudio