Spaces:

gaunernst
/

AudioMAE-AudioSet20k

Running

gaunernst commited on Nov 23, 2023

Commit

bcc0935

•

1 Parent(s): cac3ec7

fix preprocessing. add examples

Files changed (3) hide show

LS_female_1462-170138-0008.flac ADDED Viewed

Binary file (122 kB). View file

LS_male_3170-137482-0005.flac ADDED Viewed

Binary file (155 kB). View file

app.py CHANGED Viewed

@@ -17,6 +17,8 @@ LABEL_URL = "https://huggingface.co/datasets/huggingface/label-files/raw/main/au
 AUDIOSET_LABELS = list(json.loads(requests.get(LABEL_URL).content).values())
 SAMPLING_RATE = 16_000
 def resample(x: np.ndarray, sr: int):
@@ -26,25 +28,34 @@ def resample(x: np.ndarray, sr: int):
 def preprocess(x: torch.Tensor):
     melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
     if melspec.shape[0] < 1024:
         melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
     else:
         melspec = melspec[:1024]
     return melspec.view(1, 1, 1024, 128)
-def predict(audio):
     sr, x = audio
-    x = resample(x, sr)
     x = torch.from_numpy(x)
     with torch.inference_mode():
         logits = MODEL(preprocess(x)).squeeze(0)
-    topk_probs, topk_classes = logits.softmax(dim=-1).topk(5)
     return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
-iface = gr.Interface(fn=predict, inputs="audio", outputs="dataframe")
-iface.launch()

 AUDIOSET_LABELS = list(json.loads(requests.get(LABEL_URL).content).values())
 SAMPLING_RATE = 16_000
+MEAN = -4.2677393
+STD = 4.5689974
 def resample(x: np.ndarray, sr: int):
 def preprocess(x: torch.Tensor):
+    x = x - x.mean()
     melspec = kaldi.fbank(x.unsqueeze(0), htk_compat=True, window_type="hanning", num_mel_bins=128)
     if melspec.shape[0] < 1024:
         melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
     else:
         melspec = melspec[:1024]
+    melspec = (melspec - MEAN) / (STD * 2)
     return melspec.view(1, 1, 1024, 128)
+def predict(audio, start):
     sr, x = audio
+    if x.shape[0] < start * sr:
+        raise gr.Error(f"`start` ({start}) must be smaller than audio duration ({x.shape[0] / sr:.0f}s)")
+    x = resample(x[int(start * sr) :], sr)
     x = torch.from_numpy(x)
     with torch.inference_mode():
         logits = MODEL(preprocess(x)).squeeze(0)
+    topk_probs, topk_classes = logits.sigmoid().topk(10)
     return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
+gr.Interface(
+    fn=predict,
+    inputs=["audio", "number"],
+    outputs="dataframe",
+    examples=[["LS_female_1462-170138-0008.flac", 0], ["LS_male_3170-137482-0005.flac", 0]],
+).launch()