Spaces:

gaunernst
/

AudioMAE-AudioSet20k

Running

App Files Files Community

gaunernst commited on Dec 1, 2023

Commit

cafc237

•

1 Parent(s): 5b04966

beautify

Browse files

Files changed (1) hide show

app.py +36 -5

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import gradio as gr
 import requests
 import timm
 import torch
@@ -27,7 +29,7 @@ def preprocess(x: torch.Tensor):
     else:
         melspec = melspec[:1024]
     melspec = (melspec - MEAN) / (STD * 2)
-    return melspec.view(1, 1024, 128)
 def predict(audio, start):
@@ -43,15 +45,44 @@ def predict(audio, start):
     x = preprocess(x)
     with torch.inference_mode():
-        logits = MODEL(x.unsqueeze(0)).squeeze(0)
     topk_probs, topk_classes = logits.sigmoid().topk(10)
-    return [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
 gr.Interface(
     fn=predict,
     inputs=["audio", "number"],
-    outputs="dataframe",
-    examples=[["LS_female_1462-170138-0008.flac", 0], ["LS_male_3170-137482-0005.flac", 0]],
 ).launch()

 import json
 import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
 import requests
 import timm
 import torch
     else:
         melspec = melspec[:1024]
     melspec = (melspec - MEAN) / (STD * 2)
+    return melspec
 def predict(audio, start):
     x = preprocess(x)
     with torch.inference_mode():
+        logits = MODEL(x.view(1, 1, 1024, 128)).squeeze(0)
     topk_probs, topk_classes = logits.sigmoid().topk(10)
+    preds = [[AUDIOSET_LABELS[cls], prob.item() * 100] for cls, prob in zip(topk_classes, topk_probs)]
+    fig = plt.figure()
+    plt.imshow(x.T, origin="lower")
+    plt.title("Log mel-spectrogram")
+    plt.xlabel("Time (s)")
+    plt.xticks(np.arange(11) * 100, np.arange(11))
+    plt.yticks([0, 64, 128])
+    plt.tight_layout()
+    return preds, fig
+DESCRIPTION = """
+Classify audio into AudioSet classes with ViT-B/16 pre-trained using AudioMAE objective.
+- For more information about AudioMAE, visit https://github.com/facebookresearch/AudioMAE.
+- For how to use AudioMAE model in timm, visit https://huggingface.co/gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k.
+Input audio is converted to log Mel-spectrogram and treated as a grayscale image. The model is a vanilla ViT-B/16.
+NOTE: AudioMAE model only accepts 10s audio (10.24 to be exact). Longer audio will be cropped. Shorted audio will be zero-padded.
+"""
 gr.Interface(
+    title="AudioSet classification with AudioMAE (ViT-B/16)",
+    description=DESCRIPTION,
     fn=predict,
     inputs=["audio", "number"],
+    outputs=[
+        gr.Dataframe(headers=["class", "score"], row_count=10, label="prediction"),
+        gr.Plot(label="spectrogram"),
+    ],
+    examples=[
+        ["LS_female_1462-170138-0008.flac", 0],
+        ["LS_male_3170-137482-0005.flac", 0],
+    ],
 ).launch()