Eating Sound Classification using Wav2Vec 2.0

How to use

Requirements

# requirement packages
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install librosa

Prediction

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2FeatureExtractor

import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "m3hrdadfi/wav2vec2-base-100k-eating-sound-collection"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs

path = "clips_rd/gummies/gummies_6_04.wav"
outputs = predict(path, sampling_rate)

[
{'Label': 'aloe', 'Score': '0.0%'},
{'Label': 'burger', 'Score': '0.0%'},
{'Label': 'cabbage', 'Score': '0.0%'},
{'Label': 'candied_fruits', 'Score': '0.0%'},
{'Label': 'carrots', 'Score': '0.0%'},
{'Label': 'chips', 'Score': '0.0%'},
{'Label': 'chocolate', 'Score': '0.0%'},
{'Label': 'drinks', 'Score': '0.0%'},
{'Label': 'fries', 'Score': '0.0%'},
{'Label': 'grapes', 'Score': '0.0%'},
{'Label': 'gummies', 'Score': '99.8%'},
{'Label': 'ice-cream', 'Score': '0.0%'},
{'Label': 'jelly', 'Score': '0.1%'},
{'Label': 'noodles', 'Score': '0.0%'},
{'Label': 'pickles', 'Score': '0.0%'},
{'Label': 'pizza', 'Score': '0.0%'},
{'Label': 'ribs', 'Score': '0.0%'},
{'Label': 'salmon', 'Score': '0.0%'},
{'Label': 'soup', 'Score': '0.0%'},
{'Label': 'wings', 'Score': '0.0%'}
]

Evaluation

The following tables summarize the scores obtained by model overall and per each class.

label	precision	recall	f1-score	support
aloe	0.989	0.807	0.889	109
burger	1.000	0.471	0.640	119
cabbage	0.907	0.970	0.937	100
candied_fruits	0.952	0.988	0.970	161
carrots	0.970	0.992	0.981	132
chips	0.993	0.951	0.972	144
chocolate	0.828	0.914	0.869	58
drinks	0.982	0.948	0.965	58
fries	0.935	0.783	0.852	129
grapes	0.965	0.940	0.952	116
gummies	0.880	0.971	0.923	136
ice-cream	0.953	0.972	0.962	145
jelly	0.906	0.875	0.890	88
noodles	0.817	0.817	0.817	82
pickles	0.933	0.960	0.946	174
pizza	0.704	0.934	0.803	122
ribs	0.796	0.755	0.775	98
salmon	0.647	0.970	0.776	100
soup	0.941	0.857	0.897	56
wings	0.842	0.792	0.816	101
accuracy	0.890	0.890	0.890	0
macro avg	0.897	0.883	0.882	2228
weighted avg	0.903	0.890	0.888	2228

Questions?

Post a Github issue from HERE.