In [2]:
from datasets import load_dataset
from transformers import ClapModel, AutoProcessor
from IPython.display import Audio


 from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset('ashraq/esc50')

Found cached dataset parquet (/root/.cache/huggingface/datasets/ashraq___parquet/ashraq--esc50-1000c3b73cc1500f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████| 1/1 [00:00<00:00, 524.29it/s]


In [5]:
audio_sample = dataset["train"]["audio"][50]['array']
Audio(audio_sample, rate=44100)

In [19]:
model = ClapModel.from_pretrained("laion/clap-htsat-unfused")

In [20]:
processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

In [21]:
input_text = ["Hospital elevator", "Water Drop", "Sound of water dropping"]

In [22]:
inputs = processor(text=input_text, audios=audio_sample, return_tensors='pt', padding=True)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [23]:
print(inputs.keys())

dict_keys(['input_ids', 'attention_mask', 'input_features'])


In [24]:
outputs = model(**inputs)
logits_per_audio = outputs.logits_per_audio
probs = logits_per_audio.softmax(dim=-1)
print(probs)

tensor([[7.4354e-04, 4.5361e-02, 9.5390e-01]], grad_fn=)


In [10]:
from transformers import ClapAudioModel, ClapAudioModelWithProjection

In [11]:
audio_model = ClapAudioModel.from_pretrained("laion/clap-htsat-unfused")

Some weights of the model checkpoint at laion/clap-htsat-unfused were not used when initializing ClapAudioModel: ['audio_model.audio_encoder.layers.0.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_after.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'audio_model.audio_encoder.layers.1.blocks.0.intermediate.dense.weight', 'audio_model.audio_encoder.layers.0.downsample.norm.bias', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.relative_position_index', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.query.weight', 'audio_model.audio_encoder.layers.2.blocks.5.layernorm_after.bias', 'audio_model.audio_encoder.layers.2.blocks.3.layernorm_before.weight', 'text_model.encoder.layer.7.attention.output.dense.bias', 'audio_model.audio_encoder.layers.1.blocks.1.intermediate.dense.bias', 'text_model.encoder.layer.0.intermediate.de

In [12]:
audio_inputs = processor(audios=audio_sample, return_tensors='pt')

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [13]:
audio_inputs.keys()

dict_keys(['input_features', 'is_longer'])

In [14]:
audio_prediction_model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-unfused")

Some weights of the model checkpoint at laion/clap-htsat-unfused were not used when initializing ClapAudioModelWithProjection: ['text_model.encoder.layer.8.output.dense.bias', 'text_model.encoder.layer.2.attention.self.value.weight', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.7.attention.self.value.weight', 'text_model.encoder.layer.5.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.2.attention.self.value.bias', 'text_model.encoder.layer.6.attention.self.query.weight', 'text_model.encoder.layer.6.intermediate.dense.bias', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.1.attention.output.dense.weight', 'text_model.encoder.layer.2.attention.output.LayerNorm.weight', 'text_model.encoder.layer.1.output.LayerNorm.bias', 'text_model.encoder.layer.7.attention.output.dense.bias', 'text_mode

In [15]:
from transformers import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST

In [16]:
pred_outputs = audio_prediction_model(**audio_inputs)

In [17]:
print(pred_outputs.keys())

odict_keys(['audio_embeds', 'last_hidden_state'])


In [18]:
pred_outputs.audio_embeds

tensor([[-7.9439e-02, 2.3935e-01, 3.5846e-01, 2.3282e-01, 3.2885e-02,
 -1.3462e-01, -3.8317e-01, -5.1633e-02, -1.8881e-03, 1.4470e-01,
 1.5499e-01, -5.4301e-03, 6.2472e-02, 1.1324e-01, -1.3372e-01,
 -7.4772e-02, 9.4837e-02, 8.4011e-02, 1.6877e-01, 3.9500e-01,
 3.7919e-01, 4.1101e-01, -2.2619e-01, 2.6106e-01, 7.4054e-02,
 6.7051e-02, -4.6973e-02, 6.6229e-02, 7.9341e-02, -6.2507e-02,
 -3.3600e-02, -1.1131e-02, 2.9025e-01, -1.0942e-01, -6.2347e-02,
 -4.0657e-02, 3.6304e-02, 3.1982e-02, -4.5375e-02, -3.1386e-01,
 1.8173e-01, -1.8351e-01, -3.7267e-01, -2.3658e-01, 5.7322e-02,
 -2.3966e-04, -1.6086e-01, -1.8752e-01, 3.9222e-01, -2.7590e-01,
 2.3425e-01, 5.2686e-02, 1.1264e-01, 1.1232e-01, -9.5137e-02,
 1.2332e-01, 3.2688e-01, -3.4500e-02, 3.2825e-01, 1.3025e-01,
 1.6063e-01, -2.2567e-01, -1.5062e-01, -3.4971e-01, 2.3765e-01,
 -1.4173e-01, 4.0352e-02, 3.6305e-02, -1.8367e-01, -4.1525e-02,
 -1.0561e-01, 8.5074e-02, 1.6497e-01, 7.2744e-02, 2.4250e-01,
 7.6457e-02, 3.6339e-02, -2.8053e-02, 2.454

In [19]:
import torch
audio_embeds = pred_outputs.audio_embeds
print(torch.equal(outputs.audio_embeds, audio_embeds))

False


In [20]:
from transformers import ClapTextModelWithProjection
text_model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")

Some weights of the model checkpoint at laion/clap-htsat-unfused were not used when initializing ClapTextModelWithProjection: ['audio_model.audio_encoder.layers.2.blocks.2.attention.self.key.weight', 'audio_model.audio_encoder.layers.0.blocks.0.attention.self.value.weight', 'audio_model.audio_encoder.layers.2.blocks.2.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_before.bias', 'audio_model.audio_encoder.layers.2.blocks.0.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.4.attention.self.value.weight', 'audio_model.audio_encoder.layers.0.blocks.1.attention.self.key.weight', 'audio_model.audio_encoder.layers.2.blocks.1.output.dense.weight', 'audio_model.audio_encoder.layers.2.blocks.2.attention.self.value.bias', 'audio_model.audio_encoder.layers.1.blocks.1.attention.self.relative_position_bias_table', 'audio_model.audio_encoder.layers.0.blocks.0.layernorm_after.weight', 'audio_model.audio_encoder.layers.0.blocks.1.layernorm_after.bia

In [21]:
text_inputs = processor(text=input_text, return_tensors='pt', padding=True)

In [22]:
text_embeds = text_model(**text_inputs).text_embeds

In [23]:
audio_embeds = audio_embeds / audio_embeds.norm(p=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)


In [24]:
logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * model.logit_scale_t.exp()
logits_per_audio = torch.matmul(audio_embeds, text_embeds.t()) * model.logit_scale_a.exp()


In [25]:
logits_per_audio
probs = logits_per_audio.softmax(dim=-1)
print(probs)

tensor([[0.0008, 0.1800, 0.8192]], grad_fn=)


In [26]:
print(input_text)

['Hospital elevator', 'Water Drop', 'Sound of water dropping']


In [27]:
print(audio_embeds)

tensor([[-1.8583e-02, 5.5992e-02, 8.3854e-02, 5.4465e-02, 7.6927e-03,
 -3.1491e-02, -8.9635e-02, -1.2078e-02, -4.4169e-04, 3.3850e-02,
 3.6257e-02, -1.2702e-03, 1.4614e-02, 2.6491e-02, -3.1281e-02,
 -1.7491e-02, 2.2185e-02, 1.9653e-02, 3.9481e-02, 9.2402e-02,
 8.8703e-02, 9.6147e-02, -5.2913e-02, 6.1069e-02, 1.7323e-02,
 1.5685e-02, -1.0988e-02, 1.5493e-02, 1.8560e-02, -1.4622e-02,
 -7.8600e-03, -2.6038e-03, 6.7897e-02, -2.5596e-02, -1.4585e-02,
 -9.5110e-03, 8.4926e-03, 7.4816e-03, -1.0615e-02, -7.3421e-02,
 4.2511e-02, -4.2928e-02, -8.7179e-02, -5.5344e-02, 1.3409e-02,
 -5.6063e-05, -3.7629e-02, -4.3867e-02, 9.1751e-02, -6.4542e-02,
 5.4798e-02, 1.2325e-02, 2.6350e-02, 2.6276e-02, -2.2255e-02,
 2.8848e-02, 7.6467e-02, -8.0706e-03, 7.6788e-02, 3.0469e-02,
 3.7577e-02, -5.2791e-02, -3.5234e-02, -8.1808e-02, 5.5592e-02,
 -3.3154e-02, 9.4394e-03, 8.4927e-03, -4.2966e-02, -9.7140e-03,
 -2.4706e-02, 1.9901e-02, 3.8590e-02, 1.7017e-02, 5.6728e-02,
 1.7885e-02, 8.5008e-03, -6.5625e-03, 5.740

In [43]:
state_dict = audio_prediction_model.audio_projection.state_dict()

In [44]:
state_dict2 = model.audio_projection.state_dict()

In [45]:
for key in state_dict.keys():
 tensor1 = state_dict[key]
 tensor2 = state_dict2[key]

 print(torch.equal(tensor1, tensor2))

True
True
True
True
