{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "9fe51ce7-4c87-4186-9fd3-0fb18ac43e56", "metadata": {}, "outputs": [], "source": [ "from PIL import Image\n", "import requests\n", "from transformers import AutoProcessor, CLIPVisionModel" ] }, { "cell_type": "code", "execution_count": 3, "id": "0f4c21dd-4258-461d-8511-5be089d068a8", "metadata": {}, "outputs": [], "source": [ "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")\n", "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "98b9f906-ffaa-4be4-8671-4ecf65f12c49", "metadata": {}, "outputs": [], "source": [ "# url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", "# image = Image.open(requests.get(url, stream=True).raw)\n", "image = Image.open(\"002579.jpg\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "54b2e4ce-b77b-4314-87f6-ca2a1970fc79", "metadata": {}, "outputs": [], "source": [ "# image" ] }, { "cell_type": "code", "execution_count": 18, "id": "cdd65c58-007f-450b-8deb-f8b4f372a823", "metadata": {}, "outputs": [], "source": [ "# image = None" ] }, { "cell_type": "code", "execution_count": 5, "id": "e9066c2e-c78b-49d1-979b-10d0f4f09441", "metadata": {}, "outputs": [], "source": [ "inputs = processor(images=image, return_tensors=\"pt\", device_map=\"cuda:0\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "e98b211d-29d9-4662-be0b-e011e89b0101", "metadata": {}, "outputs": [], "source": [ "# inputs" ] }, { "cell_type": "code", "execution_count": 6, "id": "b030bd3d-4282-4074-98fe-97e658bd0f50", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 3, 224, 224])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs[\"pixel_values\"].shape" ] }, { "cell_type": "code", "execution_count": 22, "id": "0ce68f11-1c88-4dd7-8b17-0d1de5811fe6", "metadata": {}, "outputs": [], "source": [ "outputs = model(inputs[\"pixel_values\"].to(\"cuda:0\"))\n", "last_hidden_state = outputs.last_hidden_state\n", "pooled_output = outputs.pooler_output # pooled CLS states" ] }, { "cell_type": "code", "execution_count": 23, "id": "30cb0918-a30e-4246-b540-6b8e0d876807", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 768])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pooled_output.shape" ] }, { "cell_type": "code", "execution_count": 24, "id": "6399543a-f23f-426d-8289-3bb52d293ece", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 50, 768])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "last_hidden_state.shape" ] }, { "cell_type": "code", "execution_count": 25, "id": "19a70443-5942-4937-b3ea-6a52d76e2b08", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 768])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outputs[1].shape" ] }, { "cell_type": "code", "execution_count": 8, "id": "fa13903f-a94a-4839-ae5a-8df4f55c68b6", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from torch import nn\n", "from transformers import CLIPVisionConfig,CLIPPreTrainedModel" ] }, { "cell_type": "code", "execution_count": 9, "id": "b2bd9198-42f0-40c3-80e1-d167c0b038fb", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'Optional' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mCLIPVisionModelWithProjection\u001b[39;00m(CLIPPreTrainedModel):\n\u001b[1;32m 2\u001b[0m config_class \u001b[38;5;241m=\u001b[39m CLIPVisionConfig\n\u001b[1;32m 3\u001b[0m main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpixel_values\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", "Cell \u001b[0;32mIn[9], line 20\u001b[0m, in \u001b[0;36mCLIPVisionModelWithProjection\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_input_embeddings\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m nn\u001b[38;5;241m.\u001b[39mModule:\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model\u001b[38;5;241m.\u001b[39membeddings\u001b[38;5;241m.\u001b[39mpatch_embedding\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m---> 20\u001b[0m pixel_values: \u001b[43mOptional\u001b[49m[torch\u001b[38;5;241m.\u001b[39mFloatTensor] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 21\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 22\u001b[0m output_hidden_states: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 23\u001b[0m return_dict: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 24\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[Tuple, CLIPVisionModelOutput]:\n\u001b[1;32m 25\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m 27\u001b[0m vision_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model(\n\u001b[1;32m 28\u001b[0m pixel_values\u001b[38;5;241m=\u001b[39mpixel_values,\n\u001b[1;32m 29\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 30\u001b[0m output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m 31\u001b[0m return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m 32\u001b[0m )\n", "\u001b[0;31mNameError\u001b[0m: name 'Optional' is not defined" ] } ], "source": [ "class CLIPVisionModelWithProjection(CLIPPreTrainedModel):\n", " config_class = CLIPVisionConfig\n", " main_input_name = \"pixel_values\"\n", "\n", " def __init__(self, config: CLIPVisionConfig):\n", " super().__init__(config)\n", "\n", " self.vision_model = CLIPVisionTransformer(config)\n", "\n", " self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)\n", "\n", " # Initialize weights and apply final processing\n", " self.post_init()\n", "\n", " def get_input_embeddings(self) -> nn.Module:\n", " return self.vision_model.embeddings.patch_embedding\n", "\n", " def forward(\n", " self,\n", " pixel_values: Optional[torch.FloatTensor] = None,\n", " output_attentions: Optional[bool] = None,\n", " output_hidden_states: Optional[bool] = None,\n", " return_dict: Optional[bool] = None,\n", " ) -> Union[Tuple, CLIPVisionModelOutput]:\n", " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", "\n", " vision_outputs = self.vision_model(\n", " pixel_values=pixel_values,\n", " output_attentions=output_attentions,\n", " output_hidden_states=output_hidden_states,\n", " return_dict=return_dict,\n", " )\n", "\n", " pooled_output = vision_outputs[1] # pooled_output\n", "\n", " image_embeds = self.visual_projection(pooled_output)\n", "\n", " if not return_dict:\n", " outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n", " return tuple(output for output in outputs if output is not None)\n", "\n", " return CLIPVisionModelOutput(\n", " image_embeds=image_embeds,\n", " last_hidden_state=vision_outputs.last_hidden_state,\n", " hidden_states=vision_outputs.hidden_states,\n", " attentions=vision_outputs.attentions,\n", " )" ] }, { "cell_type": "code", "execution_count": 27, "id": "68a9ee4a-d977-4725-842d-e64e0dd2f61d", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "loading configuration file config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", "Model config CLIPConfig {\n", " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", " \"architectures\": [\n", " \"CLIPModel\"\n", " ],\n", " \"initializer_factor\": 1.0,\n", " \"logit_scale_init_value\": 2.6592,\n", " \"model_type\": \"clip\",\n", " \"projection_dim\": 512,\n", " \"text_config\": {\n", " \"bos_token_id\": 0,\n", " \"dropout\": 0.0,\n", " \"eos_token_id\": 2,\n", " \"model_type\": \"clip_text_model\"\n", " },\n", " \"transformers_version\": \"4.36.2\",\n", " \"vision_config\": {\n", " \"dropout\": 0.0,\n", " \"model_type\": \"clip_vision_model\"\n", " }\n", "}\n", "\n", "loading weights file pytorch_model.bin from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n", "All model checkpoint weights were used when initializing CLIPModel.\n", "\n", "All the weights of CLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPModel for predictions without further training.\n", "loading configuration file preprocessor_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", "loading configuration file preprocessor_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", "loading configuration file config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", "Model config CLIPConfig {\n", " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", " \"architectures\": [\n", " \"CLIPModel\"\n", " ],\n", " \"initializer_factor\": 1.0,\n", " \"logit_scale_init_value\": 2.6592,\n", " \"model_type\": \"clip\",\n", " \"projection_dim\": 512,\n", " \"text_config\": {\n", " \"bos_token_id\": 0,\n", " \"dropout\": 0.0,\n", " \"eos_token_id\": 2,\n", " \"model_type\": \"clip_text_model\"\n", " },\n", " \"transformers_version\": \"4.36.2\",\n", " \"vision_config\": {\n", " \"dropout\": 0.0,\n", " \"model_type\": \"clip_vision_model\"\n", " }\n", "}\n", "\n", "loading configuration file preprocessor_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n", "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n", "Image processor CLIPImageProcessor {\n", " \"crop_size\": {\n", " \"height\": 224,\n", " \"width\": 224\n", " },\n", " \"do_center_crop\": true,\n", " \"do_convert_rgb\": true,\n", " \"do_normalize\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n", " \"image_mean\": [\n", " 0.48145466,\n", " 0.4578275,\n", " 0.40821073\n", " ],\n", " \"image_processor_type\": \"CLIPImageProcessor\",\n", " \"image_std\": [\n", " 0.26862954,\n", " 0.26130258,\n", " 0.27577711\n", " ],\n", " \"resample\": 3,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"size\": {\n", " \"shortest_edge\": 224\n", " }\n", "}\n", "\n", "loading file vocab.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n", "loading file merges.txt from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n", "loading file tokenizer.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n", "loading file added_tokens.json from cache at None\n", "loading file special_tokens_map.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n", "loading file tokenizer_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n", "loading configuration file config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", "Model config CLIPConfig {\n", " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", " \"architectures\": [\n", " \"CLIPModel\"\n", " ],\n", " \"initializer_factor\": 1.0,\n", " \"logit_scale_init_value\": 2.6592,\n", " \"model_type\": \"clip\",\n", " \"projection_dim\": 512,\n", " \"text_config\": {\n", " \"bos_token_id\": 0,\n", " \"dropout\": 0.0,\n", " \"eos_token_id\": 2,\n", " \"model_type\": \"clip_text_model\"\n", " },\n", " \"transformers_version\": \"4.36.2\",\n", " \"vision_config\": {\n", " \"dropout\": 0.0,\n", " \"model_type\": \"clip_vision_model\"\n", " }\n", "}\n", "\n" ] } ], "source": [ "from PIL import Image\n", "import requests\n", "from transformers import AutoProcessor, CLIPModel\n", "\n", "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n", "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n", "\n", "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", "image = Image.open(requests.get(url, stream=True).raw)\n", "\n", "inputs = processor(images=image, return_tensors=\"pt\")\n", "\n", "image_features = model.get_image_features(**inputs)" ] }, { "cell_type": "code", "execution_count": 29, "id": "9ff63766-b706-452b-b735-bf9000fb9c20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 512])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "image_features.shape" ] }, { "cell_type": "code", "execution_count": 30, "id": "82566e7b-3c91-421a-94c5-f1e2b3e91c8c", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "loading configuration file config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", "Model config CLIPVisionConfig {\n", " \"attention_dropout\": 0.0,\n", " \"dropout\": 0.0,\n", " \"hidden_act\": \"quick_gelu\",\n", " \"hidden_size\": 768,\n", " \"image_size\": 224,\n", " \"initializer_factor\": 1.0,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-05,\n", " \"model_type\": \"clip_vision_model\",\n", " \"num_attention_heads\": 12,\n", " \"num_channels\": 3,\n", " \"num_hidden_layers\": 12,\n", " \"patch_size\": 32,\n", " \"projection_dim\": 512,\n", " \"transformers_version\": \"4.36.2\"\n", "}\n", "\n", "loading weights file pytorch_model.bin from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n", "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n", "- This IS expected if you are initializing CLIPVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing CLIPVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "All the weights of CLIPVisionModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPVisionModel for predictions without further training.\n", "loading configuration file preprocessor_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", "loading configuration file preprocessor_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", "loading configuration file config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", "Model config CLIPConfig {\n", " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", " \"architectures\": [\n", " \"CLIPModel\"\n", " ],\n", " \"initializer_factor\": 1.0,\n", " \"logit_scale_init_value\": 2.6592,\n", " \"model_type\": \"clip\",\n", " \"projection_dim\": 512,\n", " \"text_config\": {\n", " \"bos_token_id\": 0,\n", " \"dropout\": 0.0,\n", " \"eos_token_id\": 2,\n", " \"model_type\": \"clip_text_model\"\n", " },\n", " \"transformers_version\": \"4.36.2\",\n", " \"vision_config\": {\n", " \"dropout\": 0.0,\n", " \"model_type\": \"clip_vision_model\"\n", " }\n", "}\n", "\n", "loading configuration file preprocessor_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n", "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n", "Image processor CLIPImageProcessor {\n", " \"crop_size\": {\n", " \"height\": 224,\n", " \"width\": 224\n", " },\n", " \"do_center_crop\": true,\n", " \"do_convert_rgb\": true,\n", " \"do_normalize\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n", " \"image_mean\": [\n", " 0.48145466,\n", " 0.4578275,\n", " 0.40821073\n", " ],\n", " \"image_processor_type\": \"CLIPImageProcessor\",\n", " \"image_std\": [\n", " 0.26862954,\n", " 0.26130258,\n", " 0.27577711\n", " ],\n", " \"resample\": 3,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"size\": {\n", " \"shortest_edge\": 224\n", " }\n", "}\n", "\n", "loading file vocab.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n", "loading file merges.txt from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n", "loading file tokenizer.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n", "loading file added_tokens.json from cache at None\n", "loading file special_tokens_map.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n", "loading file tokenizer_config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n", "loading configuration file config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", "Model config CLIPConfig {\n", " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", " \"architectures\": [\n", " \"CLIPModel\"\n", " ],\n", " \"initializer_factor\": 1.0,\n", " \"logit_scale_init_value\": 2.6592,\n", " \"model_type\": \"clip\",\n", " \"projection_dim\": 512,\n", " \"text_config\": {\n", " \"bos_token_id\": 0,\n", " \"dropout\": 0.0,\n", " \"eos_token_id\": 2,\n", " \"model_type\": \"clip_text_model\"\n", " },\n", " \"transformers_version\": \"4.36.2\",\n", " \"vision_config\": {\n", " \"dropout\": 0.0,\n", " \"model_type\": \"clip_vision_model\"\n", " }\n", "}\n", "\n" ] } ], "source": [ "from PIL import Image\n", "import requests\n", "from transformers import AutoProcessor, CLIPVisionModel\n", "\n", "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n", "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n", "\n", "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", "image = Image.open(requests.get(url, stream=True).raw)\n", "\n", "inputs = processor(images=image, return_tensors=\"pt\")\n", "\n", "outputs = model(**inputs)\n", "last_hidden_state = outputs.last_hidden_state\n", "pooled_output = outputs.pooler_output # pooled CLS states" ] }, { "cell_type": "code", "execution_count": 31, "id": "bcf0a7b3-6cbb-492e-bc2c-42e3edbe6a0c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 768])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pooled_output.shape" ] }, { "cell_type": "code", "execution_count": 10, "id": "67240294-c7a0-4e94-a8c1-86bfe1b21977", "metadata": {}, "outputs": [], "source": [ "from transformers import CLIPPreTrainedModel\n", "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n", "from typing import Optional, Union, Tuple" ] }, { "cell_type": "code", "execution_count": 54, "id": "cc9b20db-7f84-44c3-9c78-e84164ccc192", "metadata": {}, "outputs": [], "source": [ "class VisionLanguageConnector(nn.Module):\n", " def __init__(self, hidden_size, projection_dim):\n", " super().__init__()\n", " self.mlp = nn.Sequential(\n", " nn.Linear(hidden_size, hidden_size, bias=False),\n", " nn.GELU(),\n", " nn.Linear(hidden_size, projection_dim, bias=False)\n", " )\n", "\n", " def forward(self, x):\n", " return self.mlp(x)\n", " \n", "class ClipWithProjection(CLIPPreTrainedModel):\n", " config_class = CLIPVisionConfig\n", " main_input_name = \"pixel_values\"\n", "\n", " def __init__(self, config: CLIPVisionConfig):\n", " super().__init__(config)\n", "\n", " self.vision_model = CLIPVisionTransformer(config)\n", " self.vision_model.\n", " self.vision_language_connector = VisionLanguageConnector(config.hidden_size, config.projection_dim)\n", "\n", " # Initialize weights and apply final processing\n", " self.post_init()\n", "\n", " def forward(\n", " self,\n", " pixel_values: Optional[torch.FloatTensor] = None,\n", " output_attentions: Optional[bool] = None,\n", " output_hidden_states: Optional[bool] = None,\n", " return_dict: Optional[bool] = None,\n", " ) -> Union[Tuple, CLIPVisionModelOutput]:\n", " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", "\n", " vision_outputs = self.vision_model(\n", " pixel_values=pixel_values,\n", " output_attentions=output_attentions,\n", " output_hidden_states=output_hidden_states,\n", " return_dict=return_dict,\n", " )\n", "\n", " pooled_output = vision_outputs[1] # pooled_output\n", "\n", " image_embeds = self.vision_language_connector(pooled_output)\n", "\n", " if not return_dict:\n", " outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n", " return tuple(output for output in outputs if output is not None)\n", "\n", " return CLIPVisionModelOutput(\n", " image_embeds=image_embeds,\n", " last_hidden_state=vision_outputs.last_hidden_state,\n", " hidden_states=vision_outputs.hidden_states,\n", " attentions=vision_outputs.attentions,\n", " )" ] }, { "cell_type": "code", "execution_count": 55, "id": "a4892ab8-39d2-41c9-ad2a-04711c22b95f", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "loading configuration file config.json from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", "Model config CLIPVisionConfig {\n", " \"attention_dropout\": 0.0,\n", " \"dropout\": 0.0,\n", " \"hidden_act\": \"quick_gelu\",\n", " \"hidden_size\": 768,\n", " \"image_size\": 224,\n", " \"initializer_factor\": 1.0,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-05,\n", " \"model_type\": \"clip_vision_model\",\n", " \"num_attention_heads\": 12,\n", " \"num_channels\": 3,\n", " \"num_hidden_layers\": 12,\n", " \"patch_size\": 32,\n", " \"projection_dim\": 512,\n", " \"transformers_version\": \"4.36.2\"\n", "}\n", "\n", "loading weights file pytorch_model.bin from cache at /home/gunak/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n", "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing ClipWithProjection: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n", "- This IS expected if you are initializing ClipWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing ClipWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of ClipWithProjection were not initialized from the model checkpoint at openai/clip-vit-base-patch32 and are newly initialized: ['vision_language_connector.mlp.2.weight', 'vision_language_connector.mlp.0.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "model = ClipWithProjection.from_pretrained(\"openai/clip-vit-base-patch32\")" ] }, { "cell_type": "code", "execution_count": 56, "id": "588ef914-5be9-49e1-b68d-b899e0e74edd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "768" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.config.hidden_size" ] }, { "cell_type": "code", "execution_count": 57, "id": "05d95b9e-9831-4415-860e-94793e29d210", "metadata": {}, "outputs": [], "source": [ "outputs = model(**inputs)" ] }, { "cell_type": "code", "execution_count": 61, "id": "185b1bff-6ffe-4cce-9255-ee7629feba54", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 512])" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outputs[0].shape" ] }, { "cell_type": "code", "execution_count": null, "id": "04414a35-c7b3-4986-a79e-1d363916caa4", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "id": "485dbbcb-06df-4926-b257-dfd1a4081d44", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'outputs' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43moutputs\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n", "\u001b[0;31mNameError\u001b[0m: name 'outputs' is not defined" ] } ], "source": [ "outputs[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "f983313c-8e0f-4805-af14-25bb69afd04c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }