Diangle commited on
Commit
dd198e6
1 Parent(s): 63d5ecf

Upload 2 files

Browse files
Notebooks/GSI_VideoRetrieval_EmbedVideos.ipynb ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# GSI Technology Video Search Demo - Embedding Videos Notebook:\n",
9
+ "\n",
10
+ "The following Notebook will include code that demonstrates the process of video embedding.<br>\n",
11
+ "It specifically focuses on embedding a single video using the [Diangle/clip4clip-webvid](https://huggingface.co/Diangle/clip4clip-webvid) model."
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 1,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "\"Close-up women's hands scratch\"\n",
21
+ "example = './example/34721191.mp4'"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 2,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode\n",
31
+ "from PIL import Image\n",
32
+ "import cv2\n",
33
+ "import numpy as np\n",
34
+ "import torch\n",
35
+ "\n",
36
+ "# Code to convert one video to few images. \n",
37
+ "def video2image(video_path, frame_rate=1.0, size=224):\n",
38
+ " def preprocess(size, n_px):\n",
39
+ " return Compose([\n",
40
+ " Resize(size, interpolation=InterpolationMode.BICUBIC), \n",
41
+ " CenterCrop(size),\n",
42
+ " lambda image: image.convert(\"RGB\"),\n",
43
+ " ToTensor(),\n",
44
+ " Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n",
45
+ " ])(n_px)\n",
46
+ " \n",
47
+ " cap = cv2.VideoCapture(video_path)\n",
48
+ " cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)\n",
49
+ " frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
50
+ " fps = int(cap.get(cv2.CAP_PROP_FPS))\n",
51
+ " if fps < 1:\n",
52
+ " images = np.zeros([3, size, size], dtype=np.float32) \n",
53
+ " print(\"ERROR: problem reading video file: \", video_path)\n",
54
+ " else:\n",
55
+ " total_duration = (frameCount + fps - 1) // fps\n",
56
+ " start_sec, end_sec = 0, total_duration\n",
57
+ " interval = fps / frame_rate\n",
58
+ " frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))\n",
59
+ " ret = True \n",
60
+ " images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)\n",
61
+ " \n",
62
+ " for i, idx in enumerate(frames_idx):\n",
63
+ " cap.set(cv2.CAP_PROP_POS_FRAMES , idx)\n",
64
+ " ret, frame = cap.read() \n",
65
+ " if not ret: break\n",
66
+ " frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) \n",
67
+ " last_frame = i\n",
68
+ " images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert(\"RGB\"))\n",
69
+ " \n",
70
+ " images = images[:last_frame+1]\n",
71
+ " cap.release()\n",
72
+ " video_frames = torch.tensor(images)\n",
73
+ " return video_frames\n",
74
+ " \n",
75
+ "video = video2image(example)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 3,
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "name": "stderr",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "Some weights of the model checkpoint at Diangle/clip4clip-webvid were not used when initializing CLIPVisionModelWithProjection: ['text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'logit_scale', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_projection.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias']\n",
88
+ "- This IS expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
89
+ "- This IS NOT expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
90
+ ]
91
+ },
92
+ {
93
+ "name": "stdout",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "tensor([-2.9570e-02, 6.0339e-03, 1.7294e-02, -1.3951e-02, 4.8329e-02,\n",
97
+ " 2.4099e-02, 3.3340e-02, 3.1769e-02, 2.1997e-03, 4.2602e-03,\n",
98
+ " -1.3887e-02, 8.2744e-03, 2.5123e-03, -2.2163e-02, -4.1139e-02,\n",
99
+ " -1.2101e-02, -6.1914e-02, 6.7091e-03, 4.2834e-02, -2.2604e-02,\n",
100
+ " -2.7443e-02, 1.0600e-02, 2.9430e-03, 3.2580e-02, -1.3577e-02,\n",
101
+ " 7.8084e-03, 1.2397e-02, -5.3404e-03, 1.4736e-02, -2.4564e-02,\n",
102
+ " -5.4057e-02, 3.9507e-02, 1.2754e-02, 4.6864e-04, 7.4087e-03,\n",
103
+ " 3.8710e-03, 7.9482e-03, 1.3444e-02, -1.7326e-02, -1.2486e-01,\n",
104
+ " -8.4992e-02, -3.9097e-02, -2.1903e-02, -7.1480e-03, -2.7220e-03,\n",
105
+ " 4.1397e-03, 1.7315e-02, 4.4724e-02, 9.1722e-04, 3.1429e-02,\n",
106
+ " 3.8212e-02, -2.1133e-02, 2.4437e-03, -1.4371e-03, -2.9859e-03,\n",
107
+ " 7.8939e-04, 2.4093e-02, -2.2199e-02, -3.9110e-02, 1.7673e-02,\n",
108
+ " 1.1360e-01, 3.3466e-03, -1.9643e-02, 1.7798e-03, 1.5112e-02,\n",
109
+ " -6.2003e-03, -2.0564e-02, 6.4936e-02, 6.6286e-02, -2.0585e-02,\n",
110
+ " 2.0740e-02, 1.0476e-02, -5.9948e-03, -2.4672e-02, 2.3725e-02,\n",
111
+ " -4.6442e-03, 1.8887e-02, 3.7517e-02, 3.1605e-02, -3.7756e-03,\n",
112
+ " 2.7584e-02, 5.7234e-03, 3.4368e-02, 1.4564e-02, 2.6392e-02,\n",
113
+ " -1.9975e-02, 1.2648e-01, -5.3093e-03, 7.3013e-02, 4.8827e-03,\n",
114
+ " -2.8492e-02, -4.9734e-02, -6.6967e-01, 1.2463e-02, 2.4013e-02,\n",
115
+ " 1.3702e-02, 2.9382e-02, 1.4373e-02, -2.1994e-02, 3.6824e-03,\n",
116
+ " 2.9366e-02, -2.1474e-03, 1.7371e-02, -6.1958e-02, -4.6649e-02,\n",
117
+ " -4.3063e-03, 1.0081e-01, -3.1598e-02, 9.4211e-03, -9.7909e-03,\n",
118
+ " 4.4678e-02, -4.8716e-03, 1.8896e-02, 9.5822e-03, -2.3881e-02,\n",
119
+ " -9.0785e-03, 5.4653e-03, 3.0017e-02, -3.0415e-02, -1.3150e-03,\n",
120
+ " 2.9047e-02, 3.2315e-02, -1.0728e-02, 4.7503e-02, -4.0033e-02,\n",
121
+ " 3.4482e-02, 6.2684e-02, 3.0337e-02, 5.0680e-02, -8.6022e-03,\n",
122
+ " 1.5261e-02, 3.7766e-02, -2.4730e-02, 8.6131e-02, 4.5388e-02,\n",
123
+ " 5.4677e-02, 3.9401e-02, 4.4164e-02, -5.2270e-02, -8.8473e-03,\n",
124
+ " 8.1178e-03, -1.0574e-02, -7.6409e-05, -8.3209e-03, -8.1179e-04,\n",
125
+ " 3.2574e-02, -1.4150e-02, -4.0937e-02, 1.0180e-02, 1.3868e-03,\n",
126
+ " 3.4978e-02, -1.1991e-02, -2.1560e-02, 2.0833e-02, 3.8494e-02,\n",
127
+ " 1.4916e-02, -1.5102e-02, -1.0009e-02, -9.6670e-03, 3.6516e-03,\n",
128
+ " 2.6473e-02, -9.1190e-03, -1.9326e-02, 3.2072e-02, -2.9562e-02,\n",
129
+ " -4.1949e-02, -9.4430e-03, 2.7654e-02, 3.1868e-02, 2.6336e-03,\n",
130
+ " -1.6622e-02, -3.4676e-02, -3.4540e-02, 8.5971e-03, -9.4823e-03,\n",
131
+ " -3.6754e-02, 4.9925e-02, 9.8040e-04, -6.7678e-02, 5.0645e-03,\n",
132
+ " -7.5227e-03, 1.2880e-02, 5.5055e-02, -5.1705e-02, -6.1548e-02,\n",
133
+ " 1.4440e-03, -6.8204e-03, -1.4279e-02, -2.8179e-02, -2.2386e-02,\n",
134
+ " 5.2374e-02, -3.4718e-02, 5.3560e-03, -6.3553e-02, 8.3361e-02,\n",
135
+ " -2.7192e-02, 4.2078e-02, 3.2605e-03, -5.6035e-02, -8.2745e-03,\n",
136
+ " -2.8813e-02, 4.3161e-02, -5.0922e-02, 3.0529e-02, 2.0102e-02,\n",
137
+ " 2.9533e-02, -7.8186e-03, -3.0819e-02, -2.1356e-02, -2.7967e-02,\n",
138
+ " 2.4877e-02, 2.3300e-02, 2.8305e-02, 2.9761e-02, 1.2363e-02,\n",
139
+ " -1.4158e-02, -1.1000e-02, 2.3479e-02, 4.8863e-02, -1.3325e-02,\n",
140
+ " 1.2415e-02, -1.0494e-02, -5.3160e-04, -1.3253e-02, -2.4968e-03,\n",
141
+ " 2.0370e-02, -5.9943e-03, -9.5419e-03, 5.9531e-03, -8.3129e-03,\n",
142
+ " -4.0607e-03, 6.1272e-03, -2.9724e-02, -1.8579e-02, 1.2740e-02,\n",
143
+ " -2.6391e-02, 4.1079e-03, -4.0331e-03, 3.4990e-02, -3.4697e-04,\n",
144
+ " -9.6936e-03, -2.2701e-02, 3.2625e-02, 1.1973e-02, -3.9408e-02,\n",
145
+ " -6.4848e-02, 4.3097e-02, 2.6910e-02, -3.9942e-02, 3.4112e-02,\n",
146
+ " -7.8409e-03, -4.3240e-02, -1.6996e-02, 3.8101e-02, -3.8530e-02,\n",
147
+ " 2.1452e-04, 3.7173e-02, 2.3474e-02, 1.9435e-03, -2.1596e-02,\n",
148
+ " 1.2855e-02, 4.8854e-03, 2.1395e-02, -2.4349e-02, 7.3487e-03,\n",
149
+ " -2.7641e-02, -1.5773e-02, 1.1367e-02, 8.7802e-03, 2.3783e-02,\n",
150
+ " 3.3420e-02, 3.4498e-02, 2.2979e-02, -1.2473e-02, 3.1100e-02,\n",
151
+ " 6.0752e-02, -2.5795e-02, 1.7830e-02, -1.3168e-02, 8.0613e-04,\n",
152
+ " 1.3292e-02, 8.1109e-03, 2.1875e-03, -1.0863e-02, 3.8718e-02,\n",
153
+ " 4.5967e-02, -1.2454e-01, 2.6564e-02, -4.4082e-04, 1.8394e-02,\n",
154
+ " 2.9872e-02, 6.4751e-03, 5.4129e-03, 2.0823e-02, -4.9624e-02,\n",
155
+ " -2.3234e-02, -5.7144e-02, -1.3117e-02, -5.3304e-02, -1.9084e-02,\n",
156
+ " -1.9121e-02, 2.5556e-04, -3.9970e-02, -3.3640e-02, 1.0532e-02,\n",
157
+ " 5.7862e-02, -4.0414e-02, 6.6390e-03, 1.6265e-03, 1.0555e-02,\n",
158
+ " -5.1818e-03, -3.9941e-02, 8.6119e-02, 2.5038e-02, 1.1136e-02,\n",
159
+ " -8.5421e-03, -2.0004e-02, 3.0798e-02, -4.8180e-03, -1.1030e-02,\n",
160
+ " 7.1489e-03, 7.0376e-02, -4.2558e-02, -5.4193e-02, 6.0990e-03,\n",
161
+ " 1.5232e-02, 1.3667e-02, -1.5016e-02, -1.0382e-03, -6.4072e-03,\n",
162
+ " 2.3970e-03, 3.7884e-02, -1.7684e-02, 2.0192e-02, -2.1400e-02,\n",
163
+ " 1.6529e-02, 1.8982e-02, 1.6748e-02, -2.0919e-02, 1.2904e-02,\n",
164
+ " -1.5105e-02, -1.7961e-02, 2.2824e-03, 9.0103e-04, 1.3905e-02,\n",
165
+ " -5.2162e-02, 5.7747e-03, 6.7262e-03, 6.3685e-03, -1.2071e-02,\n",
166
+ " -2.7873e-02, -1.4171e-04, -4.8872e-02, -8.9744e-03, -1.0448e-02,\n",
167
+ " 4.9146e-02, -2.0365e-02, -6.8874e-02, 1.3715e-02, -2.8159e-02,\n",
168
+ " 5.1973e-03, -4.1494e-02, 1.7353e-02, -1.4510e-02, -4.5331e-03,\n",
169
+ " 1.0267e-02, -2.9127e-02, 1.0169e-02, -5.0776e-03, -2.0463e-02,\n",
170
+ " 1.6880e-02, 2.4789e-02, -3.2186e-02, -1.5043e-02, -9.5236e-03,\n",
171
+ " -1.8453e-02, 1.9968e-01, -3.1110e-02, -3.4481e-02, -5.3706e-03,\n",
172
+ " -2.3295e-02, -6.6525e-02, 1.5241e-02, -5.3700e-02, -1.3558e-02,\n",
173
+ " -7.4800e-02, 4.6305e-02, 4.3405e-03, 1.0513e-02, -1.4961e-02,\n",
174
+ " 1.2347e-01, -4.1887e-02, -2.9692e-02, -2.0832e-02, 2.5459e-03,\n",
175
+ " 1.5311e-02, -1.3357e-02, 1.3205e-02, 2.8943e-02, 4.9173e-02,\n",
176
+ " 3.3758e-02, 1.1087e-02, 4.2151e-02, 6.3205e-04, -4.3288e-02,\n",
177
+ " 2.3333e-02, 1.5167e-02, -1.0237e-02, -7.9236e-02, 4.3594e-03,\n",
178
+ " 3.1445e-02, 4.2794e-03, -9.3492e-03, -3.5418e-02, -1.9242e-02,\n",
179
+ " -3.0336e-02, 7.7880e-03, 6.6255e-02, -7.5213e-03, 2.5932e-02,\n",
180
+ " -1.7802e-02, 1.8590e-03, 5.3834e-03, 9.6787e-02, 2.8787e-02,\n",
181
+ " 9.1017e-04, -1.8586e-02, 2.2730e-02, -9.7814e-02, 4.2616e-02,\n",
182
+ " 4.0229e-02, -8.9988e-03, -2.0952e-02, 7.7816e-03, -4.0449e-04,\n",
183
+ " -1.3639e-02, -1.7206e-03, -9.1304e-03, 4.3670e-03, 1.9919e-02,\n",
184
+ " -2.0095e-02, -2.6256e-03, 3.0235e-02, 3.7728e-03, 6.3254e-04,\n",
185
+ " -6.9728e-02, 2.5881e-03, 1.0343e-02, 3.3831e-02, 2.2356e-03,\n",
186
+ " -2.7363e-02, 3.5232e-02, 5.3659e-02, -7.8222e-03, -2.0881e-03,\n",
187
+ " 2.2187e-02, 2.0626e-02, 3.6413e-02, -4.4460e-03, 4.6213e-02,\n",
188
+ " -1.4652e-03, 2.1768e-02, 3.3055e-03, -2.3867e-02, -2.7972e-02,\n",
189
+ " -6.7086e-02, 2.4510e-02, 4.0885e-02, -1.6748e-03, 1.2575e-02,\n",
190
+ " -2.0675e-04, -1.1889e-02, 4.2555e-03, -2.6686e-02, -9.5006e-03,\n",
191
+ " -1.3144e-02, 3.0939e-02, -1.9938e-02, 4.2527e-02, -1.4343e-02,\n",
192
+ " 5.5876e-03, 2.4495e-02, 3.9814e-03, 2.8102e-02, 4.3181e-02,\n",
193
+ " -1.7406e-02, -4.2736e-02, -8.1578e-03, -5.3989e-03, 2.9429e-03,\n",
194
+ " 4.3196e-02, -2.0857e-02, -3.0203e-02, -4.0288e-03, -4.4894e-02,\n",
195
+ " 2.7039e-02, 3.5724e-02, -1.4012e-02, -2.3949e-03, 1.4861e-02,\n",
196
+ " 3.1610e-02, 4.8555e-02, 1.8550e-02, 1.2663e-02, -6.1358e-03,\n",
197
+ " -4.1771e-02, 2.8252e-02, -1.1711e-02, -4.0601e-03, -2.9267e-02,\n",
198
+ " -3.0001e-02, 1.6215e-02], grad_fn=<DivBackward0>)\n"
199
+ ]
200
+ }
201
+ ],
202
+ "source": [
203
+ "from transformers import CLIPVisionModelWithProjection\n",
204
+ "\n",
205
+ "model = CLIPVisionModelWithProjection.from_pretrained(\"Diangle/clip4clip-webvid\")\n",
206
+ "model = model.eval()\n",
207
+ "visual_output = model(video)\n",
208
+ "\n",
209
+ "# Normalizing the embeddings and calculating mean between all embeddings. \n",
210
+ "visual_output = visual_output[\"image_embeds\"]\n",
211
+ "visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
212
+ "visual_output = torch.mean(visual_output, dim=0)\n",
213
+ "visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
214
+ "print(visual_output)\n",
215
+ "\n",
216
+ " "
217
+ ]
218
+ }
219
+ ],
220
+ "metadata": {
221
+ "kernelspec": {
222
+ "display_name": "Python 3",
223
+ "language": "python",
224
+ "name": "python3"
225
+ },
226
+ "language_info": {
227
+ "codemirror_mode": {
228
+ "name": "ipython",
229
+ "version": 3
230
+ },
231
+ "file_extension": ".py",
232
+ "mimetype": "text/x-python",
233
+ "name": "python",
234
+ "nbconvert_exporter": "python",
235
+ "pygments_lexer": "ipython3",
236
+ "version": "3.10.9"
237
+ },
238
+ "orig_nbformat": 4
239
+ },
240
+ "nbformat": 4,
241
+ "nbformat_minor": 2
242
+ }
Notebooks/example/34721191.mp4 ADDED
Binary file (875 kB). View file