{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# GSI Technology Video Search Demo - Embedding Videos Notebook:\n", "\n", "The following Notebook will include code that demonstrates the process of video embedding.
\n", "It specifically focuses on embedding a single video using the [Diangle/clip4clip-webvid](https://huggingface.co/Diangle/clip4clip-webvid) model." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "example = './example/34721191.mp4'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode\n", "from PIL import Image\n", "import cv2\n", "import numpy as np\n", "import torch\n", "\n", "# Code to convert one video to few images. \n", "def video2image(video_path, frame_rate=1.0, size=224):\n", " def preprocess(size, n_px):\n", " return Compose([\n", " Resize(size, interpolation=InterpolationMode.BICUBIC), \n", " CenterCrop(size),\n", " lambda image: image.convert(\"RGB\"),\n", " ToTensor(),\n", " Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n", " ])(n_px)\n", " \n", " cap = cv2.VideoCapture(video_path)\n", " cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)\n", " frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n", " fps = int(cap.get(cv2.CAP_PROP_FPS))\n", " if fps < 1:\n", " images = np.zeros([3, size, size], dtype=np.float32) \n", " print(\"ERROR: problem reading video file: \", video_path)\n", " else:\n", " total_duration = (frameCount + fps - 1) // fps\n", " start_sec, end_sec = 0, total_duration\n", " interval = fps / frame_rate\n", " frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))\n", " ret = True \n", " images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)\n", " \n", " for i, idx in enumerate(frames_idx):\n", " cap.set(cv2.CAP_PROP_POS_FRAMES , idx)\n", " ret, frame = cap.read() \n", " if not ret: break\n", " frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) \n", " last_frame = i\n", " images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert(\"RGB\"))\n", " \n", " images = images[:last_frame+1]\n", " cap.release()\n", " video_frames = torch.tensor(images)\n", " return video_frames\n", " \n", "video = video2image(example)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([-2.9570e-02, 6.0339e-03, 1.7294e-02, -1.3951e-02, 4.8329e-02,\n", " 2.4099e-02, 3.3340e-02, 3.1769e-02, 2.1997e-03, 4.2602e-03,\n", " -1.3887e-02, 8.2744e-03, 2.5123e-03, -2.2163e-02, -4.1139e-02,\n", " -1.2101e-02, -6.1914e-02, 6.7091e-03, 4.2834e-02, -2.2604e-02,\n", " -2.7443e-02, 1.0600e-02, 2.9430e-03, 3.2580e-02, -1.3577e-02,\n", " 7.8084e-03, 1.2397e-02, -5.3404e-03, 1.4736e-02, -2.4564e-02,\n", " -5.4057e-02, 3.9507e-02, 1.2754e-02, 4.6864e-04, 7.4087e-03,\n", " 3.8710e-03, 7.9482e-03, 1.3444e-02, -1.7326e-02, -1.2486e-01,\n", " -8.4992e-02, -3.9097e-02, -2.1903e-02, -7.1480e-03, -2.7220e-03,\n", " 4.1397e-03, 1.7315e-02, 4.4724e-02, 9.1722e-04, 3.1429e-02,\n", " 3.8212e-02, -2.1133e-02, 2.4437e-03, -1.4371e-03, -2.9859e-03,\n", " 7.8939e-04, 2.4093e-02, -2.2199e-02, -3.9110e-02, 1.7673e-02,\n", " 1.1360e-01, 3.3466e-03, -1.9643e-02, 1.7798e-03, 1.5112e-02,\n", " -6.2003e-03, -2.0564e-02, 6.4936e-02, 6.6286e-02, -2.0585e-02,\n", " 2.0740e-02, 1.0476e-02, -5.9948e-03, -2.4672e-02, 2.3725e-02,\n", " -4.6442e-03, 1.8887e-02, 3.7517e-02, 3.1605e-02, -3.7756e-03,\n", " 2.7584e-02, 5.7234e-03, 3.4368e-02, 1.4564e-02, 2.6392e-02,\n", " -1.9975e-02, 1.2648e-01, -5.3093e-03, 7.3013e-02, 4.8827e-03,\n", " -2.8492e-02, -4.9734e-02, -6.6967e-01, 1.2463e-02, 2.4013e-02,\n", " 1.3702e-02, 2.9382e-02, 1.4373e-02, -2.1994e-02, 3.6824e-03,\n", " 2.9366e-02, -2.1474e-03, 1.7371e-02, -6.1958e-02, -4.6649e-02,\n", " -4.3063e-03, 1.0081e-01, -3.1598e-02, 9.4211e-03, -9.7909e-03,\n", " 4.4678e-02, -4.8716e-03, 1.8896e-02, 9.5822e-03, -2.3881e-02,\n", " -9.0785e-03, 5.4653e-03, 3.0017e-02, -3.0415e-02, -1.3150e-03,\n", " 2.9047e-02, 3.2315e-02, -1.0728e-02, 4.7503e-02, -4.0033e-02,\n", " 3.4482e-02, 6.2684e-02, 3.0337e-02, 5.0680e-02, -8.6022e-03,\n", " 1.5261e-02, 3.7766e-02, -2.4730e-02, 8.6131e-02, 4.5388e-02,\n", " 5.4677e-02, 3.9401e-02, 4.4164e-02, -5.2270e-02, -8.8473e-03,\n", " 8.1178e-03, -1.0574e-02, -7.6409e-05, -8.3209e-03, -8.1179e-04,\n", " 3.2574e-02, -1.4150e-02, -4.0937e-02, 1.0180e-02, 1.3868e-03,\n", " 3.4978e-02, -1.1991e-02, -2.1560e-02, 2.0833e-02, 3.8494e-02,\n", " 1.4916e-02, -1.5102e-02, -1.0009e-02, -9.6670e-03, 3.6516e-03,\n", " 2.6473e-02, -9.1190e-03, -1.9326e-02, 3.2072e-02, -2.9562e-02,\n", " -4.1949e-02, -9.4430e-03, 2.7654e-02, 3.1868e-02, 2.6336e-03,\n", " -1.6622e-02, -3.4676e-02, -3.4540e-02, 8.5971e-03, -9.4823e-03,\n", " -3.6754e-02, 4.9925e-02, 9.8040e-04, -6.7678e-02, 5.0645e-03,\n", " -7.5227e-03, 1.2880e-02, 5.5055e-02, -5.1705e-02, -6.1548e-02,\n", " 1.4440e-03, -6.8204e-03, -1.4279e-02, -2.8179e-02, -2.2386e-02,\n", " 5.2374e-02, -3.4718e-02, 5.3560e-03, -6.3553e-02, 8.3361e-02,\n", " -2.7192e-02, 4.2078e-02, 3.2605e-03, -5.6035e-02, -8.2745e-03,\n", " -2.8813e-02, 4.3161e-02, -5.0922e-02, 3.0529e-02, 2.0102e-02,\n", " 2.9533e-02, -7.8186e-03, -3.0819e-02, -2.1356e-02, -2.7967e-02,\n", " 2.4877e-02, 2.3300e-02, 2.8305e-02, 2.9761e-02, 1.2363e-02,\n", " -1.4158e-02, -1.1000e-02, 2.3479e-02, 4.8863e-02, -1.3325e-02,\n", " 1.2415e-02, -1.0494e-02, -5.3160e-04, -1.3253e-02, -2.4968e-03,\n", " 2.0370e-02, -5.9943e-03, -9.5419e-03, 5.9531e-03, -8.3129e-03,\n", " -4.0607e-03, 6.1272e-03, -2.9724e-02, -1.8579e-02, 1.2740e-02,\n", " -2.6391e-02, 4.1079e-03, -4.0331e-03, 3.4990e-02, -3.4697e-04,\n", " -9.6936e-03, -2.2701e-02, 3.2625e-02, 1.1973e-02, -3.9408e-02,\n", " -6.4848e-02, 4.3097e-02, 2.6910e-02, -3.9942e-02, 3.4112e-02,\n", " -7.8409e-03, -4.3240e-02, -1.6996e-02, 3.8101e-02, -3.8530e-02,\n", " 2.1452e-04, 3.7173e-02, 2.3474e-02, 1.9435e-03, -2.1596e-02,\n", " 1.2855e-02, 4.8854e-03, 2.1395e-02, -2.4349e-02, 7.3487e-03,\n", " -2.7641e-02, -1.5773e-02, 1.1367e-02, 8.7802e-03, 2.3783e-02,\n", " 3.3420e-02, 3.4498e-02, 2.2979e-02, -1.2473e-02, 3.1100e-02,\n", " 6.0752e-02, -2.5795e-02, 1.7830e-02, -1.3168e-02, 8.0613e-04,\n", " 1.3292e-02, 8.1109e-03, 2.1875e-03, -1.0863e-02, 3.8718e-02,\n", " 4.5967e-02, -1.2454e-01, 2.6564e-02, -4.4082e-04, 1.8394e-02,\n", " 2.9872e-02, 6.4751e-03, 5.4129e-03, 2.0823e-02, -4.9624e-02,\n", " -2.3234e-02, -5.7144e-02, -1.3117e-02, -5.3304e-02, -1.9084e-02,\n", " -1.9121e-02, 2.5556e-04, -3.9970e-02, -3.3640e-02, 1.0532e-02,\n", " 5.7862e-02, -4.0414e-02, 6.6390e-03, 1.6265e-03, 1.0555e-02,\n", " -5.1818e-03, -3.9941e-02, 8.6119e-02, 2.5038e-02, 1.1136e-02,\n", " -8.5421e-03, -2.0004e-02, 3.0798e-02, -4.8180e-03, -1.1030e-02,\n", " 7.1489e-03, 7.0376e-02, -4.2558e-02, -5.4193e-02, 6.0990e-03,\n", " 1.5232e-02, 1.3667e-02, -1.5016e-02, -1.0382e-03, -6.4072e-03,\n", " 2.3970e-03, 3.7884e-02, -1.7684e-02, 2.0192e-02, -2.1400e-02,\n", " 1.6529e-02, 1.8982e-02, 1.6748e-02, -2.0919e-02, 1.2904e-02,\n", " -1.5105e-02, -1.7961e-02, 2.2824e-03, 9.0103e-04, 1.3905e-02,\n", " -5.2162e-02, 5.7747e-03, 6.7262e-03, 6.3685e-03, -1.2071e-02,\n", " -2.7873e-02, -1.4171e-04, -4.8872e-02, -8.9744e-03, -1.0448e-02,\n", " 4.9146e-02, -2.0365e-02, -6.8874e-02, 1.3715e-02, -2.8159e-02,\n", " 5.1973e-03, -4.1494e-02, 1.7353e-02, -1.4510e-02, -4.5331e-03,\n", " 1.0267e-02, -2.9127e-02, 1.0169e-02, -5.0776e-03, -2.0463e-02,\n", " 1.6880e-02, 2.4789e-02, -3.2186e-02, -1.5043e-02, -9.5236e-03,\n", " -1.8453e-02, 1.9968e-01, -3.1110e-02, -3.4481e-02, -5.3706e-03,\n", " -2.3295e-02, -6.6525e-02, 1.5241e-02, -5.3700e-02, -1.3558e-02,\n", " -7.4800e-02, 4.6305e-02, 4.3405e-03, 1.0513e-02, -1.4961e-02,\n", " 1.2347e-01, -4.1887e-02, -2.9692e-02, -2.0832e-02, 2.5459e-03,\n", " 1.5311e-02, -1.3357e-02, 1.3205e-02, 2.8943e-02, 4.9173e-02,\n", " 3.3758e-02, 1.1087e-02, 4.2151e-02, 6.3205e-04, -4.3288e-02,\n", " 2.3333e-02, 1.5167e-02, -1.0237e-02, -7.9236e-02, 4.3594e-03,\n", " 3.1445e-02, 4.2794e-03, -9.3492e-03, -3.5418e-02, -1.9242e-02,\n", " -3.0336e-02, 7.7880e-03, 6.6255e-02, -7.5213e-03, 2.5932e-02,\n", " -1.7802e-02, 1.8590e-03, 5.3834e-03, 9.6787e-02, 2.8787e-02,\n", " 9.1017e-04, -1.8586e-02, 2.2730e-02, -9.7814e-02, 4.2616e-02,\n", " 4.0229e-02, -8.9988e-03, -2.0952e-02, 7.7816e-03, -4.0449e-04,\n", " -1.3639e-02, -1.7206e-03, -9.1304e-03, 4.3670e-03, 1.9919e-02,\n", " -2.0095e-02, -2.6256e-03, 3.0235e-02, 3.7728e-03, 6.3254e-04,\n", " -6.9728e-02, 2.5881e-03, 1.0343e-02, 3.3831e-02, 2.2356e-03,\n", " -2.7363e-02, 3.5232e-02, 5.3659e-02, -7.8222e-03, -2.0881e-03,\n", " 2.2187e-02, 2.0626e-02, 3.6413e-02, -4.4460e-03, 4.6213e-02,\n", " -1.4652e-03, 2.1768e-02, 3.3055e-03, -2.3867e-02, -2.7972e-02,\n", " -6.7086e-02, 2.4510e-02, 4.0885e-02, -1.6748e-03, 1.2575e-02,\n", " -2.0675e-04, -1.1889e-02, 4.2555e-03, -2.6686e-02, -9.5006e-03,\n", " -1.3144e-02, 3.0939e-02, -1.9938e-02, 4.2527e-02, -1.4343e-02,\n", " 5.5876e-03, 2.4495e-02, 3.9814e-03, 2.8102e-02, 4.3181e-02,\n", " -1.7406e-02, -4.2736e-02, -8.1578e-03, -5.3989e-03, 2.9429e-03,\n", " 4.3196e-02, -2.0857e-02, -3.0203e-02, -4.0288e-03, -4.4894e-02,\n", " 2.7039e-02, 3.5724e-02, -1.4012e-02, -2.3949e-03, 1.4861e-02,\n", " 3.1610e-02, 4.8555e-02, 1.8550e-02, 1.2663e-02, -6.1358e-03,\n", " -4.1771e-02, 2.8252e-02, -1.1711e-02, -4.0601e-03, -2.9267e-02,\n", " -3.0001e-02, 1.6215e-02], grad_fn=)\n" ] } ], "source": [ "from transformers import CLIPVisionModelWithProjection\n", "\n", "model = CLIPVisionModelWithProjection.from_pretrained(\"Diangle/clip4clip-webvid\")\n", "model = model.eval()\n", "visual_output = model(video)\n", "\n", "# Normalizing the embeddings and calculating mean between all embeddings. \n", "visual_output = visual_output[\"image_embeds\"]\n", "visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n", "visual_output = torch.mean(visual_output, dim=0)\n", "visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n", "print(visual_output)\n", "\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }