File size: 23,726 Bytes
dd198e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GSI Technology Video Search Demo - Embedding Videos Notebook:\n",
    "\n",
    "The following Notebook will include code that demonstrates the process of video embedding.<br>\n",
    "It specifically focuses on embedding a single video using the [Diangle/clip4clip-webvid](https://huggingface.co/Diangle/clip4clip-webvid) model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"Close-up women's hands scratch\"\n",
    "example = './example/34721191.mp4'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode\n",
    "from PIL import Image\n",
    "import cv2\n",
    "import numpy as np\n",
    "import torch\n",
    "\n",
    "# Code to convert one video to few images.  \n",
    "def video2image(video_path, frame_rate=1.0, size=224):\n",
    "    def preprocess(size, n_px):\n",
    "        return Compose([\n",
    "            Resize(size, interpolation=InterpolationMode.BICUBIC),            \n",
    "            CenterCrop(size),\n",
    "            lambda image: image.convert(\"RGB\"),\n",
    "            ToTensor(),\n",
    "            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n",
    "        ])(n_px)\n",
    "    \n",
    "    cap = cv2.VideoCapture(video_path)\n",
    "    cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)\n",
    "    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
    "    fps = int(cap.get(cv2.CAP_PROP_FPS))\n",
    "    if fps < 1:\n",
    "        images = np.zeros([3, size, size], dtype=np.float32) \n",
    "        print(\"ERROR: problem reading video file: \", video_path)\n",
    "    else:\n",
    "        total_duration = (frameCount + fps - 1) // fps\n",
    "        start_sec, end_sec = 0, total_duration\n",
    "        interval = fps / frame_rate\n",
    "        frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))\n",
    "        ret = True     \n",
    "        images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)\n",
    "            \n",
    "        for i, idx in enumerate(frames_idx):\n",
    "            cap.set(cv2.CAP_PROP_POS_FRAMES , idx)\n",
    "            ret, frame = cap.read()    \n",
    "            if not ret: break\n",
    "            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)             \n",
    "            last_frame = i\n",
    "            images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert(\"RGB\"))\n",
    "            \n",
    "        images = images[:last_frame+1]\n",
    "    cap.release()\n",
    "    video_frames = torch.tensor(images)\n",
    "    return video_frames\n",
    "    \n",
    "video = video2image(example)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at Diangle/clip4clip-webvid were not used when initializing CLIPVisionModelWithProjection: ['text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'logit_scale', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_projection.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias']\n",
      "- This IS expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([-2.9570e-02,  6.0339e-03,  1.7294e-02, -1.3951e-02,  4.8329e-02,\n",
      "         2.4099e-02,  3.3340e-02,  3.1769e-02,  2.1997e-03,  4.2602e-03,\n",
      "        -1.3887e-02,  8.2744e-03,  2.5123e-03, -2.2163e-02, -4.1139e-02,\n",
      "        -1.2101e-02, -6.1914e-02,  6.7091e-03,  4.2834e-02, -2.2604e-02,\n",
      "        -2.7443e-02,  1.0600e-02,  2.9430e-03,  3.2580e-02, -1.3577e-02,\n",
      "         7.8084e-03,  1.2397e-02, -5.3404e-03,  1.4736e-02, -2.4564e-02,\n",
      "        -5.4057e-02,  3.9507e-02,  1.2754e-02,  4.6864e-04,  7.4087e-03,\n",
      "         3.8710e-03,  7.9482e-03,  1.3444e-02, -1.7326e-02, -1.2486e-01,\n",
      "        -8.4992e-02, -3.9097e-02, -2.1903e-02, -7.1480e-03, -2.7220e-03,\n",
      "         4.1397e-03,  1.7315e-02,  4.4724e-02,  9.1722e-04,  3.1429e-02,\n",
      "         3.8212e-02, -2.1133e-02,  2.4437e-03, -1.4371e-03, -2.9859e-03,\n",
      "         7.8939e-04,  2.4093e-02, -2.2199e-02, -3.9110e-02,  1.7673e-02,\n",
      "         1.1360e-01,  3.3466e-03, -1.9643e-02,  1.7798e-03,  1.5112e-02,\n",
      "        -6.2003e-03, -2.0564e-02,  6.4936e-02,  6.6286e-02, -2.0585e-02,\n",
      "         2.0740e-02,  1.0476e-02, -5.9948e-03, -2.4672e-02,  2.3725e-02,\n",
      "        -4.6442e-03,  1.8887e-02,  3.7517e-02,  3.1605e-02, -3.7756e-03,\n",
      "         2.7584e-02,  5.7234e-03,  3.4368e-02,  1.4564e-02,  2.6392e-02,\n",
      "        -1.9975e-02,  1.2648e-01, -5.3093e-03,  7.3013e-02,  4.8827e-03,\n",
      "        -2.8492e-02, -4.9734e-02, -6.6967e-01,  1.2463e-02,  2.4013e-02,\n",
      "         1.3702e-02,  2.9382e-02,  1.4373e-02, -2.1994e-02,  3.6824e-03,\n",
      "         2.9366e-02, -2.1474e-03,  1.7371e-02, -6.1958e-02, -4.6649e-02,\n",
      "        -4.3063e-03,  1.0081e-01, -3.1598e-02,  9.4211e-03, -9.7909e-03,\n",
      "         4.4678e-02, -4.8716e-03,  1.8896e-02,  9.5822e-03, -2.3881e-02,\n",
      "        -9.0785e-03,  5.4653e-03,  3.0017e-02, -3.0415e-02, -1.3150e-03,\n",
      "         2.9047e-02,  3.2315e-02, -1.0728e-02,  4.7503e-02, -4.0033e-02,\n",
      "         3.4482e-02,  6.2684e-02,  3.0337e-02,  5.0680e-02, -8.6022e-03,\n",
      "         1.5261e-02,  3.7766e-02, -2.4730e-02,  8.6131e-02,  4.5388e-02,\n",
      "         5.4677e-02,  3.9401e-02,  4.4164e-02, -5.2270e-02, -8.8473e-03,\n",
      "         8.1178e-03, -1.0574e-02, -7.6409e-05, -8.3209e-03, -8.1179e-04,\n",
      "         3.2574e-02, -1.4150e-02, -4.0937e-02,  1.0180e-02,  1.3868e-03,\n",
      "         3.4978e-02, -1.1991e-02, -2.1560e-02,  2.0833e-02,  3.8494e-02,\n",
      "         1.4916e-02, -1.5102e-02, -1.0009e-02, -9.6670e-03,  3.6516e-03,\n",
      "         2.6473e-02, -9.1190e-03, -1.9326e-02,  3.2072e-02, -2.9562e-02,\n",
      "        -4.1949e-02, -9.4430e-03,  2.7654e-02,  3.1868e-02,  2.6336e-03,\n",
      "        -1.6622e-02, -3.4676e-02, -3.4540e-02,  8.5971e-03, -9.4823e-03,\n",
      "        -3.6754e-02,  4.9925e-02,  9.8040e-04, -6.7678e-02,  5.0645e-03,\n",
      "        -7.5227e-03,  1.2880e-02,  5.5055e-02, -5.1705e-02, -6.1548e-02,\n",
      "         1.4440e-03, -6.8204e-03, -1.4279e-02, -2.8179e-02, -2.2386e-02,\n",
      "         5.2374e-02, -3.4718e-02,  5.3560e-03, -6.3553e-02,  8.3361e-02,\n",
      "        -2.7192e-02,  4.2078e-02,  3.2605e-03, -5.6035e-02, -8.2745e-03,\n",
      "        -2.8813e-02,  4.3161e-02, -5.0922e-02,  3.0529e-02,  2.0102e-02,\n",
      "         2.9533e-02, -7.8186e-03, -3.0819e-02, -2.1356e-02, -2.7967e-02,\n",
      "         2.4877e-02,  2.3300e-02,  2.8305e-02,  2.9761e-02,  1.2363e-02,\n",
      "        -1.4158e-02, -1.1000e-02,  2.3479e-02,  4.8863e-02, -1.3325e-02,\n",
      "         1.2415e-02, -1.0494e-02, -5.3160e-04, -1.3253e-02, -2.4968e-03,\n",
      "         2.0370e-02, -5.9943e-03, -9.5419e-03,  5.9531e-03, -8.3129e-03,\n",
      "        -4.0607e-03,  6.1272e-03, -2.9724e-02, -1.8579e-02,  1.2740e-02,\n",
      "        -2.6391e-02,  4.1079e-03, -4.0331e-03,  3.4990e-02, -3.4697e-04,\n",
      "        -9.6936e-03, -2.2701e-02,  3.2625e-02,  1.1973e-02, -3.9408e-02,\n",
      "        -6.4848e-02,  4.3097e-02,  2.6910e-02, -3.9942e-02,  3.4112e-02,\n",
      "        -7.8409e-03, -4.3240e-02, -1.6996e-02,  3.8101e-02, -3.8530e-02,\n",
      "         2.1452e-04,  3.7173e-02,  2.3474e-02,  1.9435e-03, -2.1596e-02,\n",
      "         1.2855e-02,  4.8854e-03,  2.1395e-02, -2.4349e-02,  7.3487e-03,\n",
      "        -2.7641e-02, -1.5773e-02,  1.1367e-02,  8.7802e-03,  2.3783e-02,\n",
      "         3.3420e-02,  3.4498e-02,  2.2979e-02, -1.2473e-02,  3.1100e-02,\n",
      "         6.0752e-02, -2.5795e-02,  1.7830e-02, -1.3168e-02,  8.0613e-04,\n",
      "         1.3292e-02,  8.1109e-03,  2.1875e-03, -1.0863e-02,  3.8718e-02,\n",
      "         4.5967e-02, -1.2454e-01,  2.6564e-02, -4.4082e-04,  1.8394e-02,\n",
      "         2.9872e-02,  6.4751e-03,  5.4129e-03,  2.0823e-02, -4.9624e-02,\n",
      "        -2.3234e-02, -5.7144e-02, -1.3117e-02, -5.3304e-02, -1.9084e-02,\n",
      "        -1.9121e-02,  2.5556e-04, -3.9970e-02, -3.3640e-02,  1.0532e-02,\n",
      "         5.7862e-02, -4.0414e-02,  6.6390e-03,  1.6265e-03,  1.0555e-02,\n",
      "        -5.1818e-03, -3.9941e-02,  8.6119e-02,  2.5038e-02,  1.1136e-02,\n",
      "        -8.5421e-03, -2.0004e-02,  3.0798e-02, -4.8180e-03, -1.1030e-02,\n",
      "         7.1489e-03,  7.0376e-02, -4.2558e-02, -5.4193e-02,  6.0990e-03,\n",
      "         1.5232e-02,  1.3667e-02, -1.5016e-02, -1.0382e-03, -6.4072e-03,\n",
      "         2.3970e-03,  3.7884e-02, -1.7684e-02,  2.0192e-02, -2.1400e-02,\n",
      "         1.6529e-02,  1.8982e-02,  1.6748e-02, -2.0919e-02,  1.2904e-02,\n",
      "        -1.5105e-02, -1.7961e-02,  2.2824e-03,  9.0103e-04,  1.3905e-02,\n",
      "        -5.2162e-02,  5.7747e-03,  6.7262e-03,  6.3685e-03, -1.2071e-02,\n",
      "        -2.7873e-02, -1.4171e-04, -4.8872e-02, -8.9744e-03, -1.0448e-02,\n",
      "         4.9146e-02, -2.0365e-02, -6.8874e-02,  1.3715e-02, -2.8159e-02,\n",
      "         5.1973e-03, -4.1494e-02,  1.7353e-02, -1.4510e-02, -4.5331e-03,\n",
      "         1.0267e-02, -2.9127e-02,  1.0169e-02, -5.0776e-03, -2.0463e-02,\n",
      "         1.6880e-02,  2.4789e-02, -3.2186e-02, -1.5043e-02, -9.5236e-03,\n",
      "        -1.8453e-02,  1.9968e-01, -3.1110e-02, -3.4481e-02, -5.3706e-03,\n",
      "        -2.3295e-02, -6.6525e-02,  1.5241e-02, -5.3700e-02, -1.3558e-02,\n",
      "        -7.4800e-02,  4.6305e-02,  4.3405e-03,  1.0513e-02, -1.4961e-02,\n",
      "         1.2347e-01, -4.1887e-02, -2.9692e-02, -2.0832e-02,  2.5459e-03,\n",
      "         1.5311e-02, -1.3357e-02,  1.3205e-02,  2.8943e-02,  4.9173e-02,\n",
      "         3.3758e-02,  1.1087e-02,  4.2151e-02,  6.3205e-04, -4.3288e-02,\n",
      "         2.3333e-02,  1.5167e-02, -1.0237e-02, -7.9236e-02,  4.3594e-03,\n",
      "         3.1445e-02,  4.2794e-03, -9.3492e-03, -3.5418e-02, -1.9242e-02,\n",
      "        -3.0336e-02,  7.7880e-03,  6.6255e-02, -7.5213e-03,  2.5932e-02,\n",
      "        -1.7802e-02,  1.8590e-03,  5.3834e-03,  9.6787e-02,  2.8787e-02,\n",
      "         9.1017e-04, -1.8586e-02,  2.2730e-02, -9.7814e-02,  4.2616e-02,\n",
      "         4.0229e-02, -8.9988e-03, -2.0952e-02,  7.7816e-03, -4.0449e-04,\n",
      "        -1.3639e-02, -1.7206e-03, -9.1304e-03,  4.3670e-03,  1.9919e-02,\n",
      "        -2.0095e-02, -2.6256e-03,  3.0235e-02,  3.7728e-03,  6.3254e-04,\n",
      "        -6.9728e-02,  2.5881e-03,  1.0343e-02,  3.3831e-02,  2.2356e-03,\n",
      "        -2.7363e-02,  3.5232e-02,  5.3659e-02, -7.8222e-03, -2.0881e-03,\n",
      "         2.2187e-02,  2.0626e-02,  3.6413e-02, -4.4460e-03,  4.6213e-02,\n",
      "        -1.4652e-03,  2.1768e-02,  3.3055e-03, -2.3867e-02, -2.7972e-02,\n",
      "        -6.7086e-02,  2.4510e-02,  4.0885e-02, -1.6748e-03,  1.2575e-02,\n",
      "        -2.0675e-04, -1.1889e-02,  4.2555e-03, -2.6686e-02, -9.5006e-03,\n",
      "        -1.3144e-02,  3.0939e-02, -1.9938e-02,  4.2527e-02, -1.4343e-02,\n",
      "         5.5876e-03,  2.4495e-02,  3.9814e-03,  2.8102e-02,  4.3181e-02,\n",
      "        -1.7406e-02, -4.2736e-02, -8.1578e-03, -5.3989e-03,  2.9429e-03,\n",
      "         4.3196e-02, -2.0857e-02, -3.0203e-02, -4.0288e-03, -4.4894e-02,\n",
      "         2.7039e-02,  3.5724e-02, -1.4012e-02, -2.3949e-03,  1.4861e-02,\n",
      "         3.1610e-02,  4.8555e-02,  1.8550e-02,  1.2663e-02, -6.1358e-03,\n",
      "        -4.1771e-02,  2.8252e-02, -1.1711e-02, -4.0601e-03, -2.9267e-02,\n",
      "        -3.0001e-02,  1.6215e-02], grad_fn=<DivBackward0>)\n"
     ]
    }
   ],
   "source": [
    "from transformers import CLIPVisionModelWithProjection\n",
    "\n",
    "model = CLIPVisionModelWithProjection.from_pretrained(\"Diangle/clip4clip-webvid\")\n",
    "model = model.eval()\n",
    "visual_output = model(video)\n",
    "\n",
    "# Normalizing the embeddings and calculating mean between all embeddings. \n",
    "visual_output = visual_output[\"image_embeds\"]\n",
    "visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
    "visual_output = torch.mean(visual_output, dim=0)\n",
    "visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
    "print(visual_output)\n",
    "\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}