camenduru commited on
Commit
a123ab1
1 Parent(s): 845f73c

thanks to ydshieh ❤

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ snowman.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ # For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
3
+ # Doc / guide: https://huggingface.co/docs/hub/model-cards
4
+ {}
5
+ ---
6
+ # Kosmos-2: Grounding Multimodal Large Language Models to the World
7
+
8
+ **(There is an on going effort to port `Kosmos-2` directly into `transformers`. This repository (remote code) might need some more bug fixes later, including breaking changes.)**
9
+
10
+ <a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><figure><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="384"><figcaption><b>[An image of a snowman warming himself by a fire.]</b></figcaption></figure></a>
11
+
12
+
13
+ This Hub repository contains a HuggingFace's `transformers` implementation of [the original Kosmos-2 model](https://github.com/microsoft/unilm/tree/master/kosmos-2) from Microsoft.
14
+
15
+ ## How to Get Started with the Model
16
+
17
+ Use the code below to get started with the model.
18
+
19
+ ```python
20
+ import requests
21
+
22
+ from PIL import Image
23
+ from transformers import AutoProcessor, AutoModelForVision2Seq
24
+
25
+
26
+ model = AutoModelForVision2Seq.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
27
+ processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
28
+
29
+ prompt = "<grounding>An image of"
30
+
31
+ url = "https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/snowman.png"
32
+ image = Image.open(requests.get(url, stream=True).raw)
33
+
34
+ # The original Kosmos-2 demo saves the image first then reload it. For some images, this will give slightly different image input and change the generation outputs.
35
+ # Uncomment the following 2 lines if you want to match the original demo's outputs.
36
+ # (One example is the `two_dogs.jpg` from the demo)
37
+ # image.save("new_image.jpg")
38
+ # image = Image.open("new_image.jpg")
39
+
40
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
41
+
42
+ generated_ids = model.generate(
43
+ pixel_values=inputs["pixel_values"],
44
+ input_ids=inputs["input_ids"][:, :-1],
45
+ attention_mask=inputs["attention_mask"][:, :-1],
46
+ img_features=None,
47
+ img_attn_mask=inputs["img_attn_mask"][:, :-1],
48
+ use_cache=True,
49
+ max_new_tokens=64,
50
+ )
51
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
52
+
53
+ # Specify `cleanup_and_extract=False` in order to see the raw model generation.
54
+ processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
55
+
56
+ print(processed_text)
57
+ # `<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.`
58
+
59
+ # By default, the generated text is cleanup and the entities are extracted.
60
+ processed_text, entities = processor.post_process_generation(generated_text)
61
+
62
+ print(processed_text)
63
+ # `An image of a snowman warming himself by a fire.`
64
+
65
+ print(entities)
66
+ # `[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]`
67
+ ```
68
+
69
+ ## Draw the bounding bboxes of the entities on the image
70
+
71
+ Once you have the `entities`, you can use the following helper function to draw their bounding bboxes on the image:
72
+
73
+ ```python
74
+ import cv2
75
+ import numpy as np
76
+ import os
77
+ import requests
78
+ import torch
79
+ import torchvision.transforms as T
80
+
81
+ from PIL import Image
82
+
83
+
84
+ def is_overlapping(rect1, rect2):
85
+ x1, y1, x2, y2 = rect1
86
+ x3, y3, x4, y4 = rect2
87
+ return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)
88
+
89
+
90
+ def draw_entity_boxes_on_image(image, entities, show=False, save_path=None):
91
+ """_summary_
92
+ Args:
93
+ image (_type_): image or image path
94
+ collect_entity_location (_type_): _description_
95
+ """
96
+ if isinstance(image, Image.Image):
97
+ image_h = image.height
98
+ image_w = image.width
99
+ image = np.array(image)[:, :, [2, 1, 0]]
100
+ elif isinstance(image, str):
101
+ if os.path.exists(image):
102
+ pil_img = Image.open(image).convert("RGB")
103
+ image = np.array(pil_img)[:, :, [2, 1, 0]]
104
+ image_h = pil_img.height
105
+ image_w = pil_img.width
106
+ else:
107
+ raise ValueError(f"invaild image path, {image}")
108
+ elif isinstance(image, torch.Tensor):
109
+ # pdb.set_trace()
110
+ image_tensor = image.cpu()
111
+ reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None]
112
+ reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None]
113
+ image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean
114
+ pil_img = T.ToPILImage()(image_tensor)
115
+ image_h = pil_img.height
116
+ image_w = pil_img.width
117
+ image = np.array(pil_img)[:, :, [2, 1, 0]]
118
+ else:
119
+ raise ValueError(f"invaild image format, {type(image)} for {image}")
120
+
121
+ if len(entities) == 0:
122
+ return image
123
+
124
+ new_image = image.copy()
125
+ previous_bboxes = []
126
+ # size of text
127
+ text_size = 1
128
+ # thickness of text
129
+ text_line = 1 # int(max(1 * min(image_h, image_w) / 512, 1))
130
+ box_line = 3
131
+ (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
132
+ base_height = int(text_height * 0.675)
133
+ text_offset_original = text_height - base_height
134
+ text_spaces = 3
135
+
136
+ for entity_name, (start, end), bboxes in entities:
137
+ for (x1_norm, y1_norm, x2_norm, y2_norm) in bboxes:
138
+ orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm * image_w), int(y1_norm * image_h), int(x2_norm * image_w), int(y2_norm * image_h)
139
+ # draw bbox
140
+ # random color
141
+ color = tuple(np.random.randint(0, 255, size=3).tolist())
142
+ new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)
143
+
144
+ l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1
145
+
146
+ x1 = orig_x1 - l_o
147
+ y1 = orig_y1 - l_o
148
+
149
+ if y1 < text_height + text_offset_original + 2 * text_spaces:
150
+ y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
151
+ x1 = orig_x1 + r_o
152
+
153
+ # add text background
154
+ (text_width, text_height), _ = cv2.getTextSize(f" {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
155
+ text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1
156
+
157
+ for prev_bbox in previous_bboxes:
158
+ while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox):
159
+ text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
160
+ text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
161
+ y1 += (text_height + text_offset_original + 2 * text_spaces)
162
+
163
+ if text_bg_y2 >= image_h:
164
+ text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
165
+ text_bg_y2 = image_h
166
+ y1 = image_h
167
+ break
168
+
169
+ alpha = 0.5
170
+ for i in range(text_bg_y1, text_bg_y2):
171
+ for j in range(text_bg_x1, text_bg_x2):
172
+ if i < image_h and j < image_w:
173
+ if j < text_bg_x1 + 1.35 * c_width:
174
+ # original color
175
+ bg_color = color
176
+ else:
177
+ # white
178
+ bg_color = [255, 255, 255]
179
+ new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8)
180
+
181
+ cv2.putText(
182
+ new_image, f" {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
183
+ )
184
+ # previous_locations.append((x1, y1))
185
+ previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2))
186
+
187
+ pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]])
188
+ if save_path:
189
+ pil_image.save(save_path)
190
+ if show:
191
+ pil_image.show()
192
+
193
+ return new_image
194
+
195
+
196
+ # (The same image from the previous code example)
197
+ url = "https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/snowman.jpg"
198
+ image = Image.open(requests.get(url, stream=True).raw)
199
+
200
+ # From the previous code example
201
+ entities = [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
202
+
203
+ # Draw the bounding bboxes
204
+ draw_entity_boxes_on_image(image, entities, show=True)
205
+ ```
206
+
207
+ Here is the annotated image:
208
+
209
+ <a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="500"></a>
210
+
211
+
212
+ ## Tasks
213
+
214
+ This model is capable of performing different tasks through changing the prompts.
215
+
216
+ First, let's define a function to run a prompt.
217
+
218
+ ```python
219
+ import requests
220
+
221
+ from PIL import Image
222
+ from transformers import AutoProcessor, AutoModelForVision2Seq
223
+
224
+
225
+ model = AutoModelForVision2Seq.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
226
+ processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
227
+
228
+ url = "https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/snowman.png"
229
+ image = Image.open(requests.get(url, stream=True).raw)
230
+
231
+ def run_example(prompt):
232
+
233
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
234
+ generated_ids = model.generate(
235
+ pixel_values=inputs["pixel_values"],
236
+ input_ids=inputs["input_ids"][:, :-1],
237
+ attention_mask=inputs["attention_mask"][:, :-1],
238
+ img_features=None,
239
+ img_attn_mask=inputs["img_attn_mask"][:, :-1],
240
+ use_cache=True,
241
+ max_new_tokens=64,
242
+ )
243
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
244
+ _processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
245
+ processed_text, entities = processor.post_process_generation(generated_text)
246
+ print(processed_text)
247
+ print(entities)
248
+ print(_processed_text)
249
+ ```
250
+
251
+ Here are the tasks `Kosmos-2` could perform:
252
+
253
+ ### Multimodal Grounding
254
+
255
+ #### • Phrase Grounding
256
+ ```python
257
+ prompt = "<grounding><phrase> a snowman</phrase>"
258
+ run_example(prompt)
259
+
260
+ # a snowman is warming himself by the fire
261
+ # [('a snowman', (0, 9), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('the fire', (32, 40), [(0.203125, 0.015625, 0.453125, 0.859375)])]
262
+
263
+ # <grounding><phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> is warming himself by<phrase> the fire</phrase><object><patch_index_0006><patch_index_0878></object>
264
+ ```
265
+
266
+ #### • Referring Expression Comprehension
267
+ ```python
268
+ prompt = "<grounding><phrase> a snowman next to a fire</phrase>"
269
+ run_example(prompt)
270
+
271
+ # a snowman next to a fire
272
+ # [('a snowman next to a fire', (0, 24), [(0.390625, 0.046875, 0.984375, 0.828125)])]
273
+
274
+ # <grounding><phrase> a snowman next to a fire</phrase><object><patch_index_0044><patch_index_0863></object>
275
+ ```
276
+
277
+ ### Multimodal Referring
278
+
279
+ #### • Referring expression generation
280
+ ```python
281
+ prompt = "<grounding><phrase> It</phrase><object><patch_index_0044><patch_index_0863></object> is"
282
+ run_example(prompt)
283
+
284
+ # It is snowman in a hat and scarf
285
+ # [('It', (0, 2), [(0.390625, 0.046875, 0.984375, 0.828125)])]
286
+
287
+ # <grounding><phrase> It</phrase><object><patch_index_0044><patch_index_0863></object> is snowman in a hat and scarf
288
+ ```
289
+
290
+ ### Perception-Language Tasks
291
+
292
+ #### • Grounded VQA
293
+ ```python
294
+ prompt = "<grounding> Question: What is special about this image? Answer:"
295
+ run_example(prompt)
296
+
297
+ # Question: What is special about this image? Answer: The image features a snowman sitting by a campfire in the snow.
298
+ # [('a snowman', (71, 80), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a campfire', (92, 102), [(0.109375, 0.640625, 0.546875, 0.984375)])]
299
+
300
+ # <grounding> Question: What is special about this image? Answer: The image features<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> sitting by<phrase> a campfire</phrase><object><patch_index_0643><patch_index_1009></object> in the snow.
301
+ ```
302
+
303
+ #### • Grounded VQA with multimodal referring via bounding boxes
304
+ ```python
305
+ prompt = "<grounding> Question: Where is<phrase> the fire</phrase><object><patch_index_0005><patch_index_0911></object> next to? Answer:"
306
+ run_example(prompt)
307
+
308
+ # Question: Where is the fire next to? Answer: Near the snowman.
309
+ # [('the fire', (19, 27), [(0.171875, 0.015625, 0.484375, 0.890625)]), ('the snowman', (50, 61), [(0.390625, 0.046875, 0.984375, 0.828125)])]
310
+
311
+ # <grounding> Question: Where is<phrase> the fire</phrase><object><patch_index_0005><patch_index_0911></object> next to? Answer: Near<phrase> the snowman</phrase><object><patch_index_0044><patch_index_0863></object>.
312
+ ```
313
+
314
+ ### Grounded Image captioning
315
+
316
+ #### • Brief
317
+
318
+ ```python
319
+ prompt = "<grounding> An image of"
320
+ run_example(prompt)
321
+
322
+ # An image of a snowman warming himself by a campfire.
323
+ # [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a campfire', (41, 51), [(0.109375, 0.640625, 0.546875, 0.984375)])]
324
+
325
+ # <grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a campfire</phrase><object><patch_index_0643><patch_index_1009></object>.
326
+ ```
327
+
328
+ #### • Detailed
329
+
330
+ ```python
331
+ prompt = "<grounding> Describe this image in detail:"
332
+ run_example(prompt)
333
+
334
+ # Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is wearing a hat, scarf, and gloves, with a pot nearby and a cup
335
+ # [('a campfire', (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]), ('a hat', (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]), ('scarf', (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]), ('gloves', (127, 133), [(0.515625, 0.390625, 0.640625, 0.515625)]), ('a pot', (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)])]
336
+
337
+ # <grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object><patch_index_0240><patch_index_0604></object>, and<phrase> gloves</phrase><object><patch_index_0400><patch_index_0532></object>, with<phrase> a pot</phrase><object><patch_index_0610><patch_index_0872></object> nearby and<phrase> a cup</phrase><object>
338
+ ```
339
+
340
+
341
+ ## Running the Flask Server
342
+ _flask_kosmos2.py_ shows the implementation of a Flask server for the model.
343
+ It allowes the model to be approached as a REST API.
344
+
345
+ After starting the server. You can send a POST request to `http://localhost:8005/process_prompt` with the following form data:
346
+ - `prompt`: For example `<grounding> an image of`
347
+ - `image`: The image file as binary data
348
+
349
+ This in turn will produce a reply with the following JSON format:
350
+ - `message`: The Kosmos-2 generated text
351
+ - `entities`: The extracted entities
352
+
353
+ An easy way to test this is through an application like Postman. Make sure the image field is set to `File`.
354
+
355
+ ```python
356
+
357
+ from PIL import Image
358
+ from transformers import AutoProcessor, AutoModelForVision2Seq
359
+ from flask import Flask, request, jsonify
360
+ import json
361
+
362
+ app = Flask(__name__)
363
+
364
+ model = AutoModelForVision2Seq.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
365
+ processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
366
+
367
+
368
+ @app.route('/process_prompt', methods=['POST'])
369
+ def process_prompt():
370
+ try:
371
+ # Get the uploaded image data from the POST request
372
+ uploaded_file = request.files['image']
373
+ prompt = request.form.get('prompt')
374
+ image = Image.open(uploaded_file.stream)
375
+
376
+ print(image.size)
377
+
378
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
379
+
380
+ generated_ids = model.generate(
381
+ pixel_values=inputs["pixel_values"],
382
+ input_ids=inputs["input_ids"][:, :-1],
383
+ attention_mask=inputs["attention_mask"][:, :-1],
384
+ img_features=None,
385
+ img_attn_mask=inputs["img_attn_mask"][:, :-1],
386
+ use_cache=True,
387
+ max_new_tokens=64,
388
+ )
389
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
390
+
391
+ # By default, the generated text is cleanup and the entities are extracted.
392
+ processed_text, entities = processor.post_process_generation(generated_text)
393
+ parsed_entities = entities_to_json(entities)
394
+ print(generated_text)
395
+ print(processed_text)
396
+ return jsonify({"message": processed_text, 'entities': parsed_entities})
397
+ except Exception as e:
398
+ return jsonify({"error": str(e)})
399
+
400
+
401
+ def entities_to_json(entities):
402
+ result = []
403
+ for e in entities:
404
+ label = e[0]
405
+ box_coords = e[1]
406
+ box_size = e[2][0]
407
+ entity_result = {
408
+ "label": label,
409
+ "boundingBoxPosition": {"x": box_coords[0], "y": box_coords[1]},
410
+ "boundingBox": {"x_min": box_size[0], "y_min": box_size[1], "x_max": box_size[2], "y_max": box_size[3]}
411
+ }
412
+ print(entity_result)
413
+ result.append(entity_result)
414
+
415
+ return result
416
+
417
+
418
+ if __name__ == '__main__':
419
+ app.run(host='localhost', port=8005)
420
+
421
+ ```
added_tokens.json ADDED
@@ -0,0 +1,1037 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</chunk>": 64005,
3
+ "</delimiter_of_multi_objects/>": 64011,
4
+ "</doc>": 64002,
5
+ "</image>": 64004,
6
+ "</line>": 64006,
7
+ "</object>": 64010,
8
+ "</phrase>": 64008,
9
+ "<grounding>": 64012,
10
+ "<image>": 64003,
11
+ "<object>": 64009,
12
+ "<patch_index_0000>": 64013,
13
+ "<patch_index_0001>": 64014,
14
+ "<patch_index_0002>": 64015,
15
+ "<patch_index_0003>": 64016,
16
+ "<patch_index_0004>": 64017,
17
+ "<patch_index_0005>": 64018,
18
+ "<patch_index_0006>": 64019,
19
+ "<patch_index_0007>": 64020,
20
+ "<patch_index_0008>": 64021,
21
+ "<patch_index_0009>": 64022,
22
+ "<patch_index_0010>": 64023,
23
+ "<patch_index_0011>": 64024,
24
+ "<patch_index_0012>": 64025,
25
+ "<patch_index_0013>": 64026,
26
+ "<patch_index_0014>": 64027,
27
+ "<patch_index_0015>": 64028,
28
+ "<patch_index_0016>": 64029,
29
+ "<patch_index_0017>": 64030,
30
+ "<patch_index_0018>": 64031,
31
+ "<patch_index_0019>": 64032,
32
+ "<patch_index_0020>": 64033,
33
+ "<patch_index_0021>": 64034,
34
+ "<patch_index_0022>": 64035,
35
+ "<patch_index_0023>": 64036,
36
+ "<patch_index_0024>": 64037,
37
+ "<patch_index_0025>": 64038,
38
+ "<patch_index_0026>": 64039,
39
+ "<patch_index_0027>": 64040,
40
+ "<patch_index_0028>": 64041,
41
+ "<patch_index_0029>": 64042,
42
+ "<patch_index_0030>": 64043,
43
+ "<patch_index_0031>": 64044,
44
+ "<patch_index_0032>": 64045,
45
+ "<patch_index_0033>": 64046,
46
+ "<patch_index_0034>": 64047,
47
+ "<patch_index_0035>": 64048,
48
+ "<patch_index_0036>": 64049,
49
+ "<patch_index_0037>": 64050,
50
+ "<patch_index_0038>": 64051,
51
+ "<patch_index_0039>": 64052,
52
+ "<patch_index_0040>": 64053,
53
+ "<patch_index_0041>": 64054,
54
+ "<patch_index_0042>": 64055,
55
+ "<patch_index_0043>": 64056,
56
+ "<patch_index_0044>": 64057,
57
+ "<patch_index_0045>": 64058,
58
+ "<patch_index_0046>": 64059,
59
+ "<patch_index_0047>": 64060,
60
+ "<patch_index_0048>": 64061,
61
+ "<patch_index_0049>": 64062,
62
+ "<patch_index_0050>": 64063,
63
+ "<patch_index_0051>": 64064,
64
+ "<patch_index_0052>": 64065,
65
+ "<patch_index_0053>": 64066,
66
+ "<patch_index_0054>": 64067,
67
+ "<patch_index_0055>": 64068,
68
+ "<patch_index_0056>": 64069,
69
+ "<patch_index_0057>": 64070,
70
+ "<patch_index_0058>": 64071,
71
+ "<patch_index_0059>": 64072,
72
+ "<patch_index_0060>": 64073,
73
+ "<patch_index_0061>": 64074,
74
+ "<patch_index_0062>": 64075,
75
+ "<patch_index_0063>": 64076,
76
+ "<patch_index_0064>": 64077,
77
+ "<patch_index_0065>": 64078,
78
+ "<patch_index_0066>": 64079,
79
+ "<patch_index_0067>": 64080,
80
+ "<patch_index_0068>": 64081,
81
+ "<patch_index_0069>": 64082,
82
+ "<patch_index_0070>": 64083,
83
+ "<patch_index_0071>": 64084,
84
+ "<patch_index_0072>": 64085,
85
+ "<patch_index_0073>": 64086,
86
+ "<patch_index_0074>": 64087,
87
+ "<patch_index_0075>": 64088,
88
+ "<patch_index_0076>": 64089,
89
+ "<patch_index_0077>": 64090,
90
+ "<patch_index_0078>": 64091,
91
+ "<patch_index_0079>": 64092,
92
+ "<patch_index_0080>": 64093,
93
+ "<patch_index_0081>": 64094,
94
+ "<patch_index_0082>": 64095,
95
+ "<patch_index_0083>": 64096,
96
+ "<patch_index_0084>": 64097,
97
+ "<patch_index_0085>": 64098,
98
+ "<patch_index_0086>": 64099,
99
+ "<patch_index_0087>": 64100,
100
+ "<patch_index_0088>": 64101,
101
+ "<patch_index_0089>": 64102,
102
+ "<patch_index_0090>": 64103,
103
+ "<patch_index_0091>": 64104,
104
+ "<patch_index_0092>": 64105,
105
+ "<patch_index_0093>": 64106,
106
+ "<patch_index_0094>": 64107,
107
+ "<patch_index_0095>": 64108,
108
+ "<patch_index_0096>": 64109,
109
+ "<patch_index_0097>": 64110,
110
+ "<patch_index_0098>": 64111,
111
+ "<patch_index_0099>": 64112,
112
+ "<patch_index_0100>": 64113,
113
+ "<patch_index_0101>": 64114,
114
+ "<patch_index_0102>": 64115,
115
+ "<patch_index_0103>": 64116,
116
+ "<patch_index_0104>": 64117,
117
+ "<patch_index_0105>": 64118,
118
+ "<patch_index_0106>": 64119,
119
+ "<patch_index_0107>": 64120,
120
+ "<patch_index_0108>": 64121,
121
+ "<patch_index_0109>": 64122,
122
+ "<patch_index_0110>": 64123,
123
+ "<patch_index_0111>": 64124,
124
+ "<patch_index_0112>": 64125,
125
+ "<patch_index_0113>": 64126,
126
+ "<patch_index_0114>": 64127,
127
+ "<patch_index_0115>": 64128,
128
+ "<patch_index_0116>": 64129,
129
+ "<patch_index_0117>": 64130,
130
+ "<patch_index_0118>": 64131,
131
+ "<patch_index_0119>": 64132,
132
+ "<patch_index_0120>": 64133,
133
+ "<patch_index_0121>": 64134,
134
+ "<patch_index_0122>": 64135,
135
+ "<patch_index_0123>": 64136,
136
+ "<patch_index_0124>": 64137,
137
+ "<patch_index_0125>": 64138,
138
+ "<patch_index_0126>": 64139,
139
+ "<patch_index_0127>": 64140,
140
+ "<patch_index_0128>": 64141,
141
+ "<patch_index_0129>": 64142,
142
+ "<patch_index_0130>": 64143,
143
+ "<patch_index_0131>": 64144,
144
+ "<patch_index_0132>": 64145,
145
+ "<patch_index_0133>": 64146,
146
+ "<patch_index_0134>": 64147,
147
+ "<patch_index_0135>": 64148,
148
+ "<patch_index_0136>": 64149,
149
+ "<patch_index_0137>": 64150,
150
+ "<patch_index_0138>": 64151,
151
+ "<patch_index_0139>": 64152,
152
+ "<patch_index_0140>": 64153,
153
+ "<patch_index_0141>": 64154,
154
+ "<patch_index_0142>": 64155,
155
+ "<patch_index_0143>": 64156,
156
+ "<patch_index_0144>": 64157,
157
+ "<patch_index_0145>": 64158,
158
+ "<patch_index_0146>": 64159,
159
+ "<patch_index_0147>": 64160,
160
+ "<patch_index_0148>": 64161,
161
+ "<patch_index_0149>": 64162,
162
+ "<patch_index_0150>": 64163,
163
+ "<patch_index_0151>": 64164,
164
+ "<patch_index_0152>": 64165,
165
+ "<patch_index_0153>": 64166,
166
+ "<patch_index_0154>": 64167,
167
+ "<patch_index_0155>": 64168,
168
+ "<patch_index_0156>": 64169,
169
+ "<patch_index_0157>": 64170,
170
+ "<patch_index_0158>": 64171,
171
+ "<patch_index_0159>": 64172,
172
+ "<patch_index_0160>": 64173,
173
+ "<patch_index_0161>": 64174,
174
+ "<patch_index_0162>": 64175,
175
+ "<patch_index_0163>": 64176,
176
+ "<patch_index_0164>": 64177,
177
+ "<patch_index_0165>": 64178,
178
+ "<patch_index_0166>": 64179,
179
+ "<patch_index_0167>": 64180,
180
+ "<patch_index_0168>": 64181,
181
+ "<patch_index_0169>": 64182,
182
+ "<patch_index_0170>": 64183,
183
+ "<patch_index_0171>": 64184,
184
+ "<patch_index_0172>": 64185,
185
+ "<patch_index_0173>": 64186,
186
+ "<patch_index_0174>": 64187,
187
+ "<patch_index_0175>": 64188,
188
+ "<patch_index_0176>": 64189,
189
+ "<patch_index_0177>": 64190,
190
+ "<patch_index_0178>": 64191,
191
+ "<patch_index_0179>": 64192,
192
+ "<patch_index_0180>": 64193,
193
+ "<patch_index_0181>": 64194,
194
+ "<patch_index_0182>": 64195,
195
+ "<patch_index_0183>": 64196,
196
+ "<patch_index_0184>": 64197,
197
+ "<patch_index_0185>": 64198,
198
+ "<patch_index_0186>": 64199,
199
+ "<patch_index_0187>": 64200,
200
+ "<patch_index_0188>": 64201,
201
+ "<patch_index_0189>": 64202,
202
+ "<patch_index_0190>": 64203,
203
+ "<patch_index_0191>": 64204,
204
+ "<patch_index_0192>": 64205,
205
+ "<patch_index_0193>": 64206,
206
+ "<patch_index_0194>": 64207,
207
+ "<patch_index_0195>": 64208,
208
+ "<patch_index_0196>": 64209,
209
+ "<patch_index_0197>": 64210,
210
+ "<patch_index_0198>": 64211,
211
+ "<patch_index_0199>": 64212,
212
+ "<patch_index_0200>": 64213,
213
+ "<patch_index_0201>": 64214,
214
+ "<patch_index_0202>": 64215,
215
+ "<patch_index_0203>": 64216,
216
+ "<patch_index_0204>": 64217,
217
+ "<patch_index_0205>": 64218,
218
+ "<patch_index_0206>": 64219,
219
+ "<patch_index_0207>": 64220,
220
+ "<patch_index_0208>": 64221,
221
+ "<patch_index_0209>": 64222,
222
+ "<patch_index_0210>": 64223,
223
+ "<patch_index_0211>": 64224,
224
+ "<patch_index_0212>": 64225,
225
+ "<patch_index_0213>": 64226,
226
+ "<patch_index_0214>": 64227,
227
+ "<patch_index_0215>": 64228,
228
+ "<patch_index_0216>": 64229,
229
+ "<patch_index_0217>": 64230,
230
+ "<patch_index_0218>": 64231,
231
+ "<patch_index_0219>": 64232,
232
+ "<patch_index_0220>": 64233,
233
+ "<patch_index_0221>": 64234,
234
+ "<patch_index_0222>": 64235,
235
+ "<patch_index_0223>": 64236,
236
+ "<patch_index_0224>": 64237,
237
+ "<patch_index_0225>": 64238,
238
+ "<patch_index_0226>": 64239,
239
+ "<patch_index_0227>": 64240,
240
+ "<patch_index_0228>": 64241,
241
+ "<patch_index_0229>": 64242,
242
+ "<patch_index_0230>": 64243,
243
+ "<patch_index_0231>": 64244,
244
+ "<patch_index_0232>": 64245,
245
+ "<patch_index_0233>": 64246,
246
+ "<patch_index_0234>": 64247,
247
+ "<patch_index_0235>": 64248,
248
+ "<patch_index_0236>": 64249,
249
+ "<patch_index_0237>": 64250,
250
+ "<patch_index_0238>": 64251,
251
+ "<patch_index_0239>": 64252,
252
+ "<patch_index_0240>": 64253,
253
+ "<patch_index_0241>": 64254,
254
+ "<patch_index_0242>": 64255,
255
+ "<patch_index_0243>": 64256,
256
+ "<patch_index_0244>": 64257,
257
+ "<patch_index_0245>": 64258,
258
+ "<patch_index_0246>": 64259,
259
+ "<patch_index_0247>": 64260,
260
+ "<patch_index_0248>": 64261,
261
+ "<patch_index_0249>": 64262,
262
+ "<patch_index_0250>": 64263,
263
+ "<patch_index_0251>": 64264,
264
+ "<patch_index_0252>": 64265,
265
+ "<patch_index_0253>": 64266,
266
+ "<patch_index_0254>": 64267,
267
+ "<patch_index_0255>": 64268,
268
+ "<patch_index_0256>": 64269,
269
+ "<patch_index_0257>": 64270,
270
+ "<patch_index_0258>": 64271,
271
+ "<patch_index_0259>": 64272,
272
+ "<patch_index_0260>": 64273,
273
+ "<patch_index_0261>": 64274,
274
+ "<patch_index_0262>": 64275,
275
+ "<patch_index_0263>": 64276,
276
+ "<patch_index_0264>": 64277,
277
+ "<patch_index_0265>": 64278,
278
+ "<patch_index_0266>": 64279,
279
+ "<patch_index_0267>": 64280,
280
+ "<patch_index_0268>": 64281,
281
+ "<patch_index_0269>": 64282,
282
+ "<patch_index_0270>": 64283,
283
+ "<patch_index_0271>": 64284,
284
+ "<patch_index_0272>": 64285,
285
+ "<patch_index_0273>": 64286,
286
+ "<patch_index_0274>": 64287,
287
+ "<patch_index_0275>": 64288,
288
+ "<patch_index_0276>": 64289,
289
+ "<patch_index_0277>": 64290,
290
+ "<patch_index_0278>": 64291,
291
+ "<patch_index_0279>": 64292,
292
+ "<patch_index_0280>": 64293,
293
+ "<patch_index_0281>": 64294,
294
+ "<patch_index_0282>": 64295,
295
+ "<patch_index_0283>": 64296,
296
+ "<patch_index_0284>": 64297,
297
+ "<patch_index_0285>": 64298,
298
+ "<patch_index_0286>": 64299,
299
+ "<patch_index_0287>": 64300,
300
+ "<patch_index_0288>": 64301,
301
+ "<patch_index_0289>": 64302,
302
+ "<patch_index_0290>": 64303,
303
+ "<patch_index_0291>": 64304,
304
+ "<patch_index_0292>": 64305,
305
+ "<patch_index_0293>": 64306,
306
+ "<patch_index_0294>": 64307,
307
+ "<patch_index_0295>": 64308,
308
+ "<patch_index_0296>": 64309,
309
+ "<patch_index_0297>": 64310,
310
+ "<patch_index_0298>": 64311,
311
+ "<patch_index_0299>": 64312,
312
+ "<patch_index_0300>": 64313,
313
+ "<patch_index_0301>": 64314,
314
+ "<patch_index_0302>": 64315,
315
+ "<patch_index_0303>": 64316,
316
+ "<patch_index_0304>": 64317,
317
+ "<patch_index_0305>": 64318,
318
+ "<patch_index_0306>": 64319,
319
+ "<patch_index_0307>": 64320,
320
+ "<patch_index_0308>": 64321,
321
+ "<patch_index_0309>": 64322,
322
+ "<patch_index_0310>": 64323,
323
+ "<patch_index_0311>": 64324,
324
+ "<patch_index_0312>": 64325,
325
+ "<patch_index_0313>": 64326,
326
+ "<patch_index_0314>": 64327,
327
+ "<patch_index_0315>": 64328,
328
+ "<patch_index_0316>": 64329,
329
+ "<patch_index_0317>": 64330,
330
+ "<patch_index_0318>": 64331,
331
+ "<patch_index_0319>": 64332,
332
+ "<patch_index_0320>": 64333,
333
+ "<patch_index_0321>": 64334,
334
+ "<patch_index_0322>": 64335,
335
+ "<patch_index_0323>": 64336,
336
+ "<patch_index_0324>": 64337,
337
+ "<patch_index_0325>": 64338,
338
+ "<patch_index_0326>": 64339,
339
+ "<patch_index_0327>": 64340,
340
+ "<patch_index_0328>": 64341,
341
+ "<patch_index_0329>": 64342,
342
+ "<patch_index_0330>": 64343,
343
+ "<patch_index_0331>": 64344,
344
+ "<patch_index_0332>": 64345,
345
+ "<patch_index_0333>": 64346,
346
+ "<patch_index_0334>": 64347,
347
+ "<patch_index_0335>": 64348,
348
+ "<patch_index_0336>": 64349,
349
+ "<patch_index_0337>": 64350,
350
+ "<patch_index_0338>": 64351,
351
+ "<patch_index_0339>": 64352,
352
+ "<patch_index_0340>": 64353,
353
+ "<patch_index_0341>": 64354,
354
+ "<patch_index_0342>": 64355,
355
+ "<patch_index_0343>": 64356,
356
+ "<patch_index_0344>": 64357,
357
+ "<patch_index_0345>": 64358,
358
+ "<patch_index_0346>": 64359,
359
+ "<patch_index_0347>": 64360,
360
+ "<patch_index_0348>": 64361,
361
+ "<patch_index_0349>": 64362,
362
+ "<patch_index_0350>": 64363,
363
+ "<patch_index_0351>": 64364,
364
+ "<patch_index_0352>": 64365,
365
+ "<patch_index_0353>": 64366,
366
+ "<patch_index_0354>": 64367,
367
+ "<patch_index_0355>": 64368,
368
+ "<patch_index_0356>": 64369,
369
+ "<patch_index_0357>": 64370,
370
+ "<patch_index_0358>": 64371,
371
+ "<patch_index_0359>": 64372,
372
+ "<patch_index_0360>": 64373,
373
+ "<patch_index_0361>": 64374,
374
+ "<patch_index_0362>": 64375,
375
+ "<patch_index_0363>": 64376,
376
+ "<patch_index_0364>": 64377,
377
+ "<patch_index_0365>": 64378,
378
+ "<patch_index_0366>": 64379,
379
+ "<patch_index_0367>": 64380,
380
+ "<patch_index_0368>": 64381,
381
+ "<patch_index_0369>": 64382,
382
+ "<patch_index_0370>": 64383,
383
+ "<patch_index_0371>": 64384,
384
+ "<patch_index_0372>": 64385,
385
+ "<patch_index_0373>": 64386,
386
+ "<patch_index_0374>": 64387,
387
+ "<patch_index_0375>": 64388,
388
+ "<patch_index_0376>": 64389,
389
+ "<patch_index_0377>": 64390,
390
+ "<patch_index_0378>": 64391,
391
+ "<patch_index_0379>": 64392,
392
+ "<patch_index_0380>": 64393,
393
+ "<patch_index_0381>": 64394,
394
+ "<patch_index_0382>": 64395,
395
+ "<patch_index_0383>": 64396,
396
+ "<patch_index_0384>": 64397,
397
+ "<patch_index_0385>": 64398,
398
+ "<patch_index_0386>": 64399,
399
+ "<patch_index_0387>": 64400,
400
+ "<patch_index_0388>": 64401,
401
+ "<patch_index_0389>": 64402,
402
+ "<patch_index_0390>": 64403,
403
+ "<patch_index_0391>": 64404,
404
+ "<patch_index_0392>": 64405,
405
+ "<patch_index_0393>": 64406,
406
+ "<patch_index_0394>": 64407,
407
+ "<patch_index_0395>": 64408,
408
+ "<patch_index_0396>": 64409,
409
+ "<patch_index_0397>": 64410,
410
+ "<patch_index_0398>": 64411,
411
+ "<patch_index_0399>": 64412,
412
+ "<patch_index_0400>": 64413,
413
+ "<patch_index_0401>": 64414,
414
+ "<patch_index_0402>": 64415,
415
+ "<patch_index_0403>": 64416,
416
+ "<patch_index_0404>": 64417,
417
+ "<patch_index_0405>": 64418,
418
+ "<patch_index_0406>": 64419,
419
+ "<patch_index_0407>": 64420,
420
+ "<patch_index_0408>": 64421,
421
+ "<patch_index_0409>": 64422,
422
+ "<patch_index_0410>": 64423,
423
+ "<patch_index_0411>": 64424,
424
+ "<patch_index_0412>": 64425,
425
+ "<patch_index_0413>": 64426,
426
+ "<patch_index_0414>": 64427,
427
+ "<patch_index_0415>": 64428,
428
+ "<patch_index_0416>": 64429,
429
+ "<patch_index_0417>": 64430,
430
+ "<patch_index_0418>": 64431,
431
+ "<patch_index_0419>": 64432,
432
+ "<patch_index_0420>": 64433,
433
+ "<patch_index_0421>": 64434,
434
+ "<patch_index_0422>": 64435,
435
+ "<patch_index_0423>": 64436,
436
+ "<patch_index_0424>": 64437,
437
+ "<patch_index_0425>": 64438,
438
+ "<patch_index_0426>": 64439,
439
+ "<patch_index_0427>": 64440,
440
+ "<patch_index_0428>": 64441,
441
+ "<patch_index_0429>": 64442,
442
+ "<patch_index_0430>": 64443,
443
+ "<patch_index_0431>": 64444,
444
+ "<patch_index_0432>": 64445,
445
+ "<patch_index_0433>": 64446,
446
+ "<patch_index_0434>": 64447,
447
+ "<patch_index_0435>": 64448,
448
+ "<patch_index_0436>": 64449,
449
+ "<patch_index_0437>": 64450,
450
+ "<patch_index_0438>": 64451,
451
+ "<patch_index_0439>": 64452,
452
+ "<patch_index_0440>": 64453,
453
+ "<patch_index_0441>": 64454,
454
+ "<patch_index_0442>": 64455,
455
+ "<patch_index_0443>": 64456,
456
+ "<patch_index_0444>": 64457,
457
+ "<patch_index_0445>": 64458,
458
+ "<patch_index_0446>": 64459,
459
+ "<patch_index_0447>": 64460,
460
+ "<patch_index_0448>": 64461,
461
+ "<patch_index_0449>": 64462,
462
+ "<patch_index_0450>": 64463,
463
+ "<patch_index_0451>": 64464,
464
+ "<patch_index_0452>": 64465,
465
+ "<patch_index_0453>": 64466,
466
+ "<patch_index_0454>": 64467,
467
+ "<patch_index_0455>": 64468,
468
+ "<patch_index_0456>": 64469,
469
+ "<patch_index_0457>": 64470,
470
+ "<patch_index_0458>": 64471,
471
+ "<patch_index_0459>": 64472,
472
+ "<patch_index_0460>": 64473,
473
+ "<patch_index_0461>": 64474,
474
+ "<patch_index_0462>": 64475,
475
+ "<patch_index_0463>": 64476,
476
+ "<patch_index_0464>": 64477,
477
+ "<patch_index_0465>": 64478,
478
+ "<patch_index_0466>": 64479,
479
+ "<patch_index_0467>": 64480,
480
+ "<patch_index_0468>": 64481,
481
+ "<patch_index_0469>": 64482,
482
+ "<patch_index_0470>": 64483,
483
+ "<patch_index_0471>": 64484,
484
+ "<patch_index_0472>": 64485,
485
+ "<patch_index_0473>": 64486,
486
+ "<patch_index_0474>": 64487,
487
+ "<patch_index_0475>": 64488,
488
+ "<patch_index_0476>": 64489,
489
+ "<patch_index_0477>": 64490,
490
+ "<patch_index_0478>": 64491,
491
+ "<patch_index_0479>": 64492,
492
+ "<patch_index_0480>": 64493,
493
+ "<patch_index_0481>": 64494,
494
+ "<patch_index_0482>": 64495,
495
+ "<patch_index_0483>": 64496,
496
+ "<patch_index_0484>": 64497,
497
+ "<patch_index_0485>": 64498,
498
+ "<patch_index_0486>": 64499,
499
+ "<patch_index_0487>": 64500,
500
+ "<patch_index_0488>": 64501,
501
+ "<patch_index_0489>": 64502,
502
+ "<patch_index_0490>": 64503,
503
+ "<patch_index_0491>": 64504,
504
+ "<patch_index_0492>": 64505,
505
+ "<patch_index_0493>": 64506,
506
+ "<patch_index_0494>": 64507,
507
+ "<patch_index_0495>": 64508,
508
+ "<patch_index_0496>": 64509,
509
+ "<patch_index_0497>": 64510,
510
+ "<patch_index_0498>": 64511,
511
+ "<patch_index_0499>": 64512,
512
+ "<patch_index_0500>": 64513,
513
+ "<patch_index_0501>": 64514,
514
+ "<patch_index_0502>": 64515,
515
+ "<patch_index_0503>": 64516,
516
+ "<patch_index_0504>": 64517,
517
+ "<patch_index_0505>": 64518,
518
+ "<patch_index_0506>": 64519,
519
+ "<patch_index_0507>": 64520,
520
+ "<patch_index_0508>": 64521,
521
+ "<patch_index_0509>": 64522,
522
+ "<patch_index_0510>": 64523,
523
+ "<patch_index_0511>": 64524,
524
+ "<patch_index_0512>": 64525,
525
+ "<patch_index_0513>": 64526,
526
+ "<patch_index_0514>": 64527,
527
+ "<patch_index_0515>": 64528,
528
+ "<patch_index_0516>": 64529,
529
+ "<patch_index_0517>": 64530,
530
+ "<patch_index_0518>": 64531,
531
+ "<patch_index_0519>": 64532,
532
+ "<patch_index_0520>": 64533,
533
+ "<patch_index_0521>": 64534,
534
+ "<patch_index_0522>": 64535,
535
+ "<patch_index_0523>": 64536,
536
+ "<patch_index_0524>": 64537,
537
+ "<patch_index_0525>": 64538,
538
+ "<patch_index_0526>": 64539,
539
+ "<patch_index_0527>": 64540,
540
+ "<patch_index_0528>": 64541,
541
+ "<patch_index_0529>": 64542,
542
+ "<patch_index_0530>": 64543,
543
+ "<patch_index_0531>": 64544,
544
+ "<patch_index_0532>": 64545,
545
+ "<patch_index_0533>": 64546,
546
+ "<patch_index_0534>": 64547,
547
+ "<patch_index_0535>": 64548,
548
+ "<patch_index_0536>": 64549,
549
+ "<patch_index_0537>": 64550,
550
+ "<patch_index_0538>": 64551,
551
+ "<patch_index_0539>": 64552,
552
+ "<patch_index_0540>": 64553,
553
+ "<patch_index_0541>": 64554,
554
+ "<patch_index_0542>": 64555,
555
+ "<patch_index_0543>": 64556,
556
+ "<patch_index_0544>": 64557,
557
+ "<patch_index_0545>": 64558,
558
+ "<patch_index_0546>": 64559,
559
+ "<patch_index_0547>": 64560,
560
+ "<patch_index_0548>": 64561,
561
+ "<patch_index_0549>": 64562,
562
+ "<patch_index_0550>": 64563,
563
+ "<patch_index_0551>": 64564,
564
+ "<patch_index_0552>": 64565,
565
+ "<patch_index_0553>": 64566,
566
+ "<patch_index_0554>": 64567,
567
+ "<patch_index_0555>": 64568,
568
+ "<patch_index_0556>": 64569,
569
+ "<patch_index_0557>": 64570,
570
+ "<patch_index_0558>": 64571,
571
+ "<patch_index_0559>": 64572,
572
+ "<patch_index_0560>": 64573,
573
+ "<patch_index_0561>": 64574,
574
+ "<patch_index_0562>": 64575,
575
+ "<patch_index_0563>": 64576,
576
+ "<patch_index_0564>": 64577,
577
+ "<patch_index_0565>": 64578,
578
+ "<patch_index_0566>": 64579,
579
+ "<patch_index_0567>": 64580,
580
+ "<patch_index_0568>": 64581,
581
+ "<patch_index_0569>": 64582,
582
+ "<patch_index_0570>": 64583,
583
+ "<patch_index_0571>": 64584,
584
+ "<patch_index_0572>": 64585,
585
+ "<patch_index_0573>": 64586,
586
+ "<patch_index_0574>": 64587,
587
+ "<patch_index_0575>": 64588,
588
+ "<patch_index_0576>": 64589,
589
+ "<patch_index_0577>": 64590,
590
+ "<patch_index_0578>": 64591,
591
+ "<patch_index_0579>": 64592,
592
+ "<patch_index_0580>": 64593,
593
+ "<patch_index_0581>": 64594,
594
+ "<patch_index_0582>": 64595,
595
+ "<patch_index_0583>": 64596,
596
+ "<patch_index_0584>": 64597,
597
+ "<patch_index_0585>": 64598,
598
+ "<patch_index_0586>": 64599,
599
+ "<patch_index_0587>": 64600,
600
+ "<patch_index_0588>": 64601,
601
+ "<patch_index_0589>": 64602,
602
+ "<patch_index_0590>": 64603,
603
+ "<patch_index_0591>": 64604,
604
+ "<patch_index_0592>": 64605,
605
+ "<patch_index_0593>": 64606,
606
+ "<patch_index_0594>": 64607,
607
+ "<patch_index_0595>": 64608,
608
+ "<patch_index_0596>": 64609,
609
+ "<patch_index_0597>": 64610,
610
+ "<patch_index_0598>": 64611,
611
+ "<patch_index_0599>": 64612,
612
+ "<patch_index_0600>": 64613,
613
+ "<patch_index_0601>": 64614,
614
+ "<patch_index_0602>": 64615,
615
+ "<patch_index_0603>": 64616,
616
+ "<patch_index_0604>": 64617,
617
+ "<patch_index_0605>": 64618,
618
+ "<patch_index_0606>": 64619,
619
+ "<patch_index_0607>": 64620,
620
+ "<patch_index_0608>": 64621,
621
+ "<patch_index_0609>": 64622,
622
+ "<patch_index_0610>": 64623,
623
+ "<patch_index_0611>": 64624,
624
+ "<patch_index_0612>": 64625,
625
+ "<patch_index_0613>": 64626,
626
+ "<patch_index_0614>": 64627,
627
+ "<patch_index_0615>": 64628,
628
+ "<patch_index_0616>": 64629,
629
+ "<patch_index_0617>": 64630,
630
+ "<patch_index_0618>": 64631,
631
+ "<patch_index_0619>": 64632,
632
+ "<patch_index_0620>": 64633,
633
+ "<patch_index_0621>": 64634,
634
+ "<patch_index_0622>": 64635,
635
+ "<patch_index_0623>": 64636,
636
+ "<patch_index_0624>": 64637,
637
+ "<patch_index_0625>": 64638,
638
+ "<patch_index_0626>": 64639,
639
+ "<patch_index_0627>": 64640,
640
+ "<patch_index_0628>": 64641,
641
+ "<patch_index_0629>": 64642,
642
+ "<patch_index_0630>": 64643,
643
+ "<patch_index_0631>": 64644,
644
+ "<patch_index_0632>": 64645,
645
+ "<patch_index_0633>": 64646,
646
+ "<patch_index_0634>": 64647,
647
+ "<patch_index_0635>": 64648,
648
+ "<patch_index_0636>": 64649,
649
+ "<patch_index_0637>": 64650,
650
+ "<patch_index_0638>": 64651,
651
+ "<patch_index_0639>": 64652,
652
+ "<patch_index_0640>": 64653,
653
+ "<patch_index_0641>": 64654,
654
+ "<patch_index_0642>": 64655,
655
+ "<patch_index_0643>": 64656,
656
+ "<patch_index_0644>": 64657,
657
+ "<patch_index_0645>": 64658,
658
+ "<patch_index_0646>": 64659,
659
+ "<patch_index_0647>": 64660,
660
+ "<patch_index_0648>": 64661,
661
+ "<patch_index_0649>": 64662,
662
+ "<patch_index_0650>": 64663,
663
+ "<patch_index_0651>": 64664,
664
+ "<patch_index_0652>": 64665,
665
+ "<patch_index_0653>": 64666,
666
+ "<patch_index_0654>": 64667,
667
+ "<patch_index_0655>": 64668,
668
+ "<patch_index_0656>": 64669,
669
+ "<patch_index_0657>": 64670,
670
+ "<patch_index_0658>": 64671,
671
+ "<patch_index_0659>": 64672,
672
+ "<patch_index_0660>": 64673,
673
+ "<patch_index_0661>": 64674,
674
+ "<patch_index_0662>": 64675,
675
+ "<patch_index_0663>": 64676,
676
+ "<patch_index_0664>": 64677,
677
+ "<patch_index_0665>": 64678,
678
+ "<patch_index_0666>": 64679,
679
+ "<patch_index_0667>": 64680,
680
+ "<patch_index_0668>": 64681,
681
+ "<patch_index_0669>": 64682,
682
+ "<patch_index_0670>": 64683,
683
+ "<patch_index_0671>": 64684,
684
+ "<patch_index_0672>": 64685,
685
+ "<patch_index_0673>": 64686,
686
+ "<patch_index_0674>": 64687,
687
+ "<patch_index_0675>": 64688,
688
+ "<patch_index_0676>": 64689,
689
+ "<patch_index_0677>": 64690,
690
+ "<patch_index_0678>": 64691,
691
+ "<patch_index_0679>": 64692,
692
+ "<patch_index_0680>": 64693,
693
+ "<patch_index_0681>": 64694,
694
+ "<patch_index_0682>": 64695,
695
+ "<patch_index_0683>": 64696,
696
+ "<patch_index_0684>": 64697,
697
+ "<patch_index_0685>": 64698,
698
+ "<patch_index_0686>": 64699,
699
+ "<patch_index_0687>": 64700,
700
+ "<patch_index_0688>": 64701,
701
+ "<patch_index_0689>": 64702,
702
+ "<patch_index_0690>": 64703,
703
+ "<patch_index_0691>": 64704,
704
+ "<patch_index_0692>": 64705,
705
+ "<patch_index_0693>": 64706,
706
+ "<patch_index_0694>": 64707,
707
+ "<patch_index_0695>": 64708,
708
+ "<patch_index_0696>": 64709,
709
+ "<patch_index_0697>": 64710,
710
+ "<patch_index_0698>": 64711,
711
+ "<patch_index_0699>": 64712,
712
+ "<patch_index_0700>": 64713,
713
+ "<patch_index_0701>": 64714,
714
+ "<patch_index_0702>": 64715,
715
+ "<patch_index_0703>": 64716,
716
+ "<patch_index_0704>": 64717,
717
+ "<patch_index_0705>": 64718,
718
+ "<patch_index_0706>": 64719,
719
+ "<patch_index_0707>": 64720,
720
+ "<patch_index_0708>": 64721,
721
+ "<patch_index_0709>": 64722,
722
+ "<patch_index_0710>": 64723,
723
+ "<patch_index_0711>": 64724,
724
+ "<patch_index_0712>": 64725,
725
+ "<patch_index_0713>": 64726,
726
+ "<patch_index_0714>": 64727,
727
+ "<patch_index_0715>": 64728,
728
+ "<patch_index_0716>": 64729,
729
+ "<patch_index_0717>": 64730,
730
+ "<patch_index_0718>": 64731,
731
+ "<patch_index_0719>": 64732,
732
+ "<patch_index_0720>": 64733,
733
+ "<patch_index_0721>": 64734,
734
+ "<patch_index_0722>": 64735,
735
+ "<patch_index_0723>": 64736,
736
+ "<patch_index_0724>": 64737,
737
+ "<patch_index_0725>": 64738,
738
+ "<patch_index_0726>": 64739,
739
+ "<patch_index_0727>": 64740,
740
+ "<patch_index_0728>": 64741,
741
+ "<patch_index_0729>": 64742,
742
+ "<patch_index_0730>": 64743,
743
+ "<patch_index_0731>": 64744,
744
+ "<patch_index_0732>": 64745,
745
+ "<patch_index_0733>": 64746,
746
+ "<patch_index_0734>": 64747,
747
+ "<patch_index_0735>": 64748,
748
+ "<patch_index_0736>": 64749,
749
+ "<patch_index_0737>": 64750,
750
+ "<patch_index_0738>": 64751,
751
+ "<patch_index_0739>": 64752,
752
+ "<patch_index_0740>": 64753,
753
+ "<patch_index_0741>": 64754,
754
+ "<patch_index_0742>": 64755,
755
+ "<patch_index_0743>": 64756,
756
+ "<patch_index_0744>": 64757,
757
+ "<patch_index_0745>": 64758,
758
+ "<patch_index_0746>": 64759,
759
+ "<patch_index_0747>": 64760,
760
+ "<patch_index_0748>": 64761,
761
+ "<patch_index_0749>": 64762,
762
+ "<patch_index_0750>": 64763,
763
+ "<patch_index_0751>": 64764,
764
+ "<patch_index_0752>": 64765,
765
+ "<patch_index_0753>": 64766,
766
+ "<patch_index_0754>": 64767,
767
+ "<patch_index_0755>": 64768,
768
+ "<patch_index_0756>": 64769,
769
+ "<patch_index_0757>": 64770,
770
+ "<patch_index_0758>": 64771,
771
+ "<patch_index_0759>": 64772,
772
+ "<patch_index_0760>": 64773,
773
+ "<patch_index_0761>": 64774,
774
+ "<patch_index_0762>": 64775,
775
+ "<patch_index_0763>": 64776,
776
+ "<patch_index_0764>": 64777,
777
+ "<patch_index_0765>": 64778,
778
+ "<patch_index_0766>": 64779,
779
+ "<patch_index_0767>": 64780,
780
+ "<patch_index_0768>": 64781,
781
+ "<patch_index_0769>": 64782,
782
+ "<patch_index_0770>": 64783,
783
+ "<patch_index_0771>": 64784,
784
+ "<patch_index_0772>": 64785,
785
+ "<patch_index_0773>": 64786,
786
+ "<patch_index_0774>": 64787,
787
+ "<patch_index_0775>": 64788,
788
+ "<patch_index_0776>": 64789,
789
+ "<patch_index_0777>": 64790,
790
+ "<patch_index_0778>": 64791,
791
+ "<patch_index_0779>": 64792,
792
+ "<patch_index_0780>": 64793,
793
+ "<patch_index_0781>": 64794,
794
+ "<patch_index_0782>": 64795,
795
+ "<patch_index_0783>": 64796,
796
+ "<patch_index_0784>": 64797,
797
+ "<patch_index_0785>": 64798,
798
+ "<patch_index_0786>": 64799,
799
+ "<patch_index_0787>": 64800,
800
+ "<patch_index_0788>": 64801,
801
+ "<patch_index_0789>": 64802,
802
+ "<patch_index_0790>": 64803,
803
+ "<patch_index_0791>": 64804,
804
+ "<patch_index_0792>": 64805,
805
+ "<patch_index_0793>": 64806,
806
+ "<patch_index_0794>": 64807,
807
+ "<patch_index_0795>": 64808,
808
+ "<patch_index_0796>": 64809,
809
+ "<patch_index_0797>": 64810,
810
+ "<patch_index_0798>": 64811,
811
+ "<patch_index_0799>": 64812,
812
+ "<patch_index_0800>": 64813,
813
+ "<patch_index_0801>": 64814,
814
+ "<patch_index_0802>": 64815,
815
+ "<patch_index_0803>": 64816,
816
+ "<patch_index_0804>": 64817,
817
+ "<patch_index_0805>": 64818,
818
+ "<patch_index_0806>": 64819,
819
+ "<patch_index_0807>": 64820,
820
+ "<patch_index_0808>": 64821,
821
+ "<patch_index_0809>": 64822,
822
+ "<patch_index_0810>": 64823,
823
+ "<patch_index_0811>": 64824,
824
+ "<patch_index_0812>": 64825,
825
+ "<patch_index_0813>": 64826,
826
+ "<patch_index_0814>": 64827,
827
+ "<patch_index_0815>": 64828,
828
+ "<patch_index_0816>": 64829,
829
+ "<patch_index_0817>": 64830,
830
+ "<patch_index_0818>": 64831,
831
+ "<patch_index_0819>": 64832,
832
+ "<patch_index_0820>": 64833,
833
+ "<patch_index_0821>": 64834,
834
+ "<patch_index_0822>": 64835,
835
+ "<patch_index_0823>": 64836,
836
+ "<patch_index_0824>": 64837,
837
+ "<patch_index_0825>": 64838,
838
+ "<patch_index_0826>": 64839,
839
+ "<patch_index_0827>": 64840,
840
+ "<patch_index_0828>": 64841,
841
+ "<patch_index_0829>": 64842,
842
+ "<patch_index_0830>": 64843,
843
+ "<patch_index_0831>": 64844,
844
+ "<patch_index_0832>": 64845,
845
+ "<patch_index_0833>": 64846,
846
+ "<patch_index_0834>": 64847,
847
+ "<patch_index_0835>": 64848,
848
+ "<patch_index_0836>": 64849,
849
+ "<patch_index_0837>": 64850,
850
+ "<patch_index_0838>": 64851,
851
+ "<patch_index_0839>": 64852,
852
+ "<patch_index_0840>": 64853,
853
+ "<patch_index_0841>": 64854,
854
+ "<patch_index_0842>": 64855,
855
+ "<patch_index_0843>": 64856,
856
+ "<patch_index_0844>": 64857,
857
+ "<patch_index_0845>": 64858,
858
+ "<patch_index_0846>": 64859,
859
+ "<patch_index_0847>": 64860,
860
+ "<patch_index_0848>": 64861,
861
+ "<patch_index_0849>": 64862,
862
+ "<patch_index_0850>": 64863,
863
+ "<patch_index_0851>": 64864,
864
+ "<patch_index_0852>": 64865,
865
+ "<patch_index_0853>": 64866,
866
+ "<patch_index_0854>": 64867,
867
+ "<patch_index_0855>": 64868,
868
+ "<patch_index_0856>": 64869,
869
+ "<patch_index_0857>": 64870,
870
+ "<patch_index_0858>": 64871,
871
+ "<patch_index_0859>": 64872,
872
+ "<patch_index_0860>": 64873,
873
+ "<patch_index_0861>": 64874,
874
+ "<patch_index_0862>": 64875,
875
+ "<patch_index_0863>": 64876,
876
+ "<patch_index_0864>": 64877,
877
+ "<patch_index_0865>": 64878,
878
+ "<patch_index_0866>": 64879,
879
+ "<patch_index_0867>": 64880,
880
+ "<patch_index_0868>": 64881,
881
+ "<patch_index_0869>": 64882,
882
+ "<patch_index_0870>": 64883,
883
+ "<patch_index_0871>": 64884,
884
+ "<patch_index_0872>": 64885,
885
+ "<patch_index_0873>": 64886,
886
+ "<patch_index_0874>": 64887,
887
+ "<patch_index_0875>": 64888,
888
+ "<patch_index_0876>": 64889,
889
+ "<patch_index_0877>": 64890,
890
+ "<patch_index_0878>": 64891,
891
+ "<patch_index_0879>": 64892,
892
+ "<patch_index_0880>": 64893,
893
+ "<patch_index_0881>": 64894,
894
+ "<patch_index_0882>": 64895,
895
+ "<patch_index_0883>": 64896,
896
+ "<patch_index_0884>": 64897,
897
+ "<patch_index_0885>": 64898,
898
+ "<patch_index_0886>": 64899,
899
+ "<patch_index_0887>": 64900,
900
+ "<patch_index_0888>": 64901,
901
+ "<patch_index_0889>": 64902,
902
+ "<patch_index_0890>": 64903,
903
+ "<patch_index_0891>": 64904,
904
+ "<patch_index_0892>": 64905,
905
+ "<patch_index_0893>": 64906,
906
+ "<patch_index_0894>": 64907,
907
+ "<patch_index_0895>": 64908,
908
+ "<patch_index_0896>": 64909,
909
+ "<patch_index_0897>": 64910,
910
+ "<patch_index_0898>": 64911,
911
+ "<patch_index_0899>": 64912,
912
+ "<patch_index_0900>": 64913,
913
+ "<patch_index_0901>": 64914,
914
+ "<patch_index_0902>": 64915,
915
+ "<patch_index_0903>": 64916,
916
+ "<patch_index_0904>": 64917,
917
+ "<patch_index_0905>": 64918,
918
+ "<patch_index_0906>": 64919,
919
+ "<patch_index_0907>": 64920,
920
+ "<patch_index_0908>": 64921,
921
+ "<patch_index_0909>": 64922,
922
+ "<patch_index_0910>": 64923,
923
+ "<patch_index_0911>": 64924,
924
+ "<patch_index_0912>": 64925,
925
+ "<patch_index_0913>": 64926,
926
+ "<patch_index_0914>": 64927,
927
+ "<patch_index_0915>": 64928,
928
+ "<patch_index_0916>": 64929,
929
+ "<patch_index_0917>": 64930,
930
+ "<patch_index_0918>": 64931,
931
+ "<patch_index_0919>": 64932,
932
+ "<patch_index_0920>": 64933,
933
+ "<patch_index_0921>": 64934,
934
+ "<patch_index_0922>": 64935,
935
+ "<patch_index_0923>": 64936,
936
+ "<patch_index_0924>": 64937,
937
+ "<patch_index_0925>": 64938,
938
+ "<patch_index_0926>": 64939,
939
+ "<patch_index_0927>": 64940,
940
+ "<patch_index_0928>": 64941,
941
+ "<patch_index_0929>": 64942,
942
+ "<patch_index_0930>": 64943,
943
+ "<patch_index_0931>": 64944,
944
+ "<patch_index_0932>": 64945,
945
+ "<patch_index_0933>": 64946,
946
+ "<patch_index_0934>": 64947,
947
+ "<patch_index_0935>": 64948,
948
+ "<patch_index_0936>": 64949,
949
+ "<patch_index_0937>": 64950,
950
+ "<patch_index_0938>": 64951,
951
+ "<patch_index_0939>": 64952,
952
+ "<patch_index_0940>": 64953,
953
+ "<patch_index_0941>": 64954,
954
+ "<patch_index_0942>": 64955,
955
+ "<patch_index_0943>": 64956,
956
+ "<patch_index_0944>": 64957,
957
+ "<patch_index_0945>": 64958,
958
+ "<patch_index_0946>": 64959,
959
+ "<patch_index_0947>": 64960,
960
+ "<patch_index_0948>": 64961,
961
+ "<patch_index_0949>": 64962,
962
+ "<patch_index_0950>": 64963,
963
+ "<patch_index_0951>": 64964,
964
+ "<patch_index_0952>": 64965,
965
+ "<patch_index_0953>": 64966,
966
+ "<patch_index_0954>": 64967,
967
+ "<patch_index_0955>": 64968,
968
+ "<patch_index_0956>": 64969,
969
+ "<patch_index_0957>": 64970,
970
+ "<patch_index_0958>": 64971,
971
+ "<patch_index_0959>": 64972,
972
+ "<patch_index_0960>": 64973,
973
+ "<patch_index_0961>": 64974,
974
+ "<patch_index_0962>": 64975,
975
+ "<patch_index_0963>": 64976,
976
+ "<patch_index_0964>": 64977,
977
+ "<patch_index_0965>": 64978,
978
+ "<patch_index_0966>": 64979,
979
+ "<patch_index_0967>": 64980,
980
+ "<patch_index_0968>": 64981,
981
+ "<patch_index_0969>": 64982,
982
+ "<patch_index_0970>": 64983,
983
+ "<patch_index_0971>": 64984,
984
+ "<patch_index_0972>": 64985,
985
+ "<patch_index_0973>": 64986,
986
+ "<patch_index_0974>": 64987,
987
+ "<patch_index_0975>": 64988,
988
+ "<patch_index_0976>": 64989,
989
+ "<patch_index_0977>": 64990,
990
+ "<patch_index_0978>": 64991,
991
+ "<patch_index_0979>": 64992,
992
+ "<patch_index_0980>": 64993,
993
+ "<patch_index_0981>": 64994,
994
+ "<patch_index_0982>": 64995,
995
+ "<patch_index_0983>": 64996,
996
+ "<patch_index_0984>": 64997,
997
+ "<patch_index_0985>": 64998,
998
+ "<patch_index_0986>": 64999,
999
+ "<patch_index_0987>": 65000,
1000
+ "<patch_index_0988>": 65001,
1001
+ "<patch_index_0989>": 65002,
1002
+ "<patch_index_0990>": 65003,
1003
+ "<patch_index_0991>": 65004,
1004
+ "<patch_index_0992>": 65005,
1005
+ "<patch_index_0993>": 65006,
1006
+ "<patch_index_0994>": 65007,
1007
+ "<patch_index_0995>": 65008,
1008
+ "<patch_index_0996>": 65009,
1009
+ "<patch_index_0997>": 65010,
1010
+ "<patch_index_0998>": 65011,
1011
+ "<patch_index_0999>": 65012,
1012
+ "<patch_index_1000>": 65013,
1013
+ "<patch_index_1001>": 65014,
1014
+ "<patch_index_1002>": 65015,
1015
+ "<patch_index_1003>": 65016,
1016
+ "<patch_index_1004>": 65017,
1017
+ "<patch_index_1005>": 65018,
1018
+ "<patch_index_1006>": 65019,
1019
+ "<patch_index_1007>": 65020,
1020
+ "<patch_index_1008>": 65021,
1021
+ "<patch_index_1009>": 65022,
1022
+ "<patch_index_1010>": 65023,
1023
+ "<patch_index_1011>": 65024,
1024
+ "<patch_index_1012>": 65025,
1025
+ "<patch_index_1013>": 65026,
1026
+ "<patch_index_1014>": 65027,
1027
+ "<patch_index_1015>": 65028,
1028
+ "<patch_index_1016>": 65029,
1029
+ "<patch_index_1017>": 65030,
1030
+ "<patch_index_1018>": 65031,
1031
+ "<patch_index_1019>": 65032,
1032
+ "<patch_index_1020>": 65033,
1033
+ "<patch_index_1021>": 65034,
1034
+ "<patch_index_1022>": 65035,
1035
+ "<patch_index_1023>": 65036,
1036
+ "<phrase>": 64007
1037
+ }
annotated_snowman.jpg ADDED
config.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "HF_Kosmos2",
4
+ "architectures": [
5
+ "Kosmos2ForConditionalGeneration"
6
+ ],
7
+ "latent_query_num": 64,
8
+ "model_type": "kosmos-2",
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_kosmos2.Kosmos2Config",
11
+ "AutoModel": "modeling_kosmos2.Kosmos2Model",
12
+ "AutoModelForVision2Seq": "modeling_kosmos2.Kosmos2ForConditionalGeneration",
13
+ "AutoProcessor": "processing_kosmos2.Kosmos2Processor"
14
+ },
15
+ "text_config": {
16
+ "_name_or_path": "",
17
+ "activation_dropout": 0.0,
18
+ "activation_function": "gelu",
19
+ "add_cross_attention": false,
20
+ "architectures": null,
21
+ "attention_dropout": 0.1,
22
+ "attention_heads": 32,
23
+ "bad_words_ids": null,
24
+ "begin_suppress_tokens": null,
25
+ "bos_token_id": 0,
26
+ "chunk_size_feed_forward": 0,
27
+ "cross_attention_hidden_size": null,
28
+ "decoder_start_token_id": null,
29
+ "diversity_penalty": 0.0,
30
+ "do_sample": false,
31
+ "dropout": 0.1,
32
+ "early_stopping": false,
33
+ "embed_dim": 2048,
34
+ "encoder_no_repeat_ngram_size": 0,
35
+ "eos_token_id": 2,
36
+ "exponential_decay_length_penalty": null,
37
+ "ffn_dim": 8192,
38
+ "finetuning_task": null,
39
+ "forced_bos_token_id": null,
40
+ "forced_eos_token_id": null,
41
+ "gradient_checkpointing": false,
42
+ "id2label": {
43
+ "0": "LABEL_0",
44
+ "1": "LABEL_1"
45
+ },
46
+ "is_decoder": false,
47
+ "is_encoder_decoder": false,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.0,
54
+ "layers": 24,
55
+ "length_penalty": 1.0,
56
+ "max_length": 20,
57
+ "max_position_embeddings": 2048,
58
+ "min_length": 0,
59
+ "model_type": "kosmos_2_text_model",
60
+ "no_repeat_ngram_size": 3,
61
+ "num_beam_groups": 1,
62
+ "num_beams": 1,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": 1,
68
+ "prefix": null,
69
+ "problem_type": null,
70
+ "pruned_heads": {},
71
+ "remove_invalid_values": false,
72
+ "repetition_penalty": 1.0,
73
+ "return_dict": true,
74
+ "return_dict_in_generate": false,
75
+ "scale_embedding": true,
76
+ "sep_token_id": null,
77
+ "suppress_tokens": null,
78
+ "task_specific_params": null,
79
+ "temperature": 1.0,
80
+ "tf_legacy_loss": false,
81
+ "tie_encoder_decoder": false,
82
+ "tie_word_embeddings": true,
83
+ "tokenizer_class": null,
84
+ "top_k": 50,
85
+ "top_p": 1.0,
86
+ "torch_dtype": null,
87
+ "torchscript": false,
88
+ "transformers_version": "4.31.0.dev0",
89
+ "typical_p": 1.0,
90
+ "use_bfloat16": false,
91
+ "use_cache": true,
92
+ "vocab_size": 65037
93
+ },
94
+ "torch_dtype": "float32",
95
+ "transformers_version": null,
96
+ "vision_config": {
97
+ "_name_or_path": "",
98
+ "add_cross_attention": false,
99
+ "architectures": null,
100
+ "attention_dropout": 0.0,
101
+ "bad_words_ids": null,
102
+ "begin_suppress_tokens": null,
103
+ "bos_token_id": null,
104
+ "chunk_size_feed_forward": 0,
105
+ "cross_attention_hidden_size": null,
106
+ "decoder_start_token_id": null,
107
+ "diversity_penalty": 0.0,
108
+ "do_sample": false,
109
+ "early_stopping": false,
110
+ "encoder_no_repeat_ngram_size": 0,
111
+ "eos_token_id": null,
112
+ "exponential_decay_length_penalty": null,
113
+ "finetuning_task": null,
114
+ "forced_bos_token_id": null,
115
+ "forced_eos_token_id": null,
116
+ "hidden_act": "quick_gelu",
117
+ "hidden_size": 1024,
118
+ "id2label": {
119
+ "0": "LABEL_0",
120
+ "1": "LABEL_1"
121
+ },
122
+ "image_size": 224,
123
+ "initializer_factor": 1.0,
124
+ "initializer_range": 0.02,
125
+ "intermediate_size": 4096,
126
+ "is_decoder": false,
127
+ "is_encoder_decoder": false,
128
+ "label2id": {
129
+ "LABEL_0": 0,
130
+ "LABEL_1": 1
131
+ },
132
+ "layer_norm_eps": 1e-05,
133
+ "length_penalty": 1.0,
134
+ "max_length": 20,
135
+ "min_length": 0,
136
+ "model_type": "kosmos_2_vision_model",
137
+ "no_repeat_ngram_size": 0,
138
+ "num_attention_heads": 16,
139
+ "num_beam_groups": 1,
140
+ "num_beams": 1,
141
+ "num_channels": 3,
142
+ "num_hidden_layers": 24,
143
+ "num_return_sequences": 1,
144
+ "output_attentions": false,
145
+ "output_hidden_states": false,
146
+ "output_scores": false,
147
+ "pad_token_id": null,
148
+ "patch_size": 14,
149
+ "prefix": null,
150
+ "problem_type": null,
151
+ "projection_dim": 512,
152
+ "pruned_heads": {},
153
+ "remove_invalid_values": false,
154
+ "repetition_penalty": 1.0,
155
+ "return_dict": true,
156
+ "return_dict_in_generate": false,
157
+ "sep_token_id": null,
158
+ "suppress_tokens": null,
159
+ "task_specific_params": null,
160
+ "temperature": 1.0,
161
+ "tf_legacy_loss": false,
162
+ "tie_encoder_decoder": false,
163
+ "tie_word_embeddings": true,
164
+ "tokenizer_class": null,
165
+ "top_k": 50,
166
+ "top_p": 1.0,
167
+ "torch_dtype": null,
168
+ "torchscript": false,
169
+ "transformers_version": "4.31.0.dev0",
170
+ "typical_p": 1.0,
171
+ "use_bfloat16": false
172
+ }
173
+ }
configuration_kosmos2.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ KOSMOS-2 model configuration"""
16
+
17
+ import copy
18
+ import os
19
+ from typing import Union
20
+
21
+ from transformers.configuration_utils import PretrainedConfig
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28
+ "microsoft/kosmos-2-patch14-224": (
29
+ "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/config.json"
30
+ ),
31
+ # See all KOSMOS-2 models at https://huggingface.co/models?filter=kosmos-2
32
+ }
33
+
34
+
35
+ class Kosmos2TextConfig(PretrainedConfig):
36
+ r"""
37
+ This is the configuration class to store the configuration of a [`Kosmos2TextModel`]. It is used to instantiate a KOSMOS-2 text decoder
38
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
39
+ defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2
40
+ [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
41
+
42
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
43
+ documentation from [`PretrainedConfig`] for more information.
44
+
45
+ Args:
46
+ vocab_size (`int`, *optional*, defaults to 65037):
47
+ Vocabulary size of the Kosmos2 model. Defines the number of different tokens that can be represented by the
48
+ `inputs_ids` passed when calling [`Kosmos2Model`].
49
+ embed_dim (`int`, *optional*, defaults to 2048):
50
+ Dimensionality of the layers and the pooler layer.
51
+ layers (`int`, *optional*, defaults to 24):
52
+ Number of hidden layers in the Transformer encoder.
53
+ attention_heads (`int`, *optional*, defaults to 32):
54
+ Number of attention heads for each attention layer in the Transformer encoder.
55
+ ffn_dim (`int`, *optional*, defaults to 8192):
56
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
57
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
58
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
59
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
60
+ dropout (`float`, *optional*, defaults to 0.1):
61
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
62
+ attention_dropout (`float`, *optional*, defaults to 0.1):
63
+ The dropout ratio for the attention probabilities.
64
+ activation_dropout (`float`, *optional*, defaults to 0.0):
65
+ The dropout ratio for activations inside the fully connected layer.
66
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
67
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
68
+ just in case (e.g., 512 or 1024 or 2048).
69
+ layerdrop (`float`, *optional*, defaults to 0.0):
70
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
71
+ for more details.
72
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
73
+ The epsilon used by the layer normalization layers.
74
+ scale_embedding (`bool`, *optional*, defaults to `True`):
75
+ Scale embeddings by diving by sqrt(embed_dim).
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models).
78
+
79
+ Example:
80
+
81
+ ```python
82
+ >>> from transformers import Kosmos2TextConfig, Kosmos2TextModel
83
+
84
+ >>> # Initializing a Kosmos2TextConfig microsoft/kosmos-2-patch14-224 style configuration
85
+ >>> configuration = Kosmos2TextConfig()
86
+
87
+ >>> # Initializing a Kosmos2TextModel (with random weights) from the microsoft/kosmos-2-patch14-224 style configuration
88
+ >>> model = Kosmos2TextModel(configuration)
89
+
90
+ >>> # Accessing the model configuration
91
+ >>> configuration = model.config
92
+ ```"""
93
+ model_type = "kosmos_2_text_model"
94
+ keys_to_ignore_at_inference = ["past_key_values"]
95
+ attribute_map = {"num_attention_heads": "attention_heads", "hidden_size": "embed_dim"}
96
+
97
+ def __init__(
98
+ self,
99
+ vocab_size=65037,
100
+ max_position_embeddings=2048,
101
+ embed_dim=2048,
102
+ layers=24,
103
+ ffn_dim=8192,
104
+ attention_heads=32,
105
+ activation_function="gelu",
106
+ dropout=0.1,
107
+ attention_dropout=0.1,
108
+ activation_dropout=0.0,
109
+ layerdrop=0.0,
110
+ layer_norm_eps=1e-5,
111
+ scale_embedding=True,
112
+ use_cache=True,
113
+ pad_token_id=1,
114
+ bos_token_id=0,
115
+ eos_token_id=2,
116
+ **kwargs,
117
+ ):
118
+ super().__init__(
119
+ pad_token_id=pad_token_id,
120
+ bos_token_id=bos_token_id,
121
+ eos_token_id=eos_token_id,
122
+ **kwargs,
123
+ )
124
+
125
+ self.vocab_size = vocab_size
126
+ self.max_position_embeddings = max_position_embeddings
127
+ self.embed_dim = embed_dim
128
+ self.layers = layers
129
+ self.ffn_dim = ffn_dim
130
+ self.attention_heads = attention_heads
131
+ self.activation_function = activation_function
132
+ self.dropout = dropout
133
+ self.attention_dropout = attention_dropout
134
+ self.activation_dropout = activation_dropout
135
+ self.layerdrop = layerdrop
136
+ self.layer_norm_eps = layer_norm_eps
137
+ self.scale_embedding = scale_embedding
138
+ self.use_cache = use_cache
139
+
140
+ @classmethod
141
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
142
+ cls._set_token_in_kwargs(kwargs)
143
+
144
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
145
+
146
+ # get the text config dict if we are loading from Kosmos2Config
147
+ if config_dict.get("model_type") == "kosmos-2":
148
+ config_dict = config_dict["text_config"]
149
+
150
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
151
+ logger.warning(
152
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
153
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
154
+ )
155
+
156
+ return cls.from_dict(config_dict, **kwargs)
157
+
158
+
159
+ class Kosmos2VisionConfig(PretrainedConfig):
160
+ r"""
161
+ This is the configuration class to store the configuration of a [`Kosmos2VisionModel`]. It is used to instantiate a
162
+ KOSMOS-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
163
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the KOSMOS-2
164
+ [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
165
+
166
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
167
+ documentation from [`PretrainedConfig`] for more information.
168
+
169
+ Args:
170
+ hidden_size (`int`, *optional*, defaults to 1024):
171
+ Dimensionality of the encoder layers and the pooler layer.
172
+ intermediate_size (`int`, *optional*, defaults to 4096):
173
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
174
+ num_hidden_layers (`int`, *optional*, defaults to 24):
175
+ Number of hidden layers in the Transformer encoder.
176
+ num_attention_heads (`int`, *optional*, defaults to 16):
177
+ Number of attention heads for each attention layer in the Transformer encoder.
178
+ image_size (`int`, *optional*, defaults to 224):
179
+ The size (resolution) of each image.
180
+ patch_size (`int`, *optional*, defaults to 14):
181
+ The size (resolution) of each patch.
182
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
183
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
184
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
185
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
186
+ The epsilon used by the layer normalization layers.
187
+ attention_dropout (`float`, *optional*, defaults to 0.0):
188
+ The dropout ratio for the attention probabilities.
189
+ initializer_range (`float`, *optional*, defaults to 0.02):
190
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
191
+ initializer_factor (`float`, *optional*, defaults to 1):
192
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
193
+ testing).
194
+
195
+ Example:
196
+
197
+ ```python
198
+ >>> from transformers import Kosmos2VisionConfig, Kosmos2VisionModel
199
+
200
+ >>> # Initializing a Kosmos2VisionConfig with microsoft/kosmos-2-patch14-224 style configuration
201
+ >>> configuration = Kosmos2VisionConfig()
202
+
203
+ >>> # Initializing a Kosmos2VisionModel (with random weights) from the microsoft/kosmos-2-patch14-224 style configuration
204
+ >>> model = Kosmos2VisionModel(configuration)
205
+
206
+ >>> # Accessing the model configuration
207
+ >>> configuration = model.config
208
+ ```"""
209
+
210
+ model_type = "kosmos_2_vision_model"
211
+
212
+ def __init__(
213
+ self,
214
+ hidden_size=1024,
215
+ intermediate_size=4096,
216
+ projection_dim=512,
217
+ num_hidden_layers=24,
218
+ num_attention_heads=16,
219
+ num_channels=3,
220
+ image_size=224,
221
+ patch_size=14,
222
+ hidden_act="quick_gelu",
223
+ layer_norm_eps=1e-5,
224
+ attention_dropout=0.0,
225
+ initializer_range=0.02,
226
+ initializer_factor=1.0,
227
+ **kwargs,
228
+ ):
229
+ super().__init__(**kwargs)
230
+
231
+ self.hidden_size = hidden_size
232
+ self.intermediate_size = intermediate_size
233
+ self.projection_dim = projection_dim
234
+ self.num_hidden_layers = num_hidden_layers
235
+ self.num_attention_heads = num_attention_heads
236
+ self.num_channels = num_channels
237
+ self.patch_size = patch_size
238
+ self.image_size = image_size
239
+ self.initializer_range = initializer_range
240
+ self.initializer_factor = initializer_factor
241
+ self.attention_dropout = attention_dropout
242
+ self.layer_norm_eps = layer_norm_eps
243
+ self.hidden_act = hidden_act
244
+
245
+ @classmethod
246
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
247
+ cls._set_token_in_kwargs(kwargs)
248
+
249
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
250
+
251
+ # get the vision config dict if we are loading from Kosmos2Config
252
+ if config_dict.get("model_type") == "kosmos-2":
253
+ config_dict = config_dict["vision_config"]
254
+
255
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
256
+ logger.warning(
257
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
258
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
259
+ )
260
+
261
+ return cls.from_dict(config_dict, **kwargs)
262
+
263
+
264
+ class Kosmos2Config(PretrainedConfig):
265
+ r"""
266
+ This is the configuration class to store the configuration of a [`Kosmos2Model`]. It is used to instantiate a KOSMOS-2
267
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
268
+ defaults will yield a similar configuration to that of the KOSMOS-2
269
+ [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
270
+
271
+ Args:
272
+ text_config (`dict`, *optional*):
273
+ Dictionary of configuration options used to initialize [`Kosmos2TextConfig`].
274
+ vision_config (`dict`, *optional*):
275
+ Dictionary of configuration options used to initialize [`Kosmos2VisionConfig`].
276
+ latent_query_num (`int`, *optional*, defaults to 64):
277
+ The number of latent query tokens that represent the image features used in the text decoder component.
278
+ kwargs (*optional*):
279
+ Dictionary of keyword arguments.
280
+
281
+ Example:
282
+
283
+ ```python
284
+ >>> from transformers import Kosmos2Config, Kosmos2Model
285
+
286
+ >>> # Initializing a Kosmos-2 kosmos-2-patch14-224 style configuration
287
+ >>> configuration = Kosmos2Config()
288
+
289
+ >>> # Initializing a model (with random weights) from the kosmos-2-patch14-224 style configuration
290
+ >>> model = Kosmos2Model(configuration)
291
+
292
+ >>> # Accessing the model configuration
293
+ >>> configuration = model.config
294
+ ```"""
295
+ model_type = "kosmos-2"
296
+ is_composition = True
297
+
298
+ def __init__(
299
+ self,
300
+ text_config=None,
301
+ vision_config=None,
302
+ latent_query_num=64,
303
+ **kwargs,
304
+ ):
305
+ super().__init__(**kwargs)
306
+
307
+ if text_config is None:
308
+ text_config = {}
309
+ logger.info("`text_config` is `None`. Initializing the `Kosmos2TextConfig` with default values.")
310
+
311
+ if vision_config is None:
312
+ vision_config = {}
313
+ logger.info("`vision_config` is `None`. Initializing the `Kosmos2VisionConfig` with default values.")
314
+
315
+ self.text_config = Kosmos2TextConfig(**text_config)
316
+ self.vision_config = Kosmos2VisionConfig(**vision_config)
317
+
318
+ self.latent_query_num = latent_query_num
319
+
320
+ def to_dict(self):
321
+ """
322
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
323
+
324
+ Returns:
325
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
326
+ """
327
+ output = copy.deepcopy(self.__dict__)
328
+ output["text_config"] = self.text_config.to_dict()
329
+ output["vision_config"] = self.vision_config.to_dict()
330
+ output["model_type"] = self.__class__.model_type
331
+ return output
draw_bboxes.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+ from PIL import Image
5
+ import torchvision.transforms as T
6
+ import cv2
7
+ import requests
8
+
9
+
10
+ def is_overlapping(rect1, rect2):
11
+ x1, y1, x2, y2 = rect1
12
+ x3, y3, x4, y4 = rect2
13
+ return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)
14
+
15
+
16
+ def draw_entity_boxes_on_image(image, entities, show=False, save_path=None):
17
+ """_summary_
18
+ Args:
19
+ image (_type_): image or image path
20
+ collect_entity_location (_type_): _description_
21
+ """
22
+ if isinstance(image, Image.Image):
23
+ image_h = image.height
24
+ image_w = image.width
25
+ image = np.array(image)[:, :, [2, 1, 0]]
26
+ elif isinstance(image, str):
27
+ if os.path.exists(image):
28
+ pil_img = Image.open(image).convert("RGB")
29
+ image = np.array(pil_img)[:, :, [2, 1, 0]]
30
+ image_h = pil_img.height
31
+ image_w = pil_img.width
32
+ else:
33
+ raise ValueError(f"invaild image path, {image}")
34
+ elif isinstance(image, torch.Tensor):
35
+ # pdb.set_trace()
36
+ image_tensor = image.cpu()
37
+ reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None]
38
+ reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None]
39
+ image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean
40
+ pil_img = T.ToPILImage()(image_tensor)
41
+ image_h = pil_img.height
42
+ image_w = pil_img.width
43
+ image = np.array(pil_img)[:, :, [2, 1, 0]]
44
+ else:
45
+ raise ValueError(f"invaild image format, {type(image)} for {image}")
46
+
47
+ if len(entities) == 0:
48
+ return image
49
+
50
+ new_image = image.copy()
51
+ previous_bboxes = []
52
+ # size of text
53
+ text_size = 2
54
+ # thickness of text
55
+ text_line = 1 # int(max(1 * min(image_h, image_w) / 512, 1))
56
+ box_line = 3
57
+ (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
58
+ base_height = int(text_height * 0.675)
59
+ text_offset_original = text_height - base_height
60
+ text_spaces = 3
61
+
62
+ for entity_name, (start, end), bboxes in entities:
63
+ for (x1_norm, y1_norm, x2_norm, y2_norm) in bboxes:
64
+ orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm * image_w), int(y1_norm * image_h), int(x2_norm * image_w), int(y2_norm * image_h)
65
+ # draw bbox
66
+ # random color
67
+ color = tuple(np.random.randint(0, 255, size=3).tolist())
68
+ new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)
69
+
70
+ l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1
71
+
72
+ x1 = orig_x1 - l_o
73
+ y1 = orig_y1 - l_o
74
+
75
+ if y1 < text_height + text_offset_original + 2 * text_spaces:
76
+ y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
77
+ x1 = orig_x1 + r_o
78
+
79
+ # add text background
80
+ (text_width, text_height), _ = cv2.getTextSize(f" {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
81
+ text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1
82
+
83
+ for prev_bbox in previous_bboxes:
84
+ while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox):
85
+ text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
86
+ text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
87
+ y1 += (text_height + text_offset_original + 2 * text_spaces)
88
+
89
+ if text_bg_y2 >= image_h:
90
+ text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
91
+ text_bg_y2 = image_h
92
+ y1 = image_h
93
+ break
94
+
95
+ alpha = 0.5
96
+ for i in range(text_bg_y1, text_bg_y2):
97
+ for j in range(text_bg_x1, text_bg_x2):
98
+ if i < image_h and j < image_w:
99
+ if j < text_bg_x1 + 1.35 * c_width:
100
+ # original color
101
+ bg_color = color
102
+ else:
103
+ # white
104
+ bg_color = [255, 255, 255]
105
+ new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8)
106
+
107
+ cv2.putText(
108
+ new_image, f" {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
109
+ )
110
+ # previous_locations.append((x1, y1))
111
+ previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2))
112
+
113
+ pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]])
114
+ if save_path:
115
+ pil_image.save(save_path)
116
+ if show:
117
+ pil_image.show()
118
+
119
+ return new_image
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "no_repeat_ngram_size": 3,
6
+ "pad_token_id": 1,
7
+ "transformers_version": "4.32.0.dev0",
8
+ "use_cache": true
9
+ }
image_processing_kosmos2.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Image processor class for Kosmos2."""
16
+
17
+ from typing import Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+
21
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
22
+ from transformers.image_transforms import (
23
+ convert_to_rgb,
24
+ get_resize_output_image_size,
25
+ resize,
26
+ to_channel_dimension_format,
27
+ )
28
+ from transformers.image_utils import (
29
+ OPENAI_CLIP_MEAN,
30
+ OPENAI_CLIP_STD,
31
+ ChannelDimension,
32
+ ImageInput,
33
+ PILImageResampling,
34
+ infer_channel_dimension_format,
35
+ make_list_of_images,
36
+ to_numpy_array,
37
+ valid_images,
38
+ )
39
+ from transformers.utils import TensorType, is_vision_available, logging
40
+
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+
45
+ if is_vision_available():
46
+ import PIL
47
+
48
+
49
+ class Kosmos2ImageProcessor(BaseImageProcessor):
50
+ r"""
51
+ Constructs a CLIP image processor.
52
+
53
+ Args:
54
+ do_resize (`bool`, *optional*, defaults to `True`):
55
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
56
+ `do_resize` in the `preprocess` method.
57
+ size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
58
+ Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
59
+ the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
60
+ method.
61
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
62
+ Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
63
+ do_center_crop (`bool`, *optional*, defaults to `True`):
64
+ Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
65
+ `preprocess` method.
66
+ crop_size (`Dict[str, int]` *optional*, defaults to 224):
67
+ Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
68
+ method.
69
+ do_rescale (`bool`, *optional*, defaults to `True`):
70
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
71
+ the `preprocess` method.
72
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
73
+ Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
74
+ method.
75
+ do_normalize:
76
+ Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
77
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
78
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
79
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
80
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
81
+ Image standard deviation.
82
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
83
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
84
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
85
+ """
86
+
87
+ model_input_names = ["pixel_values"]
88
+
89
+ def __init__(
90
+ self,
91
+ do_resize: bool = True,
92
+ size: Dict[str, int] = None,
93
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
94
+ do_center_crop: bool = True,
95
+ crop_size: Dict[str, int] = None,
96
+ do_rescale: bool = True,
97
+ rescale_factor: Union[int, float] = 1 / 255,
98
+ do_normalize: bool = True,
99
+ image_mean: Optional[Union[float, List[float]]] = None,
100
+ image_std: Optional[Union[float, List[float]]] = None,
101
+ do_convert_rgb: bool = True,
102
+ **kwargs,
103
+ ) -> None:
104
+ super().__init__(**kwargs)
105
+ size = size if size is not None else {"shortest_edge": 224}
106
+ size = get_size_dict(size, default_to_square=False)
107
+ crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
108
+ crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
109
+
110
+ self.do_resize = do_resize
111
+ self.size = size
112
+ self.resample = resample
113
+ self.do_center_crop = do_center_crop
114
+ self.crop_size = crop_size
115
+ self.do_rescale = do_rescale
116
+ self.rescale_factor = rescale_factor
117
+ self.do_normalize = do_normalize
118
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
119
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
120
+ self.do_convert_rgb = do_convert_rgb
121
+
122
+ def resize(
123
+ self,
124
+ image: np.ndarray,
125
+ size: Dict[str, int],
126
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
127
+ data_format: Optional[Union[str, ChannelDimension]] = None,
128
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
129
+ **kwargs,
130
+ ) -> np.ndarray:
131
+ """
132
+ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
133
+ resized to keep the input aspect ratio.
134
+
135
+ Args:
136
+ image (`np.ndarray`):
137
+ Image to resize.
138
+ size (`Dict[str, int]`):
139
+ Size of the output image.
140
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
141
+ Resampling filter to use when resiizing the image.
142
+ data_format (`str` or `ChannelDimension`, *optional*):
143
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
144
+ input_data_format (`ChannelDimension` or `str`, *optional*):
145
+ The channel dimension format of the input image. If not provided, it will be inferred.
146
+ """
147
+ size = get_size_dict(size)
148
+ if "shortest_edge" not in size:
149
+ raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
150
+ output_size = get_resize_output_image_size(
151
+ image, size=size["shortest_edge"], input_data_format=input_data_format
152
+ )
153
+ return resize(
154
+ image,
155
+ size=output_size,
156
+ resample=resample,
157
+ data_format=data_format,
158
+ input_data_format=input_data_format,
159
+ **kwargs,
160
+ )
161
+
162
+ def preprocess(
163
+ self,
164
+ images: ImageInput,
165
+ do_resize: bool = None,
166
+ size: Dict[str, int] = None,
167
+ resample: PILImageResampling = None,
168
+ do_center_crop: bool = None,
169
+ crop_size: int = None,
170
+ do_rescale: bool = None,
171
+ rescale_factor: float = None,
172
+ do_normalize: bool = None,
173
+ image_mean: Optional[Union[float, List[float]]] = None,
174
+ image_std: Optional[Union[float, List[float]]] = None,
175
+ do_convert_rgb: bool = None,
176
+ return_tensors: Optional[Union[str, TensorType]] = None,
177
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
178
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
179
+ **kwargs,
180
+ ) -> PIL.Image.Image:
181
+ """
182
+ Preprocess an image or batch of images.
183
+
184
+ Args:
185
+ images (`ImageInput`):
186
+ Image to preprocess.
187
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
188
+ Whether to resize the image.
189
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
190
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
191
+ the longest edge resized to keep the input aspect ratio.
192
+ resample (`int`, *optional*, defaults to `self.resample`):
193
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
194
+ has an effect if `do_resize` is set to `True`.
195
+ do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
196
+ Whether to center crop the image.
197
+ crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
198
+ Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
199
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
200
+ Whether to rescale the image.
201
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
202
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
203
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
204
+ Whether to normalize the image.
205
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
206
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
207
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
208
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
209
+ `True`.
210
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
211
+ Whether to convert the image to RGB.
212
+ return_tensors (`str` or `TensorType`, *optional*):
213
+ The type of tensors to return. Can be one of:
214
+ - Unset: Return a list of `np.ndarray`.
215
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
216
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
217
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
218
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
219
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
220
+ The channel dimension format for the output image. Can be one of:
221
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
222
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
223
+ - Unset: Use the channel dimension format of the input image.
224
+ input_data_format (`ChannelDimension` or `str`, *optional*):
225
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
226
+ from the input image. Can be one of:
227
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
228
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
229
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
230
+ """
231
+ do_resize = do_resize if do_resize is not None else self.do_resize
232
+ size = size if size is not None else self.size
233
+ size = get_size_dict(size, param_name="size", default_to_square=False)
234
+ resample = resample if resample is not None else self.resample
235
+ do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
236
+ crop_size = crop_size if crop_size is not None else self.crop_size
237
+ crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
238
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
239
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
240
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
241
+ image_mean = image_mean if image_mean is not None else self.image_mean
242
+ image_std = image_std if image_std is not None else self.image_std
243
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
244
+
245
+ images = make_list_of_images(images)
246
+
247
+ if not valid_images(images):
248
+ raise ValueError(
249
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
250
+ "torch.Tensor, tf.Tensor or jax.ndarray."
251
+ )
252
+
253
+ if do_resize and size is None:
254
+ raise ValueError("Size must be specified if do_resize is True.")
255
+
256
+ if do_center_crop and crop_size is None:
257
+ raise ValueError("Crop size must be specified if do_center_crop is True.")
258
+
259
+ if do_rescale and rescale_factor is None:
260
+ raise ValueError("Rescale factor must be specified if do_rescale is True.")
261
+
262
+ if do_normalize and (image_mean is None or image_std is None):
263
+ raise ValueError("Image mean and std must be specified if do_normalize is True.")
264
+
265
+ # PIL RGBA images are converted to RGB
266
+ if do_convert_rgb:
267
+ images = [convert_to_rgb(image) for image in images]
268
+
269
+ # All transformations expect numpy arrays.
270
+ images = [to_numpy_array(image) for image in images]
271
+
272
+ if input_data_format is None:
273
+ # We assume that all images have the same channel dimension format.
274
+ input_data_format = infer_channel_dimension_format(images[0])
275
+
276
+ if do_resize:
277
+ images = [
278
+ self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
279
+ for image in images
280
+ ]
281
+
282
+ if do_center_crop:
283
+ images = [
284
+ self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
285
+ ]
286
+
287
+ if do_rescale:
288
+ images = [
289
+ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
290
+ for image in images
291
+ ]
292
+
293
+ if do_normalize:
294
+ images = [
295
+ self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
296
+ for image in images
297
+ ]
298
+
299
+ images = [
300
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
301
+ ]
302
+
303
+ data = {"pixel_values": images}
304
+ return BatchFeature(data=data, tensor_type=return_tensors)
modeling_kosmos2.py ADDED
@@ -0,0 +1,1747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ PyTorch KOSMOS-2 model."""
16
+
17
+
18
+ import math
19
+ from dataclasses import dataclass
20
+ from typing import List, Optional, Tuple, Union
21
+
22
+ import torch
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+
26
+ from transformers.activations import ACT2FN
27
+ from transformers.modeling_outputs import (
28
+ BaseModelOutput,
29
+ BaseModelOutputWithPastAndCrossAttentions,
30
+ BaseModelOutputWithPooling,
31
+ CausalLMOutputWithCrossAttentions,
32
+ )
33
+ from transformers.modeling_utils import PreTrainedModel
34
+ from transformers.utils import (
35
+ ModelOutput,
36
+ add_start_docstrings,
37
+ add_start_docstrings_to_model_forward,
38
+ logging,
39
+ replace_return_docstrings,
40
+ )
41
+ from .configuration_kosmos2 import Kosmos2Config, Kosmos2TextConfig, Kosmos2VisionConfig
42
+
43
+
44
+ logger = logging.get_logger(__name__)
45
+
46
+ _CHECKPOINT_FOR_DOC = "microsoft/kosmos-2-patch14-224"
47
+ _CONFIG_FOR_DOC = Kosmos2Config
48
+ _EXPECTED_OUTPUT_SHAPE = None
49
+
50
+
51
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
52
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
53
+ """
54
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
55
+ """
56
+ bsz, src_len = mask.size()
57
+ tgt_len = tgt_len if tgt_len is not None else src_len
58
+
59
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
60
+
61
+ inverted_mask = 1.0 - expanded_mask
62
+
63
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
64
+
65
+
66
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
67
+ def _make_causal_mask(
68
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
69
+ ):
70
+ """
71
+ Make causal mask used for bi-directional self-attention.
72
+ """
73
+ bsz, tgt_len = input_ids_shape
74
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
75
+ mask_cond = torch.arange(mask.size(-1), device=device)
76
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
77
+ mask = mask.to(dtype)
78
+
79
+ if past_key_values_length > 0:
80
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
81
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
82
+
83
+
84
+ # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
85
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
86
+ """
87
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
88
+ are ignored. This is modified from fairseq's `utils.make_positions`.
89
+
90
+ Args:
91
+ x: torch.Tensor x:
92
+
93
+ Returns: torch.Tensor
94
+ """
95
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
96
+ mask = input_ids.ne(padding_idx).int()
97
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
98
+ return incremental_indices.long() + padding_idx
99
+
100
+
101
+ KOSMOS2_START_DOCSTRING = r"""Kosmos-2"""
102
+ KOSMOS2_VISION_INPUTS_DOCSTRING = r"""Kosmos-2"""
103
+ KOSMOS2_TEXT_INPUTS_DOCSTRING = r"""Kosmos-2"""
104
+ KOSMOS2_INPUTS_DOCSTRING = r"""Kosmos-2"""
105
+
106
+
107
+ @dataclass
108
+ class Kosmos2ModelOutput(ModelOutput):
109
+ """
110
+ Base class for text model's outputs that also contains a pooling of the last hidden states.
111
+
112
+ Args:
113
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
114
+ Sequence of hidden-states at the output of the last layer of the model.
115
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
116
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
117
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
118
+
119
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
120
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
121
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
122
+ sequence_length)`.
123
+
124
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
125
+ heads.
126
+ image_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when being computed by the model):
127
+ Sequence of hidden-states at the output of `Kosmos2ImageToTextConnector`.
128
+ image_connector_attention (`tuple(torch.FloatTensor)`, *optional, returned when being computed by the model):
129
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
130
+ sequence_length)`.
131
+
132
+ Attentions weights given by `Kosmos2ImageToTextConnector`, after the attention softmax, used to compute the weighted average in the self-attention
133
+ heads.
134
+ vision_model_output(`BaseModelOutputWithPooling`, *optional*, returned when being computed by the model):
135
+ The output of the [`Kosmos2VisionModel`].
136
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
137
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
138
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
139
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
140
+ encoder_sequence_length, embed_size_per_head)`.
141
+
142
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
143
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
144
+ input) to speed up sequential decoding.
145
+ """
146
+
147
+ last_hidden_states: torch.FloatTensor = None
148
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
149
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
150
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
151
+ image_features: Optional[torch.FloatTensor] = None
152
+ image_connector_attention: Optional[Tuple[torch.FloatTensor]] = None
153
+ vision_model_output: BaseModelOutputWithPooling = None
154
+
155
+
156
+ @dataclass
157
+ class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
158
+ """
159
+ Model output class for `Kosmos2ForConditionalGeneration`.
160
+
161
+ Args:
162
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
163
+ Language modeling loss (for next-token prediction).
164
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
165
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
166
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
167
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
168
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
169
+
170
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
171
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
172
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
173
+ sequence_length)`.
174
+
175
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
176
+ heads.
177
+ image_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when being computed by the model):
178
+ Sequence of hidden-states at the output of `Kosmos2ImageToTextConnector`.
179
+ image_connector_attention (`tuple(torch.FloatTensor)`, *optional, returned when being computed by the model):
180
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
181
+ sequence_length)`.
182
+
183
+ Attentions weights given by `Kosmos2ImageToTextConnector`, after the attention softmax, used to compute the weighted average in the self-attention
184
+ heads.
185
+ vision_model_output(`BaseModelOutputWithPooling`, *optional*, returned when being computed by the model):
186
+ The output of the [`Kosmos2VisionModel`].
187
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
188
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
189
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
190
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
191
+ encoder_sequence_length, embed_size_per_head)`.
192
+
193
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
194
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
195
+ input) to speed up sequential decoding.
196
+ """
197
+
198
+ loss: Optional[torch.FloatTensor] = None
199
+ logits: torch.FloatTensor = None
200
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
201
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
202
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
203
+ image_features: Optional[torch.FloatTensor] = None
204
+ image_connector_attention: Optional[Tuple[torch.FloatTensor]] = None
205
+ vision_model_output: BaseModelOutputWithPooling = None
206
+
207
+
208
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Kosmos2
209
+ class Kosmos2VisionEmbeddings(nn.Module):
210
+ def __init__(self, config: Kosmos2VisionConfig):
211
+ super().__init__()
212
+ self.config = config
213
+ self.embed_dim = config.hidden_size
214
+ self.image_size = config.image_size
215
+ self.patch_size = config.patch_size
216
+
217
+ self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
218
+
219
+ self.patch_embedding = nn.Conv2d(
220
+ in_channels=config.num_channels,
221
+ out_channels=self.embed_dim,
222
+ kernel_size=self.patch_size,
223
+ stride=self.patch_size,
224
+ bias=False,
225
+ )
226
+
227
+ self.num_patches = (self.image_size // self.patch_size) ** 2
228
+ self.num_positions = self.num_patches + 1
229
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
230
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
231
+
232
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
233
+ batch_size = pixel_values.shape[0]
234
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
235
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
236
+
237
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
238
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
239
+ embeddings = embeddings + self.position_embedding(self.position_ids)
240
+ return embeddings
241
+
242
+
243
+ # Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->Kosmos2Vision
244
+ class Kosmos2VisionAttention(nn.Module):
245
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
246
+
247
+ def __init__(self, config):
248
+ super().__init__()
249
+ self.config = config
250
+ self.embed_dim = config.hidden_size
251
+ self.num_heads = config.num_attention_heads
252
+ self.head_dim = self.embed_dim // self.num_heads
253
+ if self.head_dim * self.num_heads != self.embed_dim:
254
+ raise ValueError(
255
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
256
+ f" {self.num_heads})."
257
+ )
258
+ self.scale = self.head_dim**-0.5
259
+ self.dropout = config.attention_dropout
260
+
261
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
262
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
263
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
264
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
265
+
266
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
267
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
268
+
269
+ def forward(
270
+ self,
271
+ hidden_states: torch.Tensor,
272
+ attention_mask: Optional[torch.Tensor] = None,
273
+ causal_attention_mask: Optional[torch.Tensor] = None,
274
+ output_attentions: Optional[bool] = False,
275
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
276
+ """Input shape: Batch x Time x Channel"""
277
+
278
+ bsz, tgt_len, embed_dim = hidden_states.size()
279
+
280
+ # get query proj
281
+ query_states = self.q_proj(hidden_states) * self.scale
282
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
283
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
284
+
285
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
286
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
287
+ key_states = key_states.view(*proj_shape)
288
+ value_states = value_states.view(*proj_shape)
289
+
290
+ src_len = key_states.size(1)
291
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
292
+
293
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
294
+ raise ValueError(
295
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
296
+ f" {attn_weights.size()}"
297
+ )
298
+
299
+ # apply the causal_attention_mask first
300
+ if causal_attention_mask is not None:
301
+ if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
302
+ raise ValueError(
303
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
304
+ f" {causal_attention_mask.size()}"
305
+ )
306
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
307
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
308
+
309
+ if attention_mask is not None:
310
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
311
+ raise ValueError(
312
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
313
+ )
314
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
315
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
316
+
317
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
318
+
319
+ if output_attentions:
320
+ # this operation is a bit akward, but it's required to
321
+ # make sure that attn_weights keeps its gradient.
322
+ # In order to do so, attn_weights have to reshaped
323
+ # twice and have to be reused in the following
324
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
325
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
326
+ else:
327
+ attn_weights_reshaped = None
328
+
329
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
330
+
331
+ attn_output = torch.bmm(attn_probs, value_states)
332
+
333
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
334
+ raise ValueError(
335
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
336
+ f" {attn_output.size()}"
337
+ )
338
+
339
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
340
+ attn_output = attn_output.transpose(1, 2)
341
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
342
+
343
+ attn_output = self.out_proj(attn_output)
344
+
345
+ return attn_output, attn_weights_reshaped
346
+
347
+
348
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Kosmos2Vision
349
+ class Kosmos2VisionMLP(nn.Module):
350
+ def __init__(self, config):
351
+ super().__init__()
352
+ self.config = config
353
+ self.activation_fn = ACT2FN[config.hidden_act]
354
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
355
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
356
+
357
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
358
+ hidden_states = self.fc1(hidden_states)
359
+ hidden_states = self.activation_fn(hidden_states)
360
+ hidden_states = self.fc2(hidden_states)
361
+ return hidden_states
362
+
363
+
364
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Kosmos2Vision
365
+ class Kosmos2VisionEncoderLayer(nn.Module):
366
+ def __init__(self, config: Kosmos2VisionConfig):
367
+ super().__init__()
368
+ self.embed_dim = config.hidden_size
369
+ self.self_attn = Kosmos2VisionAttention(config)
370
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
371
+ self.mlp = Kosmos2VisionMLP(config)
372
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
373
+
374
+ def forward(
375
+ self,
376
+ hidden_states: torch.Tensor,
377
+ attention_mask: torch.Tensor,
378
+ causal_attention_mask: torch.Tensor,
379
+ output_attentions: Optional[bool] = False,
380
+ ) -> Tuple[torch.FloatTensor]:
381
+ """
382
+ Args:
383
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
384
+ attention_mask (`torch.FloatTensor`): attention mask of size
385
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
386
+ `(config.encoder_attention_heads,)`.
387
+ output_attentions (`bool`, *optional*):
388
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
389
+ returned tensors for more detail.
390
+ """
391
+ residual = hidden_states
392
+
393
+ hidden_states = self.layer_norm1(hidden_states)
394
+ hidden_states, attn_weights = self.self_attn(
395
+ hidden_states=hidden_states,
396
+ attention_mask=attention_mask,
397
+ causal_attention_mask=causal_attention_mask,
398
+ output_attentions=output_attentions,
399
+ )
400
+ hidden_states = residual + hidden_states
401
+
402
+ residual = hidden_states
403
+ hidden_states = self.layer_norm2(hidden_states)
404
+ hidden_states = self.mlp(hidden_states)
405
+ hidden_states = residual + hidden_states
406
+
407
+ outputs = (hidden_states,)
408
+
409
+ if output_attentions:
410
+ outputs += (attn_weights,)
411
+
412
+ return outputs
413
+
414
+
415
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Kosmos2Vision
416
+ class Kosmos2VisionEncoder(nn.Module):
417
+ """
418
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
419
+ [`Kosmos2VisionEncoderLayer`].
420
+
421
+ Args:
422
+ config: Kosmos2VisionConfig
423
+ """
424
+
425
+ def __init__(self, config: Kosmos2VisionConfig):
426
+ super().__init__()
427
+ self.config = config
428
+ self.layers = nn.ModuleList([Kosmos2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
429
+ self.gradient_checkpointing = False
430
+
431
+ def forward(
432
+ self,
433
+ inputs_embeds,
434
+ attention_mask: Optional[torch.Tensor] = None,
435
+ causal_attention_mask: Optional[torch.Tensor] = None,
436
+ output_attentions: Optional[bool] = None,
437
+ output_hidden_states: Optional[bool] = None,
438
+ return_dict: Optional[bool] = None,
439
+ ) -> Union[Tuple, BaseModelOutput]:
440
+ r"""
441
+ Args:
442
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
443
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
444
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
445
+ than the model's internal embedding lookup matrix.
446
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
447
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
448
+
449
+ - 1 for tokens that are **not masked**,
450
+ - 0 for tokens that are **masked**.
451
+
452
+ [What are attention masks?](../glossary#attention-mask)
453
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
454
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
455
+
456
+ - 1 for tokens that are **not masked**,
457
+ - 0 for tokens that are **masked**.
458
+
459
+ [What are attention masks?](../glossary#attention-mask)
460
+ output_attentions (`bool`, *optional*):
461
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
462
+ returned tensors for more detail.
463
+ output_hidden_states (`bool`, *optional*):
464
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
465
+ for more detail.
466
+ return_dict (`bool`, *optional*):
467
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
468
+ """
469
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
470
+ output_hidden_states = (
471
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
472
+ )
473
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
474
+
475
+ encoder_states = () if output_hidden_states else None
476
+ all_attentions = () if output_attentions else None
477
+
478
+ hidden_states = inputs_embeds
479
+ for idx, encoder_layer in enumerate(self.layers):
480
+ if output_hidden_states:
481
+ encoder_states = encoder_states + (hidden_states,)
482
+ if self.gradient_checkpointing and self.training:
483
+
484
+ def create_custom_forward(module):
485
+ def custom_forward(*inputs):
486
+ return module(*inputs, output_attentions)
487
+
488
+ return custom_forward
489
+
490
+ layer_outputs = torch.utils.checkpoint.checkpoint(
491
+ create_custom_forward(encoder_layer),
492
+ hidden_states,
493
+ attention_mask,
494
+ causal_attention_mask,
495
+ )
496
+ else:
497
+ layer_outputs = encoder_layer(
498
+ hidden_states,
499
+ attention_mask,
500
+ causal_attention_mask,
501
+ output_attentions=output_attentions,
502
+ )
503
+
504
+ hidden_states = layer_outputs[0]
505
+
506
+ if output_attentions:
507
+ all_attentions = all_attentions + (layer_outputs[1],)
508
+
509
+ if output_hidden_states:
510
+ encoder_states = encoder_states + (hidden_states,)
511
+
512
+ if not return_dict:
513
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
514
+ return BaseModelOutput(
515
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
516
+ )
517
+
518
+
519
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer with CLIPVision->Kosmos2Vision,CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2Vision
520
+ class Kosmos2VisionTransformer(nn.Module):
521
+ def __init__(self, config: Kosmos2VisionConfig):
522
+ super().__init__()
523
+ self.config = config
524
+ embed_dim = config.hidden_size
525
+
526
+ self.embeddings = Kosmos2VisionEmbeddings(config)
527
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
528
+ self.encoder = Kosmos2VisionEncoder(config)
529
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
530
+
531
+ @add_start_docstrings_to_model_forward(KOSMOS2_VISION_INPUTS_DOCSTRING)
532
+ def forward(
533
+ self,
534
+ pixel_values: Optional[torch.FloatTensor] = None,
535
+ output_attentions: Optional[bool] = None,
536
+ output_hidden_states: Optional[bool] = None,
537
+ return_dict: Optional[bool] = None,
538
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
539
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
540
+ output_hidden_states = (
541
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
542
+ )
543
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
544
+
545
+ if pixel_values is None:
546
+ raise ValueError("You have to specify pixel_values")
547
+
548
+ hidden_states = self.embeddings(pixel_values)
549
+ hidden_states = self.pre_layrnorm(hidden_states)
550
+
551
+ encoder_outputs = self.encoder(
552
+ inputs_embeds=hidden_states,
553
+ output_attentions=output_attentions,
554
+ output_hidden_states=output_hidden_states,
555
+ return_dict=return_dict,
556
+ )
557
+
558
+ last_hidden_state = encoder_outputs[0]
559
+ pooled_output = last_hidden_state[:, 0, :]
560
+ pooled_output = self.post_layernorm(pooled_output)
561
+
562
+ if not return_dict:
563
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
564
+
565
+ return BaseModelOutputWithPooling(
566
+ last_hidden_state=last_hidden_state,
567
+ pooler_output=pooled_output,
568
+ hidden_states=encoder_outputs.hidden_states,
569
+ attentions=encoder_outputs.attentions,
570
+ )
571
+
572
+
573
+ # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->Kosmos2
574
+ class Kosmos2TextSinusoidalPositionalEmbedding(nn.Module):
575
+ """This module produces sinusoidal positional embeddings of any length."""
576
+
577
+ def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
578
+ super().__init__()
579
+ self.offset = 2
580
+ self.embedding_dim = embedding_dim
581
+ self.padding_idx = padding_idx
582
+ self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
583
+
584
+ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
585
+ emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
586
+ if hasattr(self, "weights"):
587
+ # in forward put the weights on the correct dtype and device of the param
588
+ emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
589
+
590
+ self.register_buffer("weights", emb_weights, persistent=False)
591
+
592
+ @staticmethod
593
+ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
594
+ """
595
+ Build sinusoidal embeddings.
596
+
597
+ This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
598
+ "Attention Is All You Need".
599
+ """
600
+ half_dim = embedding_dim // 2
601
+ emb = math.log(10000) / (half_dim - 1)
602
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
603
+ emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
604
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
605
+ if embedding_dim % 2 == 1:
606
+ # zero pad
607
+ emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
608
+ if padding_idx is not None:
609
+ emb[padding_idx, :] = 0
610
+
611
+ return emb.to(torch.get_default_dtype())
612
+
613
+ @torch.no_grad()
614
+ def forward(
615
+ self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
616
+ ):
617
+ if input_ids is not None:
618
+ bsz, seq_len = input_ids.size()
619
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
620
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
621
+ input_ids.device
622
+ )
623
+ else:
624
+ bsz, seq_len = inputs_embeds.size()[:-1]
625
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
626
+
627
+ # expand embeddings if needed
628
+ max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
629
+ if max_pos > self.weights.size(0):
630
+ self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
631
+
632
+ return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
633
+
634
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
635
+ """
636
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
637
+
638
+ Args:
639
+ inputs_embeds: torch.Tensor
640
+
641
+ Returns: torch.Tensor
642
+ """
643
+ input_shape = inputs_embeds.size()[:-1]
644
+ sequence_length = input_shape[1]
645
+
646
+ position_ids = torch.arange(
647
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
648
+ )
649
+ return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
650
+
651
+
652
+ # Similar to transformers.models.bart.modeling_bart.BartAttention with an additional `inner_attn_ln`.
653
+ class KosmosTextAttention(nn.Module):
654
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
655
+
656
+ def __init__(
657
+ self,
658
+ config,
659
+ embed_dim: int,
660
+ num_heads: int,
661
+ dropout: float = 0.0,
662
+ is_decoder: bool = False,
663
+ add_inner_attn_layernorm: bool = False,
664
+ bias: bool = True,
665
+ ):
666
+ super().__init__()
667
+ self.embed_dim = embed_dim
668
+ self.num_heads = num_heads
669
+ self.dropout = dropout
670
+ self.head_dim = embed_dim // num_heads
671
+
672
+ if (self.head_dim * num_heads) != self.embed_dim:
673
+ raise ValueError(
674
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
675
+ f" and `num_heads`: {num_heads})."
676
+ )
677
+ self.scaling = self.head_dim**-0.5
678
+ self.is_decoder = is_decoder
679
+
680
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
681
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
682
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
683
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
684
+
685
+ self.inner_attn_ln = None
686
+ if add_inner_attn_layernorm:
687
+ self.inner_attn_ln = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
688
+
689
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
690
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
691
+
692
+ def forward(
693
+ self,
694
+ hidden_states: torch.Tensor,
695
+ key_value_states: Optional[torch.Tensor] = None,
696
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
697
+ attention_mask: Optional[torch.Tensor] = None,
698
+ layer_head_mask: Optional[torch.Tensor] = None,
699
+ output_attentions: bool = False,
700
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
701
+ """Input shape: Batch x Time x Channel"""
702
+
703
+ # if key_value_states are provided this layer is used as a cross-attention layer
704
+ # for the decoder
705
+ is_cross_attention = key_value_states is not None
706
+
707
+ bsz, tgt_len, _ = hidden_states.size()
708
+
709
+ # get query proj
710
+ query_states = self.q_proj(hidden_states) * self.scaling
711
+ # get key, value proj
712
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
713
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
714
+ # the provided `key_value_states` to support prefix tuning
715
+ if (
716
+ is_cross_attention
717
+ and past_key_value is not None
718
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
719
+ ):
720
+ # reuse k,v, cross_attentions
721
+ key_states = past_key_value[0]
722
+ value_states = past_key_value[1]
723
+ elif is_cross_attention:
724
+ # cross_attentions
725
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
726
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
727
+ elif past_key_value is not None:
728
+ # reuse k, v, self_attention
729
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
730
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
731
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
732
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
733
+ else:
734
+ # self_attention
735
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
736
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
737
+
738
+ if self.is_decoder:
739
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
740
+ # Further calls to cross_attention layer can then reuse all cross-attention
741
+ # key/value_states (first "if" case)
742
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
743
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
744
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
745
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
746
+ past_key_value = (key_states, value_states)
747
+
748
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
749
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
750
+ key_states = key_states.reshape(*proj_shape)
751
+ value_states = value_states.reshape(*proj_shape)
752
+
753
+ src_len = key_states.size(1)
754
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
755
+
756
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
757
+ raise ValueError(
758
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
759
+ f" {attn_weights.size()}"
760
+ )
761
+
762
+ if attention_mask is not None:
763
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
764
+ raise ValueError(
765
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
766
+ )
767
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
768
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
769
+
770
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
771
+
772
+ if layer_head_mask is not None:
773
+ if layer_head_mask.size() != (self.num_heads,):
774
+ raise ValueError(
775
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
776
+ f" {layer_head_mask.size()}"
777
+ )
778
+ attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
779
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
780
+
781
+ if output_attentions:
782
+ # this operation is a bit awkward, but it's required to
783
+ # make sure that attn_weights keeps its gradient.
784
+ # In order to do so, attn_weights have to be reshaped
785
+ # twice and have to be reused in the following
786
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
787
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
788
+ else:
789
+ attn_weights_reshaped = None
790
+
791
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
792
+
793
+ attn_output = torch.bmm(attn_probs, value_states)
794
+
795
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
796
+ raise ValueError(
797
+ f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
798
+ f" {attn_output.size()}"
799
+ )
800
+
801
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
802
+ attn_output = attn_output.transpose(1, 2)
803
+
804
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
805
+ # partitioned across GPUs when using tensor-parallelism.
806
+ attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
807
+
808
+ if self.inner_attn_ln is not None:
809
+ attn_output = self.inner_attn_ln(attn_output)
810
+
811
+ attn_output = self.out_proj(attn_output)
812
+
813
+ return attn_output, attn_weights_reshaped, past_key_value
814
+
815
+
816
+ class Kosmos2TextFFN(nn.Module):
817
+ def __init__(self, config: Kosmos2TextConfig):
818
+ super().__init__()
819
+
820
+ self.dropout = config.dropout
821
+ self.activation_fn = ACT2FN[config.activation_function]
822
+ self.activation_dropout = config.activation_dropout
823
+
824
+ self.fc1 = nn.Linear(config.embed_dim, config.ffn_dim)
825
+ self.fc2 = nn.Linear(config.ffn_dim, config.embed_dim)
826
+
827
+ self.ffn_layernorm = nn.LayerNorm(config.ffn_dim, eps=config.layer_norm_eps)
828
+
829
+ def forward(self, hidden_states):
830
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
831
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
832
+ hidden_states = self.ffn_layernorm(hidden_states)
833
+ hidden_states = self.fc2(hidden_states)
834
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
835
+
836
+ return hidden_states
837
+
838
+
839
+ class Kosmos2TextBlock(nn.Module):
840
+ def __init__(self, config: Kosmos2TextConfig):
841
+ super().__init__()
842
+ self.embed_dim = config.embed_dim
843
+
844
+ self.self_attn = KosmosTextAttention(
845
+ config,
846
+ embed_dim=self.embed_dim,
847
+ num_heads=config.attention_heads,
848
+ dropout=config.attention_dropout,
849
+ is_decoder=True,
850
+ add_inner_attn_layernorm=True,
851
+ )
852
+ self.dropout = config.dropout
853
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
854
+
855
+ if config.add_cross_attention:
856
+ self.encoder_attn = KosmosTextAttention(
857
+ config,
858
+ embed_dim=self.embed_dim,
859
+ num_heads=config.attention_heads,
860
+ dropout=config.attention_dropout,
861
+ is_decoder=True,
862
+ add_inner_attn_layernorm=False,
863
+ )
864
+ self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
865
+
866
+ self.ffn = Kosmos2TextFFN(config)
867
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
868
+
869
+ def forward(
870
+ self,
871
+ hidden_states: torch.Tensor,
872
+ attention_mask: Optional[torch.Tensor] = None,
873
+ encoder_hidden_states: Optional[torch.Tensor] = None,
874
+ encoder_attention_mask: Optional[torch.Tensor] = None,
875
+ layer_head_mask: Optional[torch.Tensor] = None,
876
+ cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
877
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
878
+ output_attentions: Optional[bool] = False,
879
+ use_cache: Optional[bool] = True,
880
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
881
+ residual = hidden_states
882
+
883
+ # Self Attention
884
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
885
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
886
+
887
+ hidden_states = self.self_attn_layer_norm(hidden_states)
888
+
889
+ # add present self-attn cache to positions 1,2 of present_key_value tuple
890
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
891
+ hidden_states=hidden_states,
892
+ past_key_value=self_attn_past_key_value,
893
+ attention_mask=attention_mask,
894
+ layer_head_mask=layer_head_mask,
895
+ output_attentions=output_attentions,
896
+ )
897
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
898
+ hidden_states = residual + hidden_states
899
+
900
+ # Cross-Attention Block
901
+ cross_attn_present_key_value = None
902
+ cross_attn_weights = None
903
+ if encoder_hidden_states is not None:
904
+ if not hasattr(self, "encoder_attn"):
905
+ raise ValueError(
906
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
907
+ " by setting `config.add_cross_attention=True`"
908
+ )
909
+
910
+ residual = hidden_states
911
+
912
+ hidden_states = self.encoder_attn_layer_norm(hidden_states)
913
+
914
+ # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
915
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
916
+ hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
917
+ hidden_states=hidden_states,
918
+ key_value_states=encoder_hidden_states,
919
+ attention_mask=encoder_attention_mask,
920
+ layer_head_mask=cross_attn_layer_head_mask,
921
+ past_key_value=cross_attn_past_key_value,
922
+ output_attentions=output_attentions,
923
+ )
924
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
925
+ hidden_states = residual + hidden_states
926
+
927
+ # add cross-attn to positions 3,4 of present_key_value tuple
928
+ present_key_value = present_key_value + cross_attn_present_key_value
929
+
930
+ # Fully Connected
931
+ residual = hidden_states
932
+
933
+ hidden_states = self.final_layer_norm(hidden_states)
934
+
935
+ # FFN
936
+ hidden_states = self.ffn(hidden_states)
937
+ hidden_states = residual + hidden_states
938
+
939
+ outputs = (hidden_states,)
940
+
941
+ if output_attentions:
942
+ outputs += (self_attn_weights, cross_attn_weights)
943
+
944
+ if use_cache:
945
+ outputs += (present_key_value,)
946
+
947
+ return outputs
948
+
949
+
950
+ class Kosmos2TextTransformer(nn.Module):
951
+ """
952
+ Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].
953
+
954
+ Args:
955
+ config: Kosmos2TextConfig
956
+ """
957
+
958
+ def __init__(self, config: Kosmos2TextConfig):
959
+ super().__init__()
960
+ self.config = config
961
+ self.dropout = config.dropout
962
+ self.layerdrop = config.layerdrop
963
+
964
+ self.embed_scale = math.sqrt(config.embed_dim) if config.scale_embedding else 1.0
965
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.embed_dim, padding_idx=config.pad_token_id)
966
+
967
+ self.embed_positions = Kosmos2TextSinusoidalPositionalEmbedding(
968
+ num_positions=config.max_position_embeddings,
969
+ embedding_dim=config.embed_dim,
970
+ padding_idx=config.pad_token_id,
971
+ )
972
+
973
+ self.layers = nn.ModuleList([Kosmos2TextBlock(config) for _ in range(config.layers)])
974
+ self.layer_norm = nn.LayerNorm(config.embed_dim, config.layer_norm_eps)
975
+
976
+ self.gradient_checkpointing = False
977
+
978
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
979
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
980
+ # create causal mask
981
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
982
+ combined_attention_mask = None
983
+ if input_shape[-1] > 1:
984
+ combined_attention_mask = _make_causal_mask(
985
+ input_shape,
986
+ inputs_embeds.dtype,
987
+ device=inputs_embeds.device,
988
+ past_key_values_length=past_key_values_length,
989
+ )
990
+
991
+ if attention_mask is not None:
992
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
993
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
994
+ inputs_embeds.device
995
+ )
996
+ combined_attention_mask = (
997
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
998
+ )
999
+
1000
+ return combined_attention_mask
1001
+
1002
+ def forward_embedding(
1003
+ self, input_ids, inputs_embeds=None, img_features=None, img_input_mask=None, past_key_values_length: int = 0
1004
+ ):
1005
+ # The argument `inputs_embeds` should be the one without being multiplied by `self.embed_scale`.
1006
+ if inputs_embeds is None:
1007
+ inputs_embeds = self.embed_tokens(input_ids)
1008
+
1009
+ if img_features is not None:
1010
+ inputs_embeds[img_input_mask.to(dtype=torch.bool)] = img_features.view(-1, img_features.size(-1))
1011
+
1012
+ inputs_embeds = inputs_embeds * self.embed_scale
1013
+
1014
+ # embed positions
1015
+ positions = self.embed_positions(
1016
+ input_ids=input_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length
1017
+ )
1018
+ positions = positions.to(inputs_embeds.device)
1019
+
1020
+ hidden_states = inputs_embeds + positions
1021
+
1022
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
1023
+
1024
+ return hidden_states
1025
+
1026
+ def forward(
1027
+ self,
1028
+ input_ids: Optional[torch.Tensor] = None,
1029
+ attention_mask: Optional[torch.Tensor] = None,
1030
+ img_features: Optional[torch.Tensor] = None,
1031
+ img_attn_mask: Optional[torch.Tensor] = None,
1032
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1033
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1034
+ head_mask: Optional[torch.Tensor] = None,
1035
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1036
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1037
+ inputs_embeds: Optional[torch.Tensor] = None,
1038
+ use_cache: Optional[bool] = None,
1039
+ output_attentions: Optional[bool] = None,
1040
+ output_hidden_states: Optional[bool] = None,
1041
+ return_dict: Optional[bool] = None,
1042
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
1043
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1044
+ output_hidden_states = (
1045
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1046
+ )
1047
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1048
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1049
+
1050
+ if input_ids is not None and inputs_embeds is not None:
1051
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1052
+ elif input_ids is not None:
1053
+ input_shape = input_ids.shape
1054
+ input_ids = input_ids.view(-1, input_shape[-1])
1055
+ elif inputs_embeds is not None:
1056
+ input_shape = inputs_embeds.size()[:-1]
1057
+ else:
1058
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1059
+
1060
+ # past_key_values_length
1061
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
1062
+
1063
+ # We don't need img info. when `past_key_values_length` > 0
1064
+ if past_key_values_length > 0:
1065
+ img_features = None
1066
+ img_attn_mask = None
1067
+
1068
+ hidden_states = self.forward_embedding(
1069
+ input_ids=input_ids,
1070
+ inputs_embeds=inputs_embeds,
1071
+ img_features=img_features,
1072
+ img_input_mask=img_attn_mask,
1073
+ past_key_values_length=past_key_values_length,
1074
+ )
1075
+
1076
+ attention_mask = self._prepare_decoder_attention_mask(
1077
+ attention_mask, input_shape, hidden_states, past_key_values_length
1078
+ )
1079
+
1080
+ # expand encoder attention mask
1081
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
1082
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1083
+ encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
1084
+
1085
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
1086
+
1087
+ if self.gradient_checkpointing and self.training:
1088
+ if use_cache:
1089
+ logger.warning_once(
1090
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
1091
+ )
1092
+ use_cache = False
1093
+
1094
+ # decoder layers
1095
+ all_hidden_states = () if output_hidden_states else None
1096
+ all_self_attns = () if output_attentions else None
1097
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
1098
+ next_decoder_cache = () if use_cache else None
1099
+
1100
+ # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
1101
+ for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
1102
+ if attn_mask is not None:
1103
+ if attn_mask.size()[0] != (len(self.layers)):
1104
+ raise ValueError(
1105
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
1106
+ f" {head_mask.size()[0]}."
1107
+ )
1108
+
1109
+ for idx, decoder_layer in enumerate(self.layers):
1110
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
1111
+ if output_hidden_states:
1112
+ all_hidden_states += (hidden_states,)
1113
+ if self.training:
1114
+ dropout_probability = torch.rand([])
1115
+ if dropout_probability < self.layerdrop:
1116
+ continue
1117
+
1118
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
1119
+
1120
+ if self.gradient_checkpointing and self.training:
1121
+
1122
+ def create_custom_forward(module):
1123
+ def custom_forward(*inputs):
1124
+ # None for past_key_value
1125
+ return module(*inputs, output_attentions, use_cache)
1126
+
1127
+ return custom_forward
1128
+
1129
+ layer_outputs = torch.utils.checkpoint.checkpoint(
1130
+ create_custom_forward(decoder_layer),
1131
+ hidden_states,
1132
+ attention_mask,
1133
+ encoder_hidden_states,
1134
+ encoder_attention_mask,
1135
+ head_mask[idx] if head_mask is not None else None,
1136
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
1137
+ None,
1138
+ )
1139
+ else:
1140
+ layer_outputs = decoder_layer(
1141
+ hidden_states,
1142
+ attention_mask=attention_mask,
1143
+ encoder_hidden_states=encoder_hidden_states,
1144
+ encoder_attention_mask=encoder_attention_mask,
1145
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
1146
+ cross_attn_layer_head_mask=(
1147
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
1148
+ ),
1149
+ past_key_value=past_key_value,
1150
+ output_attentions=output_attentions,
1151
+ use_cache=use_cache,
1152
+ )
1153
+ hidden_states = layer_outputs[0]
1154
+
1155
+ if use_cache:
1156
+ next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
1157
+
1158
+ if output_attentions:
1159
+ all_self_attns += (layer_outputs[1],)
1160
+
1161
+ if encoder_hidden_states is not None:
1162
+ all_cross_attentions += (layer_outputs[2],)
1163
+
1164
+ # add final layer norm
1165
+ hidden_states = self.layer_norm(hidden_states)
1166
+
1167
+ # add hidden states from the last decoder layer
1168
+ if output_hidden_states:
1169
+ all_hidden_states += (hidden_states,)
1170
+
1171
+ next_cache = next_decoder_cache if use_cache else None
1172
+ if not return_dict:
1173
+ return tuple(
1174
+ v
1175
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
1176
+ if v is not None
1177
+ )
1178
+ return BaseModelOutputWithPastAndCrossAttentions(
1179
+ last_hidden_state=hidden_states,
1180
+ past_key_values=next_cache,
1181
+ hidden_states=all_hidden_states,
1182
+ attentions=all_self_attns,
1183
+ cross_attentions=all_cross_attentions,
1184
+ )
1185
+
1186
+
1187
+ class Kosmos2PreTrainedModel(PreTrainedModel):
1188
+ """
1189
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
1190
+ models.
1191
+ """
1192
+
1193
+ config_class = Kosmos2Config
1194
+ supports_gradient_checkpointing = True
1195
+
1196
+
1197
+ @add_start_docstrings(
1198
+ """The vision model from KOSMOS-2 without any head or projection on top.""",
1199
+ KOSMOS2_START_DOCSTRING,
1200
+ )
1201
+ class Kosmos2VisionModel(Kosmos2PreTrainedModel):
1202
+ config_class = Kosmos2VisionConfig
1203
+ main_input_name = "pixel_values"
1204
+
1205
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2
1206
+ def __init__(self, config: Kosmos2VisionConfig):
1207
+ super().__init__(config)
1208
+ self.model = Kosmos2VisionTransformer(config)
1209
+ # Initialize weights and apply final processing
1210
+ self.post_init()
1211
+
1212
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.get_input_embeddings with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2
1213
+ def get_input_embeddings(self) -> nn.Module:
1214
+ return self.model.embeddings.patch_embedding
1215
+
1216
+ @add_start_docstrings_to_model_forward(KOSMOS2_VISION_INPUTS_DOCSTRING)
1217
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Kosmos2VisionConfig)
1218
+ def forward(
1219
+ self,
1220
+ pixel_values: Optional[torch.FloatTensor] = None,
1221
+ output_attentions: Optional[bool] = None,
1222
+ output_hidden_states: Optional[bool] = None,
1223
+ return_dict: Optional[bool] = None,
1224
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
1225
+ r"""
1226
+ Returns:
1227
+
1228
+ """
1229
+ return self.model(
1230
+ pixel_values=pixel_values,
1231
+ output_attentions=output_attentions,
1232
+ output_hidden_states=output_hidden_states,
1233
+ return_dict=return_dict,
1234
+ )
1235
+
1236
+
1237
+ @add_start_docstrings(
1238
+ """The text model from KOSMOS-2 without any head or projection on top.""",
1239
+ KOSMOS2_START_DOCSTRING,
1240
+ )
1241
+ class Kosmos2TextModel(Kosmos2PreTrainedModel):
1242
+ config_class = Kosmos2TextConfig
1243
+
1244
+ _no_split_modules = ["Kosmos2TextBlock"]
1245
+
1246
+ def __init__(self, config: Kosmos2TextConfig):
1247
+ super().__init__(config)
1248
+ self.model = Kosmos2TextTransformer(config)
1249
+ # Initialize weights and apply final processing
1250
+ self.post_init()
1251
+
1252
+ def get_input_embeddings(self) -> nn.Module:
1253
+ return self.model.embed_tokens
1254
+
1255
+ def set_input_embeddings(self, value):
1256
+ self.model.embed_tokens = value
1257
+
1258
+ @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING)
1259
+ @replace_return_docstrings(output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=Kosmos2TextConfig)
1260
+ def forward(
1261
+ self,
1262
+ input_ids: Optional[torch.Tensor] = None,
1263
+ attention_mask: Optional[torch.Tensor] = None,
1264
+ img_features: Optional[torch.Tensor] = None,
1265
+ img_attn_mask: Optional[torch.Tensor] = None,
1266
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1267
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1268
+ head_mask: Optional[torch.Tensor] = None,
1269
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1270
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1271
+ inputs_embeds: Optional[torch.Tensor] = None,
1272
+ use_cache: Optional[bool] = None,
1273
+ output_attentions: Optional[bool] = None,
1274
+ output_hidden_states: Optional[bool] = None,
1275
+ return_dict: Optional[bool] = None,
1276
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
1277
+ r"""
1278
+ Returns:
1279
+
1280
+ """
1281
+ return self.model(
1282
+ input_ids=input_ids,
1283
+ attention_mask=attention_mask,
1284
+ img_features=img_features,
1285
+ img_attn_mask=img_attn_mask,
1286
+ encoder_hidden_states=encoder_hidden_states,
1287
+ encoder_attention_mask=encoder_attention_mask,
1288
+ head_mask=head_mask,
1289
+ cross_attn_head_mask=cross_attn_head_mask,
1290
+ past_key_values=past_key_values,
1291
+ inputs_embeds=inputs_embeds,
1292
+ use_cache=use_cache,
1293
+ output_attentions=output_attentions,
1294
+ output_hidden_states=output_hidden_states,
1295
+ return_dict=return_dict,
1296
+ )
1297
+
1298
+
1299
+ @add_start_docstrings(
1300
+ """
1301
+ The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
1302
+ embeddings).
1303
+ """,
1304
+ KOSMOS2_START_DOCSTRING,
1305
+ )
1306
+ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel):
1307
+ config_class = Kosmos2TextConfig
1308
+ _tied_weights_keys = ["lm_head.weight"]
1309
+
1310
+ def __init__(self, config: Kosmos2TextConfig):
1311
+ super().__init__(config)
1312
+
1313
+ self.model = Kosmos2TextTransformer(config)
1314
+ self.lm_head = nn.Linear(in_features=config.embed_dim, out_features=config.vocab_size, bias=False)
1315
+
1316
+ # Initialize weights and apply final processing
1317
+ self.post_init()
1318
+
1319
+ def get_input_embeddings(self) -> nn.Module:
1320
+ return self.model.embed_tokens
1321
+
1322
+ def set_input_embeddings(self, value):
1323
+ self.model.embed_tokens = value
1324
+
1325
+ def get_output_embeddings(self) -> nn.Module:
1326
+ return self.lm_head
1327
+
1328
+ def set_output_embeddings(self, new_embeddings):
1329
+ self.lm_head = new_embeddings
1330
+
1331
+ @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING)
1332
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=Kosmos2TextConfig)
1333
+ def forward(
1334
+ self,
1335
+ input_ids: Optional[torch.Tensor] = None,
1336
+ attention_mask: Optional[torch.Tensor] = None,
1337
+ img_features: Optional[torch.Tensor] = None,
1338
+ img_attn_mask: Optional[torch.Tensor] = None,
1339
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1340
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1341
+ head_mask: Optional[torch.Tensor] = None,
1342
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1343
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1344
+ inputs_embeds: Optional[torch.Tensor] = None,
1345
+ labels: Optional[torch.LongTensor] = None,
1346
+ use_cache: Optional[bool] = None,
1347
+ output_attentions: Optional[bool] = None,
1348
+ output_hidden_states: Optional[bool] = None,
1349
+ return_dict: Optional[bool] = None,
1350
+ ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
1351
+ r"""
1352
+ Returns:
1353
+
1354
+ """
1355
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1356
+
1357
+ if labels is not None:
1358
+ if use_cache:
1359
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
1360
+ use_cache = False
1361
+
1362
+ outputs = self.model(
1363
+ input_ids=input_ids,
1364
+ attention_mask=attention_mask,
1365
+ img_features=img_features,
1366
+ img_attn_mask=img_attn_mask,
1367
+ encoder_hidden_states=encoder_hidden_states,
1368
+ encoder_attention_mask=encoder_attention_mask,
1369
+ head_mask=head_mask,
1370
+ cross_attn_head_mask=cross_attn_head_mask,
1371
+ past_key_values=past_key_values,
1372
+ inputs_embeds=inputs_embeds,
1373
+ use_cache=use_cache,
1374
+ output_attentions=output_attentions,
1375
+ output_hidden_states=output_hidden_states,
1376
+ return_dict=return_dict,
1377
+ )
1378
+ logits = self.lm_head(outputs[0])
1379
+
1380
+ loss = None
1381
+ if labels is not None:
1382
+ # Shift so that tokens < n predict n
1383
+ shift_logits = logits[..., :-1, :].contiguous()
1384
+ shift_labels = labels[..., 1:].contiguous()
1385
+ # Flatten the tokens
1386
+ loss_fct = CrossEntropyLoss()
1387
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1388
+ shift_labels = shift_labels.view(-1)
1389
+ # Enable model parallelism
1390
+ shift_labels = shift_labels.to(shift_logits.device)
1391
+ loss = loss_fct(shift_logits, shift_labels)
1392
+
1393
+ if not return_dict:
1394
+ output = (logits,) + outputs[1:]
1395
+ return (loss,) + output if loss is not None else output
1396
+
1397
+ return CausalLMOutputWithCrossAttentions(
1398
+ loss=loss,
1399
+ logits=logits,
1400
+ past_key_values=outputs.past_key_values,
1401
+ hidden_states=outputs.hidden_states,
1402
+ attentions=outputs.attentions,
1403
+ cross_attentions=outputs.cross_attentions,
1404
+ )
1405
+
1406
+ def prepare_inputs_for_generation(
1407
+ self,
1408
+ input_ids,
1409
+ img_features,
1410
+ img_attn_mask,
1411
+ past_key_values=None,
1412
+ attention_mask=None,
1413
+ use_cache=None,
1414
+ **model_kwargs,
1415
+ ):
1416
+ input_shape = input_ids.shape
1417
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1418
+ if attention_mask is None:
1419
+ attention_mask = input_ids.new_ones(input_shape)
1420
+
1421
+ # cut input_ids if past_key_values is used
1422
+ if past_key_values is not None:
1423
+ input_ids = input_ids[:, -1:]
1424
+ # the image info. is already encoded into the past keys/values
1425
+ img_features = None
1426
+ img_attn_mask = None
1427
+ elif img_attn_mask is not None:
1428
+ # appending `False` to `img_attn_mask` (because `input_ids` grows during generation)
1429
+ batch_size, seq_len = input_ids.size()
1430
+ mask_len = img_attn_mask.size()[-1]
1431
+ img_attn_mask = torch.cat(
1432
+ (img_attn_mask, torch.zeros(size=(batch_size, seq_len - mask_len), dtype=torch.bool, device=input_ids.device)), dim=1
1433
+ )
1434
+
1435
+ return {
1436
+ "input_ids": input_ids,
1437
+ "img_features": img_features,
1438
+ "img_attn_mask": img_attn_mask,
1439
+ "past_key_values": past_key_values,
1440
+ "attention_mask": attention_mask,
1441
+ "use_cache": use_cache,
1442
+ }
1443
+
1444
+
1445
+ class Kosmos2ImageToTextConnector(nn.Module):
1446
+ """The layer that transforms the image model's output to part of the text model's input (namely, image features)"""
1447
+
1448
+ def __init__(self, config: Kosmos2Config):
1449
+ super().__init__()
1450
+ self.dense = nn.Linear(config.vision_config.hidden_size, config.text_config.embed_dim)
1451
+ self.latent_query = nn.Parameter(torch.randn(config.latent_query_num, config.text_config.embed_dim))
1452
+
1453
+ self.x_attn = KosmosTextAttention(
1454
+ config.text_config,
1455
+ config.text_config.embed_dim,
1456
+ config.text_config.attention_heads,
1457
+ dropout=config.text_config.attention_dropout,
1458
+ is_decoder=False,
1459
+ add_inner_attn_layernorm=False,
1460
+ )
1461
+
1462
+ def forward(self, features):
1463
+ hidden_states = self.dense(features)
1464
+
1465
+ # shape = [batch, latent_query_num, h_dim]
1466
+ latent_query = self.latent_query.unsqueeze(0).expand(hidden_states.size(0), -1, -1)
1467
+ key_value_states = torch.cat([hidden_states, latent_query], dim=1)
1468
+
1469
+ hidden_states, attn_weights, _ = self.x_attn(
1470
+ hidden_states=latent_query,
1471
+ key_value_states=key_value_states,
1472
+ past_key_value=None,
1473
+ attention_mask=None,
1474
+ output_attentions=None,
1475
+ )
1476
+
1477
+ return hidden_states, attn_weights
1478
+
1479
+
1480
+ @add_start_docstrings(
1481
+ """
1482
+ KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder (CLIP) and a language
1483
+ model.
1484
+ """,
1485
+ KOSMOS2_START_DOCSTRING,
1486
+ )
1487
+ class Kosmos2Model(Kosmos2PreTrainedModel):
1488
+ config_class = Kosmos2Config
1489
+
1490
+ def __init__(self, config: Kosmos2Config):
1491
+ super().__init__(config)
1492
+
1493
+ self.text_model = Kosmos2TextModel(config.text_config)
1494
+ self.vision_model = Kosmos2VisionModel(config.vision_config)
1495
+ self.image_to_text_connector = Kosmos2ImageToTextConnector(config)
1496
+
1497
+ # Initialize weights and apply final processing
1498
+ self.post_init()
1499
+
1500
+ def get_input_embeddings(self) -> nn.Module:
1501
+ return self.text_model.model.embed_tokens
1502
+
1503
+ def set_input_embeddings(self, value):
1504
+ self.text_model.model.embed_tokens = value
1505
+
1506
+ @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING)
1507
+ @replace_return_docstrings(output_type=Kosmos2ModelOutput, config_class=Kosmos2Config)
1508
+ def forward(
1509
+ self,
1510
+ pixel_values: Optional[torch.Tensor] = None,
1511
+ input_ids: Optional[torch.Tensor] = None,
1512
+ attention_mask: Optional[torch.Tensor] = None,
1513
+ img_attn_mask: Optional[torch.Tensor] = None,
1514
+ head_mask: Optional[torch.Tensor] = None,
1515
+ img_features: Optional[torch.Tensor] = None,
1516
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1517
+ inputs_embeds: Optional[torch.Tensor] = None,
1518
+ use_cache: Optional[bool] = None,
1519
+ output_attentions: Optional[bool] = None,
1520
+ output_hidden_states: Optional[bool] = None,
1521
+ return_dict: Optional[bool] = None,
1522
+ ) -> Union[Tuple, Kosmos2ModelOutput]:
1523
+ # TODO: Add this
1524
+ r"""
1525
+ Returns:
1526
+
1527
+ ```"""
1528
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1529
+ output_hidden_states = (
1530
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1531
+ )
1532
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1533
+
1534
+ vision_model_output = None
1535
+ image_connector_attention = None
1536
+ if img_features is None:
1537
+ if pixel_values is None:
1538
+ raise ValueError("You have to specify either `pixel_values` or `img_features`.")
1539
+
1540
+ vision_model_output = self.vision_model(pixel_values)
1541
+ # HF's CLIP has `last_hidden_state` without going through `post_layernorm`.
1542
+ # Here we need the whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
1543
+ img_features = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state)
1544
+ # normalized features
1545
+ img_features = nn.functional.normalize(img_features, dim=-1)
1546
+ img_features, image_connector_attention = self.image_to_text_connector(img_features)
1547
+
1548
+ outputs = self.text_model(
1549
+ input_ids=input_ids,
1550
+ attention_mask=attention_mask,
1551
+ img_features=img_features,
1552
+ img_attn_mask=img_attn_mask,
1553
+ head_mask=head_mask,
1554
+ past_key_values=past_key_values,
1555
+ inputs_embeds=inputs_embeds,
1556
+ use_cache=use_cache,
1557
+ output_attentions=output_attentions,
1558
+ output_hidden_states=output_hidden_states,
1559
+ return_dict=return_dict,
1560
+ )
1561
+
1562
+ if not return_dict:
1563
+ outputs = outputs + (img_features, image_connector_attention, vision_model_output)
1564
+ return tuple(output for output in outputs if output is not None)
1565
+
1566
+ return Kosmos2ModelOutput(
1567
+ last_hidden_states=outputs.last_hidden_state,
1568
+ past_key_values=outputs.past_key_values,
1569
+ hidden_states=outputs.hidden_states,
1570
+ attentions=outputs.attentions,
1571
+ image_features=img_features,
1572
+ image_connector_attention=image_connector_attention,
1573
+ vision_model_output=vision_model_output,
1574
+ )
1575
+
1576
+
1577
+ @add_start_docstrings(
1578
+ """
1579
+ KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder (CLIP)
1580
+ and a language model.
1581
+ """,
1582
+ KOSMOS2_START_DOCSTRING,
1583
+ )
1584
+ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel):
1585
+ config_class = Kosmos2Config
1586
+ _tied_weights_keys = ["text_model.lm_head.weight"]
1587
+
1588
+ def __init__(self, config: Kosmos2Config):
1589
+ super().__init__(config)
1590
+
1591
+ self.text_model = Kosmos2TextForCausalLM(config.text_config)
1592
+ self.vision_model = Kosmos2VisionModel(config.vision_config)
1593
+
1594
+ self.image_to_text_connector = Kosmos2ImageToTextConnector(config)
1595
+
1596
+ # Initialize weights and apply final processing
1597
+ self.post_init()
1598
+
1599
+ def get_input_embeddings(self) -> nn.Module:
1600
+ return self.text_model.model.embed_tokens
1601
+
1602
+ def set_input_embeddings(self, value):
1603
+ self.text_model.model.embed_tokens = value
1604
+
1605
+ def get_output_embeddings(self) -> nn.Module:
1606
+ return self.text_model.get_output_embeddings()
1607
+
1608
+ def set_output_embeddings(self, new_embeddings):
1609
+ self.text_model.set_output_embeddings(new_embeddings)
1610
+
1611
+ @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING)
1612
+ @replace_return_docstrings(output_type=Kosmos2ForConditionalGenerationModelOutput, config_class=Kosmos2Config)
1613
+ def forward(
1614
+ self,
1615
+ pixel_values: Optional[torch.Tensor] = None,
1616
+ img_attn_mask=None,
1617
+ input_ids: Optional[torch.Tensor] = None,
1618
+ attention_mask=None,
1619
+ head_mask: Optional[torch.Tensor] = None,
1620
+ img_features: Optional[List[torch.FloatTensor]] = None,
1621
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1622
+ inputs_embeds: Optional[torch.Tensor] = None,
1623
+ labels: Optional[torch.LongTensor] = None,
1624
+ use_cache: Optional[bool] = None,
1625
+ output_attentions: Optional[bool] = None,
1626
+ output_hidden_states: Optional[bool] = None,
1627
+ return_dict: Optional[bool] = None,
1628
+ ) -> Union[Tuple, Kosmos2ForConditionalGenerationModelOutput]:
1629
+ r"""
1630
+ Returns:
1631
+
1632
+ Examples:
1633
+
1634
+ ```python
1635
+ >>> from PIL import Image
1636
+ >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
1637
+
1638
+ >>> model = Kosmos2ForConditionalGeneration.from_pretrained("ydshieh/kosmos-2-patch14-224")
1639
+ >>> processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224")
1640
+
1641
+ >>> prompt = "<grounding> An image of"
1642
+ >>> image = Image.open("snowman.jpg")
1643
+
1644
+ >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
1645
+
1646
+ >>> generated_ids = model.generate(
1647
+ ... pixel_values=inputs["pixel_values"],
1648
+ ... input_ids=inputs["input_ids"][:, :-1],
1649
+ ... attention_mask=inputs["attention_mask"][:, :-1],
1650
+ ... img_features=None,
1651
+ ... img_attn_mask=inputs["img_attn_mask"][:, :-1],
1652
+ ... use_cache=True,
1653
+ ... max_new_tokens=64,
1654
+ ... )
1655
+
1656
+ >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1657
+ >>> result = processor.post_processor_generation(generated_text)
1658
+ >>> result
1659
+ <grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.
1660
+ ```"""
1661
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1662
+ output_hidden_states = (
1663
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1664
+ )
1665
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1666
+
1667
+ vision_model_output = None
1668
+ image_connector_attention = None
1669
+ if img_features is None:
1670
+ if pixel_values is None:
1671
+ raise ValueError("You have to specify either `pixel_values` or `img_features`.")
1672
+
1673
+ vision_model_output = self.vision_model(pixel_values)
1674
+ # HF's CLIP has `last_hidden_state` without going through `post_layernorm`.
1675
+ # Here we need the whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
1676
+ img_features = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state)
1677
+ # normalized features
1678
+ img_features = nn.functional.normalize(img_features, dim=-1)
1679
+ img_features, image_connector_attention = self.image_to_text_connector(img_features)
1680
+
1681
+ lm_outputs = self.text_model(
1682
+ input_ids=input_ids,
1683
+ attention_mask=attention_mask,
1684
+ img_features=img_features,
1685
+ img_attn_mask=img_attn_mask,
1686
+ head_mask=head_mask,
1687
+ past_key_values=past_key_values,
1688
+ inputs_embeds=inputs_embeds,
1689
+ labels=labels,
1690
+ use_cache=use_cache,
1691
+ output_attentions=output_attentions,
1692
+ output_hidden_states=output_hidden_states,
1693
+ return_dict=return_dict,
1694
+ )
1695
+
1696
+ if not return_dict:
1697
+ outputs = lm_outputs + (img_features, image_connector_attention, vision_model_output)
1698
+ return tuple(output for output in outputs if output is not None)
1699
+
1700
+ return Kosmos2ForConditionalGenerationModelOutput(
1701
+ loss=lm_outputs.loss,
1702
+ logits=lm_outputs.logits,
1703
+ past_key_values=lm_outputs.past_key_values,
1704
+ hidden_states=lm_outputs.hidden_states,
1705
+ attentions=lm_outputs.attentions,
1706
+ image_features=img_features,
1707
+ image_connector_attention=image_connector_attention,
1708
+ vision_model_output=vision_model_output,
1709
+ )
1710
+
1711
+ def generate(
1712
+ self,
1713
+ input_ids=None,
1714
+ attention_mask=None,
1715
+ img_features=None,
1716
+ inputs_embeds=None,
1717
+ pixel_values=None,
1718
+ **kwargs,
1719
+ ):
1720
+ # in order to allow `inputs` argument (as in `GenerationMixin`)
1721
+ inputs = kwargs.pop("inputs", None)
1722
+ if pixel_values is not None and inputs is not None:
1723
+ raise ValueError(
1724
+ f"`inputs`: {inputs} were passed alongside `pixel_values` which is not allowed."
1725
+ f"Make sure to either pass `inputs` or pixel_values=..."
1726
+ )
1727
+ if pixel_values is None and inputs is not None:
1728
+ pixel_values = inputs
1729
+
1730
+ if img_features is None:
1731
+ vision_model_output = self.vision_model(pixel_values)
1732
+ # HF's CLIP has `last_hidden_state` without going through `post_layernorm`.
1733
+ # Here we need the whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
1734
+ img_features = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state)
1735
+ # normalized features
1736
+ img_features = nn.functional.normalize(img_features, dim=-1)
1737
+ img_features, image_connector_attention = self.image_to_text_connector(img_features)
1738
+
1739
+ output = self.text_model.generate(
1740
+ input_ids=input_ids,
1741
+ attention_mask=attention_mask,
1742
+ img_features=img_features,
1743
+ input_embeds=inputs_embeds,
1744
+ **kwargs,
1745
+ )
1746
+
1747
+ return output
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "Kosmos2ImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "processor_class": "Kosmos2Processor",
23
+ "auto_map": {
24
+ "AutoProcessor": "processing_kosmos2.Kosmos2Processor",
25
+ "AutoImageProcessor": "image_processing_kosmos2.Kosmos2ImageProcessor"
26
+ },
27
+ "resample": 3,
28
+ "rescale_factor": 0.00392156862745098,
29
+ "size": {
30
+ "shortest_edge": 224
31
+ }
32
+ }
processing_kosmos2.py ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Processor class for KOSMOS-2."""
16
+
17
+ import copy
18
+ import math
19
+ import re
20
+ from typing import List, Optional, Tuple, Union
21
+
22
+ import numpy as np
23
+
24
+ from transformers.image_processing_utils import BatchFeature
25
+ from transformers.image_utils import ImageInput, is_batched
26
+ from transformers.processing_utils import ProcessorMixin
27
+ from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
28
+ from transformers.utils import TensorType, is_tf_available, is_torch_available
29
+
30
+
31
+ if is_torch_available():
32
+ import torch
33
+
34
+ if is_tf_available():
35
+ import tensorflow as tf
36
+
37
+
38
+ BboxInput = Union[
39
+ List[Tuple[int, int]],
40
+ List[Tuple[float, float, float, float]],
41
+ List[List[Tuple[int, int]]],
42
+ List[List[Tuple[float, float, float]]],
43
+ ]
44
+
45
+
46
+ class Kosmos2Processor(ProcessorMixin):
47
+ r"""
48
+ Constructs an KOSMOS-2 processor which wraps a CLIP image processor and a KOSMOS-2 tokenizer into a single
49
+ processor.
50
+
51
+ [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`Kosmos2TokenizerFast`]. See the
52
+ docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`] for more information.
53
+
54
+ Args:
55
+ image_processor (`CLIPImageProcessor`):
56
+ An instance of [`CLIPImageProcessor`]. The image processor is a required input.
57
+ tokenizer (`Kosmos2TokenizerFast`):
58
+ An instance of ['Kosmos2TokenizerFast`]. The tokenizer is a required input.
59
+ """
60
+ attributes = ["image_processor", "tokenizer"]
61
+ # Better to use explicit classes if local code works
62
+ # image_processor_class = "Kosmos2ImageProcessor"
63
+ # tokenizer_class = ("Kosmos2Tokenizer", "Kosmos2TokenizerFast")
64
+
65
+ # To make remote code work
66
+ image_processor_class = "AutoImageProcessor"
67
+ tokenizer_class = "AutoTokenizer"
68
+
69
+ def __init__(self, image_processor, tokenizer):
70
+ tokenizer.return_token_type_ids = False
71
+ super().__init__(image_processor, tokenizer)
72
+ self.current_processor = self.image_processor
73
+
74
+ def __call__(
75
+ self,
76
+ images: ImageInput = None,
77
+ text: Union[TextInput, List[TextInput]] = None,
78
+ bboxes: BboxInput = None,
79
+ num_image_tokens: Optional[int] = 64,
80
+ first_image_token_id: Optional[int] = None,
81
+ add_special_tokens: bool = True,
82
+ padding: Union[bool, str, PaddingStrategy] = False,
83
+ truncation: Union[bool, str, TruncationStrategy] = None,
84
+ max_length: Optional[int] = None,
85
+ stride: int = 0,
86
+ pad_to_multiple_of: Optional[int] = None,
87
+ return_attention_mask: Optional[bool] = None,
88
+ return_overflowing_tokens: bool = False,
89
+ return_special_tokens_mask: bool = False,
90
+ return_offsets_mapping: bool = False,
91
+ return_token_type_ids: bool = False,
92
+ return_length: bool = False,
93
+ verbose: bool = True,
94
+ return_tensors: Optional[Union[str, TensorType]] = None,
95
+ **kwargs,
96
+ ) -> BatchFeature:
97
+ """
98
+ This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
99
+ [`Kosmos2TokenizerFast.__call__`] to prepare text for the model.
100
+
101
+ Please refer to the docstring of the above two methods for more information.
102
+ """
103
+ if text is None:
104
+ raise ValueError("You have to specify at least `text`.")
105
+
106
+ text = self.preprocess_text(text, images, bboxes, num_image_tokens=num_image_tokens)
107
+
108
+ encoding = BatchFeature()
109
+
110
+ text_encoding = self.tokenizer(
111
+ text=text,
112
+ add_special_tokens=add_special_tokens,
113
+ padding=padding,
114
+ truncation=truncation,
115
+ max_length=max_length,
116
+ stride=stride,
117
+ pad_to_multiple_of=pad_to_multiple_of,
118
+ return_attention_mask=return_attention_mask,
119
+ return_overflowing_tokens=return_overflowing_tokens,
120
+ return_special_tokens_mask=return_special_tokens_mask,
121
+ return_offsets_mapping=return_offsets_mapping,
122
+ return_token_type_ids=return_token_type_ids,
123
+ return_length=return_length,
124
+ verbose=verbose,
125
+ return_tensors=return_tensors,
126
+ **kwargs,
127
+ )
128
+ encoding.update(text_encoding)
129
+
130
+ if images is not None:
131
+ image_encoding = self.image_processor(images, return_tensors=return_tensors)
132
+ encoding.update(image_encoding)
133
+
134
+ # Use the id of the first token after <unk>
135
+ if first_image_token_id is None:
136
+ first_image_token_id = self.tokenizer.unk_token_id + 1
137
+
138
+ # To see if we need one more `0` (for `<s>`) at the beginning of `img_attn_mask`.
139
+ with_bos = add_special_tokens
140
+
141
+ # The first (actual) `<image>` token is always at the 1st or 2nd place (after `<s>` if any). Here we look
142
+ # for the second `<image>` token (which indicate the first image token).
143
+ start_index = int(with_bos) + 1
144
+
145
+ if return_tensors:
146
+ # change the ids for the fake `<image>` tokens in `input_ids`
147
+ input_ids = np.array(encoding["input_ids"])
148
+ input_ids[:, start_index : (start_index + num_image_tokens)] = np.arange(
149
+ first_image_token_id, first_image_token_id + num_image_tokens
150
+ )
151
+
152
+ batch_size, seq_len = input_ids.shape[:2]
153
+ img_attn_mask = []
154
+ if with_bos:
155
+ # for `<s>`
156
+ img_attn_mask.append(np.zeros(shape=(batch_size, 1), dtype=np.int64))
157
+ # for `<image>` (the real one)
158
+ img_attn_mask.append(np.zeros(shape=(batch_size, 1), dtype=np.int64))
159
+ # for image tokens
160
+ img_attn_mask.append(np.ones(shape=(batch_size, 64), dtype=np.int64))
161
+ # for `</image>`
162
+ img_attn_mask.append(np.zeros(shape=(batch_size, 1), dtype=np.int64))
163
+ # trailing part (which are not related to the image)
164
+ seq_len -= int(with_bos) + 1 + num_image_tokens + 1
165
+ img_attn_mask.append(np.zeros(shape=(batch_size, seq_len), dtype=np.int64))
166
+
167
+ # concatenate along the sequence dimension
168
+ img_attn_mask = np.concatenate(img_attn_mask, axis=1)
169
+
170
+ # to the target tensor type
171
+ if return_tensors == "pt":
172
+ input_ids = torch.from_numpy(input_ids)
173
+ img_attn_mask = torch.from_numpy(img_attn_mask)
174
+ elif return_tensors == "tf":
175
+ input_ids = tf.convert_to_tensor(input_ids)
176
+ img_attn_mask = tf.convert_to_tensor(img_attn_mask)
177
+
178
+ encoding["input_ids"] = input_ids
179
+ encoding["img_attn_mask"] = img_attn_mask
180
+
181
+ else:
182
+ # Add `img_attn_mask`: the leading and trailing `0` are for `boi` and `eoi` tokens. The `1` indicates
183
+ # the places of image tokens.
184
+ image_token_ids = list(range(first_image_token_id, first_image_token_id + num_image_tokens))
185
+ base_img_attn_mask = [0] + [1] * num_image_tokens + [0]
186
+
187
+ # loop over `encoding["input_ids"]`
188
+ input_ids = []
189
+ img_attn_mask = []
190
+ all_input_ids = encoding["input_ids"]
191
+ # not batched -> (changed to) batch of size 1
192
+ if isinstance(text, str):
193
+ all_input_ids = [all_input_ids]
194
+ for text_ids in all_input_ids:
195
+ # change the ids for the fake `<image>` tokens in `input_ids`
196
+ text_ids = text_ids[:start_index] + image_token_ids + text_ids[start_index + num_image_tokens :]
197
+ input_ids.append(text_ids)
198
+
199
+ mask = copy.copy(base_img_attn_mask)
200
+ if with_bos:
201
+ # for `<s>`
202
+ mask = [0] + mask
203
+ # trailing part (which are not related to the image)
204
+ mask += [0] * (len(text_ids) - len(mask))
205
+ img_attn_mask.append(mask)
206
+
207
+ # un-batch if necessary
208
+ if isinstance(text, str):
209
+ input_ids = input_ids[0]
210
+ img_attn_mask = img_attn_mask[0]
211
+
212
+ encoding["input_ids"] = input_ids
213
+ encoding["img_attn_mask"] = img_attn_mask
214
+
215
+ return encoding
216
+
217
+ def preprocess_text(
218
+ self,
219
+ texts: Union[TextInput, List[TextInput]],
220
+ images: ImageInput = None,
221
+ bboxes: BboxInput = None,
222
+ num_image_tokens: Optional[int] = 64,
223
+ ) -> Union[str, List[str]]:
224
+ """Add image and bounding box information to `texts` as image and patch index tokens.
225
+
226
+ Args:
227
+ texts (`Union[TextInput, List[TextInput]]`): The texts to be processed.
228
+ images (`ImageInput`, *optional*): The images associated to `texts`.
229
+ bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*): The bounding bboxes associated to `texts`.
230
+ num_image_tokens (`int`, *optional*, defaults to 64): The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num` attribute in `Kosmos2Config`.
231
+
232
+ Returns:
233
+ `Union[TextInput, List[TextInput]]`: The processed texts with image and patch index tokens.
234
+ """
235
+ # These are fake `<image>` tokens enclosed between (the actual) `<image>` token and `</image>`.
236
+ img_tokens = ["<image>"] * num_image_tokens
237
+ img_info = " ".join(["<image>"] + img_tokens + ["</image>"])
238
+
239
+ def check_bboxes_for_single_text(bboxes):
240
+ """
241
+ Check `bboxes` for a single text example. It could be
242
+ - `None`: no bounding box associated to a text.
243
+ - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair
244
+ found in a text. This could be:
245
+ - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
246
+ - A tuple of 2 integers: A single bounding box specified by patch indices.
247
+ - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
248
+ - A list containing the above 2 tuple types: Multiple bounding boxes for a
249
+ `<phrase> ... </phrase>` pair.
250
+ """
251
+ if bboxes is None:
252
+ return
253
+ elif not isinstance(bboxes, list):
254
+ raise ValueError("`bboxes` (for a single text example) should be `None` or a list.")
255
+
256
+ # `bbox` is the bounding boxes for a single <phrase> </phrase> pair
257
+ for bbox in bboxes:
258
+ if bbox is None:
259
+ continue
260
+ elif not isinstance(bbox, list):
261
+ bbox = [bbox]
262
+ for elt in bbox:
263
+ if not isinstance(elt, tuple) or not (
264
+ (len(elt) == 2 and all(isinstance(x, int) for x in elt))
265
+ or (len(elt) == 4 and all(isinstance(x, float) for x in elt))
266
+ ):
267
+ raise ValueError(
268
+ "Each element in `bboxes` (for a single text example) should be `None`, a tuple containing "
269
+ "2 integers or 4 float point numbers, or a list containing such tuples. Also "
270
+ "make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in "
271
+ "batches or both for a single example."
272
+ )
273
+
274
+ def preprocess_single(text, image, bboxes):
275
+ text = text.strip()
276
+ if image is not None:
277
+ # Add `<image> ... (fake) image tokens ... </image>`
278
+ text = f"{img_info} {text}"
279
+
280
+ # Add `<object> <patch_idx_xxxx> <patch_idx_yyy> </object>` after `<phrase> phrase text </phrase>`
281
+ text = self._insert_patch_index_tokens(text, bboxes)
282
+ text = self._add_remove_spaces_around_tag_tokens(text)
283
+
284
+ return text
285
+
286
+ # make batch to simplify processing logic
287
+ batched = True
288
+ if isinstance(texts, str):
289
+ batched = False
290
+ texts = [texts]
291
+
292
+ if images is None:
293
+ images = [None] * len(texts)
294
+ elif not is_batched(images):
295
+ images = [images]
296
+ if len(texts) != len(images):
297
+ raise ValueError(
298
+ f"The number of examples in `texts` and `images` should be the same. Got {len(texts)} v.s. {len(images)} instead."
299
+ )
300
+
301
+ if not batched:
302
+ check_bboxes_for_single_text(bboxes)
303
+ bboxes = [bboxes]
304
+ elif bboxes is not None:
305
+ if not isinstance(bboxes, list):
306
+ raise ValueError("`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.")
307
+ for x in bboxes:
308
+ check_bboxes_for_single_text(x)
309
+ else:
310
+ bboxes = [None] * len(texts)
311
+
312
+ if len(bboxes) != len(texts):
313
+ raise ValueError(
314
+ f"The number of examples in `texts` and `bboxes` should be the same. Got {len(texts)} v.s. {len(bboxes)} instead."
315
+ )
316
+
317
+ result = [preprocess_single(text, image, bbox) for text, image, bbox in zip(texts, images, bboxes)]
318
+ # un-batch if necessary
319
+ if not batched:
320
+ result = result[0]
321
+
322
+ return result
323
+
324
+ # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
325
+ def batch_decode(self, *args, **kwargs):
326
+ """
327
+ This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
328
+ refer to the docstring of this method for more information.
329
+ """
330
+ return self.tokenizer.batch_decode(*args, **kwargs)
331
+
332
+ # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
333
+ def decode(self, *args, **kwargs):
334
+ """
335
+ This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
336
+ to the docstring of this method for more information.
337
+ """
338
+ return self.tokenizer.decode(*args, **kwargs)
339
+
340
+ def post_process_generation(self, text, cleanup_and_extract=True):
341
+
342
+ caption = text.split("</image>")[-1]
343
+ if cleanup_and_extract:
344
+ return clean_text_and_extract_entities_with_bboxes(caption)
345
+ return caption
346
+
347
+ @property
348
+ # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
349
+ def model_input_names(self):
350
+ tokenizer_input_names = self.tokenizer.model_input_names
351
+ image_processor_input_names = self.image_processor.model_input_names
352
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
353
+
354
+ def _insert_patch_index_tokens(self, text: str, bboxes: Union[List[Tuple[int]], List[Tuple[float]]]) -> str:
355
+ if bboxes is None or len(bboxes) == 0:
356
+ return text
357
+
358
+ matched_phrases = list(re.finditer(r"<phrase>.+?</phrase>", string=text))
359
+ if len(matched_phrases) != len(bboxes):
360
+ raise ValueError(
361
+ f"The number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got {len(matched_phrases)} v.s. {len(bboxes)} instead."
362
+ )
363
+
364
+ # insert object's patch index tokens
365
+ # the found `<phrase> ... </phrase>` pairs.
366
+ curr_pos = 0
367
+ buffer = []
368
+ for matched, bbox in zip(matched_phrases, bboxes):
369
+ _, end = matched.span()
370
+ buffer.append(text[curr_pos:end])
371
+ curr_pos = end
372
+ # A phrase without bbox
373
+ if bbox is None:
374
+ continue
375
+ # A phrase with a single bbox
376
+ if isinstance(bbox, tuple):
377
+ bbox = [bbox]
378
+ patch_index_strings = []
379
+ # A phrase could have multiple bboxes
380
+ for box in bbox:
381
+ patch_index_1, patch_index_2 = self._convert_bbox_to_patch_index_tokens(box)
382
+ patch_index_strings.append(f"{patch_index_1} {patch_index_2}")
383
+ position_str = " </delimiter_of_multi_objects/> ".join(patch_index_strings)
384
+ buffer.append(f"<object> {position_str} </object>")
385
+ # remaining
386
+ if curr_pos < len(text):
387
+ buffer.append(text[curr_pos:])
388
+
389
+ text = "".join(buffer)
390
+ return text
391
+
392
+ def _convert_bbox_to_patch_index_tokens(
393
+ self, bbox: Union[Tuple[int, int], Tuple[float, float, float, float]]
394
+ ) -> Tuple[str, str]:
395
+ # already computed patch indices
396
+ if len(bbox) == 2:
397
+ idx_1, idx_2 = bbox
398
+ # bbox specified with (normalized) coordinates
399
+ else:
400
+ # use `self.tokenizer` to get `num_patches_per_side`
401
+ num_patches_per_side = int(math.sqrt(self.tokenizer.num_patch_index_tokens))
402
+ idx_1, idx_2 = coordinate_to_patch_index(bbox, num_patches_per_side)
403
+
404
+ token_1 = f"<patch_index_{str(idx_1).zfill(4)}>"
405
+ token_2 = f"<patch_index_{str(idx_2).zfill(4)}>"
406
+
407
+ return token_1, token_2
408
+
409
+ def _add_remove_spaces_around_tag_tokens(self, text):
410
+ """
411
+ Remove spaces before tag tokens (e.g. `<x>`). Also ensure a space after a tag token, if it is not followed by
412
+ another tag token (this is not technically necessary, but good for a standard/consistent format). This avoids
413
+ the inconsistency of tokenization results between kosmos-2 slow and fast tokenizers.
414
+ """
415
+
416
+ tag_tokens = set(
417
+ self.tokenizer.tag_tokens
418
+ + [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.tokenizer.num_patch_index_tokens)]
419
+ )
420
+ pattern = "|".join(tag_tokens)
421
+ splits = re.split(rf"({pattern})", text)
422
+ # Don't keep the leading and trailing space if any
423
+ splits = [split for idx, split in enumerate(splits) if not (idx in [0, len(splits) - 1] and split == "")]
424
+
425
+ output = ""
426
+ prev_str_in_targets = False
427
+ for split in splits:
428
+ if split in tag_tokens:
429
+ prev_str_in_targets = True
430
+ output = output.rstrip() + split
431
+ else:
432
+ # we don't need to ensure a space before a normal token that is after a tag token. But having it and
433
+ # keeps a standard format is good anyway.
434
+ if prev_str_in_targets and not split.startswith(" "):
435
+ output += " " + split
436
+ else:
437
+ output += split
438
+ prev_str_in_targets = False
439
+
440
+ return output
441
+
442
+
443
+ def coordinate_to_patch_index(bbox: Tuple[float, float, float, float], num_patches_per_side: int) -> Tuple[int, int]:
444
+ """Convert a bounding box to a pair of patch indices.
445
+
446
+ Args:
447
+ bbox (`Tuple[float, float, float, float]`):
448
+ The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left
449
+ and lower-right corners of the box. It should have x2 > x1 and y1 > y2.
450
+ num_patches_per_side (`int`): the number of patches along each side.
451
+
452
+ Returns:
453
+ `Tuple[int, int]`: A pair of patch indices.
454
+ """
455
+ (x1, y1, x2, y2) = bbox
456
+
457
+ ul_x = math.floor(x1 * num_patches_per_side)
458
+ ul_y = math.floor(y1 * num_patches_per_side)
459
+
460
+ lr_x = math.ceil(x2 * num_patches_per_side - 1)
461
+ lr_y = math.ceil(y2 * num_patches_per_side - 1)
462
+
463
+ ul_idx = ul_y * num_patches_per_side + ul_x
464
+ lr_idx = lr_y * num_patches_per_side + lr_x
465
+
466
+ return ul_idx, lr_idx
467
+
468
+
469
+ # copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L35C1-L75C38
470
+ # (with format modifications)
471
+ def patch_index_to_coordinate(ul_idx: int, lr_idx: int, num_patches_per_side: int):
472
+ """
473
+ Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
474
+ bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
475
+
476
+ Args:
477
+ ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
478
+ lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
479
+ num_patches_per_side (`int`): the number of patches along each side.
480
+
481
+ Returns:
482
+ `Tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
483
+ """
484
+ # Compute the size of each cell in the grid
485
+ cell_size = 1.0 / num_patches_per_side
486
+
487
+ # Compute the x and y indices of the upper-left and lower-right corners of the bounding box
488
+ ul_x = ul_idx % num_patches_per_side
489
+ ul_y = ul_idx // num_patches_per_side
490
+
491
+ lr_x = lr_idx % num_patches_per_side
492
+ lr_y = lr_idx // num_patches_per_side
493
+
494
+ # Compute the normalized coordinates of the bounding box
495
+ if ul_idx == lr_idx:
496
+ x1 = ul_x * cell_size
497
+ y1 = ul_y * cell_size
498
+ x2 = lr_x * cell_size + cell_size
499
+ y2 = lr_y * cell_size + cell_size
500
+ elif ul_x == lr_x or ul_y == lr_y:
501
+ x1 = ul_x * cell_size
502
+ y1 = ul_y * cell_size
503
+ x2 = lr_x * cell_size + cell_size
504
+ y2 = lr_y * cell_size + cell_size
505
+ else:
506
+ x1 = ul_x * cell_size + cell_size / 2
507
+ y1 = ul_y * cell_size + cell_size / 2
508
+ x2 = lr_x * cell_size + cell_size / 2
509
+ y2 = lr_y * cell_size + cell_size / 2
510
+
511
+ return x1, y1, x2, y2
512
+
513
+
514
+ # copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L4-L33
515
+ # (with format modifications)
516
+ def extract_entities_with_patch_indices(text):
517
+ # The regular expression pattern for matching the required formats
518
+ pattern = r'(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>'
519
+
520
+ # Find all matches in the given string
521
+ matches = re.finditer(pattern, text)
522
+
523
+ # Initialize an empty list to store the valid patch_index combinations
524
+ entities_with_patch_indices = []
525
+
526
+ for match in matches:
527
+ # span of a `phrase` that is between <phrase> and </phrase>
528
+ span = match.span(2)
529
+ phrase_tag, phrase, match_content = match.groups()
530
+ if not phrase_tag:
531
+ phrase = None
532
+ # We take the starting position of `<object>`
533
+ span = (match.span(0)[0], match.span(0)[0])
534
+
535
+ # Split the match_content by the delimiter to get individual patch_index pairs
536
+ patch_index_pairs = match_content.split('</delimiter_of_multi_objects/>')
537
+
538
+ entity_bboxes = []
539
+ for pair in patch_index_pairs:
540
+ # Extract the xxxx and yyyy values from the patch_index pair
541
+ x = re.search(r'<patch_index_(\d+)>', pair)
542
+ y = re.search(r'<patch_index_(\d+)>', pair[1:])
543
+
544
+ if x and y:
545
+ if phrase:
546
+ entity_bboxes.append((int(x.group(1)), int(y.group(1))))
547
+ else:
548
+ entity_bboxes.append((int(x.group(1)), int(y.group(1))))
549
+
550
+ if phrase:
551
+ entities_with_patch_indices.append((phrase, span, entity_bboxes))
552
+ else:
553
+ for bbox in entity_bboxes:
554
+ # fake entity name
555
+ entity = f"<patch_index_{bbox[0]}><patch_index_{bbox[1]}>"
556
+ entities_with_patch_indices.append((entity, span, [bbox]))
557
+
558
+ return entities_with_patch_indices
559
+
560
+
561
+ # TODO: Be careful
562
+ def remove_special_fields(text):
563
+ return re.sub('<.*?>', '', text)
564
+
565
+
566
+ def adjust_entity_positions(entity, text):
567
+
568
+ entity_name, (start, end) = entity
569
+ adjusted_start = len(remove_special_fields(text[:start]))
570
+ adjusted_end = len(remove_special_fields(text[:end]))
571
+ adjusted_entity = (entity_name, (adjusted_start, adjusted_end))
572
+ return adjusted_entity
573
+
574
+
575
+ # copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L77-L87
576
+ # (with format modifications)
577
+ def clean_text_and_extract_entities_with_bboxes(text, num_patches_per_side=32):
578
+
579
+ processed_text = remove_special_fields(text)
580
+
581
+ entities_with_patch_indices = extract_entities_with_patch_indices(text)
582
+ entities = []
583
+ for item in entities_with_patch_indices:
584
+ entity, bboxes = item[0:2], item[2]
585
+ adjusted_entity = adjust_entity_positions(entity, text)
586
+ bboxes_in_coords = list(map(lambda bbox: patch_index_to_coordinate(bbox[0], bbox[1], num_patches_per_side), bboxes))
587
+
588
+ entities.append(adjusted_entity + (bboxes_in_coords,))
589
+
590
+ def cleanup_spaces(text, entities):
591
+ new_text = text.strip()
592
+ leading_spaces = len(text) - len(text.lstrip())
593
+
594
+ new_entities = []
595
+ for entity_name, (start, end), bboxes in entities:
596
+
597
+ entity_name_leading_spaces = len(entity_name) - len(entity_name.lstrip())
598
+ entity_name_trailing_spaces = len(entity_name) - len(entity_name.rstrip())
599
+
600
+ start = start - leading_spaces + entity_name_leading_spaces
601
+ end = end - leading_spaces - entity_name_trailing_spaces
602
+ entity_name = entity_name.strip()
603
+
604
+ new_entities.append((entity_name, (start, end), bboxes))
605
+
606
+ return new_text, new_entities
607
+
608
+ return cleanup_spaces(processed_text, entities)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c7290a8c916513d3bc0cbda4f0b0d02dcc17db935df7da9b52d3917e47cde17
3
+ size 6658242717
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a60b4d1d1d8f70c8b2569c94540d4d9b7c694fd32e7a428ad0dcffaafaa3beb
3
+ size 1363614
snowman.jpg ADDED
snowman.png ADDED

Git LFS Details

  • SHA256: b97825997df04bd823207fd145331ffc3c3b62ec4e3a3adaac83c93debe87bdf
  • Pointer size: 132 Bytes
  • Size of remote file: 1.36 MB
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenization_kosmos2.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization classes for KOSMOS-2 model."""
16
+
17
+
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import sentencepiece as spm
23
+
24
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
25
+ from transformers.utils import logging
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ SPIECE_UNDERLINE = "▁"
31
+
32
+ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
33
+
34
+ PRETRAINED_VOCAB_FILES_MAP = {
35
+ "vocab_file": {
36
+ "microsoft/kosmos-2-patch14-224": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/sentencepiece.bpe.model",
37
+ }
38
+ }
39
+
40
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
41
+ "microsoft/kosmos-2-patch14-224": 2048,
42
+ }
43
+
44
+
45
+ class Kosmos2Tokenizer(PreTrainedTokenizer):
46
+ """
47
+ Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
48
+ [SentencePiece](https://github.com/google/sentencepiece).
49
+
50
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
51
+ this superclass for more information regarding those methods.
52
+
53
+ Args:
54
+ vocab_file (`str`):
55
+ Path to the vocabulary file.
56
+ bos_token (`str`, *optional*, defaults to `"<s>"`):
57
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
58
+
59
+ <Tip>
60
+
61
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
62
+ sequence. The token used is the `cls_token`.
63
+
64
+ </Tip>
65
+
66
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
67
+ The end of sequence token.
68
+
69
+ <Tip>
70
+
71
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
72
+ The token used is the `sep_token`.
73
+
74
+ </Tip>
75
+
76
+ sep_token (`str`, *optional*, defaults to `"</s>"`):
77
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
78
+ sequence classification or for a text and a question for question answering. It is also used as the last
79
+ token of a sequence built with special tokens.
80
+ cls_token (`str`, *optional*, defaults to `"<s>"`):
81
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
82
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
83
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
84
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
85
+ token instead.
86
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
87
+ The token used for padding, for example when batching sequences of different lengths.
88
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
89
+ The token used for masking values. This is the token used when training this model with masked language
90
+ modeling. This is the token which the model will try to predict.
91
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
92
+ Additional special tokens used by the tokenizer.
93
+ num_patch_index_tokens (`int`, *optional*, defaults to `1024`):
94
+ The number of tokens used to specify the patch indices of bounding boxes in an image. These tokens have the
95
+ format `<patch_index_xxxx>` where `xxxx` is an integer.
96
+ sp_model_kwargs (`dict`, *optional*):
97
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
98
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
99
+ to set:
100
+
101
+ - `enable_sampling`: Enable subword regularization.
102
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
103
+
104
+ - `nbest_size = {0,1}`: No sampling is performed.
105
+ - `nbest_size > 1`: samples from the nbest_size results.
106
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
107
+ using forward-filtering-and-backward-sampling algorithm.
108
+
109
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
110
+ BPE-dropout.
111
+
112
+ Attributes:
113
+ sp_model (`SentencePieceProcessor`):
114
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
115
+ """
116
+
117
+ vocab_files_names = VOCAB_FILES_NAMES
118
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
119
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
120
+ model_input_names = ["input_ids", "attention_mask"]
121
+
122
+ def __init__(
123
+ self,
124
+ vocab_file,
125
+ bos_token="<s>",
126
+ eos_token="</s>",
127
+ sep_token="</s>",
128
+ cls_token="<s>",
129
+ unk_token="<unk>",
130
+ pad_token="<pad>",
131
+ mask_token="<mask>",
132
+ num_patch_index_tokens=1024,
133
+ add_tag_and_patch_index_tokens=False,
134
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
135
+ **kwargs,
136
+ ) -> None:
137
+ # Mask token behave like a normal word, i.e. include the space before it
138
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
139
+
140
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
141
+
142
+ super().__init__(
143
+ bos_token=bos_token,
144
+ eos_token=eos_token,
145
+ unk_token=unk_token,
146
+ sep_token=sep_token,
147
+ cls_token=cls_token,
148
+ pad_token=pad_token,
149
+ mask_token=mask_token,
150
+ sp_model_kwargs=self.sp_model_kwargs,
151
+ **kwargs,
152
+ )
153
+
154
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
155
+ self.sp_model.Load(str(vocab_file))
156
+ self.vocab_file = vocab_file
157
+
158
+ # Original fairseq vocab and spm vocab must be "aligned":
159
+ # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
160
+ # -------- | ------- | ------- | ------ | ------- | ------ | ------ | ------ | ------ | ------- | ------
161
+ # fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | '.' | '_the' | ',' | '▁to' | '▁and' | '▁of'
162
+ # spm | '<unk>' | '<s>' | '</s>' | '.' | '_the' | ',' | '▁to' | '▁and' | '▁of' | '▁a'
163
+
164
+ # Mimic fairseq token-to-id alignment for the first 4 token
165
+ self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
166
+
167
+ # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
168
+ self.fairseq_offset = 1
169
+
170
+ self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
171
+ self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
172
+
173
+ self.eod_token = "</doc>"
174
+
175
+ self.boi_token = "<image>"
176
+ self.eoi_token = "</image>"
177
+
178
+ self.eoc_token = "</chunk>"
179
+ self.eol_token = "</line>"
180
+
181
+ self.bop_token = "<phrase>"
182
+ self.eop_token = "</phrase>"
183
+
184
+ self.boo_token = "<object>"
185
+ self.eoo_token = "</object>"
186
+
187
+ self.dom_token = "</delimiter_of_multi_objects/>"
188
+
189
+ self.grd_token = "<grounding>"
190
+
191
+ self.tag_tokens = [
192
+ self.eod_token,
193
+ self.boi_token,
194
+ self.eoi_token,
195
+ self.eoc_token,
196
+ self.eol_token,
197
+ self.bop_token,
198
+ self.eop_token,
199
+ self.boo_token,
200
+ self.eoo_token,
201
+ self.dom_token,
202
+ self.grd_token,
203
+ ]
204
+
205
+ self.num_patch_index_tokens = num_patch_index_tokens
206
+ patch_index_tokens = [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.num_patch_index_tokens)]
207
+
208
+ if add_tag_and_patch_index_tokens:
209
+ for idx, token in enumerate(self.tag_tokens + patch_index_tokens):
210
+ # we can't add them as special tokens, as the slow tokenizer doesn't save the information of a token
211
+ # being special when it is added through `add_tokens`, but the fast tokenizer is able to do so.
212
+ self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=True)
213
+
214
+ def _decode(
215
+ self,
216
+ token_ids: List[int],
217
+ skip_special_tokens: bool = False,
218
+ clean_up_tokenization_spaces: bool = None,
219
+ spaces_between_special_tokens: bool = True,
220
+ **kwargs,
221
+ ) -> str:
222
+ self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
223
+
224
+ filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
225
+
226
+ # To avoid mixing byte-level and unicode for byte-level BPT
227
+ # we need to build string separately for added tokens and byte-level tokens
228
+ # cf. https://github.com/huggingface/transformers/issues/1133
229
+ sub_texts = []
230
+ current_sub_text = []
231
+ is_first_current_sub_text = True
232
+ for token in filtered_tokens:
233
+ if skip_special_tokens and token in self.all_special_ids:
234
+ continue
235
+ if token in self.added_tokens_encoder:
236
+ if current_sub_text:
237
+ sub_text = self.convert_tokens_to_string(current_sub_text)
238
+ # `convert_tokens_to_string` removes the leading space, which is undesired if we are not at the
239
+ # beginning part of the text. We can't use `spaces_between_special_tokens` to add this space back
240
+ # neither, as it will also add a space before a tag/patch_index token (which is not the case with
241
+ # the fast tokenizer - it doesn't even support `spaces_between_special_tokens`), which is not the
242
+ # ideal output format.
243
+ # The condition `not spaces_between_special_tokens` is to avoid double spaces.
244
+ if not is_first_current_sub_text and not spaces_between_special_tokens:
245
+ sub_text = " " + sub_text
246
+ sub_texts.append(sub_text)
247
+ current_sub_text = []
248
+ is_first_current_sub_text = False
249
+ sub_texts.append(token)
250
+ else:
251
+ current_sub_text.append(token)
252
+ if current_sub_text:
253
+ sub_texts.append(self.convert_tokens_to_string(current_sub_text))
254
+
255
+ if spaces_between_special_tokens:
256
+ text = " ".join(sub_texts)
257
+ else:
258
+ text = "".join(sub_texts)
259
+
260
+ clean_up_tokenization_spaces = (
261
+ clean_up_tokenization_spaces
262
+ if clean_up_tokenization_spaces is not None
263
+ else self.clean_up_tokenization_spaces
264
+ )
265
+ if clean_up_tokenization_spaces:
266
+ clean_text = self.clean_up_tokenization(text)
267
+ return clean_text
268
+ else:
269
+ return text
270
+
271
+ def __getstate__(self):
272
+ state = self.__dict__.copy()
273
+ state["sp_model"] = None
274
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
275
+ return state
276
+
277
+ def __setstate__(self, d):
278
+ self.__dict__ = d
279
+
280
+ # for backward compatibility
281
+ if not hasattr(self, "sp_model_kwargs"):
282
+ self.sp_model_kwargs = {}
283
+
284
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
285
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
286
+
287
+ def build_inputs_with_special_tokens(
288
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
289
+ ) -> List[int]:
290
+ """
291
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
292
+ adding special tokens. An XLM-RoBERTa sequence has the following format:
293
+
294
+ - single sequence: `<s> X </s>`
295
+ - pair of sequences: `<s> A </s></s> B </s>`
296
+
297
+ Args:
298
+ token_ids_0 (`List[int]`):
299
+ List of IDs to which the special tokens will be added.
300
+ token_ids_1 (`List[int]`, *optional*):
301
+ Optional second list of IDs for sequence pairs.
302
+
303
+ Returns:
304
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
305
+ """
306
+
307
+ if token_ids_1 is None:
308
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
309
+ cls = [self.cls_token_id]
310
+ sep = [self.sep_token_id]
311
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
312
+
313
+ def get_special_tokens_mask(
314
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
315
+ ) -> List[int]:
316
+ """
317
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
318
+ special tokens using the tokenizer `prepare_for_model` method.
319
+
320
+ Args:
321
+ token_ids_0 (`List[int]`):
322
+ List of IDs.
323
+ token_ids_1 (`List[int]`, *optional*):
324
+ Optional second list of IDs for sequence pairs.
325
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
326
+ Whether or not the token list is already formatted with special tokens for the model.
327
+
328
+ Returns:
329
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
330
+ """
331
+
332
+ if already_has_special_tokens:
333
+ return super().get_special_tokens_mask(
334
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
335
+ )
336
+
337
+ if token_ids_1 is None:
338
+ return [1] + ([0] * len(token_ids_0)) + [1]
339
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
340
+
341
+ def create_token_type_ids_from_sequences(
342
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
343
+ ) -> List[int]:
344
+ """
345
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
346
+ not make use of token type ids, therefore a list of zeros is returned.
347
+
348
+ Args:
349
+ token_ids_0 (`List[int]`):
350
+ List of IDs.
351
+ token_ids_1 (`List[int]`, *optional*):
352
+ Optional second list of IDs for sequence pairs.
353
+
354
+ Returns:
355
+ `List[int]`: List of zeros.
356
+
357
+ """
358
+
359
+ sep = [self.sep_token_id]
360
+ cls = [self.cls_token_id]
361
+
362
+ if token_ids_1 is None:
363
+ return len(cls + token_ids_0 + sep) * [0]
364
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
365
+
366
+ @property
367
+ def vocab_size(self):
368
+ return len(self.sp_model) + self.fairseq_offset + 1 # Add the <mask> token
369
+
370
+ def get_vocab(self):
371
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
372
+ vocab.update(self.added_tokens_encoder)
373
+ return vocab
374
+
375
+ def _tokenize(self, text: str) -> List[str]:
376
+ return self.sp_model.encode(text, out_type=str)
377
+
378
+ def _convert_token_to_id(self, token):
379
+ """Converts a token (str) in an id using the vocab."""
380
+ if token in self.fairseq_tokens_to_ids:
381
+ return self.fairseq_tokens_to_ids[token]
382
+ spm_id = self.sp_model.PieceToId(token)
383
+
384
+ # Need to return unknown token if the SP model returned 0
385
+ return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
386
+
387
+ def _convert_id_to_token(self, index):
388
+ """Converts an index (integer) in a token (str) using the vocab."""
389
+ if index in self.fairseq_ids_to_tokens:
390
+ return self.fairseq_ids_to_tokens[index]
391
+ return self.sp_model.IdToPiece(index - self.fairseq_offset)
392
+
393
+ def convert_tokens_to_string(self, tokens):
394
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
395
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
396
+ return out_string
397
+
398
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
399
+ if not os.path.isdir(save_directory):
400
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
401
+ return
402
+ out_vocab_file = os.path.join(
403
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
404
+ )
405
+
406
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
407
+ copyfile(self.vocab_file, out_vocab_file)
408
+ elif not os.path.isfile(self.vocab_file):
409
+ with open(out_vocab_file, "wb") as fi:
410
+ content_spiece_model = self.sp_model.serialized_model_proto()
411
+ fi.write(content_spiece_model)
412
+
413
+ return (out_vocab_file,)
tokenization_kosmos2_fast.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization classes for KOSMOS-2 model."""
16
+
17
+
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import List, Optional, Tuple
21
+
22
+ from transformers.tokenization_utils import AddedToken
23
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
24
+ from transformers.utils import is_sentencepiece_available, logging
25
+
26
+
27
+ if is_sentencepiece_available():
28
+ from .tokenization_kosmos2 import Kosmos2Tokenizer
29
+ else:
30
+ Kosmos2TokenizerFast = None
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
36
+
37
+ PRETRAINED_VOCAB_FILES_MAP = {
38
+ "vocab_file": {
39
+ "microsoft/kosmos-2-patch14-224": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/sentencepiece.bpe.model",
40
+ }
41
+ }
42
+
43
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
44
+ "microsoft/kosmos-2-patch14-224": 2048,
45
+ }
46
+
47
+
48
+ class Kosmos2TokenizerFast(PreTrainedTokenizerFast):
49
+ """
50
+ Construct a "fast" KOSMOS-2 tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
51
+ [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
52
+ [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
53
+
54
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
55
+ refer to this superclass for more information regarding those methods.
56
+
57
+ Args:
58
+ vocab_file (`str`):
59
+ Path to the vocabulary file.
60
+ bos_token (`str`, *optional*, defaults to `"<s>"`):
61
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
62
+
63
+ <Tip>
64
+
65
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
66
+ sequence. The token used is the `cls_token`.
67
+
68
+ </Tip>
69
+
70
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
71
+ The end of sequence token.
72
+
73
+ <Tip>
74
+
75
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
76
+ The token used is the `sep_token`.
77
+
78
+ </Tip>
79
+
80
+ sep_token (`str`, *optional*, defaults to `"</s>"`):
81
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
82
+ sequence classification or for a text and a question for question answering. It is also used as the last
83
+ token of a sequence built with special tokens.
84
+ cls_token (`str`, *optional*, defaults to `"<s>"`):
85
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
86
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
87
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
88
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
89
+ token instead.
90
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
91
+ The token used for padding, for example when batching sequences of different lengths.
92
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
93
+ The token used for masking values. This is the token used when training this model with masked language
94
+ modeling. This is the token which the model will try to predict.
95
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
96
+ Additional special tokens used by the tokenizer.
97
+ num_patch_index_tokens (`int`, *optional*, defaults to `1024`):
98
+ The number of tokens used to specify the patch indices of bounding boxes in an image. These tokens have the
99
+ format `<patch_index_xxxx>` where `xxxx` is an integer.
100
+ """
101
+
102
+ vocab_files_names = VOCAB_FILES_NAMES
103
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
104
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
105
+ model_input_names = ["input_ids", "attention_mask"]
106
+ slow_tokenizer_class = Kosmos2Tokenizer
107
+
108
+ def __init__(
109
+ self,
110
+ vocab_file=None,
111
+ tokenizer_file=None,
112
+ bos_token="<s>",
113
+ eos_token="</s>",
114
+ sep_token="</s>",
115
+ cls_token="<s>",
116
+ unk_token="<unk>",
117
+ pad_token="<pad>",
118
+ mask_token="<mask>",
119
+ num_patch_index_tokens=1024,
120
+ add_tag_and_patch_index_tokens=False,
121
+ **kwargs,
122
+ ):
123
+ # Mask token behave like a normal word, i.e. include the space before it
124
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
125
+
126
+ super().__init__(
127
+ vocab_file,
128
+ tokenizer_file=tokenizer_file,
129
+ bos_token=bos_token,
130
+ eos_token=eos_token,
131
+ sep_token=sep_token,
132
+ cls_token=cls_token,
133
+ unk_token=unk_token,
134
+ pad_token=pad_token,
135
+ mask_token=mask_token,
136
+ **kwargs,
137
+ )
138
+
139
+ self.vocab_file = vocab_file
140
+ self.can_save_slow_tokenizer = False if not self.vocab_file else True
141
+
142
+ self.eod_token = "</doc>"
143
+
144
+ self.boi_token = "<image>"
145
+ self.eoi_token = "</image>"
146
+
147
+ self.eoc_token = "</chunk>"
148
+ self.eol_token = "</line>"
149
+
150
+ self.bop_token = "<phrase>"
151
+ self.eop_token = "</phrase>"
152
+
153
+ self.boo_token = "<object>"
154
+ self.eoo_token = "</object>"
155
+
156
+ self.dom_token = "</delimiter_of_multi_objects/>"
157
+
158
+ self.grd_token = "<grounding>"
159
+
160
+ self.tag_tokens = [
161
+ self.eod_token,
162
+ self.boi_token,
163
+ self.eoi_token,
164
+ self.eoc_token,
165
+ self.eol_token,
166
+ self.bop_token,
167
+ self.eop_token,
168
+ self.boo_token,
169
+ self.eoo_token,
170
+ self.dom_token,
171
+ self.grd_token,
172
+ ]
173
+
174
+ self.num_patch_index_tokens = num_patch_index_tokens
175
+ patch_index_tokens = [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.num_patch_index_tokens)]
176
+
177
+ if add_tag_and_patch_index_tokens:
178
+ for idx, token in enumerate(self.tag_tokens + patch_index_tokens):
179
+ # we need to set `special_tokens=False` to be the same as in the slow tokenizer.
180
+ self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=False)
181
+
182
+ def build_inputs_with_special_tokens(
183
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
184
+ ) -> List[int]:
185
+ """
186
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
187
+ adding special tokens. An XLM-RoBERTa sequence has the following format:
188
+
189
+ - single sequence: `<s> X </s>`
190
+ - pair of sequences: `<s> A </s></s> B </s>`
191
+
192
+ Args:
193
+ token_ids_0 (`List[int]`):
194
+ List of IDs to which the special tokens will be added.
195
+ token_ids_1 (`List[int]`, *optional*):
196
+ Optional second list of IDs for sequence pairs.
197
+
198
+ Returns:
199
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
200
+ """
201
+
202
+ if token_ids_1 is None:
203
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
204
+ cls = [self.cls_token_id]
205
+ sep = [self.sep_token_id]
206
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
207
+
208
+ def create_token_type_ids_from_sequences(
209
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
210
+ ) -> List[int]:
211
+ """
212
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
213
+ not make use of token type ids, therefore a list of zeros is returned.
214
+
215
+ Args:
216
+ token_ids_0 (`List[int]`):
217
+ List of IDs.
218
+ token_ids_1 (`List[int]`, *optional*):
219
+ Optional second list of IDs for sequence pairs.
220
+
221
+ Returns:
222
+ `List[int]`: List of zeros.
223
+
224
+ """
225
+
226
+ sep = [self.sep_token_id]
227
+ cls = [self.cls_token_id]
228
+
229
+ if token_ids_1 is None:
230
+ return len(cls + token_ids_0 + sep) * [0]
231
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
232
+
233
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
234
+ if not self.can_save_slow_tokenizer:
235
+ raise ValueError(
236
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
237
+ "tokenizer."
238
+ )
239
+
240
+ if not os.path.isdir(save_directory):
241
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
242
+ return
243
+ out_vocab_file = os.path.join(
244
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
245
+ )
246
+
247
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
248
+ copyfile(self.vocab_file, out_vocab_file)
249
+
250
+ return (out_vocab_file,)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 1000000000000000019884624838656,
15
+ "pad_token": "<pad>",
16
+ "processor_class": "Kosmos2Processor",
17
+ "sep_token": "</s>",
18
+ "sp_model_kwargs": {},
19
+ "tokenizer_class": "Kosmos2Tokenizer",
20
+ "unk_token": "<unk>",
21
+ "auto_map": {
22
+ "AutoTokenizer": [
23
+ "tokenization_kosmos2.Kosmos2Tokenizer",
24
+ "tokenization_kosmos2_fast.Kosmos2TokenizerFast"
25
+ ]
26
+ }
27
+ }
two_dogs.jpg ADDED