camenduru commited on Sep 2, 2023

Commit

a123ab1

•

1 Parent(s): 845f73c

thanks to ydshieh ❤

Browse files

Files changed (22) hide show

.gitattributes +1 -0
README.md +421 -0
added_tokens.json +1037 -0
annotated_snowman.jpg +0 -0
config.json +173 -0
configuration_kosmos2.py +331 -0
draw_bboxes.py +119 -0
generation_config.json +9 -0
image_processing_kosmos2.py +304 -0
modeling_kosmos2.py +1747 -0
preprocessor_config.json +32 -0
processing_kosmos2.py +608 -0
pytorch_model.bin +3 -0
sentencepiece.bpe.model +3 -0
snowman.jpg +0 -0
snowman.png +3 -0
special_tokens_map.json +15 -0
tokenization_kosmos2.py +413 -0
tokenization_kosmos2_fast.py +250 -0
tokenizer.json +0 -0
tokenizer_config.json +27 -0
two_dogs.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+snowman.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,421 @@

+---
+# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/model-cards
+{}
+---
+# Kosmos-2: Grounding Multimodal Large Language Models to the World
+**(There is an on going effort to port `Kosmos-2` directly into `transformers`. This repository (remote code) might need some more bug fixes later, including breaking changes.)**
+<a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><figure><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="384"><figcaption><b>[An image of a snowman warming himself by a fire.]</b></figcaption></figure></a>
+This Hub repository contains a HuggingFace's `transformers` implementation of [the original Kosmos-2 model](https://github.com/microsoft/unilm/tree/master/kosmos-2) from Microsoft.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+```python
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForVision2Seq
+model = AutoModelForVision2Seq.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
+prompt = "<grounding>An image of"
+url = "https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/snowman.png"
+image = Image.open(requests.get(url, stream=True).raw)
+# The original Kosmos-2 demo saves the image first then reload it. For some images, this will give slightly different image input and change the generation outputs.
+# Uncomment the following 2 lines if you want to match the original demo's outputs.
+# (One example is the `two_dogs.jpg` from the demo)
+# image.save("new_image.jpg")
+# image = Image.open("new_image.jpg")
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+generated_ids = model.generate(
+    pixel_values=inputs["pixel_values"],
+    input_ids=inputs["input_ids"][:, :-1],
+    attention_mask=inputs["attention_mask"][:, :-1],
+    img_features=None,
+    img_attn_mask=inputs["img_attn_mask"][:, :-1],
+    use_cache=True,
+    max_new_tokens=64,
+)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+# Specify `cleanup_and_extract=False` in order to see the raw model generation.
+processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
+print(processed_text)
+# `<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.`
+# By default, the generated  text is cleanup and the entities are extracted.
+processed_text, entities = processor.post_process_generation(generated_text)
+print(processed_text)
+# `An image of a snowman warming himself by a fire.`
+print(entities)
+# `[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]`
+```
+## Draw the bounding bboxes of the entities on the image
+Once you have the `entities`, you can use the following helper function to draw their bounding bboxes on the image:
+```python
+import cv2
+import numpy as np
+import os
+import requests
+import torch
+import torchvision.transforms as T
+from PIL import Image
+def is_overlapping(rect1, rect2):
+    x1, y1, x2, y2 = rect1
+    x3, y3, x4, y4 = rect2
+    return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)
+def draw_entity_boxes_on_image(image, entities, show=False, save_path=None):
+    """_summary_
+    Args:
+        image (_type_): image or image path
+        collect_entity_location (_type_): _description_
+    """
+    if isinstance(image, Image.Image):
+        image_h = image.height
+        image_w = image.width
+        image = np.array(image)[:, :, [2, 1, 0]]
+    elif isinstance(image, str):
+        if os.path.exists(image):
+            pil_img = Image.open(image).convert("RGB")
+            image = np.array(pil_img)[:, :, [2, 1, 0]]
+            image_h = pil_img.height
+            image_w = pil_img.width
+        else:
+            raise ValueError(f"invaild image path, {image}")
+    elif isinstance(image, torch.Tensor):
+        # pdb.set_trace()
+        image_tensor = image.cpu()
+        reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None]
+        reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None]
+        image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean
+        pil_img = T.ToPILImage()(image_tensor)
+        image_h = pil_img.height
+        image_w = pil_img.width
+        image = np.array(pil_img)[:, :, [2, 1, 0]]
+    else:
+        raise ValueError(f"invaild image format, {type(image)} for {image}")
+    if len(entities) == 0:
+        return image
+    new_image = image.copy()
+    previous_bboxes = []
+    # size of text
+    text_size = 1
+    # thickness of text
+    text_line = 1  # int(max(1 * min(image_h, image_w) / 512, 1))
+    box_line = 3
+    (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
+    base_height = int(text_height * 0.675)
+    text_offset_original = text_height - base_height
+    text_spaces = 3
+    for entity_name, (start, end), bboxes in entities:
+        for (x1_norm, y1_norm, x2_norm, y2_norm) in bboxes:
+            orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm * image_w), int(y1_norm * image_h), int(x2_norm * image_w), int(y2_norm * image_h)
+            # draw bbox
+            # random color
+            color = tuple(np.random.randint(0, 255, size=3).tolist())
+            new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)
+            l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1
+            x1 = orig_x1 - l_o
+            y1 = orig_y1 - l_o
+            if y1 < text_height + text_offset_original + 2 * text_spaces:
+                y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
+                x1 = orig_x1 + r_o
+            # add text background
+            (text_width, text_height), _ = cv2.getTextSize(f"  {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
+            text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1
+            for prev_bbox in previous_bboxes:
+                while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox):
+                    text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
+                    text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
+                    y1 += (text_height + text_offset_original + 2 * text_spaces)
+                    if text_bg_y2 >= image_h:
+                        text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
+                        text_bg_y2 = image_h
+                        y1 = image_h
+                        break
+            alpha = 0.5
+            for i in range(text_bg_y1, text_bg_y2):
+                for j in range(text_bg_x1, text_bg_x2):
+                    if i < image_h and j < image_w:
+                        if j < text_bg_x1 + 1.35 * c_width:
+                            # original color
+                            bg_color = color
+                        else:
+                            # white
+                            bg_color = [255, 255, 255]
+                        new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8)
+            cv2.putText(
+                new_image, f"  {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
+            )
+            # previous_locations.append((x1, y1))
+            previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2))
+    pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]])
+    if save_path:
+        pil_image.save(save_path)
+    if show:
+        pil_image.show()
+    return new_image
+# (The same image from the previous code example)
+url = "https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+# From the previous code example
+entities = [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
+# Draw the bounding bboxes
+draw_entity_boxes_on_image(image, entities, show=True)
+```
+Here is the annotated image:
+<a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="500"></a>
+## Tasks
+This model is capable of performing different tasks through changing the prompts.
+First, let's define a function to run a prompt.
+```python
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForVision2Seq
+model = AutoModelForVision2Seq.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
+url = "https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/snowman.png"
+image = Image.open(requests.get(url, stream=True).raw)
+def run_example(prompt):
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
+    generated_ids = model.generate(
+        pixel_values=inputs["pixel_values"],
+        input_ids=inputs["input_ids"][:, :-1],
+        attention_mask=inputs["attention_mask"][:, :-1],
+        img_features=None,
+        img_attn_mask=inputs["img_attn_mask"][:, :-1],
+        use_cache=True,
+        max_new_tokens=64,
+    )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    _processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
+    processed_text, entities = processor.post_process_generation(generated_text)
+    print(processed_text)
+    print(entities)
+    print(_processed_text)
+```
+Here are the tasks `Kosmos-2` could perform:
+### Multimodal Grounding
+#### • Phrase Grounding
+```python
+prompt = "<grounding><phrase> a snowman</phrase>"
+run_example(prompt)
+# a snowman is warming himself by the fire
+# [('a snowman', (0, 9), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('the fire', (32, 40), [(0.203125, 0.015625, 0.453125, 0.859375)])]
+# <grounding><phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> is warming himself by<phrase> the fire</phrase><object><patch_index_0006><patch_index_0878></object>
+```
+#### • Referring Expression Comprehension
+```python
+prompt = "<grounding><phrase> a snowman next to a fire</phrase>"
+run_example(prompt)
+# a snowman next to a fire
+# [('a snowman next to a fire', (0, 24), [(0.390625, 0.046875, 0.984375, 0.828125)])]
+# <grounding><phrase> a snowman next to a fire</phrase><object><patch_index_0044><patch_index_0863></object>
+```
+### Multimodal Referring
+#### • Referring expression generation
+```python
+prompt = "<grounding><phrase> It</phrase><object><patch_index_0044><patch_index_0863></object> is"
+run_example(prompt)
+# It is snowman in a hat and scarf
+# [('It', (0, 2), [(0.390625, 0.046875, 0.984375, 0.828125)])]
+# <grounding><phrase> It</phrase><object><patch_index_0044><patch_index_0863></object> is snowman in a hat and scarf
+```
+### Perception-Language Tasks
+#### • Grounded VQA
+```python
+prompt = "<grounding> Question: What is special about this image? Answer:"
+run_example(prompt)
+# Question: What is special about this image? Answer: The image features a snowman sitting by a campfire in the snow.
+# [('a snowman', (71, 80), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a campfire', (92, 102), [(0.109375, 0.640625, 0.546875, 0.984375)])]
+# <grounding> Question: What is special about this image? Answer: The image features<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> sitting by<phrase> a campfire</phrase><object><patch_index_0643><patch_index_1009></object> in the snow.
+```
+#### • Grounded VQA with multimodal referring via bounding boxes
+```python
+prompt = "<grounding> Question: Where is<phrase> the fire</phrase><object><patch_index_0005><patch_index_0911></object> next to? Answer:"
+run_example(prompt)
+# Question: Where is the fire next to? Answer: Near the snowman.
+# [('the fire', (19, 27), [(0.171875, 0.015625, 0.484375, 0.890625)]), ('the snowman', (50, 61), [(0.390625, 0.046875, 0.984375, 0.828125)])]
+# <grounding> Question: Where is<phrase> the fire</phrase><object><patch_index_0005><patch_index_0911></object> next to? Answer: Near<phrase> the snowman</phrase><object><patch_index_0044><patch_index_0863></object>.
+```
+### Grounded Image captioning
+#### • Brief
+```python
+prompt = "<grounding> An image of"
+run_example(prompt)
+# An image of a snowman warming himself by a campfire.
+# [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a campfire', (41, 51), [(0.109375, 0.640625, 0.546875, 0.984375)])]
+# <grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a campfire</phrase><object><patch_index_0643><patch_index_1009></object>.
+```
+#### • Detailed
+```python
+prompt = "<grounding> Describe this image in detail:"
+run_example(prompt)
+# Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is wearing a hat, scarf, and gloves, with a pot nearby and a cup
+# [('a campfire', (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]), ('a hat', (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]), ('scarf', (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]), ('gloves', (127, 133), [(0.515625, 0.390625, 0.640625, 0.515625)]), ('a pot', (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)])]
+# <grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object><patch_index_0240><patch_index_0604></object>, and<phrase> gloves</phrase><object><patch_index_0400><patch_index_0532></object>, with<phrase> a pot</phrase><object><patch_index_0610><patch_index_0872></object> nearby and<phrase> a cup</phrase><object>
+```
+## Running the Flask Server
+_flask_kosmos2.py_ shows the implementation of a Flask server for the model.
+It allowes the model to be approached as a REST API.
+After starting the server. You can send a POST request to `http://localhost:8005/process_prompt` with the following form data:
+- `prompt`: For example `<grounding> an image of`
+- `image`: The image file as binary data
+This in turn will produce a reply with the following JSON format:
+- `message`: The Kosmos-2 generated text
+- `entities`: The extracted entities
+An easy way to test this is through an application like Postman. Make sure the image field is set to `File`.
+```python
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from flask import Flask, request, jsonify
+import json
+app = Flask(__name__)
+model = AutoModelForVision2Seq.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
+@app.route('/process_prompt', methods=['POST'])
+def process_prompt():
+    try:
+        # Get the uploaded image data from the POST request
+        uploaded_file = request.files['image']
+        prompt = request.form.get('prompt')
+        image = Image.open(uploaded_file.stream)
+        print(image.size)
+        inputs = processor(text=prompt, images=image, return_tensors="pt")
+        generated_ids = model.generate(
+            pixel_values=inputs["pixel_values"],
+            input_ids=inputs["input_ids"][:, :-1],
+            attention_mask=inputs["attention_mask"][:, :-1],
+            img_features=None,
+            img_attn_mask=inputs["img_attn_mask"][:, :-1],
+            use_cache=True,
+            max_new_tokens=64,
+        )
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # By default, the generated  text is cleanup and the entities are extracted.
+        processed_text, entities = processor.post_process_generation(generated_text)
+        parsed_entities = entities_to_json(entities)
+        print(generated_text)
+        print(processed_text)
+        return jsonify({"message": processed_text, 'entities': parsed_entities})
+    except Exception as e:
+        return jsonify({"error": str(e)})
+def entities_to_json(entities):
+    result = []
+    for e in entities:
+        label = e[0]
+        box_coords = e[1]
+        box_size = e[2][0]
+        entity_result = {
+            "label": label,
+            "boundingBoxPosition": {"x": box_coords[0], "y": box_coords[1]},
+            "boundingBox": {"x_min": box_size[0], "y_min": box_size[1], "x_max": box_size[2], "y_max": box_size[3]}
+        }
+        print(entity_result)
+        result.append(entity_result)
+    return result
+if __name__ == '__main__':
+    app.run(host='localhost', port=8005)
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1037 @@

+{
+  "</chunk>": 64005,
+  "</delimiter_of_multi_objects/>": 64011,
+  "</doc>": 64002,
+  "</image>": 64004,
+  "</line>": 64006,
+  "</object>": 64010,
+  "</phrase>": 64008,
+  "<grounding>": 64012,
+  "<image>": 64003,
+  "<object>": 64009,
+  "<patch_index_0000>": 64013,
+  "<patch_index_0001>": 64014,
+  "<patch_index_0002>": 64015,
+  "<patch_index_0003>": 64016,
+  "<patch_index_0004>": 64017,
+  "<patch_index_0005>": 64018,
+  "<patch_index_0006>": 64019,
+  "<patch_index_0007>": 64020,
+  "<patch_index_0008>": 64021,
+  "<patch_index_0009>": 64022,
+  "<patch_index_0010>": 64023,
+  "<patch_index_0011>": 64024,
+  "<patch_index_0012>": 64025,
+  "<patch_index_0013>": 64026,
+  "<patch_index_0014>": 64027,
+  "<patch_index_0015>": 64028,
+  "<patch_index_0016>": 64029,
+  "<patch_index_0017>": 64030,
+  "<patch_index_0018>": 64031,
+  "<patch_index_0019>": 64032,
+  "<patch_index_0020>": 64033,
+  "<patch_index_0021>": 64034,
+  "<patch_index_0022>": 64035,
+  "<patch_index_0023>": 64036,
+  "<patch_index_0024>": 64037,
+  "<patch_index_0025>": 64038,
+  "<patch_index_0026>": 64039,
+  "<patch_index_0027>": 64040,
+  "<patch_index_0028>": 64041,
+  "<patch_index_0029>": 64042,
+  "<patch_index_0030>": 64043,
+  "<patch_index_0031>": 64044,
+  "<patch_index_0032>": 64045,
+  "<patch_index_0033>": 64046,
+  "<patch_index_0034>": 64047,
+  "<patch_index_0035>": 64048,
+  "<patch_index_0036>": 64049,
+  "<patch_index_0037>": 64050,
+  "<patch_index_0038>": 64051,
+  "<patch_index_0039>": 64052,
+  "<patch_index_0040>": 64053,
+  "<patch_index_0041>": 64054,
+  "<patch_index_0042>": 64055,
+  "<patch_index_0043>": 64056,
+  "<patch_index_0044>": 64057,
+  "<patch_index_0045>": 64058,
+  "<patch_index_0046>": 64059,
+  "<patch_index_0047>": 64060,
+  "<patch_index_0048>": 64061,
+  "<patch_index_0049>": 64062,
+  "<patch_index_0050>": 64063,
+  "<patch_index_0051>": 64064,
+  "<patch_index_0052>": 64065,
+  "<patch_index_0053>": 64066,
+  "<patch_index_0054>": 64067,
+  "<patch_index_0055>": 64068,
+  "<patch_index_0056>": 64069,
+  "<patch_index_0057>": 64070,
+  "<patch_index_0058>": 64071,
+  "<patch_index_0059>": 64072,
+  "<patch_index_0060>": 64073,
+  "<patch_index_0061>": 64074,
+  "<patch_index_0062>": 64075,
+  "<patch_index_0063>": 64076,
+  "<patch_index_0064>": 64077,
+  "<patch_index_0065>": 64078,
+  "<patch_index_0066>": 64079,
+  "<patch_index_0067>": 64080,
+  "<patch_index_0068>": 64081,
+  "<patch_index_0069>": 64082,
+  "<patch_index_0070>": 64083,
+  "<patch_index_0071>": 64084,
+  "<patch_index_0072>": 64085,
+  "<patch_index_0073>": 64086,
+  "<patch_index_0074>": 64087,
+  "<patch_index_0075>": 64088,
+  "<patch_index_0076>": 64089,
+  "<patch_index_0077>": 64090,
+  "<patch_index_0078>": 64091,
+  "<patch_index_0079>": 64092,
+  "<patch_index_0080>": 64093,
+  "<patch_index_0081>": 64094,
+  "<patch_index_0082>": 64095,
+  "<patch_index_0083>": 64096,
+  "<patch_index_0084>": 64097,
+  "<patch_index_0085>": 64098,
+  "<patch_index_0086>": 64099,
+  "<patch_index_0087>": 64100,
+  "<patch_index_0088>": 64101,
+  "<patch_index_0089>": 64102,
+  "<patch_index_0090>": 64103,
+  "<patch_index_0091>": 64104,
+  "<patch_index_0092>": 64105,
+  "<patch_index_0093>": 64106,
+  "<patch_index_0094>": 64107,
+  "<patch_index_0095>": 64108,
+  "<patch_index_0096>": 64109,
+  "<patch_index_0097>": 64110,
+  "<patch_index_0098>": 64111,
+  "<patch_index_0099>": 64112,
+  "<patch_index_0100>": 64113,
+  "<patch_index_0101>": 64114,
+  "<patch_index_0102>": 64115,
+  "<patch_index_0103>": 64116,
+  "<patch_index_0104>": 64117,
+  "<patch_index_0105>": 64118,
+  "<patch_index_0106>": 64119,
+  "<patch_index_0107>": 64120,
+  "<patch_index_0108>": 64121,
+  "<patch_index_0109>": 64122,
+  "<patch_index_0110>": 64123,
+  "<patch_index_0111>": 64124,
+  "<patch_index_0112>": 64125,
+  "<patch_index_0113>": 64126,
+  "<patch_index_0114>": 64127,
+  "<patch_index_0115>": 64128,
+  "<patch_index_0116>": 64129,
+  "<patch_index_0117>": 64130,
+  "<patch_index_0118>": 64131,
+  "<patch_index_0119>": 64132,
+  "<patch_index_0120>": 64133,
+  "<patch_index_0121>": 64134,
+  "<patch_index_0122>": 64135,
+  "<patch_index_0123>": 64136,
+  "<patch_index_0124>": 64137,
+  "<patch_index_0125>": 64138,
+  "<patch_index_0126>": 64139,
+  "<patch_index_0127>": 64140,
+  "<patch_index_0128>": 64141,
+  "<patch_index_0129>": 64142,
+  "<patch_index_0130>": 64143,
+  "<patch_index_0131>": 64144,
+  "<patch_index_0132>": 64145,
+  "<patch_index_0133>": 64146,
+  "<patch_index_0134>": 64147,
+  "<patch_index_0135>": 64148,
+  "<patch_index_0136>": 64149,
+  "<patch_index_0137>": 64150,
+  "<patch_index_0138>": 64151,
+  "<patch_index_0139>": 64152,
+  "<patch_index_0140>": 64153,
+  "<patch_index_0141>": 64154,
+  "<patch_index_0142>": 64155,
+  "<patch_index_0143>": 64156,
+  "<patch_index_0144>": 64157,
+  "<patch_index_0145>": 64158,
+  "<patch_index_0146>": 64159,
+  "<patch_index_0147>": 64160,
+  "<patch_index_0148>": 64161,
+  "<patch_index_0149>": 64162,
+  "<patch_index_0150>": 64163,
+  "<patch_index_0151>": 64164,
+  "<patch_index_0152>": 64165,
+  "<patch_index_0153>": 64166,
+  "<patch_index_0154>": 64167,
+  "<patch_index_0155>": 64168,
+  "<patch_index_0156>": 64169,
+  "<patch_index_0157>": 64170,
+  "<patch_index_0158>": 64171,
+  "<patch_index_0159>": 64172,
+  "<patch_index_0160>": 64173,
+  "<patch_index_0161>": 64174,
+  "<patch_index_0162>": 64175,
+  "<patch_index_0163>": 64176,
+  "<patch_index_0164>": 64177,
+  "<patch_index_0165>": 64178,
+  "<patch_index_0166>": 64179,
+  "<patch_index_0167>": 64180,
+  "<patch_index_0168>": 64181,
+  "<patch_index_0169>": 64182,
+  "<patch_index_0170>": 64183,
+  "<patch_index_0171>": 64184,
+  "<patch_index_0172>": 64185,
+  "<patch_index_0173>": 64186,
+  "<patch_index_0174>": 64187,
+  "<patch_index_0175>": 64188,
+  "<patch_index_0176>": 64189,
+  "<patch_index_0177>": 64190,
+  "<patch_index_0178>": 64191,
+  "<patch_index_0179>": 64192,
+  "<patch_index_0180>": 64193,
+  "<patch_index_0181>": 64194,
+  "<patch_index_0182>": 64195,
+  "<patch_index_0183>": 64196,
+  "<patch_index_0184>": 64197,
+  "<patch_index_0185>": 64198,
+  "<patch_index_0186>": 64199,
+  "<patch_index_0187>": 64200,
+  "<patch_index_0188>": 64201,
+  "<patch_index_0189>": 64202,
+  "<patch_index_0190>": 64203,
+  "<patch_index_0191>": 64204,
+  "<patch_index_0192>": 64205,
+  "<patch_index_0193>": 64206,
+  "<patch_index_0194>": 64207,
+  "<patch_index_0195>": 64208,
+  "<patch_index_0196>": 64209,
+  "<patch_index_0197>": 64210,
+  "<patch_index_0198>": 64211,
+  "<patch_index_0199>": 64212,
+  "<patch_index_0200>": 64213,
+  "<patch_index_0201>": 64214,
+  "<patch_index_0202>": 64215,
+  "<patch_index_0203>": 64216,
+  "<patch_index_0204>": 64217,
+  "<patch_index_0205>": 64218,
+  "<patch_index_0206>": 64219,
+  "<patch_index_0207>": 64220,
+  "<patch_index_0208>": 64221,
+  "<patch_index_0209>": 64222,
+  "<patch_index_0210>": 64223,
+  "<patch_index_0211>": 64224,
+  "<patch_index_0212>": 64225,
+  "<patch_index_0213>": 64226,
+  "<patch_index_0214>": 64227,
+  "<patch_index_0215>": 64228,
+  "<patch_index_0216>": 64229,
+  "<patch_index_0217>": 64230,
+  "<patch_index_0218>": 64231,
+  "<patch_index_0219>": 64232,
+  "<patch_index_0220>": 64233,
+  "<patch_index_0221>": 64234,
+  "<patch_index_0222>": 64235,
+  "<patch_index_0223>": 64236,
+  "<patch_index_0224>": 64237,
+  "<patch_index_0225>": 64238,
+  "<patch_index_0226>": 64239,
+  "<patch_index_0227>": 64240,
+  "<patch_index_0228>": 64241,
+  "<patch_index_0229>": 64242,
+  "<patch_index_0230>": 64243,
+  "<patch_index_0231>": 64244,
+  "<patch_index_0232>": 64245,
+  "<patch_index_0233>": 64246,
+  "<patch_index_0234>": 64247,
+  "<patch_index_0235>": 64248,
+  "<patch_index_0236>": 64249,
+  "<patch_index_0237>": 64250,
+  "<patch_index_0238>": 64251,
+  "<patch_index_0239>": 64252,
+  "<patch_index_0240>": 64253,
+  "<patch_index_0241>": 64254,
+  "<patch_index_0242>": 64255,
+  "<patch_index_0243>": 64256,
+  "<patch_index_0244>": 64257,
+  "<patch_index_0245>": 64258,
+  "<patch_index_0246>": 64259,
+  "<patch_index_0247>": 64260,
+  "<patch_index_0248>": 64261,
+  "<patch_index_0249>": 64262,
+  "<patch_index_0250>": 64263,
+  "<patch_index_0251>": 64264,
+  "<patch_index_0252>": 64265,
+  "<patch_index_0253>": 64266,
+  "<patch_index_0254>": 64267,
+  "<patch_index_0255>": 64268,
+  "<patch_index_0256>": 64269,
+  "<patch_index_0257>": 64270,
+  "<patch_index_0258>": 64271,
+  "<patch_index_0259>": 64272,
+  "<patch_index_0260>": 64273,
+  "<patch_index_0261>": 64274,
+  "<patch_index_0262>": 64275,
+  "<patch_index_0263>": 64276,
+  "<patch_index_0264>": 64277,
+  "<patch_index_0265>": 64278,
+  "<patch_index_0266>": 64279,
+  "<patch_index_0267>": 64280,
+  "<patch_index_0268>": 64281,
+  "<patch_index_0269>": 64282,
+  "<patch_index_0270>": 64283,
+  "<patch_index_0271>": 64284,
+  "<patch_index_0272>": 64285,
+  "<patch_index_0273>": 64286,
+  "<patch_index_0274>": 64287,
+  "<patch_index_0275>": 64288,
+  "<patch_index_0276>": 64289,
+  "<patch_index_0277>": 64290,
+  "<patch_index_0278>": 64291,
+  "<patch_index_0279>": 64292,
+  "<patch_index_0280>": 64293,
+  "<patch_index_0281>": 64294,
+  "<patch_index_0282>": 64295,
+  "<patch_index_0283>": 64296,
+  "<patch_index_0284>": 64297,
+  "<patch_index_0285>": 64298,
+  "<patch_index_0286>": 64299,
+  "<patch_index_0287>": 64300,
+  "<patch_index_0288>": 64301,
+  "<patch_index_0289>": 64302,
+  "<patch_index_0290>": 64303,
+  "<patch_index_0291>": 64304,
+  "<patch_index_0292>": 64305,
+  "<patch_index_0293>": 64306,
+  "<patch_index_0294>": 64307,
+  "<patch_index_0295>": 64308,
+  "<patch_index_0296>": 64309,
+  "<patch_index_0297>": 64310,
+  "<patch_index_0298>": 64311,
+  "<patch_index_0299>": 64312,
+  "<patch_index_0300>": 64313,
+  "<patch_index_0301>": 64314,
+  "<patch_index_0302>": 64315,
+  "<patch_index_0303>": 64316,
+  "<patch_index_0304>": 64317,
+  "<patch_index_0305>": 64318,
+  "<patch_index_0306>": 64319,
+  "<patch_index_0307>": 64320,
+  "<patch_index_0308>": 64321,
+  "<patch_index_0309>": 64322,
+  "<patch_index_0310>": 64323,
+  "<patch_index_0311>": 64324,
+  "<patch_index_0312>": 64325,
+  "<patch_index_0313>": 64326,
+  "<patch_index_0314>": 64327,
+  "<patch_index_0315>": 64328,
+  "<patch_index_0316>": 64329,
+  "<patch_index_0317>": 64330,
+  "<patch_index_0318>": 64331,
+  "<patch_index_0319>": 64332,
+  "<patch_index_0320>": 64333,
+  "<patch_index_0321>": 64334,
+  "<patch_index_0322>": 64335,
+  "<patch_index_0323>": 64336,
+  "<patch_index_0324>": 64337,
+  "<patch_index_0325>": 64338,
+  "<patch_index_0326>": 64339,
+  "<patch_index_0327>": 64340,
+  "<patch_index_0328>": 64341,
+  "<patch_index_0329>": 64342,
+  "<patch_index_0330>": 64343,
+  "<patch_index_0331>": 64344,
+  "<patch_index_0332>": 64345,
+  "<patch_index_0333>": 64346,
+  "<patch_index_0334>": 64347,
+  "<patch_index_0335>": 64348,
+  "<patch_index_0336>": 64349,
+  "<patch_index_0337>": 64350,
+  "<patch_index_0338>": 64351,
+  "<patch_index_0339>": 64352,
+  "<patch_index_0340>": 64353,
+  "<patch_index_0341>": 64354,
+  "<patch_index_0342>": 64355,
+  "<patch_index_0343>": 64356,
+  "<patch_index_0344>": 64357,
+  "<patch_index_0345>": 64358,
+  "<patch_index_0346>": 64359,
+  "<patch_index_0347>": 64360,
+  "<patch_index_0348>": 64361,
+  "<patch_index_0349>": 64362,
+  "<patch_index_0350>": 64363,
+  "<patch_index_0351>": 64364,
+  "<patch_index_0352>": 64365,
+  "<patch_index_0353>": 64366,
+  "<patch_index_0354>": 64367,
+  "<patch_index_0355>": 64368,
+  "<patch_index_0356>": 64369,
+  "<patch_index_0357>": 64370,
+  "<patch_index_0358>": 64371,
+  "<patch_index_0359>": 64372,
+  "<patch_index_0360>": 64373,
+  "<patch_index_0361>": 64374,
+  "<patch_index_0362>": 64375,
+  "<patch_index_0363>": 64376,
+  "<patch_index_0364>": 64377,
+  "<patch_index_0365>": 64378,
+  "<patch_index_0366>": 64379,
+  "<patch_index_0367>": 64380,
+  "<patch_index_0368>": 64381,
+  "<patch_index_0369>": 64382,
+  "<patch_index_0370>": 64383,
+  "<patch_index_0371>": 64384,
+  "<patch_index_0372>": 64385,
+  "<patch_index_0373>": 64386,
+  "<patch_index_0374>": 64387,
+  "<patch_index_0375>": 64388,
+  "<patch_index_0376>": 64389,
+  "<patch_index_0377>": 64390,
+  "<patch_index_0378>": 64391,
+  "<patch_index_0379>": 64392,
+  "<patch_index_0380>": 64393,
+  "<patch_index_0381>": 64394,
+  "<patch_index_0382>": 64395,
+  "<patch_index_0383>": 64396,
+  "<patch_index_0384>": 64397,
+  "<patch_index_0385>": 64398,
+  "<patch_index_0386>": 64399,
+  "<patch_index_0387>": 64400,
+  "<patch_index_0388>": 64401,
+  "<patch_index_0389>": 64402,
+  "<patch_index_0390>": 64403,
+  "<patch_index_0391>": 64404,
+  "<patch_index_0392>": 64405,
+  "<patch_index_0393>": 64406,
+  "<patch_index_0394>": 64407,
+  "<patch_index_0395>": 64408,
+  "<patch_index_0396>": 64409,
+  "<patch_index_0397>": 64410,
+  "<patch_index_0398>": 64411,
+  "<patch_index_0399>": 64412,
+  "<patch_index_0400>": 64413,
+  "<patch_index_0401>": 64414,
+  "<patch_index_0402>": 64415,
+  "<patch_index_0403>": 64416,
+  "<patch_index_0404>": 64417,
+  "<patch_index_0405>": 64418,
+  "<patch_index_0406>": 64419,
+  "<patch_index_0407>": 64420,
+  "<patch_index_0408>": 64421,
+  "<patch_index_0409>": 64422,
+  "<patch_index_0410>": 64423,
+  "<patch_index_0411>": 64424,
+  "<patch_index_0412>": 64425,
+  "<patch_index_0413>": 64426,
+  "<patch_index_0414>": 64427,
+  "<patch_index_0415>": 64428,
+  "<patch_index_0416>": 64429,
+  "<patch_index_0417>": 64430,
+  "<patch_index_0418>": 64431,
+  "<patch_index_0419>": 64432,
+  "<patch_index_0420>": 64433,
+  "<patch_index_0421>": 64434,
+  "<patch_index_0422>": 64435,
+  "<patch_index_0423>": 64436,
+  "<patch_index_0424>": 64437,
+  "<patch_index_0425>": 64438,
+  "<patch_index_0426>": 64439,
+  "<patch_index_0427>": 64440,
+  "<patch_index_0428>": 64441,
+  "<patch_index_0429>": 64442,
+  "<patch_index_0430>": 64443,
+  "<patch_index_0431>": 64444,
+  "<patch_index_0432>": 64445,
+  "<patch_index_0433>": 64446,
+  "<patch_index_0434>": 64447,
+  "<patch_index_0435>": 64448,
+  "<patch_index_0436>": 64449,
+  "<patch_index_0437>": 64450,
+  "<patch_index_0438>": 64451,
+  "<patch_index_0439>": 64452,
+  "<patch_index_0440>": 64453,
+  "<patch_index_0441>": 64454,
+  "<patch_index_0442>": 64455,
+  "<patch_index_0443>": 64456,
+  "<patch_index_0444>": 64457,
+  "<patch_index_0445>": 64458,
+  "<patch_index_0446>": 64459,
+  "<patch_index_0447>": 64460,
+  "<patch_index_0448>": 64461,
+  "<patch_index_0449>": 64462,
+  "<patch_index_0450>": 64463,
+  "<patch_index_0451>": 64464,
+  "<patch_index_0452>": 64465,
+  "<patch_index_0453>": 64466,
+  "<patch_index_0454>": 64467,
+  "<patch_index_0455>": 64468,
+  "<patch_index_0456>": 64469,
+  "<patch_index_0457>": 64470,
+  "<patch_index_0458>": 64471,
+  "<patch_index_0459>": 64472,
+  "<patch_index_0460>": 64473,
+  "<patch_index_0461>": 64474,
+  "<patch_index_0462>": 64475,
+  "<patch_index_0463>": 64476,
+  "<patch_index_0464>": 64477,
+  "<patch_index_0465>": 64478,
+  "<patch_index_0466>": 64479,
+  "<patch_index_0467>": 64480,
+  "<patch_index_0468>": 64481,
+  "<patch_index_0469>": 64482,
+  "<patch_index_0470>": 64483,
+  "<patch_index_0471>": 64484,
+  "<patch_index_0472>": 64485,
+  "<patch_index_0473>": 64486,
+  "<patch_index_0474>": 64487,
+  "<patch_index_0475>": 64488,
+  "<patch_index_0476>": 64489,
+  "<patch_index_0477>": 64490,
+  "<patch_index_0478>": 64491,
+  "<patch_index_0479>": 64492,
+  "<patch_index_0480>": 64493,
+  "<patch_index_0481>": 64494,
+  "<patch_index_0482>": 64495,
+  "<patch_index_0483>": 64496,
+  "<patch_index_0484>": 64497,
+  "<patch_index_0485>": 64498,
+  "<patch_index_0486>": 64499,
+  "<patch_index_0487>": 64500,
+  "<patch_index_0488>": 64501,
+  "<patch_index_0489>": 64502,
+  "<patch_index_0490>": 64503,
+  "<patch_index_0491>": 64504,
+  "<patch_index_0492>": 64505,
+  "<patch_index_0493>": 64506,
+  "<patch_index_0494>": 64507,
+  "<patch_index_0495>": 64508,
+  "<patch_index_0496>": 64509,
+  "<patch_index_0497>": 64510,
+  "<patch_index_0498>": 64511,
+  "<patch_index_0499>": 64512,
+  "<patch_index_0500>": 64513,
+  "<patch_index_0501>": 64514,
+  "<patch_index_0502>": 64515,
+  "<patch_index_0503>": 64516,
+  "<patch_index_0504>": 64517,
+  "<patch_index_0505>": 64518,
+  "<patch_index_0506>": 64519,
+  "<patch_index_0507>": 64520,
+  "<patch_index_0508>": 64521,
+  "<patch_index_0509>": 64522,
+  "<patch_index_0510>": 64523,
+  "<patch_index_0511>": 64524,
+  "<patch_index_0512>": 64525,
+  "<patch_index_0513>": 64526,
+  "<patch_index_0514>": 64527,
+  "<patch_index_0515>": 64528,
+  "<patch_index_0516>": 64529,
+  "<patch_index_0517>": 64530,
+  "<patch_index_0518>": 64531,
+  "<patch_index_0519>": 64532,
+  "<patch_index_0520>": 64533,
+  "<patch_index_0521>": 64534,
+  "<patch_index_0522>": 64535,
+  "<patch_index_0523>": 64536,
+  "<patch_index_0524>": 64537,
+  "<patch_index_0525>": 64538,
+  "<patch_index_0526>": 64539,
+  "<patch_index_0527>": 64540,
+  "<patch_index_0528>": 64541,
+  "<patch_index_0529>": 64542,
+  "<patch_index_0530>": 64543,
+  "<patch_index_0531>": 64544,
+  "<patch_index_0532>": 64545,
+  "<patch_index_0533>": 64546,
+  "<patch_index_0534>": 64547,
+  "<patch_index_0535>": 64548,
+  "<patch_index_0536>": 64549,
+  "<patch_index_0537>": 64550,
+  "<patch_index_0538>": 64551,
+  "<patch_index_0539>": 64552,
+  "<patch_index_0540>": 64553,
+  "<patch_index_0541>": 64554,
+  "<patch_index_0542>": 64555,
+  "<patch_index_0543>": 64556,
+  "<patch_index_0544>": 64557,
+  "<patch_index_0545>": 64558,
+  "<patch_index_0546>": 64559,
+  "<patch_index_0547>": 64560,
+  "<patch_index_0548>": 64561,
+  "<patch_index_0549>": 64562,
+  "<patch_index_0550>": 64563,
+  "<patch_index_0551>": 64564,
+  "<patch_index_0552>": 64565,
+  "<patch_index_0553>": 64566,
+  "<patch_index_0554>": 64567,
+  "<patch_index_0555>": 64568,
+  "<patch_index_0556>": 64569,
+  "<patch_index_0557>": 64570,
+  "<patch_index_0558>": 64571,
+  "<patch_index_0559>": 64572,
+  "<patch_index_0560>": 64573,
+  "<patch_index_0561>": 64574,
+  "<patch_index_0562>": 64575,
+  "<patch_index_0563>": 64576,
+  "<patch_index_0564>": 64577,
+  "<patch_index_0565>": 64578,
+  "<patch_index_0566>": 64579,
+  "<patch_index_0567>": 64580,
+  "<patch_index_0568>": 64581,
+  "<patch_index_0569>": 64582,
+  "<patch_index_0570>": 64583,
+  "<patch_index_0571>": 64584,
+  "<patch_index_0572>": 64585,
+  "<patch_index_0573>": 64586,
+  "<patch_index_0574>": 64587,
+  "<patch_index_0575>": 64588,
+  "<patch_index_0576>": 64589,
+  "<patch_index_0577>": 64590,
+  "<patch_index_0578>": 64591,
+  "<patch_index_0579>": 64592,
+  "<patch_index_0580>": 64593,
+  "<patch_index_0581>": 64594,
+  "<patch_index_0582>": 64595,
+  "<patch_index_0583>": 64596,
+  "<patch_index_0584>": 64597,
+  "<patch_index_0585>": 64598,
+  "<patch_index_0586>": 64599,
+  "<patch_index_0587>": 64600,
+  "<patch_index_0588>": 64601,
+  "<patch_index_0589>": 64602,
+  "<patch_index_0590>": 64603,
+  "<patch_index_0591>": 64604,
+  "<patch_index_0592>": 64605,
+  "<patch_index_0593>": 64606,
+  "<patch_index_0594>": 64607,
+  "<patch_index_0595>": 64608,
+  "<patch_index_0596>": 64609,
+  "<patch_index_0597>": 64610,
+  "<patch_index_0598>": 64611,
+  "<patch_index_0599>": 64612,
+  "<patch_index_0600>": 64613,
+  "<patch_index_0601>": 64614,
+  "<patch_index_0602>": 64615,
+  "<patch_index_0603>": 64616,
+  "<patch_index_0604>": 64617,
+  "<patch_index_0605>": 64618,
+  "<patch_index_0606>": 64619,
+  "<patch_index_0607>": 64620,
+  "<patch_index_0608>": 64621,
+  "<patch_index_0609>": 64622,
+  "<patch_index_0610>": 64623,
+  "<patch_index_0611>": 64624,
+  "<patch_index_0612>": 64625,
+  "<patch_index_0613>": 64626,
+  "<patch_index_0614>": 64627,
+  "<patch_index_0615>": 64628,
+  "<patch_index_0616>": 64629,
+  "<patch_index_0617>": 64630,
+  "<patch_index_0618>": 64631,
+  "<patch_index_0619>": 64632,
+  "<patch_index_0620>": 64633,
+  "<patch_index_0621>": 64634,
+  "<patch_index_0622>": 64635,
+  "<patch_index_0623>": 64636,
+  "<patch_index_0624>": 64637,
+  "<patch_index_0625>": 64638,
+  "<patch_index_0626>": 64639,
+  "<patch_index_0627>": 64640,
+  "<patch_index_0628>": 64641,
+  "<patch_index_0629>": 64642,
+  "<patch_index_0630>": 64643,
+  "<patch_index_0631>": 64644,
+  "<patch_index_0632>": 64645,
+  "<patch_index_0633>": 64646,
+  "<patch_index_0634>": 64647,
+  "<patch_index_0635>": 64648,
+  "<patch_index_0636>": 64649,
+  "<patch_index_0637>": 64650,
+  "<patch_index_0638>": 64651,
+  "<patch_index_0639>": 64652,
+  "<patch_index_0640>": 64653,
+  "<patch_index_0641>": 64654,
+  "<patch_index_0642>": 64655,
+  "<patch_index_0643>": 64656,
+  "<patch_index_0644>": 64657,
+  "<patch_index_0645>": 64658,
+  "<patch_index_0646>": 64659,
+  "<patch_index_0647>": 64660,
+  "<patch_index_0648>": 64661,
+  "<patch_index_0649>": 64662,
+  "<patch_index_0650>": 64663,
+  "<patch_index_0651>": 64664,
+  "<patch_index_0652>": 64665,
+  "<patch_index_0653>": 64666,
+  "<patch_index_0654>": 64667,
+  "<patch_index_0655>": 64668,
+  "<patch_index_0656>": 64669,
+  "<patch_index_0657>": 64670,
+  "<patch_index_0658>": 64671,
+  "<patch_index_0659>": 64672,
+  "<patch_index_0660>": 64673,
+  "<patch_index_0661>": 64674,
+  "<patch_index_0662>": 64675,
+  "<patch_index_0663>": 64676,
+  "<patch_index_0664>": 64677,
+  "<patch_index_0665>": 64678,
+  "<patch_index_0666>": 64679,
+  "<patch_index_0667>": 64680,
+  "<patch_index_0668>": 64681,
+  "<patch_index_0669>": 64682,
+  "<patch_index_0670>": 64683,
+  "<patch_index_0671>": 64684,
+  "<patch_index_0672>": 64685,
+  "<patch_index_0673>": 64686,
+  "<patch_index_0674>": 64687,
+  "<patch_index_0675>": 64688,
+  "<patch_index_0676>": 64689,
+  "<patch_index_0677>": 64690,
+  "<patch_index_0678>": 64691,
+  "<patch_index_0679>": 64692,
+  "<patch_index_0680>": 64693,
+  "<patch_index_0681>": 64694,
+  "<patch_index_0682>": 64695,
+  "<patch_index_0683>": 64696,
+  "<patch_index_0684>": 64697,
+  "<patch_index_0685>": 64698,
+  "<patch_index_0686>": 64699,
+  "<patch_index_0687>": 64700,
+  "<patch_index_0688>": 64701,
+  "<patch_index_0689>": 64702,
+  "<patch_index_0690>": 64703,
+  "<patch_index_0691>": 64704,
+  "<patch_index_0692>": 64705,
+  "<patch_index_0693>": 64706,
+  "<patch_index_0694>": 64707,
+  "<patch_index_0695>": 64708,
+  "<patch_index_0696>": 64709,
+  "<patch_index_0697>": 64710,
+  "<patch_index_0698>": 64711,
+  "<patch_index_0699>": 64712,
+  "<patch_index_0700>": 64713,
+  "<patch_index_0701>": 64714,
+  "<patch_index_0702>": 64715,
+  "<patch_index_0703>": 64716,
+  "<patch_index_0704>": 64717,
+  "<patch_index_0705>": 64718,
+  "<patch_index_0706>": 64719,
+  "<patch_index_0707>": 64720,
+  "<patch_index_0708>": 64721,
+  "<patch_index_0709>": 64722,
+  "<patch_index_0710>": 64723,
+  "<patch_index_0711>": 64724,
+  "<patch_index_0712>": 64725,
+  "<patch_index_0713>": 64726,
+  "<patch_index_0714>": 64727,
+  "<patch_index_0715>": 64728,
+  "<patch_index_0716>": 64729,
+  "<patch_index_0717>": 64730,
+  "<patch_index_0718>": 64731,
+  "<patch_index_0719>": 64732,
+  "<patch_index_0720>": 64733,
+  "<patch_index_0721>": 64734,
+  "<patch_index_0722>": 64735,
+  "<patch_index_0723>": 64736,
+  "<patch_index_0724>": 64737,
+  "<patch_index_0725>": 64738,
+  "<patch_index_0726>": 64739,
+  "<patch_index_0727>": 64740,
+  "<patch_index_0728>": 64741,
+  "<patch_index_0729>": 64742,
+  "<patch_index_0730>": 64743,
+  "<patch_index_0731>": 64744,
+  "<patch_index_0732>": 64745,
+  "<patch_index_0733>": 64746,
+  "<patch_index_0734>": 64747,
+  "<patch_index_0735>": 64748,
+  "<patch_index_0736>": 64749,
+  "<patch_index_0737>": 64750,
+  "<patch_index_0738>": 64751,
+  "<patch_index_0739>": 64752,
+  "<patch_index_0740>": 64753,
+  "<patch_index_0741>": 64754,
+  "<patch_index_0742>": 64755,
+  "<patch_index_0743>": 64756,
+  "<patch_index_0744>": 64757,
+  "<patch_index_0745>": 64758,
+  "<patch_index_0746>": 64759,
+  "<patch_index_0747>": 64760,
+  "<patch_index_0748>": 64761,
+  "<patch_index_0749>": 64762,
+  "<patch_index_0750>": 64763,
+  "<patch_index_0751>": 64764,
+  "<patch_index_0752>": 64765,
+  "<patch_index_0753>": 64766,
+  "<patch_index_0754>": 64767,
+  "<patch_index_0755>": 64768,
+  "<patch_index_0756>": 64769,
+  "<patch_index_0757>": 64770,
+  "<patch_index_0758>": 64771,
+  "<patch_index_0759>": 64772,
+  "<patch_index_0760>": 64773,
+  "<patch_index_0761>": 64774,
+  "<patch_index_0762>": 64775,
+  "<patch_index_0763>": 64776,
+  "<patch_index_0764>": 64777,
+  "<patch_index_0765>": 64778,
+  "<patch_index_0766>": 64779,
+  "<patch_index_0767>": 64780,
+  "<patch_index_0768>": 64781,
+  "<patch_index_0769>": 64782,
+  "<patch_index_0770>": 64783,
+  "<patch_index_0771>": 64784,
+  "<patch_index_0772>": 64785,
+  "<patch_index_0773>": 64786,
+  "<patch_index_0774>": 64787,
+  "<patch_index_0775>": 64788,
+  "<patch_index_0776>": 64789,
+  "<patch_index_0777>": 64790,
+  "<patch_index_0778>": 64791,
+  "<patch_index_0779>": 64792,
+  "<patch_index_0780>": 64793,
+  "<patch_index_0781>": 64794,
+  "<patch_index_0782>": 64795,
+  "<patch_index_0783>": 64796,
+  "<patch_index_0784>": 64797,
+  "<patch_index_0785>": 64798,
+  "<patch_index_0786>": 64799,
+  "<patch_index_0787>": 64800,
+  "<patch_index_0788>": 64801,
+  "<patch_index_0789>": 64802,
+  "<patch_index_0790>": 64803,
+  "<patch_index_0791>": 64804,
+  "<patch_index_0792>": 64805,
+  "<patch_index_0793>": 64806,
+  "<patch_index_0794>": 64807,
+  "<patch_index_0795>": 64808,
+  "<patch_index_0796>": 64809,
+  "<patch_index_0797>": 64810,
+  "<patch_index_0798>": 64811,
+  "<patch_index_0799>": 64812,
+  "<patch_index_0800>": 64813,
+  "<patch_index_0801>": 64814,
+  "<patch_index_0802>": 64815,
+  "<patch_index_0803>": 64816,
+  "<patch_index_0804>": 64817,
+  "<patch_index_0805>": 64818,
+  "<patch_index_0806>": 64819,
+  "<patch_index_0807>": 64820,
+  "<patch_index_0808>": 64821,
+  "<patch_index_0809>": 64822,
+  "<patch_index_0810>": 64823,
+  "<patch_index_0811>": 64824,
+  "<patch_index_0812>": 64825,
+  "<patch_index_0813>": 64826,
+  "<patch_index_0814>": 64827,
+  "<patch_index_0815>": 64828,
+  "<patch_index_0816>": 64829,
+  "<patch_index_0817>": 64830,
+  "<patch_index_0818>": 64831,
+  "<patch_index_0819>": 64832,
+  "<patch_index_0820>": 64833,
+  "<patch_index_0821>": 64834,
+  "<patch_index_0822>": 64835,
+  "<patch_index_0823>": 64836,
+  "<patch_index_0824>": 64837,
+  "<patch_index_0825>": 64838,
+  "<patch_index_0826>": 64839,
+  "<patch_index_0827>": 64840,
+  "<patch_index_0828>": 64841,
+  "<patch_index_0829>": 64842,
+  "<patch_index_0830>": 64843,
+  "<patch_index_0831>": 64844,
+  "<patch_index_0832>": 64845,
+  "<patch_index_0833>": 64846,
+  "<patch_index_0834>": 64847,
+  "<patch_index_0835>": 64848,
+  "<patch_index_0836>": 64849,
+  "<patch_index_0837>": 64850,
+  "<patch_index_0838>": 64851,
+  "<patch_index_0839>": 64852,
+  "<patch_index_0840>": 64853,
+  "<patch_index_0841>": 64854,
+  "<patch_index_0842>": 64855,
+  "<patch_index_0843>": 64856,
+  "<patch_index_0844>": 64857,
+  "<patch_index_0845>": 64858,
+  "<patch_index_0846>": 64859,
+  "<patch_index_0847>": 64860,
+  "<patch_index_0848>": 64861,
+  "<patch_index_0849>": 64862,
+  "<patch_index_0850>": 64863,
+  "<patch_index_0851>": 64864,
+  "<patch_index_0852>": 64865,
+  "<patch_index_0853>": 64866,
+  "<patch_index_0854>": 64867,
+  "<patch_index_0855>": 64868,
+  "<patch_index_0856>": 64869,
+  "<patch_index_0857>": 64870,
+  "<patch_index_0858>": 64871,
+  "<patch_index_0859>": 64872,
+  "<patch_index_0860>": 64873,
+  "<patch_index_0861>": 64874,
+  "<patch_index_0862>": 64875,
+  "<patch_index_0863>": 64876,
+  "<patch_index_0864>": 64877,
+  "<patch_index_0865>": 64878,
+  "<patch_index_0866>": 64879,
+  "<patch_index_0867>": 64880,
+  "<patch_index_0868>": 64881,
+  "<patch_index_0869>": 64882,
+  "<patch_index_0870>": 64883,
+  "<patch_index_0871>": 64884,
+  "<patch_index_0872>": 64885,
+  "<patch_index_0873>": 64886,
+  "<patch_index_0874>": 64887,
+  "<patch_index_0875>": 64888,
+  "<patch_index_0876>": 64889,
+  "<patch_index_0877>": 64890,
+  "<patch_index_0878>": 64891,
+  "<patch_index_0879>": 64892,
+  "<patch_index_0880>": 64893,
+  "<patch_index_0881>": 64894,
+  "<patch_index_0882>": 64895,
+  "<patch_index_0883>": 64896,
+  "<patch_index_0884>": 64897,
+  "<patch_index_0885>": 64898,
+  "<patch_index_0886>": 64899,
+  "<patch_index_0887>": 64900,
+  "<patch_index_0888>": 64901,
+  "<patch_index_0889>": 64902,
+  "<patch_index_0890>": 64903,
+  "<patch_index_0891>": 64904,
+  "<patch_index_0892>": 64905,
+  "<patch_index_0893>": 64906,
+  "<patch_index_0894>": 64907,
+  "<patch_index_0895>": 64908,
+  "<patch_index_0896>": 64909,
+  "<patch_index_0897>": 64910,
+  "<patch_index_0898>": 64911,
+  "<patch_index_0899>": 64912,
+  "<patch_index_0900>": 64913,
+  "<patch_index_0901>": 64914,
+  "<patch_index_0902>": 64915,
+  "<patch_index_0903>": 64916,
+  "<patch_index_0904>": 64917,
+  "<patch_index_0905>": 64918,
+  "<patch_index_0906>": 64919,
+  "<patch_index_0907>": 64920,
+  "<patch_index_0908>": 64921,
+  "<patch_index_0909>": 64922,
+  "<patch_index_0910>": 64923,
+  "<patch_index_0911>": 64924,
+  "<patch_index_0912>": 64925,
+  "<patch_index_0913>": 64926,
+  "<patch_index_0914>": 64927,
+  "<patch_index_0915>": 64928,
+  "<patch_index_0916>": 64929,
+  "<patch_index_0917>": 64930,
+  "<patch_index_0918>": 64931,
+  "<patch_index_0919>": 64932,
+  "<patch_index_0920>": 64933,
+  "<patch_index_0921>": 64934,
+  "<patch_index_0922>": 64935,
+  "<patch_index_0923>": 64936,
+  "<patch_index_0924>": 64937,
+  "<patch_index_0925>": 64938,
+  "<patch_index_0926>": 64939,
+  "<patch_index_0927>": 64940,
+  "<patch_index_0928>": 64941,
+  "<patch_index_0929>": 64942,
+  "<patch_index_0930>": 64943,
+  "<patch_index_0931>": 64944,
+  "<patch_index_0932>": 64945,
+  "<patch_index_0933>": 64946,
+  "<patch_index_0934>": 64947,
+  "<patch_index_0935>": 64948,
+  "<patch_index_0936>": 64949,
+  "<patch_index_0937>": 64950,
+  "<patch_index_0938>": 64951,
+  "<patch_index_0939>": 64952,
+  "<patch_index_0940>": 64953,
+  "<patch_index_0941>": 64954,
+  "<patch_index_0942>": 64955,
+  "<patch_index_0943>": 64956,
+  "<patch_index_0944>": 64957,
+  "<patch_index_0945>": 64958,
+  "<patch_index_0946>": 64959,
+  "<patch_index_0947>": 64960,
+  "<patch_index_0948>": 64961,
+  "<patch_index_0949>": 64962,
+  "<patch_index_0950>": 64963,
+  "<patch_index_0951>": 64964,
+  "<patch_index_0952>": 64965,
+  "<patch_index_0953>": 64966,
+  "<patch_index_0954>": 64967,
+  "<patch_index_0955>": 64968,
+  "<patch_index_0956>": 64969,
+  "<patch_index_0957>": 64970,
+  "<patch_index_0958>": 64971,
+  "<patch_index_0959>": 64972,
+  "<patch_index_0960>": 64973,
+  "<patch_index_0961>": 64974,
+  "<patch_index_0962>": 64975,
+  "<patch_index_0963>": 64976,
+  "<patch_index_0964>": 64977,
+  "<patch_index_0965>": 64978,
+  "<patch_index_0966>": 64979,
+  "<patch_index_0967>": 64980,
+  "<patch_index_0968>": 64981,
+  "<patch_index_0969>": 64982,
+  "<patch_index_0970>": 64983,
+  "<patch_index_0971>": 64984,
+  "<patch_index_0972>": 64985,
+  "<patch_index_0973>": 64986,
+  "<patch_index_0974>": 64987,
+  "<patch_index_0975>": 64988,
+  "<patch_index_0976>": 64989,
+  "<patch_index_0977>": 64990,
+  "<patch_index_0978>": 64991,
+  "<patch_index_0979>": 64992,
+  "<patch_index_0980>": 64993,
+  "<patch_index_0981>": 64994,
+  "<patch_index_0982>": 64995,
+  "<patch_index_0983>": 64996,
+  "<patch_index_0984>": 64997,
+  "<patch_index_0985>": 64998,
+  "<patch_index_0986>": 64999,
+  "<patch_index_0987>": 65000,
+  "<patch_index_0988>": 65001,
+  "<patch_index_0989>": 65002,
+  "<patch_index_0990>": 65003,
+  "<patch_index_0991>": 65004,
+  "<patch_index_0992>": 65005,
+  "<patch_index_0993>": 65006,
+  "<patch_index_0994>": 65007,
+  "<patch_index_0995>": 65008,
+  "<patch_index_0996>": 65009,
+  "<patch_index_0997>": 65010,
+  "<patch_index_0998>": 65011,
+  "<patch_index_0999>": 65012,
+  "<patch_index_1000>": 65013,
+  "<patch_index_1001>": 65014,
+  "<patch_index_1002>": 65015,
+  "<patch_index_1003>": 65016,
+  "<patch_index_1004>": 65017,
+  "<patch_index_1005>": 65018,
+  "<patch_index_1006>": 65019,
+  "<patch_index_1007>": 65020,
+  "<patch_index_1008>": 65021,
+  "<patch_index_1009>": 65022,
+  "<patch_index_1010>": 65023,
+  "<patch_index_1011>": 65024,
+  "<patch_index_1012>": 65025,
+  "<patch_index_1013>": 65026,
+  "<patch_index_1014>": 65027,
+  "<patch_index_1015>": 65028,
+  "<patch_index_1016>": 65029,
+  "<patch_index_1017>": 65030,
+  "<patch_index_1018>": 65031,
+  "<patch_index_1019>": 65032,
+  "<patch_index_1020>": 65033,
+  "<patch_index_1021>": 65034,
+  "<patch_index_1022>": 65035,
+  "<patch_index_1023>": 65036,
+  "<phrase>": 64007
+}

annotated_snowman.jpg ADDED Viewed

config.json ADDED Viewed

	@@ -0,0 +1,173 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "HF_Kosmos2",
+  "architectures": [
+    "Kosmos2ForConditionalGeneration"
+  ],
+  "latent_query_num": 64,
+  "model_type": "kosmos-2",
+  "auto_map": {
+    "AutoConfig": "configuration_kosmos2.Kosmos2Config",
+    "AutoModel": "modeling_kosmos2.Kosmos2Model",
+    "AutoModelForVision2Seq": "modeling_kosmos2.Kosmos2ForConditionalGeneration",
+    "AutoProcessor": "processing_kosmos2.Kosmos2Processor"
+  },
+  "text_config": {
+    "_name_or_path": "",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.1,
+    "attention_heads": 32,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": false,
+    "embed_dim": 2048,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "ffn_dim": 8192,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gradient_checkpointing": false,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.0,
+    "layers": 24,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 2048,
+    "min_length": 0,
+    "model_type": "kosmos_2_text_model",
+    "no_repeat_ngram_size": 3,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": true,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 65037
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "kosmos_2_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

configuration_kosmos2.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" KOSMOS-2 model configuration"""
+import copy
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/kosmos-2-patch14-224": (
+        "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/config.json"
+    ),
+    # See all KOSMOS-2 models at https://huggingface.co/models?filter=kosmos-2
+}
+class Kosmos2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2TextModel`]. It is used to instantiate a KOSMOS-2 text decoder
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2
+    [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65037):
+            Vocabulary size of the Kosmos2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Kosmos2Model`].
+        embed_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the layers and the pooler layer.
+        layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(embed_dim).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    Example:
+    ```python
+    >>> from transformers import Kosmos2TextConfig, Kosmos2TextModel
+    >>> # Initializing a Kosmos2TextConfig microsoft/kosmos-2-patch14-224 style configuration
+    >>> configuration = Kosmos2TextConfig()
+    >>> # Initializing a Kosmos2TextModel (with random weights) from the microsoft/kosmos-2-patch14-224 style configuration
+    >>> model = Kosmos2TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "kosmos_2_text_model"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "attention_heads", "hidden_size": "embed_dim"}
+    def __init__(
+        self,
+        vocab_size=65037,
+        max_position_embeddings=2048,
+        embed_dim=2048,
+        layers=24,
+        ffn_dim=8192,
+        attention_heads=32,
+        activation_function="gelu",
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        layerdrop=0.0,
+        layer_norm_eps=1e-5,
+        scale_embedding=True,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_dim = embed_dim
+        self.layers = layers
+        self.ffn_dim = ffn_dim
+        self.attention_heads = attention_heads
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.scale_embedding = scale_embedding
+        self.use_cache = use_cache
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the text config dict if we are loading from Kosmos2Config
+        if config_dict.get("model_type") == "kosmos-2":
+            config_dict = config_dict["text_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class Kosmos2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2VisionModel`]. It is used to instantiate a
+    KOSMOS-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the KOSMOS-2
+    [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    Example:
+    ```python
+    >>> from transformers import Kosmos2VisionConfig, Kosmos2VisionModel
+    >>> # Initializing a Kosmos2VisionConfig with microsoft/kosmos-2-patch14-224 style configuration
+    >>> configuration = Kosmos2VisionConfig()
+    >>> # Initializing a Kosmos2VisionModel (with random weights) from the microsoft/kosmos-2-patch14-224 style configuration
+    >>> model = Kosmos2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "kosmos_2_vision_model"
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        projection_dim=512,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from Kosmos2Config
+        if config_dict.get("model_type") == "kosmos-2":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class Kosmos2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2Model`]. It is used to instantiate a KOSMOS-2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the KOSMOS-2
+    [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2VisionConfig`].
+        latent_query_num (`int`, *optional*, defaults to 64):
+            The number of latent query tokens that represent the image features used in the text decoder component.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import Kosmos2Config, Kosmos2Model
+    >>> # Initializing a Kosmos-2 kosmos-2-patch14-224 style configuration
+    >>> configuration = Kosmos2Config()
+    >>> # Initializing a model (with random weights) from the kosmos-2-patch14-224 style configuration
+    >>> model = Kosmos2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "kosmos-2"
+    is_composition = True
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        latent_query_num=64,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `Kosmos2TextConfig` with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. Initializing the `Kosmos2VisionConfig` with default values.")
+        self.text_config = Kosmos2TextConfig(**text_config)
+        self.vision_config = Kosmos2VisionConfig(**vision_config)
+        self.latent_query_num = latent_query_num
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output

draw_bboxes.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import numpy as np
+import torch
+from PIL import Image
+import torchvision.transforms as T
+import cv2
+import requests
+def is_overlapping(rect1, rect2):
+    x1, y1, x2, y2 = rect1
+    x3, y3, x4, y4 = rect2
+    return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)
+def draw_entity_boxes_on_image(image, entities, show=False, save_path=None):
+    """_summary_
+    Args:
+        image (_type_): image or image path
+        collect_entity_location (_type_): _description_
+    """
+    if isinstance(image, Image.Image):
+        image_h = image.height
+        image_w = image.width
+        image = np.array(image)[:, :, [2, 1, 0]]
+    elif isinstance(image, str):
+        if os.path.exists(image):
+            pil_img = Image.open(image).convert("RGB")
+            image = np.array(pil_img)[:, :, [2, 1, 0]]
+            image_h = pil_img.height
+            image_w = pil_img.width
+        else:
+            raise ValueError(f"invaild image path, {image}")
+    elif isinstance(image, torch.Tensor):
+        # pdb.set_trace()
+        image_tensor = image.cpu()
+        reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None]
+        reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None]
+        image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean
+        pil_img = T.ToPILImage()(image_tensor)
+        image_h = pil_img.height
+        image_w = pil_img.width
+        image = np.array(pil_img)[:, :, [2, 1, 0]]
+    else:
+        raise ValueError(f"invaild image format, {type(image)} for {image}")
+    if len(entities) == 0:
+        return image
+    new_image = image.copy()
+    previous_bboxes = []
+    # size of text
+    text_size = 2
+    # thickness of text
+    text_line = 1  # int(max(1 * min(image_h, image_w) / 512, 1))
+    box_line = 3
+    (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
+    base_height = int(text_height * 0.675)
+    text_offset_original = text_height - base_height
+    text_spaces = 3
+    for entity_name, (start, end), bboxes in entities:
+        for (x1_norm, y1_norm, x2_norm, y2_norm) in bboxes:
+            orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm * image_w), int(y1_norm * image_h), int(x2_norm * image_w), int(y2_norm * image_h)
+            # draw bbox
+            # random color
+            color = tuple(np.random.randint(0, 255, size=3).tolist())
+            new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)
+            l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1
+            x1 = orig_x1 - l_o
+            y1 = orig_y1 - l_o
+            if y1 < text_height + text_offset_original + 2 * text_spaces:
+                y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
+                x1 = orig_x1 + r_o
+            # add text background
+            (text_width, text_height), _ = cv2.getTextSize(f"  {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
+            text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1
+            for prev_bbox in previous_bboxes:
+                while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox):
+                    text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
+                    text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
+                    y1 += (text_height + text_offset_original + 2 * text_spaces)
+                    if text_bg_y2 >= image_h:
+                        text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
+                        text_bg_y2 = image_h
+                        y1 = image_h
+                        break
+            alpha = 0.5
+            for i in range(text_bg_y1, text_bg_y2):
+                for j in range(text_bg_x1, text_bg_x2):
+                    if i < image_h and j < image_w:
+                        if j < text_bg_x1 + 1.35 * c_width:
+                            # original color
+                            bg_color = color
+                        else:
+                            # white
+                            bg_color = [255, 255, 255]
+                        new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8)
+            cv2.putText(
+                new_image, f"  {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
+            )
+            # previous_locations.append((x1, y1))
+            previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2))
+    pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]])
+    if save_path:
+        pil_image.save(save_path)
+    if show:
+        pil_image.show()
+    return new_image

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "no_repeat_ngram_size": 3,
+  "pad_token_id": 1,
+  "transformers_version": "4.32.0.dev0",
+  "use_cache": true
+}

image_processing_kosmos2.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Kosmos2."""
+from typing import Dict, List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from transformers.image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    import PIL
+class Kosmos2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a CLIP image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(
+            image, size=size["shortest_edge"], input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)

modeling_kosmos2.py ADDED Viewed

	@@ -0,0 +1,1747 @@

+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch KOSMOS-2 model."""
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithCrossAttentions,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_kosmos2 import Kosmos2Config, Kosmos2TextConfig, Kosmos2VisionConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "microsoft/kosmos-2-patch14-224"
+_CONFIG_FOR_DOC = Kosmos2Config
+_EXPECTED_OUTPUT_SHAPE = None
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+    Args:
+        x: torch.Tensor x:
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+KOSMOS2_START_DOCSTRING = r"""Kosmos-2"""
+KOSMOS2_VISION_INPUTS_DOCSTRING = r"""Kosmos-2"""
+KOSMOS2_TEXT_INPUTS_DOCSTRING = r"""Kosmos-2"""
+KOSMOS2_INPUTS_DOCSTRING = r"""Kosmos-2"""
+@dataclass
+class Kosmos2ModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when being computed by the model):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextConnector`.
+        image_connector_attention (`tuple(torch.FloatTensor)`, *optional, returned when being computed by the model):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights given by `Kosmos2ImageToTextConnector`, after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        vision_model_output(`BaseModelOutputWithPooling`, *optional*, returned when being computed by the model):
+            The output of the [`Kosmos2VisionModel`].
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+    """
+    last_hidden_states: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_features: Optional[torch.FloatTensor] = None
+    image_connector_attention: Optional[Tuple[torch.FloatTensor]] = None
+    vision_model_output: BaseModelOutputWithPooling = None
+@dataclass
+class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Model output class for `Kosmos2ForConditionalGeneration`.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when being computed by the model):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextConnector`.
+        image_connector_attention (`tuple(torch.FloatTensor)`, *optional, returned when being computed by the model):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights given by `Kosmos2ImageToTextConnector`, after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        vision_model_output(`BaseModelOutputWithPooling`, *optional*, returned when being computed by the model):
+            The output of the [`Kosmos2VisionModel`].
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_features: Optional[torch.FloatTensor] = None
+    image_connector_attention: Optional[Tuple[torch.FloatTensor]] = None
+    vision_model_output: BaseModelOutputWithPooling = None
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Kosmos2
+class Kosmos2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->Kosmos2Vision
+class Kosmos2VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Kosmos2Vision
+class Kosmos2VisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Kosmos2Vision
+class Kosmos2VisionEncoderLayer(nn.Module):
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Kosmos2VisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Kosmos2VisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Kosmos2Vision
+class Kosmos2VisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Kosmos2VisionEncoderLayer`].
+    Args:
+        config: Kosmos2VisionConfig
+    """
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Kosmos2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer with CLIPVision->Kosmos2Vision,CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2Vision
+class Kosmos2VisionTransformer(nn.Module):
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Kosmos2VisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = Kosmos2VisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    @add_start_docstrings_to_model_forward(KOSMOS2_VISION_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->Kosmos2
+class Kosmos2TextSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+        self.register_buffer("weights", emb_weights, persistent=False)
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb.to(torch.get_default_dtype())
+    @torch.no_grad()
+    def forward(
+        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+                input_ids.device
+            )
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+        Args:
+            inputs_embeds: torch.Tensor
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+#  Similar to transformers.models.bart.modeling_bart.BartAttention with an additional `inner_attn_ln`.
+class KosmosTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        config,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        add_inner_attn_layernorm: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.inner_attn_ln = None
+        if add_inner_attn_layernorm:
+            self.inner_attn_ln = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        if self.inner_attn_ln is not None:
+            attn_output = self.inner_attn_ln(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+class Kosmos2TextFFN(nn.Module):
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__()
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(config.embed_dim, config.ffn_dim)
+        self.fc2 = nn.Linear(config.ffn_dim, config.embed_dim)
+        self.ffn_layernorm = nn.LayerNorm(config.ffn_dim, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.ffn_layernorm(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+class Kosmos2TextBlock(nn.Module):
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__()
+        self.embed_dim = config.embed_dim
+        self.self_attn = KosmosTextAttention(
+            config,
+            embed_dim=self.embed_dim,
+            num_heads=config.attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            add_inner_attn_layernorm=True,
+        )
+        self.dropout = config.dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if config.add_cross_attention:
+            self.encoder_attn = KosmosTextAttention(
+                config,
+                embed_dim=self.embed_dim,
+                num_heads=config.attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+                add_inner_attn_layernorm=False,
+            )
+            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.ffn = Kosmos2TextFFN(config)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "encoder_attn"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        # FFN
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class Kosmos2TextTransformer(nn.Module):
+    """
+    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].
+    Args:
+        config: Kosmos2TextConfig
+    """
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__()
+        self.config = config
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.embed_scale = math.sqrt(config.embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.embed_dim, padding_idx=config.pad_token_id)
+        self.embed_positions = Kosmos2TextSinusoidalPositionalEmbedding(
+            num_positions=config.max_position_embeddings,
+            embedding_dim=config.embed_dim,
+            padding_idx=config.pad_token_id,
+        )
+        self.layers = nn.ModuleList([Kosmos2TextBlock(config) for _ in range(config.layers)])
+        self.layer_norm = nn.LayerNorm(config.embed_dim, config.layer_norm_eps)
+        self.gradient_checkpointing = False
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward_embedding(
+        self, input_ids, inputs_embeds=None, img_features=None, img_input_mask=None, past_key_values_length: int = 0
+    ):
+        # The argument `inputs_embeds` should be the one without being multiplied by `self.embed_scale`.
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if img_features is not None:
+            inputs_embeds[img_input_mask.to(dtype=torch.bool)] = img_features.view(-1, img_features.size(-1))
+        inputs_embeds = inputs_embeds * self.embed_scale
+        # embed positions
+        positions = self.embed_positions(
+            input_ids=input_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length
+        )
+        positions = positions.to(inputs_embeds.device)
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        img_features: Optional[torch.Tensor] = None,
+        img_attn_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        # We don't need img info. when `past_key_values_length` > 0
+        if past_key_values_length > 0:
+            img_features = None
+            img_attn_mask = None
+        hidden_states = self.forward_embedding(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            img_features=img_features,
+            img_input_mask=img_attn_mask,
+            past_key_values_length=past_key_values_length,
+        )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, hidden_states, past_key_values_length
+        )
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        # add final layer norm
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class Kosmos2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Kosmos2Config
+    supports_gradient_checkpointing = True
+@add_start_docstrings(
+    """The vision model from KOSMOS-2 without any head or projection on top.""",
+    KOSMOS2_START_DOCSTRING,
+)
+class Kosmos2VisionModel(Kosmos2PreTrainedModel):
+    config_class = Kosmos2VisionConfig
+    main_input_name = "pixel_values"
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__(config)
+        self.model = Kosmos2VisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.get_input_embeddings with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embeddings.patch_embedding
+    @add_start_docstrings_to_model_forward(KOSMOS2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Kosmos2VisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        return self.model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+@add_start_docstrings(
+    """The text model from KOSMOS-2 without any head or projection on top.""",
+    KOSMOS2_START_DOCSTRING,
+)
+class Kosmos2TextModel(Kosmos2PreTrainedModel):
+    config_class = Kosmos2TextConfig
+    _no_split_modules = ["Kosmos2TextBlock"]
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__(config)
+        self.model = Kosmos2TextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=Kosmos2TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        img_features: Optional[torch.Tensor] = None,
+        img_attn_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Returns:
+        """
+        return self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            img_features=img_features,
+            img_attn_mask=img_attn_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+@add_start_docstrings(
+    """
+    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    KOSMOS2_START_DOCSTRING,
+)
+class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel):
+    config_class = Kosmos2TextConfig
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__(config)
+        self.model = Kosmos2TextTransformer(config)
+        self.lm_head = nn.Linear(in_features=config.embed_dim, out_features=config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=Kosmos2TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        img_features: Optional[torch.Tensor] = None,
+        img_attn_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            img_features=img_features,
+            img_attn_mask=img_attn_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.lm_head(outputs[0])
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        img_features,
+        img_attn_mask,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        **model_kwargs,
+    ):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # cut input_ids if past_key_values is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+            # the image info. is already encoded into the past keys/values
+            img_features = None
+            img_attn_mask = None
+        elif img_attn_mask is not None:
+            # appending `False` to `img_attn_mask` (because `input_ids` grows during generation)
+            batch_size, seq_len = input_ids.size()
+            mask_len = img_attn_mask.size()[-1]
+            img_attn_mask = torch.cat(
+                (img_attn_mask, torch.zeros(size=(batch_size, seq_len - mask_len), dtype=torch.bool, device=input_ids.device)), dim=1
+            )
+        return {
+            "input_ids": input_ids,
+            "img_features": img_features,
+            "img_attn_mask": img_attn_mask,
+            "past_key_values": past_key_values,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+class Kosmos2ImageToTextConnector(nn.Module):
+    """The layer that transforms the image model's output to part of the text model's input (namely, image features)"""
+    def __init__(self, config: Kosmos2Config):
+        super().__init__()
+        self.dense = nn.Linear(config.vision_config.hidden_size, config.text_config.embed_dim)
+        self.latent_query = nn.Parameter(torch.randn(config.latent_query_num, config.text_config.embed_dim))
+        self.x_attn = KosmosTextAttention(
+            config.text_config,
+            config.text_config.embed_dim,
+            config.text_config.attention_heads,
+            dropout=config.text_config.attention_dropout,
+            is_decoder=False,
+            add_inner_attn_layernorm=False,
+        )
+    def forward(self, features):
+        hidden_states = self.dense(features)
+        # shape = [batch, latent_query_num, h_dim]
+        latent_query = self.latent_query.unsqueeze(0).expand(hidden_states.size(0), -1, -1)
+        key_value_states = torch.cat([hidden_states, latent_query], dim=1)
+        hidden_states, attn_weights, _ = self.x_attn(
+            hidden_states=latent_query,
+            key_value_states=key_value_states,
+            past_key_value=None,
+            attention_mask=None,
+            output_attentions=None,
+        )
+        return hidden_states, attn_weights
+@add_start_docstrings(
+    """
+    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder (CLIP) and a language
+    model.
+    """,
+    KOSMOS2_START_DOCSTRING,
+)
+class Kosmos2Model(Kosmos2PreTrainedModel):
+    config_class = Kosmos2Config
+    def __init__(self, config: Kosmos2Config):
+        super().__init__(config)
+        self.text_model = Kosmos2TextModel(config.text_config)
+        self.vision_model = Kosmos2VisionModel(config.vision_config)
+        self.image_to_text_connector = Kosmos2ImageToTextConnector(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.text_model.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Kosmos2ModelOutput, config_class=Kosmos2Config)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        img_attn_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        img_features: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Kosmos2ModelOutput]:
+        # TODO: Add this
+        r"""
+        Returns:
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_model_output = None
+        image_connector_attention = None
+        if img_features is None:
+            if pixel_values is None:
+                raise ValueError("You have to specify either `pixel_values` or `img_features`.")
+            vision_model_output = self.vision_model(pixel_values)
+            # HF's CLIP has `last_hidden_state` without going through `post_layernorm`.
+            # Here we need the whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
+            img_features = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state)
+            # normalized features
+            img_features = nn.functional.normalize(img_features, dim=-1)
+            img_features, image_connector_attention = self.image_to_text_connector(img_features)
+        outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            img_features=img_features,
+            img_attn_mask=img_attn_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            outputs = outputs + (img_features, image_connector_attention, vision_model_output)
+            return tuple(output for output in outputs if output is not None)
+        return Kosmos2ModelOutput(
+            last_hidden_states=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_features=img_features,
+            image_connector_attention=image_connector_attention,
+            vision_model_output=vision_model_output,
+        )
+@add_start_docstrings(
+    """
+    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder (CLIP)
+    and a language model.
+    """,
+    KOSMOS2_START_DOCSTRING,
+)
+class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel):
+    config_class = Kosmos2Config
+    _tied_weights_keys = ["text_model.lm_head.weight"]
+    def __init__(self, config: Kosmos2Config):
+        super().__init__(config)
+        self.text_model = Kosmos2TextForCausalLM(config.text_config)
+        self.vision_model = Kosmos2VisionModel(config.vision_config)
+        self.image_to_text_connector = Kosmos2ImageToTextConnector(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.text_model.model.embed_tokens = value
+    def get_output_embeddings(self) -> nn.Module:
+        return self.text_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.text_model.set_output_embeddings(new_embeddings)
+    @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Kosmos2ForConditionalGenerationModelOutput, config_class=Kosmos2Config)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        img_attn_mask=None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask=None,
+        head_mask: Optional[torch.Tensor] = None,
+        img_features: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Kosmos2ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
+        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("ydshieh/kosmos-2-patch14-224")
+        >>> processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224")
+        >>> prompt = "<grounding> An image of"
+        >>> image = Image.open("snowman.jpg")
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> generated_ids = model.generate(
+        ...     pixel_values=inputs["pixel_values"],
+        ...     input_ids=inputs["input_ids"][:, :-1],
+        ...     attention_mask=inputs["attention_mask"][:, :-1],
+        ...     img_features=None,
+        ...     img_attn_mask=inputs["img_attn_mask"][:, :-1],
+        ...     use_cache=True,
+        ...     max_new_tokens=64,
+        ... )
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> result = processor.post_processor_generation(generated_text)
+        >>> result
+        <grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_model_output = None
+        image_connector_attention = None
+        if img_features is None:
+            if pixel_values is None:
+                raise ValueError("You have to specify either `pixel_values` or `img_features`.")
+            vision_model_output = self.vision_model(pixel_values)
+            # HF's CLIP has `last_hidden_state` without going through `post_layernorm`.
+            # Here we need the whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
+            img_features = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state)
+            # normalized features
+            img_features = nn.functional.normalize(img_features, dim=-1)
+            img_features, image_connector_attention = self.image_to_text_connector(img_features)
+        lm_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            img_features=img_features,
+            img_attn_mask=img_attn_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            outputs = lm_outputs + (img_features, image_connector_attention, vision_model_output)
+            return tuple(output for output in outputs if output is not None)
+        return Kosmos2ForConditionalGenerationModelOutput(
+            loss=lm_outputs.loss,
+            logits=lm_outputs.logits,
+            past_key_values=lm_outputs.past_key_values,
+            hidden_states=lm_outputs.hidden_states,
+            attentions=lm_outputs.attentions,
+            image_features=img_features,
+            image_connector_attention=image_connector_attention,
+            vision_model_output=vision_model_output,
+        )
+    def generate(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        img_features=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        **kwargs,
+    ):
+        # in order to allow `inputs` argument (as in `GenerationMixin`)
+        inputs = kwargs.pop("inputs", None)
+        if pixel_values is not None and inputs is not None:
+            raise ValueError(
+                f"`inputs`: {inputs} were passed alongside `pixel_values` which is not allowed."
+                f"Make sure to either pass `inputs` or pixel_values=..."
+            )
+        if pixel_values is None and inputs is not None:
+            pixel_values = inputs
+        if img_features is None:
+            vision_model_output = self.vision_model(pixel_values)
+            # HF's CLIP has `last_hidden_state` without going through `post_layernorm`.
+            # Here we need the whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
+            img_features = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state)
+            # normalized features
+            img_features = nn.functional.normalize(img_features, dim=-1)
+            img_features, image_connector_attention = self.image_to_text_connector(img_features)
+        output = self.text_model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            img_features=img_features,
+            input_embeds=inputs_embeds,
+            **kwargs,
+        )
+        return output

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Kosmos2ImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "Kosmos2Processor",
+  "auto_map": {
+    "AutoProcessor": "processing_kosmos2.Kosmos2Processor",
+    "AutoImageProcessor": "image_processing_kosmos2.Kosmos2ImageProcessor"
+  },
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

processing_kosmos2.py ADDED Viewed

	@@ -0,0 +1,608 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for KOSMOS-2."""
+import copy
+import math
+import re
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_batched
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
+from transformers.utils import TensorType, is_tf_available, is_torch_available
+if is_torch_available():
+    import torch
+if is_tf_available():
+    import tensorflow as tf
+BboxInput = Union[
+    List[Tuple[int, int]],
+    List[Tuple[float, float, float, float]],
+    List[List[Tuple[int, int]]],
+    List[List[Tuple[float, float, float]]],
+]
+class Kosmos2Processor(ProcessorMixin):
+    r"""
+    Constructs an KOSMOS-2 processor which wraps a CLIP image processor and a KOSMOS-2 tokenizer into a single
+    processor.
+    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`Kosmos2TokenizerFast`]. See the
+    docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`] for more information.
+    Args:
+        image_processor (`CLIPImageProcessor`):
+            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
+        tokenizer (`Kosmos2TokenizerFast`):
+            An instance of ['Kosmos2TokenizerFast`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    # Better to use explicit classes if local code works
+    # image_processor_class = "Kosmos2ImageProcessor"
+    # tokenizer_class = ("Kosmos2Tokenizer", "Kosmos2TokenizerFast")
+    # To make remote code work
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, List[TextInput]] = None,
+        bboxes: BboxInput = None,
+        num_image_tokens: Optional[int] = 64,
+        first_image_token_id: Optional[int] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`Kosmos2TokenizerFast.__call__`] to prepare text for the model.
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if text is None:
+            raise ValueError("You have to specify at least `text`.")
+        text = self.preprocess_text(text, images, bboxes, num_image_tokens=num_image_tokens)
+        encoding = BatchFeature()
+        text_encoding = self.tokenizer(
+            text=text,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_token_type_ids=return_token_type_ids,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+        encoding.update(text_encoding)
+        if images is not None:
+            image_encoding = self.image_processor(images, return_tensors=return_tensors)
+            encoding.update(image_encoding)
+            # Use the id of the first token after <unk>
+            if first_image_token_id is None:
+                first_image_token_id = self.tokenizer.unk_token_id + 1
+            # To see if we need one more `0` (for `<s>`) at the beginning of `img_attn_mask`.
+            with_bos = add_special_tokens
+            # The first (actual) `<image>` token is always at the 1st or 2nd place (after `<s>` if any). Here we look
+            # for the second `<image>` token (which indicate the first image token).
+            start_index = int(with_bos) + 1
+            if return_tensors:
+                # change the ids for the fake `<image>` tokens in `input_ids`
+                input_ids = np.array(encoding["input_ids"])
+                input_ids[:, start_index : (start_index + num_image_tokens)] = np.arange(
+                    first_image_token_id, first_image_token_id + num_image_tokens
+                )
+                batch_size, seq_len = input_ids.shape[:2]
+                img_attn_mask = []
+                if with_bos:
+                    # for `<s>`
+                    img_attn_mask.append(np.zeros(shape=(batch_size, 1), dtype=np.int64))
+                # for `<image>` (the real one)
+                img_attn_mask.append(np.zeros(shape=(batch_size, 1), dtype=np.int64))
+                # for image tokens
+                img_attn_mask.append(np.ones(shape=(batch_size, 64), dtype=np.int64))
+                # for `</image>`
+                img_attn_mask.append(np.zeros(shape=(batch_size, 1), dtype=np.int64))
+                # trailing part (which are not related to the image)
+                seq_len -= int(with_bos) + 1 + num_image_tokens + 1
+                img_attn_mask.append(np.zeros(shape=(batch_size, seq_len), dtype=np.int64))
+                # concatenate along the sequence dimension
+                img_attn_mask = np.concatenate(img_attn_mask, axis=1)
+                # to the target tensor type
+                if return_tensors == "pt":
+                    input_ids = torch.from_numpy(input_ids)
+                    img_attn_mask = torch.from_numpy(img_attn_mask)
+                elif return_tensors == "tf":
+                    input_ids = tf.convert_to_tensor(input_ids)
+                    img_attn_mask = tf.convert_to_tensor(img_attn_mask)
+                encoding["input_ids"] = input_ids
+                encoding["img_attn_mask"] = img_attn_mask
+            else:
+                # Add `img_attn_mask`: the leading and trailing `0` are for `boi` and `eoi` tokens. The `1` indicates
+                # the places of image tokens.
+                image_token_ids = list(range(first_image_token_id, first_image_token_id + num_image_tokens))
+                base_img_attn_mask = [0] + [1] * num_image_tokens + [0]
+                # loop over `encoding["input_ids"]`
+                input_ids = []
+                img_attn_mask = []
+                all_input_ids = encoding["input_ids"]
+                # not batched -> (changed to) batch of size 1
+                if isinstance(text, str):
+                    all_input_ids = [all_input_ids]
+                for text_ids in all_input_ids:
+                    # change the ids for the fake `<image>` tokens in `input_ids`
+                    text_ids = text_ids[:start_index] + image_token_ids + text_ids[start_index + num_image_tokens :]
+                    input_ids.append(text_ids)
+                    mask = copy.copy(base_img_attn_mask)
+                    if with_bos:
+                        # for `<s>`
+                        mask = [0] + mask
+                    # trailing part (which are not related to the image)
+                    mask += [0] * (len(text_ids) - len(mask))
+                    img_attn_mask.append(mask)
+                # un-batch if necessary
+                if isinstance(text, str):
+                    input_ids = input_ids[0]
+                    img_attn_mask = img_attn_mask[0]
+                encoding["input_ids"] = input_ids
+                encoding["img_attn_mask"] = img_attn_mask
+        return encoding
+    def preprocess_text(
+        self,
+        texts: Union[TextInput, List[TextInput]],
+        images: ImageInput = None,
+        bboxes: BboxInput = None,
+        num_image_tokens: Optional[int] = 64,
+    ) -> Union[str, List[str]]:
+        """Add image and bounding box information to `texts` as image and patch index tokens.
+        Args:
+            texts (`Union[TextInput, List[TextInput]]`): The texts to be processed.
+            images (`ImageInput`, *optional*): The images associated to `texts`.
+            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*): The bounding bboxes associated to `texts`.
+            num_image_tokens (`int`, *optional*, defaults to 64): The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num` attribute in `Kosmos2Config`.
+        Returns:
+            `Union[TextInput, List[TextInput]]`: The processed texts with image and patch index tokens.
+        """
+        # These are fake `<image>` tokens enclosed between (the actual) `<image>` token and `</image>`.
+        img_tokens = ["<image>"] * num_image_tokens
+        img_info = " ".join(["<image>"] + img_tokens + ["</image>"])
+        def check_bboxes_for_single_text(bboxes):
+            """
+            Check `bboxes` for a single text example. It could be
+                - `None`: no bounding box associated to a text.
+                - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair
+                  found in a text. This could be:
+                      - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
+                      - A tuple of 2 integers: A single bounding box specified by patch indices.
+                      - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
+                      - A list containing the above 2 tuple types: Multiple bounding boxes for a
+                       `<phrase> ... </phrase>` pair.
+            """
+            if bboxes is None:
+                return
+            elif not isinstance(bboxes, list):
+                raise ValueError("`bboxes` (for a single text example) should be `None` or a list.")
+            # `bbox` is the bounding boxes for a single <phrase> </phrase> pair
+            for bbox in bboxes:
+                if bbox is None:
+                    continue
+                elif not isinstance(bbox, list):
+                    bbox = [bbox]
+                for elt in bbox:
+                    if not isinstance(elt, tuple) or not (
+                        (len(elt) == 2 and all(isinstance(x, int) for x in elt))
+                        or (len(elt) == 4 and all(isinstance(x, float) for x in elt))
+                    ):
+                        raise ValueError(
+                            "Each element in `bboxes` (for a single text example) should be `None`, a tuple containing "
+                            "2 integers or 4 float point numbers, or a list containing such tuples. Also "
+                            "make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in "
+                            "batches or both for a single example."
+                        )
+        def preprocess_single(text, image, bboxes):
+            text = text.strip()
+            if image is not None:
+                # Add `<image> ... (fake) image tokens ... </image>`
+                text = f"{img_info} {text}"
+            # Add `<object> <patch_idx_xxxx> <patch_idx_yyy> </object>` after `<phrase> phrase text </phrase>`
+            text = self._insert_patch_index_tokens(text, bboxes)
+            text = self._add_remove_spaces_around_tag_tokens(text)
+            return text
+        # make batch to simplify processing logic
+        batched = True
+        if isinstance(texts, str):
+            batched = False
+            texts = [texts]
+        if images is None:
+            images = [None] * len(texts)
+        elif not is_batched(images):
+            images = [images]
+        if len(texts) != len(images):
+            raise ValueError(
+                f"The number of examples in `texts` and `images` should be the same. Got {len(texts)} v.s. {len(images)} instead."
+            )
+        if not batched:
+            check_bboxes_for_single_text(bboxes)
+            bboxes = [bboxes]
+        elif bboxes is not None:
+            if not isinstance(bboxes, list):
+                raise ValueError("`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.")
+            for x in bboxes:
+                check_bboxes_for_single_text(x)
+        else:
+            bboxes = [None] * len(texts)
+        if len(bboxes) != len(texts):
+            raise ValueError(
+                f"The number of examples in `texts` and `bboxes` should be the same. Got {len(texts)} v.s. {len(bboxes)} instead."
+            )
+        result = [preprocess_single(text, image, bbox) for text, image, bbox in zip(texts, images, bboxes)]
+        # un-batch if necessary
+        if not batched:
+            result = result[0]
+        return result
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_generation(self, text, cleanup_and_extract=True):
+        caption = text.split("</image>")[-1]
+        if cleanup_and_extract:
+            return clean_text_and_extract_entities_with_bboxes(caption)
+        return caption
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def _insert_patch_index_tokens(self, text: str, bboxes: Union[List[Tuple[int]], List[Tuple[float]]]) -> str:
+        if bboxes is None or len(bboxes) == 0:
+            return text
+        matched_phrases = list(re.finditer(r"<phrase>.+?</phrase>", string=text))
+        if len(matched_phrases) != len(bboxes):
+            raise ValueError(
+                f"The number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got {len(matched_phrases)} v.s. {len(bboxes)} instead."
+            )
+        # insert object's patch index tokens
+        # the found `<phrase> ... </phrase>` pairs.
+        curr_pos = 0
+        buffer = []
+        for matched, bbox in zip(matched_phrases, bboxes):
+            _, end = matched.span()
+            buffer.append(text[curr_pos:end])
+            curr_pos = end
+            # A phrase without bbox
+            if bbox is None:
+                continue
+            # A phrase with a single bbox
+            if isinstance(bbox, tuple):
+                bbox = [bbox]
+            patch_index_strings = []
+            # A phrase could have multiple bboxes
+            for box in bbox:
+                patch_index_1, patch_index_2 = self._convert_bbox_to_patch_index_tokens(box)
+                patch_index_strings.append(f"{patch_index_1} {patch_index_2}")
+            position_str = " </delimiter_of_multi_objects/> ".join(patch_index_strings)
+            buffer.append(f"<object> {position_str} </object>")
+        # remaining
+        if curr_pos < len(text):
+            buffer.append(text[curr_pos:])
+        text = "".join(buffer)
+        return text
+    def _convert_bbox_to_patch_index_tokens(
+        self, bbox: Union[Tuple[int, int], Tuple[float, float, float, float]]
+    ) -> Tuple[str, str]:
+        # already computed patch indices
+        if len(bbox) == 2:
+            idx_1, idx_2 = bbox
+        # bbox specified with (normalized) coordinates
+        else:
+            # use `self.tokenizer` to get `num_patches_per_side`
+            num_patches_per_side = int(math.sqrt(self.tokenizer.num_patch_index_tokens))
+            idx_1, idx_2 = coordinate_to_patch_index(bbox, num_patches_per_side)
+        token_1 = f"<patch_index_{str(idx_1).zfill(4)}>"
+        token_2 = f"<patch_index_{str(idx_2).zfill(4)}>"
+        return token_1, token_2
+    def _add_remove_spaces_around_tag_tokens(self, text):
+        """
+        Remove spaces before tag tokens (e.g. `<x>`). Also ensure a space after a tag token, if it is not followed by
+        another tag token (this is not technically necessary, but good for a standard/consistent format). This avoids
+        the inconsistency of tokenization results between kosmos-2 slow and fast tokenizers.
+        """
+        tag_tokens = set(
+            self.tokenizer.tag_tokens
+            + [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.tokenizer.num_patch_index_tokens)]
+        )
+        pattern = "|".join(tag_tokens)
+        splits = re.split(rf"({pattern})", text)
+        # Don't keep the leading and trailing space if any
+        splits = [split for idx, split in enumerate(splits) if not (idx in [0, len(splits) - 1] and split == "")]
+        output = ""
+        prev_str_in_targets = False
+        for split in splits:
+            if split in tag_tokens:
+                prev_str_in_targets = True
+                output = output.rstrip() + split
+            else:
+                # we don't need to ensure a space before a normal token that is after a tag token. But having it and
+                # keeps a standard format is good anyway.
+                if prev_str_in_targets and not split.startswith(" "):
+                    output += " " + split
+                else:
+                    output += split
+                prev_str_in_targets = False
+        return output
+def coordinate_to_patch_index(bbox: Tuple[float, float, float, float], num_patches_per_side: int) -> Tuple[int, int]:
+    """Convert a bounding box to a pair of patch indices.
+    Args:
+        bbox (`Tuple[float, float, float, float]`):
+            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left
+            and lower-right corners of the box. It should have x2 > x1 and  y1 > y2.
+        num_patches_per_side (`int`): the number of patches along each side.
+    Returns:
+        `Tuple[int, int]`: A pair of patch indices.
+    """
+    (x1, y1, x2, y2) = bbox
+    ul_x = math.floor(x1 * num_patches_per_side)
+    ul_y = math.floor(y1 * num_patches_per_side)
+    lr_x = math.ceil(x2 * num_patches_per_side - 1)
+    lr_y = math.ceil(y2 * num_patches_per_side - 1)
+    ul_idx = ul_y * num_patches_per_side + ul_x
+    lr_idx = lr_y * num_patches_per_side + lr_x
+    return ul_idx, lr_idx
+# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L35C1-L75C38
+# (with format modifications)
+def patch_index_to_coordinate(ul_idx: int, lr_idx: int, num_patches_per_side: int):
+    """
+    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
+    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
+    Args:
+        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
+        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
+        num_patches_per_side (`int`): the number of patches along each side.
+    Returns:
+        `Tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
+    """
+    # Compute the size of each cell in the grid
+    cell_size = 1.0 / num_patches_per_side
+    # Compute the x and y indices of the upper-left and lower-right corners of the bounding box
+    ul_x = ul_idx % num_patches_per_side
+    ul_y = ul_idx // num_patches_per_side
+    lr_x = lr_idx % num_patches_per_side
+    lr_y = lr_idx // num_patches_per_side
+    # Compute the normalized coordinates of the bounding box
+    if ul_idx == lr_idx:
+        x1 = ul_x * cell_size
+        y1 = ul_y * cell_size
+        x2 = lr_x * cell_size + cell_size
+        y2 = lr_y * cell_size + cell_size
+    elif ul_x == lr_x or ul_y == lr_y:
+        x1 = ul_x * cell_size
+        y1 = ul_y * cell_size
+        x2 = lr_x * cell_size + cell_size
+        y2 = lr_y * cell_size + cell_size
+    else:
+        x1 = ul_x * cell_size + cell_size / 2
+        y1 = ul_y * cell_size + cell_size / 2
+        x2 = lr_x * cell_size + cell_size / 2
+        y2 = lr_y * cell_size + cell_size / 2
+    return x1, y1, x2, y2
+# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L4-L33
+# (with format modifications)
+def extract_entities_with_patch_indices(text):
+    # The regular expression pattern for matching the required formats
+    pattern = r'(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>'
+    # Find all matches in the given string
+    matches = re.finditer(pattern, text)
+    # Initialize an empty list to store the valid patch_index combinations
+    entities_with_patch_indices = []
+    for match in matches:
+        # span of a `phrase` that is between <phrase> and </phrase>
+        span = match.span(2)
+        phrase_tag, phrase, match_content = match.groups()
+        if not phrase_tag:
+            phrase = None
+            # We take the starting position of `<object>`
+            span = (match.span(0)[0], match.span(0)[0])
+        # Split the match_content by the delimiter to get individual patch_index pairs
+        patch_index_pairs = match_content.split('</delimiter_of_multi_objects/>')
+        entity_bboxes = []
+        for pair in patch_index_pairs:
+            # Extract the xxxx and yyyy values from the patch_index pair
+            x = re.search(r'<patch_index_(\d+)>', pair)
+            y = re.search(r'<patch_index_(\d+)>', pair[1:])
+            if x and y:
+                if phrase:
+                    entity_bboxes.append((int(x.group(1)), int(y.group(1))))
+                else:
+                    entity_bboxes.append((int(x.group(1)), int(y.group(1))))
+        if phrase:
+            entities_with_patch_indices.append((phrase, span, entity_bboxes))
+        else:
+            for bbox in entity_bboxes:
+                # fake entity name
+                entity = f"<patch_index_{bbox[0]}><patch_index_{bbox[1]}>"
+                entities_with_patch_indices.append((entity, span, [bbox]))
+    return entities_with_patch_indices
+# TODO: Be careful
+def remove_special_fields(text):
+    return re.sub('<.*?>', '', text)
+def adjust_entity_positions(entity, text):
+    entity_name, (start, end) = entity
+    adjusted_start = len(remove_special_fields(text[:start]))
+    adjusted_end = len(remove_special_fields(text[:end]))
+    adjusted_entity = (entity_name, (adjusted_start, adjusted_end))
+    return adjusted_entity
+# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L77-L87
+# (with format modifications)
+def clean_text_and_extract_entities_with_bboxes(text, num_patches_per_side=32):
+    processed_text = remove_special_fields(text)
+    entities_with_patch_indices = extract_entities_with_patch_indices(text)
+    entities = []
+    for item in entities_with_patch_indices:
+        entity, bboxes = item[0:2], item[2]
+        adjusted_entity = adjust_entity_positions(entity, text)
+        bboxes_in_coords = list(map(lambda bbox: patch_index_to_coordinate(bbox[0], bbox[1], num_patches_per_side), bboxes))
+        entities.append(adjusted_entity + (bboxes_in_coords,))
+    def cleanup_spaces(text, entities):
+        new_text = text.strip()
+        leading_spaces = len(text) - len(text.lstrip())
+        new_entities = []
+        for entity_name, (start, end), bboxes in entities:
+            entity_name_leading_spaces = len(entity_name) - len(entity_name.lstrip())
+            entity_name_trailing_spaces = len(entity_name) - len(entity_name.rstrip())
+            start = start - leading_spaces + entity_name_leading_spaces
+            end = end - leading_spaces - entity_name_trailing_spaces
+            entity_name = entity_name.strip()
+            new_entities.append((entity_name, (start, end), bboxes))
+        return new_text, new_entities
+    return cleanup_spaces(processed_text, entities)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c7290a8c916513d3bc0cbda4f0b0d02dcc17db935df7da9b52d3917e47cde17
+size 6658242717

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a60b4d1d1d8f70c8b2569c94540d4d9b7c694fd32e7a428ad0dcffaafaa3beb
+size 1363614

snowman.jpg ADDED Viewed

snowman.png ADDED Viewed

Git LFS Details

SHA256: b97825997df04bd823207fd145331ffc3c3b62ec4e3a3adaac83c93debe87bdf
Pointer size: 132 Bytes
Size of remote file: 1.36 MB

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenization_kosmos2.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for KOSMOS-2 model."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/kosmos-2-patch14-224": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/sentencepiece.bpe.model",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/kosmos-2-patch14-224": 2048,
+}
+class Kosmos2Tokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+        num_patch_index_tokens (`int`, *optional*, defaults to `1024`):
+            The number of tokens used to specify the patch indices of bounding boxes in an image. These tokens have the
+            format `<patch_index_xxxx>` where `xxxx` is an integer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        num_patch_index_tokens=1024,
+        add_tag_and_patch_index_tokens=False,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |    4   |    5   |    6   |    7   |    8    |   9
+        # -------- | ------- | ------- | ------ | ------- | ------ | ------ | ------ | ------ | ------- | ------
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | '.'    | '_the' | ','    | '▁to'  | '▁and' | '▁of'
+        # spm      | '<unk>' | '<s>'   | '</s>' | '.'     | '_the' | ','    | '▁to' | '▁and' | '▁of'   | '▁a'
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        self.eod_token = "</doc>"
+        self.boi_token = "<image>"
+        self.eoi_token = "</image>"
+        self.eoc_token = "</chunk>"
+        self.eol_token = "</line>"
+        self.bop_token = "<phrase>"
+        self.eop_token = "</phrase>"
+        self.boo_token = "<object>"
+        self.eoo_token = "</object>"
+        self.dom_token = "</delimiter_of_multi_objects/>"
+        self.grd_token = "<grounding>"
+        self.tag_tokens = [
+            self.eod_token,
+            self.boi_token,
+            self.eoi_token,
+            self.eoc_token,
+            self.eol_token,
+            self.bop_token,
+            self.eop_token,
+            self.boo_token,
+            self.eoo_token,
+            self.dom_token,
+            self.grd_token,
+        ]
+        self.num_patch_index_tokens = num_patch_index_tokens
+        patch_index_tokens = [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.num_patch_index_tokens)]
+        if add_tag_and_patch_index_tokens:
+            for idx, token in enumerate(self.tag_tokens + patch_index_tokens):
+                # we can't add them as special tokens, as the slow tokenizer doesn't save the information of a token
+                # being special when it is added through `add_tokens`, but the fast tokenizer is able to do so.
+                self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=True)
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        spaces_between_special_tokens: bool = True,
+        **kwargs,
+    ) -> str:
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        is_first_current_sub_text = True
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_text = self.convert_tokens_to_string(current_sub_text)
+                    # `convert_tokens_to_string` removes the leading space, which is undesired if we are not at the
+                    # beginning part of the text. We can't use `spaces_between_special_tokens` to add this space back
+                    # neither, as it will also add a space before a tag/patch_index token (which is not the case with
+                    # the fast tokenizer - it doesn't even support `spaces_between_special_tokens`), which is not the
+                    # ideal output format.
+                    # The condition `not spaces_between_special_tokens` is to avoid double spaces.
+                    if not is_first_current_sub_text and not spaces_between_special_tokens:
+                        sub_text = " " + sub_text
+                    sub_texts.append(sub_text)
+                    current_sub_text = []
+                    is_first_current_sub_text = False
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+        if spaces_between_special_tokens:
+            text = " ".join(sub_texts)
+        else:
+            text = "".join(sub_texts)
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)

tokenization_kosmos2_fast.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for KOSMOS-2 model."""
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import is_sentencepiece_available, logging
+if is_sentencepiece_available():
+    from .tokenization_kosmos2 import Kosmos2Tokenizer
+else:
+    Kosmos2TokenizerFast = None
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/kosmos-2-patch14-224": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/sentencepiece.bpe.model",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/kosmos-2-patch14-224": 2048,
+}
+class Kosmos2TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" KOSMOS-2 tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+        num_patch_index_tokens (`int`, *optional*, defaults to `1024`):
+            The number of tokens used to specify the patch indices of bounding boxes in an image. These tokens have the
+            format `<patch_index_xxxx>` where `xxxx` is an integer.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = Kosmos2Tokenizer
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        num_patch_index_tokens=1024,
+        add_tag_and_patch_index_tokens=False,
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+        self.eod_token = "</doc>"
+        self.boi_token = "<image>"
+        self.eoi_token = "</image>"
+        self.eoc_token = "</chunk>"
+        self.eol_token = "</line>"
+        self.bop_token = "<phrase>"
+        self.eop_token = "</phrase>"
+        self.boo_token = "<object>"
+        self.eoo_token = "</object>"
+        self.dom_token = "</delimiter_of_multi_objects/>"
+        self.grd_token = "<grounding>"
+        self.tag_tokens = [
+            self.eod_token,
+            self.boi_token,
+            self.eoi_token,
+            self.eoc_token,
+            self.eol_token,
+            self.bop_token,
+            self.eop_token,
+            self.boo_token,
+            self.eoo_token,
+            self.dom_token,
+            self.grd_token,
+        ]
+        self.num_patch_index_tokens = num_patch_index_tokens
+        patch_index_tokens = [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.num_patch_index_tokens)]
+        if add_tag_and_patch_index_tokens:
+            for idx, token in enumerate(self.tag_tokens + patch_index_tokens):
+                # we need to set `special_tokens=False` to be the same as in the slow tokenizer.
+                self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=False)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Kosmos2Processor",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "Kosmos2Tokenizer",
+  "unk_token": "<unk>",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_kosmos2.Kosmos2Tokenizer",
+      "tokenization_kosmos2_fast.Kosmos2TokenizerFast"
+    ]
+  }
+}

two_dogs.jpg ADDED Viewed