microsoft/kosmos-2.5 · Batch inferencing bug: All outputs in the same batch has the same prediction, even for different images

Model is great, but for batch inferencing, the problem is, even for different images, the output is the same, it seems to be just doing inferencing on a single image.
This is my test code:

from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
from PIL import Image
import torch
from tqdm import tqdm
import os

DTYPE = torch.bfloat16

def initialize_model(model_name: str):
    print(f"Initializing {model_name} model")
    model = Kosmos2_5ForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=DTYPE)
    processor = AutoProcessor.from_pretrained(model_name)
    return model, processor

def test_kosmos_batch(
        model, 
        processor, 
        image_dir,
        prompt="<md>",
        batch_size=1,
        device = "cuda",
        dtype = torch.bfloat16,
        max_new_tokens=1024,
    ):

    # Read image paths from the image_dir string
    image_paths = [os.path.join(image_dir, img) for img in os.listdir(image_dir)]

    outputs = []
    num_batches = (len(image_paths) + batch_size - 1) // batch_size

    for i in tqdm(range(num_batches)):
        batch_paths = image_paths[i * batch_size : (i + 1) * batch_size]
        images = [Image.open(path) for path in batch_paths]

        inputs = processor(
            text=[prompt] * len(images),
            images=images,
            return_tensors="pt",
            padding=True,
        )

        inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
        inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
        print("Processing inputs completed")
        try:
            del inputs["width"]
            del inputs["height"]
        except KeyError:
            pass

        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
        outputs.extend(generated_text)
    
    for i, output in enumerate(outputs):
        print("=========================================")
        print(f"Output {i}: {output}")

    return outputs

if __name__ == "__main__":
    model_name = "microsoft/kosmos-2.5"
    model, processor = initialize_model(model_name)
    print("Model initialized successfully!")
    image_paths = "./images"
    test_kosmos_batch(model, processor, image_paths, batch_size=3, max_new_tokens=50)

Output is for the same image, which should not be the case:

=========================================
Output 0: <md>**Section 10.** **Effectivity.** This Circular shall take effect fifteen (15) calendar days following its publication either in the Official Gazette or in a newspaper of general circulation.

# Classification: GENERAL
=========================================
Output 1: <md>10\. *Effectivity*. This Circular shall take effect fifteen (15) calendar days following its publication either in the Official Gazette or in a newspaper of general circulation.

# Classification: GENERAL
=========================================
Output 2: <md># Section 10. Effectivity. This Circular shall take effect fifteen (15) calendar days following its publication in the Official Gazette or in a newspaper of general circulation.

# 1. The effectivity of this Circular shall be determined by the following