from datasets import Dataset, IterableDataset from PIL import Image # ChatML format templates = { "assistant": "<|im_start|>assistant\n{msg}<|im_end|>", # message by assistant "user": "<|im_start|>user\n{msg}<|im_end|>" # message by user } ds = Dataset.from_json("llava_instruct_150k.json", split="train") ds_stream = ds.to_iterable_dataset() def get_image(image_path): image_path = f"train2014/COCO_train2014_{image_path}" img = Image.open(image_path) return img def get_chatml_text(conversations): chatml_text = "" for conversation in conversations: role = conversation["from"] role = "user" if role == "human" else "assistant" content = conversation["value"] formatted_text = templates[role].format(msg=content) chatml_text += formatted_text + "\n" return chatml_text def instruct_data_generator(): for sample in ds_stream: image_path = sample["image"] conversations = sample["conversations"] image = get_image(image_path) text = get_chatml_text(conversations) yield {"text": text, "image": image} instruct_ds = IterableDataset.from_generator(generator=instruct_data_generator)