Spaces:

Riksarkivet
/

htr_demo

Running on T4

File size: 1,595 Bytes

import io

import datasets
from PIL import Image


class DemoImages:
    _instance = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(DemoImages, cls).__new__(cls, *args, **kwargs)
        return cls._instance

    def __init__(self, url="Riksarkivet/test_images_demo", cache_dir="./helper/examples/.cache_images"):
        if not hasattr(self, "images_datasets"):
            self.images_datasets = datasets.load_dataset(url, cache_dir=cache_dir, split="train")
            self.example_df = self.images_datasets.to_pandas()
            self.examples_list = self.convert_bytes_to_images()

    def convert_bytes_to_images(self):
        examples_list = []
        # For each row in the dataframe
        for index, row in self.example_df.iterrows():
            image_bytes = row["image"]["bytes"]
            image = Image.open(io.BytesIO(image_bytes))

            # Set the path to save the image
            path_to_image = f"./helper/examples/images/image_{index}.jpg"

            # Save the image
            image.save(path_to_image)

            # Get the description
            description = row["text"]

            # Append to the examples list
            examples_list.append([description, path_to_image])

        return examples_list


if __name__ == "__main__":
    # test = DemoImages(cache_dir=".cache_images")

    # print(test.examples_list)

    images_datasets = datasets.load_dataset("Riksarkivet/test_images_demo", cache_dir="./helper/examples/.cache_images")
    print(images_datasets["train"]["image"][0])