Spaces:
Running
Running
import cv2 | |
import lmdb | |
import sys | |
from multiprocessing import Pool | |
from os import path as osp | |
from tqdm import tqdm | |
def make_lmdb_from_imgs(data_path, | |
lmdb_path, | |
img_path_list, | |
keys, | |
batch=5000, | |
compress_level=1, | |
multiprocessing_read=False, | |
n_thread=40, | |
map_size=None): | |
"""Make lmdb from images. | |
Contents of lmdb. The file structure is: | |
example.lmdb | |
βββ data.mdb | |
βββ lock.mdb | |
βββ meta_info.txt | |
The data.mdb and lock.mdb are standard lmdb files and you can refer to | |
https://lmdb.readthedocs.io/en/release/ for more details. | |
The meta_info.txt is a specified txt file to record the meta information | |
of our datasets. It will be automatically created when preparing | |
datasets by our provided dataset tools. | |
Each line in the txt file records 1)image name (with extension), | |
2)image shape, and 3)compression level, separated by a white space. | |
For example, the meta information could be: | |
`000_00000000.png (720,1280,3) 1`, which means: | |
1) image name (with extension): 000_00000000.png; | |
2) image shape: (720,1280,3); | |
3) compression level: 1 | |
We use the image name without extension as the lmdb key. | |
If `multiprocessing_read` is True, it will read all the images to memory | |
using multiprocessing. Thus, your server needs to have enough memory. | |
Args: | |
data_path (str): Data path for reading images. | |
lmdb_path (str): Lmdb save path. | |
img_path_list (str): Image path list. | |
keys (str): Used for lmdb keys. | |
batch (int): After processing batch images, lmdb commits. | |
Default: 5000. | |
compress_level (int): Compress level when encoding images. Default: 1. | |
multiprocessing_read (bool): Whether use multiprocessing to read all | |
the images to memory. Default: False. | |
n_thread (int): For multiprocessing. | |
map_size (int | None): Map size for lmdb env. If None, use the | |
estimated size from images. Default: None | |
""" | |
assert len(img_path_list) == len(keys), ('img_path_list and keys should have the same length, ' | |
f'but got {len(img_path_list)} and {len(keys)}') | |
print(f'Create lmdb for {data_path}, save to {lmdb_path}...') | |
print(f'Totoal images: {len(img_path_list)}') | |
if not lmdb_path.endswith('.lmdb'): | |
raise ValueError("lmdb_path must end with '.lmdb'.") | |
if osp.exists(lmdb_path): | |
print(f'Folder {lmdb_path} already exists. Exit.') | |
sys.exit(1) | |
if multiprocessing_read: | |
# read all the images to memory (multiprocessing) | |
dataset = {} # use dict to keep the order for multiprocessing | |
shapes = {} | |
print(f'Read images with multiprocessing, #thread: {n_thread} ...') | |
pbar = tqdm(total=len(img_path_list), unit='image') | |
def callback(arg): | |
"""get the image data and update pbar.""" | |
key, dataset[key], shapes[key] = arg | |
pbar.update(1) | |
pbar.set_description(f'Read {key}') | |
pool = Pool(n_thread) | |
for path, key in zip(img_path_list, keys): | |
pool.apply_async(read_img_worker, args=(osp.join(data_path, path), key, compress_level), callback=callback) | |
pool.close() | |
pool.join() | |
pbar.close() | |
print(f'Finish reading {len(img_path_list)} images.') | |
# create lmdb environment | |
if map_size is None: | |
# obtain data size for one image | |
img = cv2.imread(osp.join(data_path, img_path_list[0]), cv2.IMREAD_UNCHANGED) | |
_, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) | |
data_size_per_img = img_byte.nbytes | |
print('Data size per image is: ', data_size_per_img) | |
data_size = data_size_per_img * len(img_path_list) | |
map_size = data_size * 10 | |
env = lmdb.open(lmdb_path, map_size=map_size) | |
# write data to lmdb | |
pbar = tqdm(total=len(img_path_list), unit='chunk') | |
txn = env.begin(write=True) | |
txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') | |
for idx, (path, key) in enumerate(zip(img_path_list, keys)): | |
pbar.update(1) | |
pbar.set_description(f'Write {key}') | |
key_byte = key.encode('ascii') | |
if multiprocessing_read: | |
img_byte = dataset[key] | |
h, w, c = shapes[key] | |
else: | |
_, img_byte, img_shape = read_img_worker(osp.join(data_path, path), key, compress_level) | |
h, w, c = img_shape | |
txn.put(key_byte, img_byte) | |
# write meta information | |
txt_file.write(f'{key}.png ({h},{w},{c}) {compress_level}\n') | |
if idx % batch == 0: | |
txn.commit() | |
txn = env.begin(write=True) | |
pbar.close() | |
txn.commit() | |
env.close() | |
txt_file.close() | |
print('\nFinish writing lmdb.') | |
def read_img_worker(path, key, compress_level): | |
"""Read image worker. | |
Args: | |
path (str): Image path. | |
key (str): Image key. | |
compress_level (int): Compress level when encoding images. | |
Returns: | |
str: Image key. | |
byte: Image byte. | |
tuple[int]: Image shape. | |
""" | |
img = cv2.imread(path, cv2.IMREAD_UNCHANGED) | |
if img.ndim == 2: | |
h, w = img.shape | |
c = 1 | |
else: | |
h, w, c = img.shape | |
_, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) | |
return (key, img_byte, (h, w, c)) | |
class LmdbMaker(): | |
"""LMDB Maker. | |
Args: | |
lmdb_path (str): Lmdb save path. | |
map_size (int): Map size for lmdb env. Default: 1024 ** 4, 1TB. | |
batch (int): After processing batch images, lmdb commits. | |
Default: 5000. | |
compress_level (int): Compress level when encoding images. Default: 1. | |
""" | |
def __init__(self, lmdb_path, map_size=1024**4, batch=5000, compress_level=1): | |
if not lmdb_path.endswith('.lmdb'): | |
raise ValueError("lmdb_path must end with '.lmdb'.") | |
if osp.exists(lmdb_path): | |
print(f'Folder {lmdb_path} already exists. Exit.') | |
sys.exit(1) | |
self.lmdb_path = lmdb_path | |
self.batch = batch | |
self.compress_level = compress_level | |
self.env = lmdb.open(lmdb_path, map_size=map_size) | |
self.txn = self.env.begin(write=True) | |
self.txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') | |
self.counter = 0 | |
def put(self, img_byte, key, img_shape): | |
self.counter += 1 | |
key_byte = key.encode('ascii') | |
self.txn.put(key_byte, img_byte) | |
# write meta information | |
h, w, c = img_shape | |
self.txt_file.write(f'{key}.png ({h},{w},{c}) {self.compress_level}\n') | |
if self.counter % self.batch == 0: | |
self.txn.commit() | |
self.txn = self.env.begin(write=True) | |
def close(self): | |
self.txn.commit() | |
self.env.close() | |
self.txt_file.close() | |