New `HUBDatasetStats()` class (#8716)
Browse files* New `HUBDatasetStats()` class
Usage examples:
```
from utils.dataloaders import *
stats = HUBDatasetStats('coco128.yaml', autodownload=True) # method 1
stats = HUBDatasetStats('path/to/coco128_with_yaml.zip') # method 1
stats.get_json(save=False)
stats.process_images()
```
@kalenmike
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update dataloaders.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update dataloaders.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update dataloaders.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update dataloaders.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- utils/dataloaders.py +70 -76
@@ -977,21 +977,35 @@ def verify_image_label(args):
|
|
977 |
return [None, None, None, None, nm, nf, ne, nc, msg]
|
978 |
|
979 |
|
980 |
-
|
981 |
""" Return dataset statistics dictionary with images and instances counts per split per class
|
982 |
To run in parent directory: export PYTHONPATH="$PWD/yolov5"
|
983 |
-
Usage1: from utils.dataloaders import *;
|
984 |
-
Usage2: from utils.dataloaders import *;
|
985 |
Arguments
|
986 |
path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
|
987 |
autodownload: Attempt to download dataset if not found locally
|
988 |
-
verbose: Print stats dictionary
|
989 |
"""
|
990 |
|
991 |
-
def
|
992 |
-
#
|
993 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
994 |
|
|
|
995 |
def _find_yaml(dir):
|
996 |
# Return data.yaml file
|
997 |
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
|
@@ -1002,7 +1016,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
|
|
1002 |
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
|
1003 |
return files[0]
|
1004 |
|
1005 |
-
def _unzip(path):
|
1006 |
# Unzip data.zip
|
1007 |
if not str(path).endswith('.zip'): # path is data.yaml
|
1008 |
return False, None, path
|
@@ -1010,11 +1024,11 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
|
|
1010 |
ZipFile(path).extractall(path=path.parent) # unzip
|
1011 |
dir = path.with_suffix('') # dataset directory == zip name
|
1012 |
assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
|
1013 |
-
return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path
|
1014 |
|
1015 |
-
def _hub_ops(f, max_dim=1920):
|
1016 |
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
|
1017 |
-
f_new = im_dir / Path(f).name # dataset-hub image filename
|
1018 |
try: # use PIL
|
1019 |
im = Image.open(f)
|
1020 |
r = max_dim / max(im.height, im.width) # ratio
|
@@ -1030,69 +1044,49 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
|
|
1030 |
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
|
1031 |
cv2.imwrite(str(f_new), im)
|
1032 |
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
-
|
1041 |
-
|
1042 |
-
|
1043 |
-
|
1044 |
-
|
1045 |
-
|
1046 |
-
|
1047 |
-
stats[split] =
|
1048 |
-
|
1049 |
-
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
1053 |
-
|
1054 |
-
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
'
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
1066 |
-
|
1067 |
-
|
1068 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1069 |
pass
|
1070 |
-
|
1071 |
-
|
1072 |
-
stats_path = hub_dir / 'stats.json'
|
1073 |
-
if profile:
|
1074 |
-
for _ in range(1):
|
1075 |
-
file = stats_path.with_suffix('.npy')
|
1076 |
-
t1 = time.time()
|
1077 |
-
np.save(file, stats)
|
1078 |
-
t2 = time.time()
|
1079 |
-
x = np.load(file, allow_pickle=True)
|
1080 |
-
print(f'stats.npy times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
|
1081 |
-
|
1082 |
-
file = stats_path.with_suffix('.json')
|
1083 |
-
t1 = time.time()
|
1084 |
-
with open(file, 'w') as f:
|
1085 |
-
json.dump(stats, f) # save stats *.json
|
1086 |
-
t2 = time.time()
|
1087 |
-
with open(file) as f:
|
1088 |
-
x = json.load(f) # load hyps dict
|
1089 |
-
print(f'stats.json times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
|
1090 |
-
|
1091 |
-
# Save, print and return
|
1092 |
-
if hub:
|
1093 |
-
print(f'Saving {stats_path.resolve()}...')
|
1094 |
-
with open(stats_path, 'w') as f:
|
1095 |
-
json.dump(stats, f) # save stats.json
|
1096 |
-
if verbose:
|
1097 |
-
print(json.dumps(stats, indent=2, sort_keys=False))
|
1098 |
-
return stats
|
|
|
977 |
return [None, None, None, None, nm, nf, ne, nc, msg]
|
978 |
|
979 |
|
980 |
+
class HUBDatasetStats():
|
981 |
""" Return dataset statistics dictionary with images and instances counts per split per class
|
982 |
To run in parent directory: export PYTHONPATH="$PWD/yolov5"
|
983 |
+
Usage1: from utils.dataloaders import *; HUBDatasetStats('coco128.yaml', autodownload=True)
|
984 |
+
Usage2: from utils.dataloaders import *; HUBDatasetStats('path/to/coco128_with_yaml.zip')
|
985 |
Arguments
|
986 |
path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
|
987 |
autodownload: Attempt to download dataset if not found locally
|
|
|
988 |
"""
|
989 |
|
990 |
+
def __init__(self, path='coco128.yaml', autodownload=False):
|
991 |
+
# Initialize class
|
992 |
+
zipped, data_dir, yaml_path = self._unzip(Path(path))
|
993 |
+
try:
|
994 |
+
with open(check_yaml(yaml_path), errors='ignore') as f:
|
995 |
+
data = yaml.safe_load(f) # data dict
|
996 |
+
if zipped:
|
997 |
+
data['path'] = data_dir
|
998 |
+
except Exception as e:
|
999 |
+
raise Exception("error/HUB/dataset_stats/yaml_load") from e
|
1000 |
+
|
1001 |
+
check_dataset(data, autodownload) # download dataset if missing
|
1002 |
+
self.hub_dir = Path(data['path'] + '-hub')
|
1003 |
+
self.im_dir = self.hub_dir / 'images'
|
1004 |
+
self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images
|
1005 |
+
self.stats = {'nc': data['nc'], 'names': data['names']} # statistics dictionary
|
1006 |
+
self.data = data
|
1007 |
|
1008 |
+
@staticmethod
|
1009 |
def _find_yaml(dir):
|
1010 |
# Return data.yaml file
|
1011 |
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
|
|
|
1016 |
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
|
1017 |
return files[0]
|
1018 |
|
1019 |
+
def _unzip(self, path):
|
1020 |
# Unzip data.zip
|
1021 |
if not str(path).endswith('.zip'): # path is data.yaml
|
1022 |
return False, None, path
|
|
|
1024 |
ZipFile(path).extractall(path=path.parent) # unzip
|
1025 |
dir = path.with_suffix('') # dataset directory == zip name
|
1026 |
assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
|
1027 |
+
return True, str(dir), self._find_yaml(dir) # zipped, data_dir, yaml_path
|
1028 |
|
1029 |
+
def _hub_ops(self, f, max_dim=1920):
|
1030 |
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
|
1031 |
+
f_new = self.im_dir / Path(f).name # dataset-hub image filename
|
1032 |
try: # use PIL
|
1033 |
im = Image.open(f)
|
1034 |
r = max_dim / max(im.height, im.width) # ratio
|
|
|
1044 |
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
|
1045 |
cv2.imwrite(str(f_new), im)
|
1046 |
|
1047 |
+
def get_json(self, save=False, verbose=False):
|
1048 |
+
# Return dataset JSON for Ultralytics HUB
|
1049 |
+
def _round(labels):
|
1050 |
+
# Update labels to integer class and 6 decimal place floats
|
1051 |
+
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
|
1052 |
+
|
1053 |
+
for split in 'train', 'val', 'test':
|
1054 |
+
if self.data.get(split) is None:
|
1055 |
+
self.stats[split] = None # i.e. no test set
|
1056 |
+
continue
|
1057 |
+
dataset = LoadImagesAndLabels(self.data[split]) # load dataset
|
1058 |
+
x = np.array([
|
1059 |
+
np.bincount(label[:, 0].astype(int), minlength=self.data['nc'])
|
1060 |
+
for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics')]) # shape(128x80)
|
1061 |
+
self.stats[split] = {
|
1062 |
+
'instance_stats': {
|
1063 |
+
'total': int(x.sum()),
|
1064 |
+
'per_class': x.sum(0).tolist()},
|
1065 |
+
'image_stats': {
|
1066 |
+
'total': dataset.n,
|
1067 |
+
'unlabelled': int(np.all(x == 0, 1).sum()),
|
1068 |
+
'per_class': (x > 0).sum(0).tolist()},
|
1069 |
+
'labels': [{
|
1070 |
+
str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
|
1071 |
+
|
1072 |
+
# Save, print and return
|
1073 |
+
if save:
|
1074 |
+
stats_path = self.hub_dir / 'stats.json'
|
1075 |
+
print(f'Saving {stats_path.resolve()}...')
|
1076 |
+
with open(stats_path, 'w') as f:
|
1077 |
+
json.dump(self.stats, f) # save stats.json
|
1078 |
+
if verbose:
|
1079 |
+
print(json.dumps(self.stats, indent=2, sort_keys=False))
|
1080 |
+
return self.stats
|
1081 |
+
|
1082 |
+
def process_images(self):
|
1083 |
+
# Compress images for Ultralytics HUB
|
1084 |
+
for split in 'train', 'val', 'test':
|
1085 |
+
if self.data.get(split) is None:
|
1086 |
+
continue
|
1087 |
+
dataset = LoadImagesAndLabels(self.data[split]) # load dataset
|
1088 |
+
desc = f'{split} images'
|
1089 |
+
for _ in tqdm(ThreadPool(NUM_THREADS).imap(self._hub_ops, dataset.im_files), total=dataset.n, desc=desc):
|
1090 |
pass
|
1091 |
+
print(f'Done. All images saved to {self.im_dir}')
|
1092 |
+
return self.im_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|