glenn-jocher pre-commit-ci[bot] commited on
Commit
b367860
1 Parent(s): a6f197a

New `HUBDatasetStats()` class (#8716)

Browse files

* New `HUBDatasetStats()` class

Usage examples:
```
from utils.dataloaders import *

stats = HUBDatasetStats('coco128.yaml', autodownload=True) # method 1
stats = HUBDatasetStats('path/to/coco128_with_yaml.zip') # method 1

stats.get_json(save=False)
stats.process_images()
```

@kalenmike

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Files changed (1) hide show
  1. utils/dataloaders.py +70 -76
utils/dataloaders.py CHANGED
@@ -977,21 +977,35 @@ def verify_image_label(args):
977
  return [None, None, None, None, nm, nf, ne, nc, msg]
978
 
979
 
980
- def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
981
  """ Return dataset statistics dictionary with images and instances counts per split per class
982
  To run in parent directory: export PYTHONPATH="$PWD/yolov5"
983
- Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True)
984
- Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip')
985
  Arguments
986
  path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
987
  autodownload: Attempt to download dataset if not found locally
988
- verbose: Print stats dictionary
989
  """
990
 
991
- def _round_labels(labels):
992
- # Update labels to integer class and 6 decimal place floats
993
- return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
994
 
 
995
  def _find_yaml(dir):
996
  # Return data.yaml file
997
  files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
@@ -1002,7 +1016,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
1002
  assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
1003
  return files[0]
1004
 
1005
- def _unzip(path):
1006
  # Unzip data.zip
1007
  if not str(path).endswith('.zip'): # path is data.yaml
1008
  return False, None, path
@@ -1010,11 +1024,11 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
1010
  ZipFile(path).extractall(path=path.parent) # unzip
1011
  dir = path.with_suffix('') # dataset directory == zip name
1012
  assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
1013
- return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path
1014
 
1015
- def _hub_ops(f, max_dim=1920):
1016
  # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
1017
- f_new = im_dir / Path(f).name # dataset-hub image filename
1018
  try: # use PIL
1019
  im = Image.open(f)
1020
  r = max_dim / max(im.height, im.width) # ratio
@@ -1030,69 +1044,49 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
1030
  im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
1031
  cv2.imwrite(str(f_new), im)
1032
 
1033
- zipped, data_dir, yaml_path = _unzip(Path(path))
1034
- try:
1035
- with open(check_yaml(yaml_path), errors='ignore') as f:
1036
- data = yaml.safe_load(f) # data dict
1037
- if zipped:
1038
- data['path'] = data_dir # TODO: should this be dir.resolve()?`
1039
- except Exception:
1040
- raise Exception("error/HUB/dataset_stats/yaml_load")
1041
-
1042
- check_dataset(data, autodownload) # download dataset if missing
1043
- hub_dir = Path(data['path'] + ('-hub' if hub else ''))
1044
- stats = {'nc': data['nc'], 'names': data['names']} # statistics dictionary
1045
- for split in 'train', 'val', 'test':
1046
- if data.get(split) is None:
1047
- stats[split] = None # i.e. no test set
1048
- continue
1049
- x = []
1050
- dataset = LoadImagesAndLabels(data[split]) # load dataset
1051
- for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics'):
1052
- x.append(np.bincount(label[:, 0].astype(int), minlength=data['nc']))
1053
- x = np.array(x) # shape(128x80)
1054
- stats[split] = {
1055
- 'instance_stats': {
1056
- 'total': int(x.sum()),
1057
- 'per_class': x.sum(0).tolist()},
1058
- 'image_stats': {
1059
- 'total': dataset.n,
1060
- 'unlabelled': int(np.all(x == 0, 1).sum()),
1061
- 'per_class': (x > 0).sum(0).tolist()},
1062
- 'labels': [{
1063
- str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
1064
-
1065
- if hub:
1066
- im_dir = hub_dir / 'images'
1067
- im_dir.mkdir(parents=True, exist_ok=True)
1068
- for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
 
 
 
 
 
 
 
1069
  pass
1070
-
1071
- # Profile
1072
- stats_path = hub_dir / 'stats.json'
1073
- if profile:
1074
- for _ in range(1):
1075
- file = stats_path.with_suffix('.npy')
1076
- t1 = time.time()
1077
- np.save(file, stats)
1078
- t2 = time.time()
1079
- x = np.load(file, allow_pickle=True)
1080
- print(f'stats.npy times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
1081
-
1082
- file = stats_path.with_suffix('.json')
1083
- t1 = time.time()
1084
- with open(file, 'w') as f:
1085
- json.dump(stats, f) # save stats *.json
1086
- t2 = time.time()
1087
- with open(file) as f:
1088
- x = json.load(f) # load hyps dict
1089
- print(f'stats.json times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
1090
-
1091
- # Save, print and return
1092
- if hub:
1093
- print(f'Saving {stats_path.resolve()}...')
1094
- with open(stats_path, 'w') as f:
1095
- json.dump(stats, f) # save stats.json
1096
- if verbose:
1097
- print(json.dumps(stats, indent=2, sort_keys=False))
1098
- return stats
 
977
  return [None, None, None, None, nm, nf, ne, nc, msg]
978
 
979
 
980
+ class HUBDatasetStats():
981
  """ Return dataset statistics dictionary with images and instances counts per split per class
982
  To run in parent directory: export PYTHONPATH="$PWD/yolov5"
983
+ Usage1: from utils.dataloaders import *; HUBDatasetStats('coco128.yaml', autodownload=True)
984
+ Usage2: from utils.dataloaders import *; HUBDatasetStats('path/to/coco128_with_yaml.zip')
985
  Arguments
986
  path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
987
  autodownload: Attempt to download dataset if not found locally
 
988
  """
989
 
990
+ def __init__(self, path='coco128.yaml', autodownload=False):
991
+ # Initialize class
992
+ zipped, data_dir, yaml_path = self._unzip(Path(path))
993
+ try:
994
+ with open(check_yaml(yaml_path), errors='ignore') as f:
995
+ data = yaml.safe_load(f) # data dict
996
+ if zipped:
997
+ data['path'] = data_dir
998
+ except Exception as e:
999
+ raise Exception("error/HUB/dataset_stats/yaml_load") from e
1000
+
1001
+ check_dataset(data, autodownload) # download dataset if missing
1002
+ self.hub_dir = Path(data['path'] + '-hub')
1003
+ self.im_dir = self.hub_dir / 'images'
1004
+ self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images
1005
+ self.stats = {'nc': data['nc'], 'names': data['names']} # statistics dictionary
1006
+ self.data = data
1007
 
1008
+ @staticmethod
1009
  def _find_yaml(dir):
1010
  # Return data.yaml file
1011
  files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
 
1016
  assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
1017
  return files[0]
1018
 
1019
+ def _unzip(self, path):
1020
  # Unzip data.zip
1021
  if not str(path).endswith('.zip'): # path is data.yaml
1022
  return False, None, path
 
1024
  ZipFile(path).extractall(path=path.parent) # unzip
1025
  dir = path.with_suffix('') # dataset directory == zip name
1026
  assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
1027
+ return True, str(dir), self._find_yaml(dir) # zipped, data_dir, yaml_path
1028
 
1029
+ def _hub_ops(self, f, max_dim=1920):
1030
  # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
1031
+ f_new = self.im_dir / Path(f).name # dataset-hub image filename
1032
  try: # use PIL
1033
  im = Image.open(f)
1034
  r = max_dim / max(im.height, im.width) # ratio
 
1044
  im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
1045
  cv2.imwrite(str(f_new), im)
1046
 
1047
+ def get_json(self, save=False, verbose=False):
1048
+ # Return dataset JSON for Ultralytics HUB
1049
+ def _round(labels):
1050
+ # Update labels to integer class and 6 decimal place floats
1051
+ return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
1052
+
1053
+ for split in 'train', 'val', 'test':
1054
+ if self.data.get(split) is None:
1055
+ self.stats[split] = None # i.e. no test set
1056
+ continue
1057
+ dataset = LoadImagesAndLabels(self.data[split]) # load dataset
1058
+ x = np.array([
1059
+ np.bincount(label[:, 0].astype(int), minlength=self.data['nc'])
1060
+ for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics')]) # shape(128x80)
1061
+ self.stats[split] = {
1062
+ 'instance_stats': {
1063
+ 'total': int(x.sum()),
1064
+ 'per_class': x.sum(0).tolist()},
1065
+ 'image_stats': {
1066
+ 'total': dataset.n,
1067
+ 'unlabelled': int(np.all(x == 0, 1).sum()),
1068
+ 'per_class': (x > 0).sum(0).tolist()},
1069
+ 'labels': [{
1070
+ str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
1071
+
1072
+ # Save, print and return
1073
+ if save:
1074
+ stats_path = self.hub_dir / 'stats.json'
1075
+ print(f'Saving {stats_path.resolve()}...')
1076
+ with open(stats_path, 'w') as f:
1077
+ json.dump(self.stats, f) # save stats.json
1078
+ if verbose:
1079
+ print(json.dumps(self.stats, indent=2, sort_keys=False))
1080
+ return self.stats
1081
+
1082
+ def process_images(self):
1083
+ # Compress images for Ultralytics HUB
1084
+ for split in 'train', 'val', 'test':
1085
+ if self.data.get(split) is None:
1086
+ continue
1087
+ dataset = LoadImagesAndLabels(self.data[split]) # load dataset
1088
+ desc = f'{split} images'
1089
+ for _ in tqdm(ThreadPool(NUM_THREADS).imap(self._hub_ops, dataset.im_files), total=dataset.n, desc=desc):
1090
  pass
1091
+ print(f'Done. All images saved to {self.im_dir}')
1092
+ return self.im_dir