glenn-jocher pre-commit-ci[bot] commited on
Commit
c23a441
1 Parent(s): 6adc53b

Improved `dataset_stats()` YAML checks (#8125)

Browse files

* Update dataloaders.py

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Files changed (1) hide show
  1. utils/dataloaders.py +23 -12
utils/dataloaders.py CHANGED
@@ -859,7 +859,7 @@ def flatten_recursive(path=DATASETS_DIR / 'coco128'):
859
  shutil.copyfile(file, new_path / Path(file).name)
860
 
861
 
862
- def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.datasets import *; extract_boxes()
863
  # Convert detection dataset into classification dataset, with one directory per class
864
  path = Path(path) # images dir
865
  shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing
@@ -895,7 +895,7 @@ def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.datasets import
895
 
896
  def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
897
  """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
898
- Usage: from utils.datasets import *; autosplit()
899
  Arguments
900
  path: Path to images directory
901
  weights: Train, val, test weights (list, tuple)
@@ -972,29 +972,40 @@ def verify_image_label(args):
972
  def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
973
  """ Return dataset statistics dictionary with images and instances counts per split per class
974
  To run in parent directory: export PYTHONPATH="$PWD/yolov5"
975
- Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', autodownload=True)
976
- Usage2: from utils.datasets import *; dataset_stats('path/to/coco128_with_yaml.zip')
977
  Arguments
978
  path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
979
  autodownload: Attempt to download dataset if not found locally
980
  verbose: Print stats dictionary
981
  """
982
 
983
- def round_labels(labels):
984
  # Update labels to integer class and 6 decimal place floats
985
  return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
986
 
987
- def unzip(path):
988
- # Unzip data.zip TODO: CONSTRAINT: path/to/abc.zip MUST unzip to 'path/to/abc/'
 
 
 
 
 
 
 
 
 
 
989
  if str(path).endswith('.zip'): # path is data.zip
990
  assert Path(path).is_file(), f'Error unzipping {path}, file not found'
991
  ZipFile(path).extractall(path=path.parent) # unzip
992
  dir = path.with_suffix('') # dataset directory == zip name
993
- return True, str(dir), next(dir.rglob('*.yaml')) # zipped, data_dir, yaml_path
 
994
  else: # path is data.yaml
995
  return False, None, path
996
 
997
- def hub_ops(f, max_dim=1920):
998
  # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
999
  f_new = im_dir / Path(f).name # dataset-hub image filename
1000
  try: # use PIL
@@ -1012,7 +1023,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
1012
  im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
1013
  cv2.imwrite(str(f_new), im)
1014
 
1015
- zipped, data_dir, yaml_path = unzip(Path(path))
1016
  with open(check_yaml(yaml_path), errors='ignore') as f:
1017
  data = yaml.safe_load(f) # data dict
1018
  if zipped:
@@ -1038,12 +1049,12 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
1038
  'unlabelled': int(np.all(x == 0, 1).sum()),
1039
  'per_class': (x > 0).sum(0).tolist()},
1040
  'labels': [{
1041
- str(Path(k).name): round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
1042
 
1043
  if hub:
1044
  im_dir = hub_dir / 'images'
1045
  im_dir.mkdir(parents=True, exist_ok=True)
1046
- for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
1047
  pass
1048
 
1049
  # Profile
 
859
  shutil.copyfile(file, new_path / Path(file).name)
860
 
861
 
862
+ def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.dataloaders import *; extract_boxes()
863
  # Convert detection dataset into classification dataset, with one directory per class
864
  path = Path(path) # images dir
865
  shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing
 
895
 
896
  def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
897
  """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
898
+ Usage: from utils.dataloaders import *; autosplit()
899
  Arguments
900
  path: Path to images directory
901
  weights: Train, val, test weights (list, tuple)
 
972
  def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
973
  """ Return dataset statistics dictionary with images and instances counts per split per class
974
  To run in parent directory: export PYTHONPATH="$PWD/yolov5"
975
+ Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True)
976
+ Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip')
977
  Arguments
978
  path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
979
  autodownload: Attempt to download dataset if not found locally
980
  verbose: Print stats dictionary
981
  """
982
 
983
+ def _round_labels(labels):
984
  # Update labels to integer class and 6 decimal place floats
985
  return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
986
 
987
+ def _find_yaml(dir):
988
+ # Return data.yaml file
989
+ files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
990
+ assert files, f'No *.yaml file found in {dir}'
991
+ if len(files) > 1:
992
+ files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name
993
+ assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed'
994
+ assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
995
+ return files[0]
996
+
997
+ def _unzip(path):
998
+ # Unzip data.zip
999
  if str(path).endswith('.zip'): # path is data.zip
1000
  assert Path(path).is_file(), f'Error unzipping {path}, file not found'
1001
  ZipFile(path).extractall(path=path.parent) # unzip
1002
  dir = path.with_suffix('') # dataset directory == zip name
1003
+ assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
1004
+ return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path
1005
  else: # path is data.yaml
1006
  return False, None, path
1007
 
1008
+ def _hub_ops(f, max_dim=1920):
1009
  # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
1010
  f_new = im_dir / Path(f).name # dataset-hub image filename
1011
  try: # use PIL
 
1023
  im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
1024
  cv2.imwrite(str(f_new), im)
1025
 
1026
+ zipped, data_dir, yaml_path = _unzip(Path(path))
1027
  with open(check_yaml(yaml_path), errors='ignore') as f:
1028
  data = yaml.safe_load(f) # data dict
1029
  if zipped:
 
1049
  'unlabelled': int(np.all(x == 0, 1).sum()),
1050
  'per_class': (x > 0).sum(0).tolist()},
1051
  'labels': [{
1052
+ str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
1053
 
1054
  if hub:
1055
  im_dir = hub_dir / 'images'
1056
  im_dir.mkdir(parents=True, exist_ok=True)
1057
+ for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
1058
  pass
1059
 
1060
  # Profile