glenn-jocher commited on
Commit
8c6f9e1
1 Parent(s): 850970e

Update `dataset_stats()` for zipped datasets (#3926)

Browse files

* Update `dataset_stats()` for zipped datasets

@KalenMike

* cleanup

Files changed (1) hide show
  1. utils/datasets.py +17 -3
utils/datasets.py CHANGED
@@ -888,9 +888,11 @@ def verify_image_label(args):
888
 
889
  def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
890
  """ Return dataset statistics dictionary with images and instances counts per split per class
891
- Usage: from utils.datasets import *; dataset_stats('coco128.yaml', verbose=True)
 
 
892
  Arguments
893
- path: Path to data.yaml
894
  autodownload: Attempt to download dataset if not found locally
895
  verbose: Print stats dictionary
896
  """
@@ -899,8 +901,20 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
899
  # Update labels to integer class and 6 decimal place floats
900
  return [[int(c), *[round(x, 6) for x in points]] for c, *points in labels]
901
 
902
- with open(check_file(path)) as f:
 
 
 
 
 
 
 
 
 
 
903
  data = yaml.safe_load(f) # data dict
 
 
904
  check_dataset(data, autodownload) # download dataset if missing
905
  nc = data['nc'] # number of classes
906
  stats = {'nc': nc, 'names': data['names']} # statistics dictionary
 
888
 
889
  def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
890
  """ Return dataset statistics dictionary with images and instances counts per split per class
891
+ Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', verbose=True)
892
+ Usage2: from utils.datasets import *; dataset_stats('../datasets/coco128.zip', verbose=True)
893
+
894
  Arguments
895
+ path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
896
  autodownload: Attempt to download dataset if not found locally
897
  verbose: Print stats dictionary
898
  """
 
901
  # Update labels to integer class and 6 decimal place floats
902
  return [[int(c), *[round(x, 6) for x in points]] for c, *points in labels]
903
 
904
+ def unzip(path):
905
+ # Unzip data.zip TODO: CONSTRAINT: path/to/abc.zip MUST unzip to 'path/to/abc/'
906
+ if str(path).endswith('.zip'): # path is data.zip
907
+ assert os.system(f'unzip -q {path} -d {path.parent}') == 0, f'Error unzipping {path}'
908
+ data_dir = path.with_suffix('') # dataset directory
909
+ return True, data_dir, list(data_dir.rglob('*.yaml'))[0] # zipped, data_dir, yaml_path
910
+ else: # path is data.yaml
911
+ return False, None, path
912
+
913
+ zipped, data_dir, yaml_path = unzip(Path(path))
914
+ with open(check_file(yaml_path)) as f:
915
  data = yaml.safe_load(f) # data dict
916
+ if zipped:
917
+ data['path'] = data_dir # TODO: should this be dir.resolve()?
918
  check_dataset(data, autodownload) # download dataset if missing
919
  nc = data['nc'] # number of classes
920
  stats = {'nc': nc, 'names': data['names']} # statistics dictionary