Commit
•
c23a441
1
Parent(s):
6adc53b
Improved `dataset_stats()` YAML checks (#8125)
Browse files* Update dataloaders.py
* Update dataloaders.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- utils/dataloaders.py +23 -12
utils/dataloaders.py
CHANGED
@@ -859,7 +859,7 @@ def flatten_recursive(path=DATASETS_DIR / 'coco128'):
|
|
859 |
shutil.copyfile(file, new_path / Path(file).name)
|
860 |
|
861 |
|
862 |
-
def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.
|
863 |
# Convert detection dataset into classification dataset, with one directory per class
|
864 |
path = Path(path) # images dir
|
865 |
shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing
|
@@ -895,7 +895,7 @@ def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.datasets import
|
|
895 |
|
896 |
def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
|
897 |
""" Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
|
898 |
-
Usage: from utils.
|
899 |
Arguments
|
900 |
path: Path to images directory
|
901 |
weights: Train, val, test weights (list, tuple)
|
@@ -972,29 +972,40 @@ def verify_image_label(args):
|
|
972 |
def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
|
973 |
""" Return dataset statistics dictionary with images and instances counts per split per class
|
974 |
To run in parent directory: export PYTHONPATH="$PWD/yolov5"
|
975 |
-
Usage1: from utils.
|
976 |
-
Usage2: from utils.
|
977 |
Arguments
|
978 |
path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
|
979 |
autodownload: Attempt to download dataset if not found locally
|
980 |
verbose: Print stats dictionary
|
981 |
"""
|
982 |
|
983 |
-
def
|
984 |
# Update labels to integer class and 6 decimal place floats
|
985 |
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
|
986 |
|
987 |
-
def
|
988 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
989 |
if str(path).endswith('.zip'): # path is data.zip
|
990 |
assert Path(path).is_file(), f'Error unzipping {path}, file not found'
|
991 |
ZipFile(path).extractall(path=path.parent) # unzip
|
992 |
dir = path.with_suffix('') # dataset directory == zip name
|
993 |
-
|
|
|
994 |
else: # path is data.yaml
|
995 |
return False, None, path
|
996 |
|
997 |
-
def
|
998 |
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
|
999 |
f_new = im_dir / Path(f).name # dataset-hub image filename
|
1000 |
try: # use PIL
|
@@ -1012,7 +1023,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
|
|
1012 |
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
|
1013 |
cv2.imwrite(str(f_new), im)
|
1014 |
|
1015 |
-
zipped, data_dir, yaml_path =
|
1016 |
with open(check_yaml(yaml_path), errors='ignore') as f:
|
1017 |
data = yaml.safe_load(f) # data dict
|
1018 |
if zipped:
|
@@ -1038,12 +1049,12 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
|
|
1038 |
'unlabelled': int(np.all(x == 0, 1).sum()),
|
1039 |
'per_class': (x > 0).sum(0).tolist()},
|
1040 |
'labels': [{
|
1041 |
-
str(Path(k).name):
|
1042 |
|
1043 |
if hub:
|
1044 |
im_dir = hub_dir / 'images'
|
1045 |
im_dir.mkdir(parents=True, exist_ok=True)
|
1046 |
-
for _ in tqdm(ThreadPool(NUM_THREADS).imap(
|
1047 |
pass
|
1048 |
|
1049 |
# Profile
|
|
|
859 |
shutil.copyfile(file, new_path / Path(file).name)
|
860 |
|
861 |
|
862 |
+
def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.dataloaders import *; extract_boxes()
|
863 |
# Convert detection dataset into classification dataset, with one directory per class
|
864 |
path = Path(path) # images dir
|
865 |
shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing
|
|
|
895 |
|
896 |
def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False):
|
897 |
""" Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
|
898 |
+
Usage: from utils.dataloaders import *; autosplit()
|
899 |
Arguments
|
900 |
path: Path to images directory
|
901 |
weights: Train, val, test weights (list, tuple)
|
|
|
972 |
def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
|
973 |
""" Return dataset statistics dictionary with images and instances counts per split per class
|
974 |
To run in parent directory: export PYTHONPATH="$PWD/yolov5"
|
975 |
+
Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True)
|
976 |
+
Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip')
|
977 |
Arguments
|
978 |
path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
|
979 |
autodownload: Attempt to download dataset if not found locally
|
980 |
verbose: Print stats dictionary
|
981 |
"""
|
982 |
|
983 |
+
def _round_labels(labels):
|
984 |
# Update labels to integer class and 6 decimal place floats
|
985 |
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
|
986 |
|
987 |
+
def _find_yaml(dir):
|
988 |
+
# Return data.yaml file
|
989 |
+
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
|
990 |
+
assert files, f'No *.yaml file found in {dir}'
|
991 |
+
if len(files) > 1:
|
992 |
+
files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name
|
993 |
+
assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed'
|
994 |
+
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
|
995 |
+
return files[0]
|
996 |
+
|
997 |
+
def _unzip(path):
|
998 |
+
# Unzip data.zip
|
999 |
if str(path).endswith('.zip'): # path is data.zip
|
1000 |
assert Path(path).is_file(), f'Error unzipping {path}, file not found'
|
1001 |
ZipFile(path).extractall(path=path.parent) # unzip
|
1002 |
dir = path.with_suffix('') # dataset directory == zip name
|
1003 |
+
assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
|
1004 |
+
return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path
|
1005 |
else: # path is data.yaml
|
1006 |
return False, None, path
|
1007 |
|
1008 |
+
def _hub_ops(f, max_dim=1920):
|
1009 |
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
|
1010 |
f_new = im_dir / Path(f).name # dataset-hub image filename
|
1011 |
try: # use PIL
|
|
|
1023 |
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
|
1024 |
cv2.imwrite(str(f_new), im)
|
1025 |
|
1026 |
+
zipped, data_dir, yaml_path = _unzip(Path(path))
|
1027 |
with open(check_yaml(yaml_path), errors='ignore') as f:
|
1028 |
data = yaml.safe_load(f) # data dict
|
1029 |
if zipped:
|
|
|
1049 |
'unlabelled': int(np.all(x == 0, 1).sum()),
|
1050 |
'per_class': (x > 0).sum(0).tolist()},
|
1051 |
'labels': [{
|
1052 |
+
str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
|
1053 |
|
1054 |
if hub:
|
1055 |
im_dir = hub_dir / 'images'
|
1056 |
im_dir.mkdir(parents=True, exist_ok=True)
|
1057 |
+
for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
|
1058 |
pass
|
1059 |
|
1060 |
# Profile
|