diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..5e592189ce66ef23954a3b3acca46f883e6ac221 --- /dev/null +++ b/app.py @@ -0,0 +1,71 @@ +import gradio as gr +import torch +from PIL import Image +from ultralytics import YOLO +import matplotlib.pyplot as plt +import io +from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas +model = YOLO('detect-best.pt') + +def predict(img, conf, iou): + results = model.predict(img, conf=conf, iou=iou) + name = results[0].names + cls = results[0].boxes.cls + crazing = 0 + inclusion = 0 + patches = 0 + pitted_surface = 0 + rolled_inscale = 0 + scratches = 0 + for i in cls: + if i == 0: + crazing += 1 + elif i == 1: + inclusion += 1 + elif i == 2: + patches += 1 + elif i == 3: + pitted_surface += 1 + elif i == 4: + rolled_inscale += 1 + elif i == 5: + scratches += 1 + # 绘制柱状图 + fig, ax = plt.subplots() + categories = ['crazing','inclusion', 'patches' ,'pitted_surface', 'rolled_inscale' ,'scratches'] + counts = [crazing,inclusion, patches ,pitted_surface, rolled_inscale ,scratches] + ax.bar(categories, counts) + ax.set_title('Category-Count') + plt.ylim(0,5) + plt.xticks(rotation=45, ha="right") + ax.set_xlabel('Category') + ax.set_ylabel('Count') + # 将图表保存为字节流 + buf = io.BytesIO() + canvas = FigureCanvas(fig) + canvas.print_png(buf) + plt.close(fig) # 关闭图形,释放资源 + + # 将字节流转换为PIL Image + image_png = Image.open(buf) + # 绘制并返回结果图片和类别计数图表 + + for i, r in enumerate(results): + # Plot results image + im_bgr = r.plot() # BGR-order numpy array + im_rgb = Image.fromarray(im_bgr[..., ::-1]) # RGB-order PIL image + + # Show results to screen (in supported environments) + return im_rgb, image_png + + +base_conf, base_iou = 0.25, 0.45 +title = "基于改进YOLOv8算法的工业瑕疵辅助检测系统" +des = "鼠标点击上传图片即可检测缺陷,可通过鼠标调整预测置信度,还可点击网页最下方示例图片进行预测" +interface = gr.Interface( + inputs=['image', gr.Slider(maximum=1, minimum=0, value=base_conf), gr.Slider(maximum=1, minimum=0, value=base_iou)], + outputs=["image", 'image'], fn=predict, title=title, description=des, + examples=[["example1.jpg", base_conf, base_iou], + ["example2.jpg", base_conf, base_iou], + ["example3.jpg", base_conf, base_iou]]) +interface.launch() diff --git a/detect-best.pt b/detect-best.pt new file mode 100644 index 0000000000000000000000000000000000000000..b706642c2e6f67acd5ed3e48407bfb0361b3d8f8 --- /dev/null +++ b/detect-best.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62b790537841a3f4a29d3cf6c3a7effcea9000cdf769e87829e8feee0f39b383 +size 8385200 diff --git a/example1.jpg b/example1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e1a47a98778dba2e2f94d9fba1d0629d0fe84eb5 Binary files /dev/null and b/example1.jpg differ diff --git a/example2.jpg b/example2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..394beffad2bebf9c845f60b1bad5d4517a8fff54 Binary files /dev/null and b/example2.jpg differ diff --git a/example3.jpg b/example3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4d3bf16e87847a94b9a3ca572f30c3e905ce9b36 Binary files /dev/null and b/example3.jpg differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6856f7d335983b15d3d798c1901e62822f63b2d --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +ultralytics \ No newline at end of file diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa113835e884a10581dd4442a86679a839614426 --- /dev/null +++ b/ultralytics/__init__.py @@ -0,0 +1,13 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +__version__ = '8.0.147' + +from ultralytics.hub import start +from ultralytics.models import RTDETR, SAM, YOLO +from ultralytics.models.fastsam import FastSAM +from ultralytics.models.nas import NAS +from ultralytics.utils import SETTINGS as settings +from ultralytics.utils.checks import check_yolo as checks +from ultralytics.utils.downloads import download + +__all__ = '__version__', 'YOLO', 'NAS', 'SAM', 'FastSAM', 'RTDETR', 'checks', 'download', 'start', 'settings' # allow simpler import diff --git a/ultralytics/__pycache__/__init__.cpython-310.pyc b/ultralytics/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78a92a179e6fa947275b3c43858ef63a90195656 Binary files /dev/null and b/ultralytics/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/__pycache__/__init__.cpython-39.pyc b/ultralytics/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..221810889acaf2046d4075f9590d3ce855b46654 Binary files /dev/null and b/ultralytics/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/cfg/__init__.py b/ultralytics/cfg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..414ca587238d5289803cfb6d1577dd1924824d2f --- /dev/null +++ b/ultralytics/cfg/__init__.py @@ -0,0 +1,441 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import contextlib +import re +import shutil +import sys +from difflib import get_close_matches +from pathlib import Path +from types import SimpleNamespace +from typing import Dict, List, Union + +from ultralytics.utils import (DEFAULT_CFG, DEFAULT_CFG_DICT, DEFAULT_CFG_PATH, LOGGER, ROOT, SETTINGS, SETTINGS_YAML, + IterableSimpleNamespace, __version__, checks, colorstr, deprecation_warn, yaml_load, + yaml_print) + +# Define valid tasks and modes +MODES = 'train', 'val', 'predict', 'export', 'track', 'benchmark' +TASKS = 'detect', 'segment', 'classify', 'pose' +TASK2DATA = {'detect': 'coco8.yaml', 'segment': 'coco8-seg.yaml', 'classify': 'imagenet100', 'pose': 'coco8-pose.yaml'} +TASK2MODEL = { + 'detect': 'yolov8n.pt', + 'segment': 'yolov8n-seg.pt', + 'classify': 'yolov8n-cls.pt', + 'pose': 'yolov8n-pose.pt'} +TASK2METRIC = { + 'detect': 'metrics/mAP50-95(B)', + 'segment': 'metrics/mAP50-95(M)', + 'classify': 'metrics/accuracy_top1', + 'pose': 'metrics/mAP50-95(P)'} + +CLI_HELP_MSG = \ + f""" + Arguments received: {str(['yolo'] + sys.argv[1:])}. Ultralytics 'yolo' commands use the following syntax: + + yolo TASK MODE ARGS + + Where TASK (optional) is one of {TASKS} + MODE (required) is one of {MODES} + ARGS (optional) are any number of custom 'arg=value' pairs like 'imgsz=320' that override defaults. + See all ARGS at https://docs.ultralytics.com/usage/cfg or with 'yolo cfg' + + 1. Train a detection model for 10 epochs with an initial learning_rate of 0.01 + yolo train data=coco128.yaml model=yolov8n.pt epochs=10 lr0=0.01 + + 2. Predict a YouTube video using a pretrained segmentation model at image size 320: + yolo predict model=yolov8n-seg.pt source='https://youtu.be/Zgi9g1ksQHc' imgsz=320 + + 3. Val a pretrained detection model at batch-size 1 and image size 640: + yolo val model=yolov8n.pt data=coco128.yaml batch=1 imgsz=640 + + 4. Export a YOLOv8n classification model to ONNX format at image size 224 by 128 (no TASK required) + yolo export model=yolov8n-cls.pt format=onnx imgsz=224,128 + + 5. Run special commands: + yolo help + yolo checks + yolo version + yolo settings + yolo copy-cfg + yolo cfg + + Docs: https://docs.ultralytics.com + Community: https://community.ultralytics.com + GitHub: https://github.com/ultralytics/ultralytics + """ + +# Define keys for arg type checks +CFG_FLOAT_KEYS = 'warmup_epochs', 'box', 'cls', 'dfl', 'degrees', 'shear' +CFG_FRACTION_KEYS = ('dropout', 'iou', 'lr0', 'lrf', 'momentum', 'weight_decay', 'warmup_momentum', 'warmup_bias_lr', + 'label_smoothing', 'hsv_h', 'hsv_s', 'hsv_v', 'translate', 'scale', 'perspective', 'flipud', + 'fliplr', 'mosaic', 'mixup', 'copy_paste', 'conf', 'iou', 'fraction') # fraction floats 0.0 - 1.0 +CFG_INT_KEYS = ('epochs', 'patience', 'batch', 'workers', 'seed', 'close_mosaic', 'mask_ratio', 'max_det', 'vid_stride', + 'line_width', 'workspace', 'nbs', 'save_period') +CFG_BOOL_KEYS = ('save', 'exist_ok', 'verbose', 'deterministic', 'single_cls', 'rect', 'cos_lr', 'overlap_mask', 'val', + 'save_json', 'save_hybrid', 'half', 'dnn', 'plots', 'show', 'save_txt', 'save_conf', 'save_crop', + 'show_labels', 'show_conf', 'visualize', 'augment', 'agnostic_nms', 'retina_masks', 'boxes', 'keras', + 'optimize', 'int8', 'dynamic', 'simplify', 'nms', 'profile') + + +def cfg2dict(cfg): + """ + Convert a configuration object to a dictionary, whether it is a file path, a string, or a SimpleNamespace object. + + Args: + cfg (str | Path | SimpleNamespace): Configuration object to be converted to a dictionary. + + Returns: + cfg (dict): Configuration object in dictionary format. + """ + if isinstance(cfg, (str, Path)): + cfg = yaml_load(cfg) # load dict + elif isinstance(cfg, SimpleNamespace): + cfg = vars(cfg) # convert to dict + return cfg + + +def get_cfg(cfg: Union[str, Path, Dict, SimpleNamespace] = DEFAULT_CFG_DICT, overrides: Dict = None): + """ + Load and merge configuration data from a file or dictionary. + + Args: + cfg (str | Path | Dict | SimpleNamespace): Configuration data. + overrides (str | Dict | optional): Overrides in the form of a file name or a dictionary. Default is None. + + Returns: + (SimpleNamespace): Training arguments namespace. + """ + cfg = cfg2dict(cfg) + + # Merge overrides + if overrides: + overrides = cfg2dict(overrides) + check_dict_alignment(cfg, overrides) + cfg = {**cfg, **overrides} # merge cfg and overrides dicts (prefer overrides) + + # Special handling for numeric project/name + for k in 'project', 'name': + if k in cfg and isinstance(cfg[k], (int, float)): + cfg[k] = str(cfg[k]) + if cfg.get('name') == 'model': # assign model to 'name' arg + cfg['name'] = cfg.get('model', '').split('.')[0] + LOGGER.warning(f"WARNING ⚠️ 'name=model' automatically updated to 'name={cfg['name']}'.") + + # Type and Value checks + for k, v in cfg.items(): + if v is not None: # None values may be from optional args + if k in CFG_FLOAT_KEYS and not isinstance(v, (int, float)): + raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. " + f"Valid '{k}' types are int (i.e. '{k}=0') or float (i.e. '{k}=0.5')") + elif k in CFG_FRACTION_KEYS: + if not isinstance(v, (int, float)): + raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. " + f"Valid '{k}' types are int (i.e. '{k}=0') or float (i.e. '{k}=0.5')") + if not (0.0 <= v <= 1.0): + raise ValueError(f"'{k}={v}' is an invalid value. " + f"Valid '{k}' values are between 0.0 and 1.0.") + elif k in CFG_INT_KEYS and not isinstance(v, int): + raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. " + f"'{k}' must be an int (i.e. '{k}=8')") + elif k in CFG_BOOL_KEYS and not isinstance(v, bool): + raise TypeError(f"'{k}={v}' is of invalid type {type(v).__name__}. " + f"'{k}' must be a bool (i.e. '{k}=True' or '{k}=False')") + + # Return instance + return IterableSimpleNamespace(**cfg) + + +def _handle_deprecation(custom): + """Hardcoded function to handle deprecated config keys""" + + for key in custom.copy().keys(): + if key == 'hide_labels': + deprecation_warn(key, 'show_labels') + custom['show_labels'] = custom.pop('hide_labels') == 'False' + if key == 'hide_conf': + deprecation_warn(key, 'show_conf') + custom['show_conf'] = custom.pop('hide_conf') == 'False' + if key == 'line_thickness': + deprecation_warn(key, 'line_width') + custom['line_width'] = custom.pop('line_thickness') + + return custom + + +def check_dict_alignment(base: Dict, custom: Dict, e=None): + """ + This function checks for any mismatched keys between a custom configuration list and a base configuration list. + If any mismatched keys are found, the function prints out similar keys from the base list and exits the program. + + Args: + custom (dict): a dictionary of custom configuration options + base (dict): a dictionary of base configuration options + """ + custom = _handle_deprecation(custom) + base_keys, custom_keys = (set(x.keys()) for x in (base, custom)) + mismatched = [k for k in custom_keys if k not in base_keys] + if mismatched: + string = '' + for x in mismatched: + matches = get_close_matches(x, base_keys) # key list + matches = [f'{k}={base[k]}' if base.get(k) is not None else k for k in matches] + match_str = f'Similar arguments are i.e. {matches}.' if matches else '' + string += f"'{colorstr('red', 'bold', x)}' is not a valid YOLO argument. {match_str}\n" + raise SyntaxError(string + CLI_HELP_MSG) from e + + +def merge_equals_args(args: List[str]) -> List[str]: + """ + Merges arguments around isolated '=' args in a list of strings. + The function considers cases where the first argument ends with '=' or the second starts with '=', + as well as when the middle one is an equals sign. + + Args: + args (List[str]): A list of strings where each element is an argument. + + Returns: + List[str]: A list of strings where the arguments around isolated '=' are merged. + """ + new_args = [] + for i, arg in enumerate(args): + if arg == '=' and 0 < i < len(args) - 1: # merge ['arg', '=', 'val'] + new_args[-1] += f'={args[i + 1]}' + del args[i + 1] + elif arg.endswith('=') and i < len(args) - 1 and '=' not in args[i + 1]: # merge ['arg=', 'val'] + new_args.append(f'{arg}{args[i + 1]}') + del args[i + 1] + elif arg.startswith('=') and i > 0: # merge ['arg', '=val'] + new_args[-1] += arg + else: + new_args.append(arg) + return new_args + + +def handle_yolo_hub(args: List[str]) -> None: + """ + Handle Ultralytics HUB command-line interface (CLI) commands. + + This function processes Ultralytics HUB CLI commands such as login and logout. + It should be called when executing a script with arguments related to HUB authentication. + + Args: + args (List[str]): A list of command line arguments + + Example: + ```python + python my_script.py hub login your_api_key + ``` + """ + from ultralytics import hub + + if args[0] == 'login': + key = args[1] if len(args) > 1 else '' + # Log in to Ultralytics HUB using the provided API key + hub.login(key) + elif args[0] == 'logout': + # Log out from Ultralytics HUB + hub.logout() + + +def handle_yolo_settings(args: List[str]) -> None: + """ + Handle YOLO settings command-line interface (CLI) commands. + + This function processes YOLO settings CLI commands such as reset. + It should be called when executing a script with arguments related to YOLO settings management. + + Args: + args (List[str]): A list of command line arguments for YOLO settings management. + + Example: + ```python + python my_script.py yolo settings reset + ``` + """ + if any(args): + if args[0] == 'reset': + SETTINGS_YAML.unlink() # delete the settings file + SETTINGS.reset() # create new settings + LOGGER.info('Settings reset successfully') # inform the user that settings have been reset + else: # save a new setting + new = dict(parse_key_value_pair(a) for a in args) + check_dict_alignment(SETTINGS, new) + SETTINGS.update(new) + + yaml_print(SETTINGS_YAML) # print the current settings + + +def parse_key_value_pair(pair): + """Parse one 'key=value' pair and return key and value.""" + re.sub(r' *= *', '=', pair) # remove spaces around equals sign + k, v = pair.split('=', 1) # split on first '=' sign + assert v, f"missing '{k}' value" + return k, smart_value(v) + + +def smart_value(v): + """Convert a string to an underlying type such as int, float, bool, etc.""" + if v.lower() == 'none': + return None + elif v.lower() == 'true': + return True + elif v.lower() == 'false': + return False + else: + with contextlib.suppress(Exception): + return eval(v) + return v + + +def entrypoint(debug=''): + """ + This function is the ultralytics package entrypoint, it's responsible for parsing the command line arguments passed + to the package. + + This function allows for: + - passing mandatory YOLO args as a list of strings + - specifying the task to be performed, either 'detect', 'segment' or 'classify' + - specifying the mode, either 'train', 'val', 'test', or 'predict' + - running special modes like 'checks' + - passing overrides to the package's configuration + + It uses the package's default cfg and initializes it using the passed overrides. + Then it calls the CLI function with the composed cfg + """ + args = (debug.split(' ') if debug else sys.argv)[1:] + if not args: # no arguments passed + LOGGER.info(CLI_HELP_MSG) + return + + special = { + 'help': lambda: LOGGER.info(CLI_HELP_MSG), + 'checks': checks.check_yolo, + 'version': lambda: LOGGER.info(__version__), + 'settings': lambda: handle_yolo_settings(args[1:]), + 'cfg': lambda: yaml_print(DEFAULT_CFG_PATH), + 'hub': lambda: handle_yolo_hub(args[1:]), + 'login': lambda: handle_yolo_hub(args), + 'copy-cfg': copy_default_cfg} + full_args_dict = {**DEFAULT_CFG_DICT, **{k: None for k in TASKS}, **{k: None for k in MODES}, **special} + + # Define common mis-uses of special commands, i.e. -h, -help, --help + special.update({k[0]: v for k, v in special.items()}) # singular + special.update({k[:-1]: v for k, v in special.items() if len(k) > 1 and k.endswith('s')}) # singular + special = {**special, **{f'-{k}': v for k, v in special.items()}, **{f'--{k}': v for k, v in special.items()}} + + overrides = {} # basic overrides, i.e. imgsz=320 + for a in merge_equals_args(args): # merge spaces around '=' sign + if a.startswith('--'): + LOGGER.warning(f"WARNING ⚠️ '{a}' does not require leading dashes '--', updating to '{a[2:]}'.") + a = a[2:] + if a.endswith(','): + LOGGER.warning(f"WARNING ⚠️ '{a}' does not require trailing comma ',', updating to '{a[:-1]}'.") + a = a[:-1] + if '=' in a: + try: + k, v = parse_key_value_pair(a) + if k == 'cfg': # custom.yaml passed + LOGGER.info(f'Overriding {DEFAULT_CFG_PATH} with {v}') + overrides = {k: val for k, val in yaml_load(checks.check_yaml(v)).items() if k != 'cfg'} + else: + overrides[k] = v + except (NameError, SyntaxError, ValueError, AssertionError) as e: + check_dict_alignment(full_args_dict, {a: ''}, e) + + elif a in TASKS: + overrides['task'] = a + elif a in MODES: + overrides['mode'] = a + elif a.lower() in special: + special[a.lower()]() + return + elif a in DEFAULT_CFG_DICT and isinstance(DEFAULT_CFG_DICT[a], bool): + overrides[a] = True # auto-True for default bool args, i.e. 'yolo show' sets show=True + elif a in DEFAULT_CFG_DICT: + raise SyntaxError(f"'{colorstr('red', 'bold', a)}' is a valid YOLO argument but is missing an '=' sign " + f"to set its value, i.e. try '{a}={DEFAULT_CFG_DICT[a]}'\n{CLI_HELP_MSG}") + else: + check_dict_alignment(full_args_dict, {a: ''}) + + # Check keys + check_dict_alignment(full_args_dict, overrides) + + # Mode + mode = overrides.get('mode') + if mode is None: + mode = DEFAULT_CFG.mode or 'predict' + LOGGER.warning(f"WARNING ⚠️ 'mode' is missing. Valid modes are {MODES}. Using default 'mode={mode}'.") + elif mode not in MODES: + if mode not in ('checks', checks): + raise ValueError(f"Invalid 'mode={mode}'. Valid modes are {MODES}.\n{CLI_HELP_MSG}") + LOGGER.warning("WARNING ⚠️ 'yolo mode=checks' is deprecated. Use 'yolo checks' instead.") + checks.check_yolo() + return + + # Task + task = overrides.pop('task', None) + if task: + if task not in TASKS: + raise ValueError(f"Invalid 'task={task}'. Valid tasks are {TASKS}.\n{CLI_HELP_MSG}") + if 'model' not in overrides: + overrides['model'] = TASK2MODEL[task] + + # Model + model = overrides.pop('model', DEFAULT_CFG.model) + if model is None: + model = 'yolov8n.pt' + LOGGER.warning(f"WARNING ⚠️ 'model' is missing. Using default 'model={model}'.") + overrides['model'] = model + if 'rtdetr' in model.lower(): # guess architecture + from ultralytics import RTDETR + model = RTDETR(model) # no task argument + elif 'fastsam' in model.lower(): + from ultralytics import FastSAM + model = FastSAM(model) + elif 'sam' in model.lower(): + from ultralytics import SAM + model = SAM(model) + else: + from ultralytics import YOLO + model = YOLO(model, task=task) + if isinstance(overrides.get('pretrained'), str): + model.load(overrides['pretrained']) + + # Task Update + if task != model.task: + if task: + LOGGER.warning(f"WARNING ⚠️ conflicting 'task={task}' passed with 'task={model.task}' model. " + f"Ignoring 'task={task}' and updating to 'task={model.task}' to match model.") + task = model.task + + # Mode + if mode in ('predict', 'track') and 'source' not in overrides: + overrides['source'] = DEFAULT_CFG.source or ROOT / 'assets' if (ROOT / 'assets').exists() \ + else 'https://ultralytics.com/images/bus.jpg' + LOGGER.warning(f"WARNING ⚠️ 'source' is missing. Using default 'source={overrides['source']}'.") + elif mode in ('train', 'val'): + if 'data' not in overrides: + overrides['data'] = TASK2DATA.get(task or DEFAULT_CFG.task, DEFAULT_CFG.data) + LOGGER.warning(f"WARNING ⚠️ 'data' is missing. Using default 'data={overrides['data']}'.") + elif mode == 'export': + if 'format' not in overrides: + overrides['format'] = DEFAULT_CFG.format or 'torchscript' + LOGGER.warning(f"WARNING ⚠️ 'format' is missing. Using default 'format={overrides['format']}'.") + + # Run command in python + # getattr(model, mode)(**vars(get_cfg(overrides=overrides))) # default args using default.yaml + getattr(model, mode)(**overrides) # default args from model + + +# Special modes -------------------------------------------------------------------------------------------------------- +def copy_default_cfg(): + """Copy and create a new default configuration file with '_copy' appended to its name.""" + new_file = Path.cwd() / DEFAULT_CFG_PATH.name.replace('.yaml', '_copy.yaml') + shutil.copy2(DEFAULT_CFG_PATH, new_file) + LOGGER.info(f'{DEFAULT_CFG_PATH} copied to {new_file}\n' + f"Example YOLO command with this new custom cfg:\n yolo cfg='{new_file}' imgsz=320 batch=8") + + +if __name__ == '__main__': + # Example Usage: entrypoint(debug='yolo predict model=yolov8n.pt') + entrypoint(debug='') diff --git a/ultralytics/cfg/__pycache__/__init__.cpython-310.pyc b/ultralytics/cfg/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9583e0c4ab37384db79dedf79417a2d4c5adc02f Binary files /dev/null and b/ultralytics/cfg/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/cfg/__pycache__/__init__.cpython-39.pyc b/ultralytics/cfg/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b21bb5f7b203d94e15b95d3cf5896b16fb3283b1 Binary files /dev/null and b/ultralytics/cfg/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/cfg/default.yaml b/ultralytics/cfg/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ad0efe1123ea701c7c27c85e3518b79aba66d0d --- /dev/null +++ b/ultralytics/cfg/default.yaml @@ -0,0 +1,114 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# Default training settings and hyperparameters for medium-augmentation COCO training + +task: detect # (str) YOLO task, i.e. detect, segment, classify, pose +mode: train # (str) YOLO mode, i.e. train, val, predict, export, track, benchmark + +# Train settings ------------------------------------------------------------------------------------------------------- +model: # (str, optional) path to model file, i.e. yolov8n.pt, yolov8n.yaml +data: # (str, optional) path to data file, i.e. coco128.yaml +epochs: 100 # (int) number of epochs to train for +patience: 50 # (int) epochs to wait for no observable improvement for early stopping of training +batch: -1 # (int) number of images per batch (-1 for AutoBatch) +imgsz: 640 # (int | list) input images size as int for train and val modes, or list[w,h] for predict and export modes +save: True # (bool) save train checkpoints and predict results +save_period: -1 # (int) Save checkpoint every x epochs (disabled if < 1) +cache: False # (bool) True/ram, disk or False. Use cache for data loading +device: cpu # (int | str | list, optional) device to run on, i.e. cuda device=0 or device=0,1,2,3 or device=cpu +workers: 2 # (int) number of worker threads for data loading (per RANK if DDP) +project: # (str, optional) project name +name: # (str, optional) experiment name, results saved to 'project/name' directory +exist_ok: True # (bool) whether to overwrite existing experiment +pretrained: True # (bool | str) whether to use a pretrained model (bool) or a model to load weights from (str) +optimizer: auto # (str) optimizer to use, choices=[SGD, Adam, Adamax, AdamW, NAdam, RAdam, RMSProp, auto] +verbose: True # (bool) whether to print verbose output +seed: 0 # (int) random seed for reproducibility +deterministic: True # (bool) whether to enable deterministic mode +single_cls: False # (bool) train multi-class data as single-class +rect: False # (bool) rectangular training if mode='train' or rectangular validation if mode='val' +cos_lr: False # (bool) use cosine learning rate scheduler +close_mosaic: 10 # (int) disable mosaic augmentation for final epochs +resume: False # (bool) resume training from last checkpoint +amp: False # (bool) Automatic Mixed Precision (AMP) training, choices=[True, False], True runs AMP check +fraction: 1.0 # (float) dataset fraction to train on (default is 1.0, all images in train set) +profile: False # (bool) profile ONNX and TensorRT speeds during training for loggers +# Segmentation +overlap_mask: True # (bool) masks should overlap during training (segment train only) +mask_ratio: 4 # (int) mask downsample ratio (segment train only) +# Classification +dropout: 0.0 # (float) use dropout regularization (classify train only) + +# Val/Test settings ---------------------------------------------------------------------------------------------------- +val: True # (bool) validate/test during training +split: val # (str) dataset split to use for validation, i.e. 'val', 'test' or 'train' +save_json: True # (bool) save results to JSON file +save_hybrid: False # (bool) save hybrid version of labels (labels + additional predictions) +conf: # (float, optional) object confidence threshold for detection (default 0.25 predict, 0.001 val) +iou: 0.7 # (float) intersection over union (IoU) threshold for NMS +max_det: 300 # (int) maximum number of detections per image +half: False # (bool) use half precision (FP16) +dnn: False # (bool) use OpenCV DNN for ONNX inference +plots: True # (bool) save plots during train/val + +# Prediction settings -------------------------------------------------------------------------------------------------- +source: # (str, optional) source directory for images or videos +show: False # (bool) show results if possible +save_txt: False # (bool) save results as .txt file +save_conf: False # (bool) save results with confidence scores +save_crop: False # (bool) save cropped images with results +show_labels: True # (bool) show object labels in plots +show_conf: True # (bool) show object confidence scores in plots +vid_stride: 1 # (int) video frame-rate stride +line_width: # (int, optional) line width of the bounding boxes, auto if missing +visualize: False # (bool) visualize model features +augment: False # (bool) apply image augmentation to prediction sources +agnostic_nms: False # (bool) class-agnostic NMS +classes: # (int | list[int], optional) filter results by class, i.e. class=0, or class=[0,2,3] +retina_masks: False # (bool) use high-resolution segmentation masks +boxes: True # (bool) Show boxes in segmentation predictions + +# Export settings ------------------------------------------------------------------------------------------------------ +format: torchscript # (str) format to export to, choices at https://docs.ultralytics.com/modes/export/#export-formats +keras: False # (bool) use Kera=s +optimize: False # (bool) TorchScript: optimize for mobile +int8: False # (bool) CoreML/TF INT8 quantization +dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes +simplify: False # (bool) ONNX: simplify model +opset: # (int, optional) ONNX: opset version +workspace: 4 # (int) TensorRT: workspace size (GB) +nms: False # (bool) CoreML: add NMS + +# Hyperparameters ------------------------------------------------------------------------------------------------------ +lr0: 0.01 # (float) initial learning rate (i.e. SGD=1E-2, Adam=1E-3) +lrf: 0.01 # (float) final learning rate (lr0 * lrf) +momentum: 0.937 # (float) SGD momentum/Adam beta1 +weight_decay: 0.0005 # (float) optimizer weight decay 5e-4 +warmup_epochs: 3.0 # (float) warmup epochs (fractions ok) +warmup_momentum: 0.8 # (float) warmup initial momentum +warmup_bias_lr: 0.1 # (float) warmup initial bias lr +box: 7.5 # (float) box loss gain +cls: 0.5 # (float) cls loss gain (scale with pixels) +dfl: 1.5 # (float) dfl loss gain +pose: 12.0 # (float) pose loss gain +kobj: 1.0 # (float) keypoint obj loss gain +label_smoothing: 0.0 # (float) label smoothing (fraction) +nbs: 64 # (int) nominal batch size +hsv_h: 0.015 # (float) image HSV-Hue augmentation (fraction) +hsv_s: 0.7 # (float) image HSV-Saturation augmentation (fraction) +hsv_v: 0.4 # (float) image HSV-Value augmentation (fraction) +degrees: 0.0 # (float) image rotation (+/- deg) +translate: 0.1 # (float) image translation (+/- fraction) +scale: 0.5 # (float) image scale (+/- gain) +shear: 0.0 # (float) image shear (+/- deg) +perspective: 0.0 # (float) image perspective (+/- fraction), range 0-0.001 +flipud: 0.0 # (float) image flip up-down (probability) +fliplr: 0.5 # (float) image flip left-right (probability) +mosaic: 1.0 # (float) image mosaic (probability) +mixup: 0.0 # (float) image mixup (probability) +copy_paste: 0.0 # (float) segment copy-paste (probability) + +# Custom config.yaml --------------------------------------------------------------------------------------------------- +cfg: # (str, optional) for overriding defaults.yaml +save_dir: ./runs/train1 # 自己设置路径 +# Tracker settings ------------------------------------------------------------------------------------------------------ +tracker: botsort.yaml # (str) tracker type, choices=[botsort.yaml, bytetrack.yaml] diff --git a/ultralytics/cfg/models/v8/yolov8.yaml b/ultralytics/cfg/models/v8/yolov8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86c86d9bcd6966293563d7f88a054d853e92f00c --- /dev/null +++ b/ultralytics/cfg/models/v8/yolov8.yaml @@ -0,0 +1,46 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect + +# Parameters +nc: 1 # number of classes +scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n' + # [depth, width, max_channels] + n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs + s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs + m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs + l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs + x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs + +# YOLOv8.0n backbone +backbone: + # [from, repeats, module, args] + - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 + - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4 + - [-1, 3, C2f, [128, True]] + - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8 + - [-1, 6, C2f, [256, True]] + - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16 + - [-1, 6, C2f, [512, True]] + - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32 + - [-1, 3, C2f, [1024, True]] + - [-1, 1, SPPF, [1024, 5]] # 9 + +# YOLOv8.0n head +head: + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 6], 1, Concat, [1]] # cat backbone P4 + - [-1, 3, C2f, [512]] # 12 + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 4], 1, Concat, [1]] # cat backbone P3 + - [-1, 3, C2f, [256]] # 15 (P3/8-small) + + - [-1, 1, Conv, [256, 3, 2]] + - [[-1, 12], 1, Concat, [1]] # cat head P4 + - [-1, 3, C2f, [512]] # 18 (P4/16-medium) + + - [-1, 1, Conv, [512, 3, 2]] + - [[-1, 9], 1, Concat, [1]] # cat head P5 + - [-1, 3, C2f, [1024]] # 21 (P5/32-large) + + - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/models/v8/yolov8_ECA.yaml b/ultralytics/cfg/models/v8/yolov8_ECA.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e54e0d99788510a626f82a507740d8bebc49ce5 --- /dev/null +++ b/ultralytics/cfg/models/v8/yolov8_ECA.yaml @@ -0,0 +1,50 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect + +# Parameters +nc: 9 # number of classes +scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n' + # [depth, width, max_channels] + n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs + s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs + m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs + l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs + x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs + +# YOLOv8.0n backbone +backbone: + # [from, repeats, module, args] + - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 + - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4 + - [-1, 3, C2f, [128, True]] + - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8 + - [-1, 6, C2f, [256, True]] + - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16 + - [-1, 6, C2f, [512, True]] + - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32 + - [-1, 3, C2f, [1024, True]] + - [-1, 1, SPPF, [1024, 5]] # 9 + +# YOLOv8.0n head +head: + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 6], 1, Concat, [1]] # cat backbone P4 + - [-1, 3, C2f, [512]] # 12 + - [-1, 1, ECAAttention, [512]] + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 4], 1, Concat, [1]] # cat backbone P3 + - [-1, 3, C2f, [256]] # 16 (P3/8-small) + - [-1, 1, ECAAttention, [256]] + + - [-1, 1, Conv, [256, 3, 2]] + - [[-1, 12], 1, Concat, [1]] # cat head P4 + - [-1, 3, C2f, [512]] # 20 (P4/16-medium) + - [-1, 1, ECAAttention, [512]] + + - [-1, 1, Conv, [512, 3, 2]] + - [[-1, 9], 1, Concat, [1]] # cat head P5 + - [-1, 3, C2f, [1024]] # 24 (P5/32-large) + - [-1, 1, ECAAttention, [1024]] + + - [[17, 21, 25], 1, Detect, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/models/v8/yolov8_GAM.yaml b/ultralytics/cfg/models/v8/yolov8_GAM.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4784bbf1bb4280ebf26011b635bdbb5721a9ad1 --- /dev/null +++ b/ultralytics/cfg/models/v8/yolov8_GAM.yaml @@ -0,0 +1,50 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect + +# Parameters +nc: 9 # number of classes +scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n' + # [depth, width, max_channels] + n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs + s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs + m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs + l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs + x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs + +# YOLOv8.0n backbone +backbone: + # [from, repeats, module, args] + - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 + - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4 + - [-1, 3, C2f, [128, True]] + - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8 + - [-1, 6, C2f, [256, True]] + - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16 + - [-1, 6, C2f, [512, True]] + - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32 + - [-1, 3, C2f, [1024, True]] + - [-1, 1, SPPF, [1024, 5]] # 9 + +# YOLOv8.0n head +head: + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 6], 1, Concat, [1]] # cat backbone P4 + - [-1, 3, C2f, [512]] # 12 + - [-1, 1, GAM_Attention, [512,512]] + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 4], 1, Concat, [1]] # cat backbone P3 + - [-1, 3, C2f, [256]] # 16 (P3/8-small) + - [-1, 1, GAM_Attention, [256,256]] + + - [-1, 1, Conv, [256, 3, 2]] + - [[-1, 12], 1, Concat, [1]] # cat head P4 + - [-1, 3, C2f, [512]] # 20 (P4/16-medium) + - [-1, 1, GAM_Attention, [512,512]] + + - [-1, 1, Conv, [512, 3, 2]] + - [[-1, 9], 1, Concat, [1]] # cat head P5 + - [-1, 3, C2f, [1024]] # 24 (P5/32-large) + - [-1, 1, GAM_Attention, [1024,1024]] + + - [[17, 21, 25], 1, Detect, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/models/v8/yolov8_ResBlock_CBAM.yaml b/ultralytics/cfg/models/v8/yolov8_ResBlock_CBAM.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c3007fdf8f356525889abb8bc5b1212a018ffe29 --- /dev/null +++ b/ultralytics/cfg/models/v8/yolov8_ResBlock_CBAM.yaml @@ -0,0 +1,50 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect + +# Parameters +nc: 9 # number of classes +scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n' + # [depth, width, max_channels] + n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs + s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs + m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs + l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs + x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs + +# YOLOv8.0n backbone +backbone: + # [from, repeats, module, args] + - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 + - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4 + - [-1, 3, C2f, [128, True]] + - [-1, 1, GhostConv, [256, 3, 2]] # 3-P3/8 + - [-1, 6, C2f, [256, True]] + - [-1, 1, GhostConv, [512, 3, 2]] # 5-P4/16 + - [-1, 6, C2f, [512, True]] + - [-1, 1, GhostConv, [1024, 3, 2]] # 7-P5/32 + - [-1, 3, C2f, [1024, True]] + - [-1, 1, SPPF, [1024, 5]] # 9 + +# YOLOv8.0n head +head: + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 6], 1, Concat, [1]] # cat backbone P4 + - [-1, 3, C2f, [512]] # 12 + - [-1, 1, ResBlock_CBAM, [512]] + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 4], 1, Concat, [1]] # cat backbone P3 + - [-1, 3, C2f, [256]] # 16 (P3/8-small) + - [-1, 1, ResBlock_CBAM, [256]] + + - [-1, 1, Conv, [256, 3, 2]] + - [[-1, 12], 1, Concat, [1]] # cat head P4 + - [-1, 3, C2f, [512]] # 20 (P4/16-medium) + - [-1, 1, ResBlock_CBAM, [512]] + + - [-1, 1, Conv, [512, 3, 2]] + - [[-1, 9], 1, Concat, [1]] # cat head P5 + - [-1, 3, C2f, [1024]] # 24 (P5/32-large) + - [-1, 1, ResBlock_CBAM, [1024]] + + - [[17, 21, 25], 1, Detect, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/models/v8/yolov8_SA.yaml b/ultralytics/cfg/models/v8/yolov8_SA.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d224f2b497c30b88b318e1d182702970758568a --- /dev/null +++ b/ultralytics/cfg/models/v8/yolov8_SA.yaml @@ -0,0 +1,50 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect + +# Parameters +nc: 9 # number of classes +scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n' + # [depth, width, max_channels] + n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPs + s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPs + m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPs + l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs + x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs + +# YOLOv8.0n backbone +backbone: + # [from, repeats, module, args] + - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 + - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4 + - [-1, 3, C2f, [128, True]] + - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8 + - [-1, 6, C2f, [256, True]] + - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16 + - [-1, 6, C2f, [512, True]] + - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32 + - [-1, 3, C2f, [1024, True]] + - [-1, 1, SPPF, [1024, 5]] # 9 + +# YOLOv8.0n head +head: + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 6], 1, Concat, [1]] # cat backbone P4 + - [-1, 3, C2f, [512]] # 12 + - [-1, 1, ShuffleAttention, [512]] + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [[-1, 4], 1, Concat, [1]] # cat backbone P3 + - [-1, 3, C2f, [256]] # 16 (P3/8-small) + - [-1, 1, ShuffleAttention, [256]] + + - [-1, 1, Conv, [256, 3, 2]] + - [[-1, 12], 1, Concat, [1]] # cat head P4 + - [-1, 3, C2f, [512]] # 20 (P4/16-medium) + - [-1, 1, ShuffleAttention, [512]] + + - [-1, 1, Conv, [512, 3, 2]] + - [[-1, 9], 1, Concat, [1]] # cat head P5 + - [-1, 3, C2f, [1024]] # 24 (P5/32-large) + - [-1, 1, ShuffleAttention, [1024]] + + - [[17, 21, 25], 1, Detect, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/trackers/botsort.yaml b/ultralytics/cfg/trackers/botsort.yaml new file mode 100644 index 0000000000000000000000000000000000000000..61547a7a5e4c037597c2fe986c55adb2769e5503 --- /dev/null +++ b/ultralytics/cfg/trackers/botsort.yaml @@ -0,0 +1,18 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# Default YOLO tracker settings for BoT-SORT tracker https://github.com/NirAharon/BoT-SORT + +tracker_type: botsort # tracker type, ['botsort', 'bytetrack'] +track_high_thresh: 0.5 # threshold for the first association +track_low_thresh: 0.1 # threshold for the second association +new_track_thresh: 0.6 # threshold for init new track if the detection does not match any tracks +track_buffer: 30 # buffer to calculate the time when to remove tracks +match_thresh: 0.8 # threshold for matching tracks +# min_box_area: 10 # threshold for min box areas(for tracker evaluation, not used for now) +# mot20: False # for tracker evaluation(not used for now) + +# BoT-SORT settings +cmc_method: sparseOptFlow # method of global motion compensation +# ReID model related thresh (not supported yet) +proximity_thresh: 0.5 +appearance_thresh: 0.25 +with_reid: False diff --git a/ultralytics/cfg/trackers/bytetrack.yaml b/ultralytics/cfg/trackers/bytetrack.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1449c43f948889ab7dab5ab3fd181d56851521dd --- /dev/null +++ b/ultralytics/cfg/trackers/bytetrack.yaml @@ -0,0 +1,11 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# Default YOLO tracker settings for ByteTrack tracker https://github.com/ifzhang/ByteTrack + +tracker_type: bytetrack # tracker type, ['botsort', 'bytetrack'] +track_high_thresh: 0.5 # threshold for the first association +track_low_thresh: 0.1 # threshold for the second association +new_track_thresh: 0.6 # threshold for init new track if the detection does not match any tracks +track_buffer: 30 # buffer to calculate the time when to remove tracks +match_thresh: 0.8 # threshold for matching tracks +# min_box_area: 10 # threshold for min box areas(for tracker evaluation, not used for now) +# mot20: False # for tracker evaluation(not used for now) diff --git a/ultralytics/data/__init__.py b/ultralytics/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f5dbb2b754f2a840a43d7850db306ccf8c1f0c6 --- /dev/null +++ b/ultralytics/data/__init__.py @@ -0,0 +1,8 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .base import BaseDataset +from .build import build_dataloader, build_yolo_dataset, load_inference_source +from .dataset import ClassificationDataset, SemanticDataset, YOLODataset + +__all__ = ('BaseDataset', 'ClassificationDataset', 'SemanticDataset', 'YOLODataset', 'build_yolo_dataset', + 'build_dataloader', 'load_inference_source') diff --git a/ultralytics/data/__pycache__/__init__.cpython-310.pyc b/ultralytics/data/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4363c9def4b1e081d76fabe5afdb66c5f40cde23 Binary files /dev/null and b/ultralytics/data/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/data/__pycache__/__init__.cpython-39.pyc b/ultralytics/data/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68e42d109fedfaa19fe6971aea5c08f0cae5538d Binary files /dev/null and b/ultralytics/data/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/data/__pycache__/augment.cpython-310.pyc b/ultralytics/data/__pycache__/augment.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc0c8b510fa60d04b21719b37b298f25025ac309 Binary files /dev/null and b/ultralytics/data/__pycache__/augment.cpython-310.pyc differ diff --git a/ultralytics/data/__pycache__/augment.cpython-39.pyc b/ultralytics/data/__pycache__/augment.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aaeb874f7a65e6db2de3baf021b3d3f1a10a0512 Binary files /dev/null and b/ultralytics/data/__pycache__/augment.cpython-39.pyc differ diff --git a/ultralytics/data/__pycache__/base.cpython-310.pyc b/ultralytics/data/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c6159ec4618fd080b63c76e57f05a486d138beb Binary files /dev/null and b/ultralytics/data/__pycache__/base.cpython-310.pyc differ diff --git a/ultralytics/data/__pycache__/base.cpython-39.pyc b/ultralytics/data/__pycache__/base.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6903f5103443fb9325bf2b4a8b1407d2e25a462 Binary files /dev/null and b/ultralytics/data/__pycache__/base.cpython-39.pyc differ diff --git a/ultralytics/data/__pycache__/build.cpython-310.pyc b/ultralytics/data/__pycache__/build.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14c67246e0acc28926a825e7abc09c50010b7f33 Binary files /dev/null and b/ultralytics/data/__pycache__/build.cpython-310.pyc differ diff --git a/ultralytics/data/__pycache__/build.cpython-39.pyc b/ultralytics/data/__pycache__/build.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa5ca2e24bdaa89866da16e4700a1e6533c1e755 Binary files /dev/null and b/ultralytics/data/__pycache__/build.cpython-39.pyc differ diff --git a/ultralytics/data/__pycache__/dataset.cpython-310.pyc b/ultralytics/data/__pycache__/dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3b8612064b3ce922e194dc71e2ae5fe65b37a2e Binary files /dev/null and b/ultralytics/data/__pycache__/dataset.cpython-310.pyc differ diff --git a/ultralytics/data/__pycache__/dataset.cpython-39.pyc b/ultralytics/data/__pycache__/dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43b64d68b2ec3347bda99f9bcd925418ef9d9652 Binary files /dev/null and b/ultralytics/data/__pycache__/dataset.cpython-39.pyc differ diff --git a/ultralytics/data/__pycache__/loaders.cpython-310.pyc b/ultralytics/data/__pycache__/loaders.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac2f9b57142ede488f6e046ad4c96f51f7bb064e Binary files /dev/null and b/ultralytics/data/__pycache__/loaders.cpython-310.pyc differ diff --git a/ultralytics/data/__pycache__/loaders.cpython-39.pyc b/ultralytics/data/__pycache__/loaders.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec8878e7c87a80c910548620edb304f1bb5b810f Binary files /dev/null and b/ultralytics/data/__pycache__/loaders.cpython-39.pyc differ diff --git a/ultralytics/data/__pycache__/utils.cpython-310.pyc b/ultralytics/data/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7683a7f44706287a6993263ebc72b457d89d5e60 Binary files /dev/null and b/ultralytics/data/__pycache__/utils.cpython-310.pyc differ diff --git a/ultralytics/data/__pycache__/utils.cpython-39.pyc b/ultralytics/data/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ec9d9894e42f91bd8659f1d4768259dc38618d9 Binary files /dev/null and b/ultralytics/data/__pycache__/utils.cpython-39.pyc differ diff --git a/ultralytics/data/annotator.py b/ultralytics/data/annotator.py new file mode 100644 index 0000000000000000000000000000000000000000..c122cb9d9f6d32e36ff535686d772cf96e7c99f2 --- /dev/null +++ b/ultralytics/data/annotator.py @@ -0,0 +1,39 @@ +from pathlib import Path + +from ultralytics import SAM, YOLO + + +def auto_annotate(data, det_model='yolov8x.pt', sam_model='sam_b.pt', device='', output_dir=None): + """ + Automatically annotates images using a YOLO object detection model and a SAM segmentation model. + Args: + data (str): Path to a folder containing images to be annotated. + det_model (str, optional): Pre-trained YOLO detection model. Defaults to 'yolov8x.pt'. + sam_model (str, optional): Pre-trained SAM segmentation model. Defaults to 'sam_b.pt'. + device (str, optional): Device to run the models on. Defaults to an empty string (CPU or GPU, if available). + output_dir (str | None | optional): Directory to save the annotated results. + Defaults to a 'labels' folder in the same directory as 'data'. + """ + det_model = YOLO(det_model) + sam_model = SAM(sam_model) + + if not output_dir: + output_dir = Path(str(data)).parent / 'labels' + Path(output_dir).mkdir(exist_ok=True, parents=True) + + det_results = det_model(data, stream=True, device=device) + + for result in det_results: + boxes = result.boxes.xyxy # Boxes object for bbox outputs + class_ids = result.boxes.cls.int().tolist() # noqa + if len(class_ids): + sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device) + segments = sam_results[0].masks.xyn # noqa + + with open(str(Path(output_dir) / Path(result.path).stem) + '.txt', 'w') as f: + for i in range(len(segments)): + s = segments[i] + if len(s) == 0: + continue + segment = map(str, segments[i].reshape(-1).tolist()) + f.write(f'{class_ids[i]} ' + ' '.join(segment) + '\n') diff --git a/ultralytics/data/augment.py b/ultralytics/data/augment.py new file mode 100644 index 0000000000000000000000000000000000000000..be77e5299444cf5c917d89d279ad93f8345f0946 --- /dev/null +++ b/ultralytics/data/augment.py @@ -0,0 +1,906 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import math +import random +from copy import deepcopy + +import cv2 +import numpy as np +import torch +import torchvision.transforms as T + +from ultralytics.utils import LOGGER, colorstr +from ultralytics.utils.checks import check_version +from ultralytics.utils.instance import Instances +from ultralytics.utils.metrics import bbox_ioa +from ultralytics.utils.ops import segment2box + +from .utils import polygons2masks, polygons2masks_overlap + +POSE_FLIPLR_INDEX = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] + + +# TODO: we might need a BaseTransform to make all these augments be compatible with both classification and semantic +class BaseTransform: + + def __init__(self) -> None: + pass + + def apply_image(self, labels): + """Applies image transformation to labels.""" + pass + + def apply_instances(self, labels): + """Applies transformations to input 'labels' and returns object instances.""" + pass + + def apply_semantic(self, labels): + """Applies semantic segmentation to an image.""" + pass + + def __call__(self, labels): + """Applies label transformations to an image, instances and semantic masks.""" + self.apply_image(labels) + self.apply_instances(labels) + self.apply_semantic(labels) + + +class Compose: + + def __init__(self, transforms): + """Initializes the Compose object with a list of transforms.""" + self.transforms = transforms + + def __call__(self, data): + """Applies a series of transformations to input data.""" + for t in self.transforms: + data = t(data) + return data + + def append(self, transform): + """Appends a new transform to the existing list of transforms.""" + self.transforms.append(transform) + + def tolist(self): + """Converts list of transforms to a standard Python list.""" + return self.transforms + + def __repr__(self): + """Return string representation of object.""" + format_string = f'{self.__class__.__name__}(' + for t in self.transforms: + format_string += '\n' + format_string += f' {t}' + format_string += '\n)' + return format_string + + +class BaseMixTransform: + """This implementation is from mmyolo.""" + + def __init__(self, dataset, pre_transform=None, p=0.0) -> None: + self.dataset = dataset + self.pre_transform = pre_transform + self.p = p + + def __call__(self, labels): + """Applies pre-processing transforms and mixup/mosaic transforms to labels data.""" + if random.uniform(0, 1) > self.p: + return labels + + # Get index of one or three other images + indexes = self.get_indexes() + if isinstance(indexes, int): + indexes = [indexes] + + # Get images information will be used for Mosaic or MixUp + mix_labels = [self.dataset.get_image_and_label(i) for i in indexes] + + if self.pre_transform is not None: + for i, data in enumerate(mix_labels): + mix_labels[i] = self.pre_transform(data) + labels['mix_labels'] = mix_labels + + # Mosaic or MixUp + labels = self._mix_transform(labels) + labels.pop('mix_labels', None) + return labels + + def _mix_transform(self, labels): + """Applies MixUp or Mosaic augmentation to the label dictionary.""" + raise NotImplementedError + + def get_indexes(self): + """Gets a list of shuffled indexes for mosaic augmentation.""" + raise NotImplementedError + + +class Mosaic(BaseMixTransform): + """ + Mosaic augmentation. + + This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image. + The augmentation is applied to a dataset with a given probability. + + Attributes: + dataset: The dataset on which the mosaic augmentation is applied. + imgsz (int, optional): Image size (height and width) after mosaic pipeline of a single image. Default to 640. + p (float, optional): Probability of applying the mosaic augmentation. Must be in the range 0-1. Default to 1.0. + n (int, optional): The grid size, either 4 (for 2x2) or 9 (for 3x3). + """ + + def __init__(self, dataset, imgsz=640, p=1.0, n=4): + """Initializes the object with a dataset, image size, probability, and border.""" + assert 0 <= p <= 1.0, f'The probability should be in range [0, 1], but got {p}.' + assert n in (4, 9), 'grid must be equal to 4 or 9.' + super().__init__(dataset=dataset, p=p) + self.dataset = dataset + self.imgsz = imgsz + self.border = (-imgsz // 2, -imgsz // 2) # width, height + self.n = n + + def get_indexes(self, buffer=True): + """Return a list of random indexes from the dataset.""" + if buffer: # select images from buffer + return random.choices(list(self.dataset.buffer), k=self.n - 1) + else: # select any images + return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)] + + def _mix_transform(self, labels): + """Apply mixup transformation to the input image and labels.""" + assert labels.get('rect_shape', None) is None, 'rect and mosaic are mutually exclusive.' + assert len(labels.get('mix_labels', [])), 'There are no other images for mosaic augment.' + return self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels) + + def _mosaic4(self, labels): + """Create a 2x2 image mosaic.""" + mosaic_labels = [] + s = self.imgsz + yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.border) # mosaic center x, y + for i in range(4): + labels_patch = labels if i == 0 else labels['mix_labels'][i - 1] + # Load image + img = labels_patch['img'] + h, w = labels_patch.pop('resized_shape') + + # Place img in img4 + if i == 0: # top left + img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles + x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) + x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) + elif i == 1: # top right + x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc + x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h + elif i == 2: # bottom left + x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) + x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) + elif i == 3: # bottom right + x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) + x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) + + img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + padw = x1a - x1b + padh = y1a - y1b + + labels_patch = self._update_labels(labels_patch, padw, padh) + mosaic_labels.append(labels_patch) + final_labels = self._cat_labels(mosaic_labels) + final_labels['img'] = img4 + return final_labels + + def _mosaic9(self, labels): + """Create a 3x3 image mosaic.""" + mosaic_labels = [] + s = self.imgsz + hp, wp = -1, -1 # height, width previous + for i in range(9): + labels_patch = labels if i == 0 else labels['mix_labels'][i - 1] + # Load image + img = labels_patch['img'] + h, w = labels_patch.pop('resized_shape') + + # Place img in img9 + if i == 0: # center + img9 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles + h0, w0 = h, w + c = s, s, s + w, s + h # xmin, ymin, xmax, ymax (base) coordinates + elif i == 1: # top + c = s, s - h, s + w, s + elif i == 2: # top right + c = s + wp, s - h, s + wp + w, s + elif i == 3: # right + c = s + w0, s, s + w0 + w, s + h + elif i == 4: # bottom right + c = s + w0, s + hp, s + w0 + w, s + hp + h + elif i == 5: # bottom + c = s + w0 - w, s + h0, s + w0, s + h0 + h + elif i == 6: # bottom left + c = s + w0 - wp - w, s + h0, s + w0 - wp, s + h0 + h + elif i == 7: # left + c = s - w, s + h0 - h, s, s + h0 + elif i == 8: # top left + c = s - w, s + h0 - hp - h, s, s + h0 - hp + + padw, padh = c[:2] + x1, y1, x2, y2 = (max(x, 0) for x in c) # allocate coords + + # Image + img9[y1:y2, x1:x2] = img[y1 - padh:, x1 - padw:] # img9[ymin:ymax, xmin:xmax] + hp, wp = h, w # height, width previous for next iteration + + # Labels assuming imgsz*2 mosaic size + labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1]) + mosaic_labels.append(labels_patch) + final_labels = self._cat_labels(mosaic_labels) + + final_labels['img'] = img9[-self.border[0]:self.border[0], -self.border[1]:self.border[1]] + return final_labels + + @staticmethod + def _update_labels(labels, padw, padh): + """Update labels.""" + nh, nw = labels['img'].shape[:2] + labels['instances'].convert_bbox(format='xyxy') + labels['instances'].denormalize(nw, nh) + labels['instances'].add_padding(padw, padh) + return labels + + def _cat_labels(self, mosaic_labels): + """Return labels with mosaic border instances clipped.""" + if len(mosaic_labels) == 0: + return {} + cls = [] + instances = [] + imgsz = self.imgsz * 2 # mosaic imgsz + for labels in mosaic_labels: + cls.append(labels['cls']) + instances.append(labels['instances']) + final_labels = { + 'im_file': mosaic_labels[0]['im_file'], + 'ori_shape': mosaic_labels[0]['ori_shape'], + 'resized_shape': (imgsz, imgsz), + 'cls': np.concatenate(cls, 0), + 'instances': Instances.concatenate(instances, axis=0), + 'mosaic_border': self.border} # final_labels + final_labels['instances'].clip(imgsz, imgsz) + good = final_labels['instances'].remove_zero_area_boxes() + final_labels['cls'] = final_labels['cls'][good] + return final_labels + + +class MixUp(BaseMixTransform): + + def __init__(self, dataset, pre_transform=None, p=0.0) -> None: + super().__init__(dataset=dataset, pre_transform=pre_transform, p=p) + + def get_indexes(self): + """Get a random index from the dataset.""" + return random.randint(0, len(self.dataset) - 1) + + def _mix_transform(self, labels): + """Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf.""" + r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0 + labels2 = labels['mix_labels'][0] + labels['img'] = (labels['img'] * r + labels2['img'] * (1 - r)).astype(np.uint8) + labels['instances'] = Instances.concatenate([labels['instances'], labels2['instances']], axis=0) + labels['cls'] = np.concatenate([labels['cls'], labels2['cls']], 0) + return labels + + +class RandomPerspective: + + def __init__(self, + degrees=0.0, + translate=0.1, + scale=0.5, + shear=0.0, + perspective=0.0, + border=(0, 0), + pre_transform=None): + self.degrees = degrees + self.translate = translate + self.scale = scale + self.shear = shear + self.perspective = perspective + # Mosaic border + self.border = border + self.pre_transform = pre_transform + + def affine_transform(self, img, border): + """Center.""" + C = np.eye(3, dtype=np.float32) + + C[0, 2] = -img.shape[1] / 2 # x translation (pixels) + C[1, 2] = -img.shape[0] / 2 # y translation (pixels) + + # Perspective + P = np.eye(3, dtype=np.float32) + P[2, 0] = random.uniform(-self.perspective, self.perspective) # x perspective (about y) + P[2, 1] = random.uniform(-self.perspective, self.perspective) # y perspective (about x) + + # Rotation and Scale + R = np.eye(3, dtype=np.float32) + a = random.uniform(-self.degrees, self.degrees) + # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations + s = random.uniform(1 - self.scale, 1 + self.scale) + # s = 2 ** random.uniform(-scale, scale) + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) + + # Shear + S = np.eye(3, dtype=np.float32) + S[0, 1] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180) # y shear (deg) + + # Translation + T = np.eye(3, dtype=np.float32) + T[0, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[0] # x translation (pixels) + T[1, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[1] # y translation (pixels) + + # Combined rotation matrix + M = T @ S @ R @ P @ C # order of operations (right to left) is IMPORTANT + # Affine image + if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed + if self.perspective: + img = cv2.warpPerspective(img, M, dsize=self.size, borderValue=(114, 114, 114)) + else: # affine + img = cv2.warpAffine(img, M[:2], dsize=self.size, borderValue=(114, 114, 114)) + return img, M, s + + def apply_bboxes(self, bboxes, M): + """ + Apply affine to bboxes only. + + Args: + bboxes (ndarray): list of bboxes, xyxy format, with shape (num_bboxes, 4). + M (ndarray): affine matrix. + + Returns: + new_bboxes (ndarray): bboxes after affine, [num_bboxes, 4]. + """ + n = len(bboxes) + if n == 0: + return bboxes + + xy = np.ones((n * 4, 3), dtype=bboxes.dtype) + xy[:, :2] = bboxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = xy @ M.T # transform + xy = (xy[:, :2] / xy[:, 2:3] if self.perspective else xy[:, :2]).reshape(n, 8) # perspective rescale or affine + + # Create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T + + def apply_segments(self, segments, M): + """ + Apply affine to segments and generate new bboxes from segments. + + Args: + segments (ndarray): list of segments, [num_samples, 500, 2]. + M (ndarray): affine matrix. + + Returns: + new_segments (ndarray): list of segments after affine, [num_samples, 500, 2]. + new_bboxes (ndarray): bboxes after affine, [N, 4]. + """ + n, num = segments.shape[:2] + if n == 0: + return [], segments + + xy = np.ones((n * num, 3), dtype=segments.dtype) + segments = segments.reshape(-1, 2) + xy[:, :2] = segments + xy = xy @ M.T # transform + xy = xy[:, :2] / xy[:, 2:3] + segments = xy.reshape(n, -1, 2) + bboxes = np.stack([segment2box(xy, self.size[0], self.size[1]) for xy in segments], 0) + return bboxes, segments + + def apply_keypoints(self, keypoints, M): + """ + Apply affine to keypoints. + + Args: + keypoints (ndarray): keypoints, [N, 17, 3]. + M (ndarray): affine matrix. + + Return: + new_keypoints (ndarray): keypoints after affine, [N, 17, 3]. + """ + n, nkpt = keypoints.shape[:2] + if n == 0: + return keypoints + xy = np.ones((n * nkpt, 3), dtype=keypoints.dtype) + visible = keypoints[..., 2].reshape(n * nkpt, 1) + xy[:, :2] = keypoints[..., :2].reshape(n * nkpt, 2) + xy = xy @ M.T # transform + xy = xy[:, :2] / xy[:, 2:3] # perspective rescale or affine + out_mask = (xy[:, 0] < 0) | (xy[:, 1] < 0) | (xy[:, 0] > self.size[0]) | (xy[:, 1] > self.size[1]) + visible[out_mask] = 0 + return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3) + + def __call__(self, labels): + """ + Affine images and targets. + + Args: + labels (dict): a dict of `bboxes`, `segments`, `keypoints`. + """ + if self.pre_transform and 'mosaic_border' not in labels: + labels = self.pre_transform(labels) + labels.pop('ratio_pad', None) # do not need ratio pad + + img = labels['img'] + cls = labels['cls'] + instances = labels.pop('instances') + # Make sure the coord formats are right + instances.convert_bbox(format='xyxy') + instances.denormalize(*img.shape[:2][::-1]) + + border = labels.pop('mosaic_border', self.border) + self.size = img.shape[1] + border[1] * 2, img.shape[0] + border[0] * 2 # w, h + # M is affine matrix + # scale for func:`box_candidates` + img, M, scale = self.affine_transform(img, border) + + bboxes = self.apply_bboxes(instances.bboxes, M) + + segments = instances.segments + keypoints = instances.keypoints + # Update bboxes if there are segments. + if len(segments): + bboxes, segments = self.apply_segments(segments, M) + + if keypoints is not None: + keypoints = self.apply_keypoints(keypoints, M) + new_instances = Instances(bboxes, segments, keypoints, bbox_format='xyxy', normalized=False) + # Clip + new_instances.clip(*self.size) + + # Filter instances + instances.scale(scale_w=scale, scale_h=scale, bbox_only=True) + # Make the bboxes have the same scale with new_bboxes + i = self.box_candidates(box1=instances.bboxes.T, + box2=new_instances.bboxes.T, + area_thr=0.01 if len(segments) else 0.10) + labels['instances'] = new_instances[i] + labels['cls'] = cls[i] + labels['img'] = img + labels['resized_shape'] = img.shape[:2] + return labels + + def box_candidates(self, box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16): # box1(4,n), box2(4,n) + # Compute box candidates: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps)) # aspect ratio + return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr) # candidates + + +class RandomHSV: + + def __init__(self, hgain=0.5, sgain=0.5, vgain=0.5) -> None: + self.hgain = hgain + self.sgain = sgain + self.vgain = vgain + + def __call__(self, labels): + """Applies random horizontal or vertical flip to an image with a given probability.""" + img = labels['img'] + if self.hgain or self.sgain or self.vgain: + r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain] + 1 # random gains + hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) + dtype = img.dtype # uint8 + + x = np.arange(0, 256, dtype=r.dtype) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))) + cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed + return labels + + +class RandomFlip: + + def __init__(self, p=0.5, direction='horizontal', flip_idx=None) -> None: + assert direction in ['horizontal', 'vertical'], f'Support direction `horizontal` or `vertical`, got {direction}' + assert 0 <= p <= 1.0 + + self.p = p + self.direction = direction + self.flip_idx = flip_idx + + def __call__(self, labels): + """Resize image and padding for detection, instance segmentation, pose.""" + img = labels['img'] + instances = labels.pop('instances') + instances.convert_bbox(format='xywh') + h, w = img.shape[:2] + h = 1 if instances.normalized else h + w = 1 if instances.normalized else w + + # Flip up-down + if self.direction == 'vertical' and random.random() < self.p: + img = np.flipud(img) + instances.flipud(h) + if self.direction == 'horizontal' and random.random() < self.p: + img = np.fliplr(img) + instances.fliplr(w) + # For keypoints + if self.flip_idx is not None and instances.keypoints is not None: + instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :]) + labels['img'] = np.ascontiguousarray(img) + labels['instances'] = instances + return labels + + +class LetterBox: + """Resize image and padding for detection, instance segmentation, pose.""" + + def __init__(self, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, center=True, stride=32): + """Initialize LetterBox object with specific parameters.""" + self.new_shape = new_shape + self.auto = auto + self.scaleFill = scaleFill + self.scaleup = scaleup + self.stride = stride + self.center = center # Put the image in the middle or top-left + + def __call__(self, labels=None, image=None): + """Return updated labels and image with added border.""" + if labels is None: + labels = {} + img = labels.get('img') if image is None else image + shape = img.shape[:2] # current shape [height, width] + new_shape = labels.pop('rect_shape', self.new_shape) + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + if not self.scaleup: # only scale down, do not scale up (for better val mAP) + r = min(r, 1.0) + + # Compute padding + ratio = r, r # width, height ratios + new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) + dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding + if self.auto: # minimum rectangle + dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride) # wh padding + elif self.scaleFill: # stretch + dw, dh = 0.0, 0.0 + new_unpad = (new_shape[1], new_shape[0]) + ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios + + if self.center: + dw /= 2 # divide padding into 2 sides + dh /= 2 + if labels.get('ratio_pad'): + labels['ratio_pad'] = (labels['ratio_pad'], (dw, dh)) # for evaluation + + if shape[::-1] != new_unpad: # resize + img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1)) + img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, + value=(114, 114, 114)) # add border + + if len(labels): + labels = self._update_labels(labels, ratio, dw, dh) + labels['img'] = img + labels['resized_shape'] = new_shape + return labels + else: + return img + + def _update_labels(self, labels, ratio, padw, padh): + """Update labels.""" + labels['instances'].convert_bbox(format='xyxy') + labels['instances'].denormalize(*labels['img'].shape[:2][::-1]) + labels['instances'].scale(*ratio) + labels['instances'].add_padding(padw, padh) + return labels + + +class CopyPaste: + + def __init__(self, p=0.5) -> None: + self.p = p + + def __call__(self, labels): + """Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy).""" + im = labels['img'] + cls = labels['cls'] + h, w = im.shape[:2] + instances = labels.pop('instances') + instances.convert_bbox(format='xyxy') + instances.denormalize(w, h) + if self.p and len(instances.segments): + n = len(instances) + _, w, _ = im.shape # height, width, channels + im_new = np.zeros(im.shape, np.uint8) + + # Calculate ioa first then select indexes randomly + ins_flip = deepcopy(instances) + ins_flip.fliplr(w) + + ioa = bbox_ioa(ins_flip.bboxes, instances.bboxes) # intersection over area, (N, M) + indexes = np.nonzero((ioa < 0.30).all(1))[0] # (N, ) + n = len(indexes) + for j in random.sample(list(indexes), k=round(self.p * n)): + cls = np.concatenate((cls, cls[[j]]), axis=0) + instances = Instances.concatenate((instances, ins_flip[[j]]), axis=0) + cv2.drawContours(im_new, instances.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED) + + result = cv2.flip(im, 1) # augment segments (flip left-right) + i = cv2.flip(im_new, 1).astype(bool) + im[i] = result[i] # cv2.imwrite('debug.jpg', im) # debug + + labels['img'] = im + labels['cls'] = cls + labels['instances'] = instances + return labels + + +class Albumentations: + """YOLOv8 Albumentations class (optional, only used if package is installed)""" + + def __init__(self, p=1.0): + """Initialize the transform object for YOLO bbox formatted params.""" + self.p = p + self.transform = None + prefix = colorstr('albumentations: ') + try: + import albumentations as A + + check_version(A.__version__, '1.0.3', hard=True) # version requirement + + T = [ + A.Blur(p=0.01), + A.MedianBlur(p=0.01), + A.ToGray(p=0.01), + A.CLAHE(p=0.01), + A.RandomBrightnessContrast(p=0.0), + A.RandomGamma(p=0.0), + A.ImageCompression(quality_lower=75, p=0.0)] # transforms + self.transform = A.Compose(T, bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels'])) + + LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p)) + except ImportError: # package not installed, skip + pass + except Exception as e: + LOGGER.info(f'{prefix}{e}') + + def __call__(self, labels): + """Generates object detections and returns a dictionary with detection results.""" + im = labels['img'] + cls = labels['cls'] + if len(cls): + labels['instances'].convert_bbox('xywh') + labels['instances'].normalize(*im.shape[:2][::-1]) + bboxes = labels['instances'].bboxes + # TODO: add supports of segments and keypoints + if self.transform and random.random() < self.p: + new = self.transform(image=im, bboxes=bboxes, class_labels=cls) # transformed + if len(new['class_labels']) > 0: # skip update if no bbox in new im + labels['img'] = new['image'] + labels['cls'] = np.array(new['class_labels']) + bboxes = np.array(new['bboxes'], dtype=np.float32) + labels['instances'].update(bboxes=bboxes) + return labels + + +# TODO: technically this is not an augmentation, maybe we should put this to another files +class Format: + + def __init__(self, + bbox_format='xywh', + normalize=True, + return_mask=False, + return_keypoint=False, + mask_ratio=4, + mask_overlap=True, + batch_idx=True): + self.bbox_format = bbox_format + self.normalize = normalize + self.return_mask = return_mask # set False when training detection only + self.return_keypoint = return_keypoint + self.mask_ratio = mask_ratio + self.mask_overlap = mask_overlap + self.batch_idx = batch_idx # keep the batch indexes + + def __call__(self, labels): + """Return formatted image, classes, bounding boxes & keypoints to be used by 'collate_fn'.""" + img = labels.pop('img') + h, w = img.shape[:2] + cls = labels.pop('cls') + instances = labels.pop('instances') + instances.convert_bbox(format=self.bbox_format) + instances.denormalize(w, h) + nl = len(instances) + + if self.return_mask: + if nl: + masks, instances, cls = self._format_segments(instances, cls, w, h) + masks = torch.from_numpy(masks) + else: + masks = torch.zeros(1 if self.mask_overlap else nl, img.shape[0] // self.mask_ratio, + img.shape[1] // self.mask_ratio) + labels['masks'] = masks + if self.normalize: + instances.normalize(w, h) + labels['img'] = self._format_img(img) + labels['cls'] = torch.from_numpy(cls) if nl else torch.zeros(nl) + labels['bboxes'] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4)) + if self.return_keypoint: + labels['keypoints'] = torch.from_numpy(instances.keypoints) + # Then we can use collate_fn + if self.batch_idx: + labels['batch_idx'] = torch.zeros(nl) + return labels + + def _format_img(self, img): + """Format the image for YOLOv5 from Numpy array to PyTorch tensor.""" + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = np.ascontiguousarray(img.transpose(2, 0, 1)[::-1]) + img = torch.from_numpy(img) + return img + + def _format_segments(self, instances, cls, w, h): + """convert polygon points to bitmap.""" + segments = instances.segments + if self.mask_overlap: + masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio) + masks = masks[None] # (640, 640) -> (1, 640, 640) + instances = instances[sorted_idx] + cls = cls[sorted_idx] + else: + masks = polygons2masks((h, w), segments, color=1, downsample_ratio=self.mask_ratio) + + return masks, instances, cls + + +def v8_transforms(dataset, imgsz, hyp, stretch=False): + """Convert images to a size suitable for YOLOv8 training.""" + pre_transform = Compose([ + Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), + CopyPaste(p=hyp.copy_paste), + RandomPerspective( + degrees=hyp.degrees, + translate=hyp.translate, + scale=hyp.scale, + shear=hyp.shear, + perspective=hyp.perspective, + pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)), + )]) + flip_idx = dataset.data.get('flip_idx', []) # for keypoints augmentation + if dataset.use_keypoints: + kpt_shape = dataset.data.get('kpt_shape', None) + if len(flip_idx) == 0 and hyp.fliplr > 0.0: + hyp.fliplr = 0.0 + LOGGER.warning("WARNING ⚠️ No 'flip_idx' array defined in data.yaml, setting augmentation 'fliplr=0.0'") + elif flip_idx and (len(flip_idx) != kpt_shape[0]): + raise ValueError(f'data.yaml flip_idx={flip_idx} length must be equal to kpt_shape[0]={kpt_shape[0]}') + + return Compose([ + pre_transform, + MixUp(dataset, pre_transform=pre_transform, p=hyp.mixup), + Albumentations(p=1.0), + RandomHSV(hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v), + RandomFlip(direction='vertical', p=hyp.flipud), + RandomFlip(direction='horizontal', p=hyp.fliplr, flip_idx=flip_idx)]) # transforms + + +# Classification augmentations ----------------------------------------------------------------------------------------- +def classify_transforms(size=224, mean=(0.0, 0.0, 0.0), std=(1.0, 1.0, 1.0)): # IMAGENET_MEAN, IMAGENET_STD + # Transforms to apply if albumentations not installed + if not isinstance(size, int): + raise TypeError(f'classify_transforms() size {size} must be integer, not (list, tuple)') + if any(mean) or any(std): + return T.Compose([CenterCrop(size), ToTensor(), T.Normalize(mean, std, inplace=True)]) + else: + return T.Compose([CenterCrop(size), ToTensor()]) + + +def hsv2colorjitter(h, s, v): + """Map HSV (hue, saturation, value) jitter into ColorJitter values (brightness, contrast, saturation, hue)""" + return v, v, s, h + + +def classify_albumentations( + augment=True, + size=224, + scale=(0.08, 1.0), + hflip=0.5, + vflip=0.0, + hsv_h=0.015, # image HSV-Hue augmentation (fraction) + hsv_s=0.7, # image HSV-Saturation augmentation (fraction) + hsv_v=0.4, # image HSV-Value augmentation (fraction) + mean=(0.0, 0.0, 0.0), # IMAGENET_MEAN + std=(1.0, 1.0, 1.0), # IMAGENET_STD + auto_aug=False, +): + """YOLOv8 classification Albumentations (optional, only used if package is installed).""" + prefix = colorstr('albumentations: ') + try: + import albumentations as A + from albumentations.pytorch import ToTensorV2 + + check_version(A.__version__, '1.0.3', hard=True) # version requirement + if augment: # Resize and crop + T = [A.RandomResizedCrop(height=size, width=size, scale=scale)] + if auto_aug: + # TODO: implement AugMix, AutoAug & RandAug in albumentations + LOGGER.info(f'{prefix}auto augmentations are currently not supported') + else: + if hflip > 0: + T += [A.HorizontalFlip(p=hflip)] + if vflip > 0: + T += [A.VerticalFlip(p=vflip)] + if any((hsv_h, hsv_s, hsv_v)): + T += [A.ColorJitter(*hsv2colorjitter(hsv_h, hsv_s, hsv_v))] # brightness, contrast, saturation, hue + else: # Use fixed crop for eval set (reproducibility) + T = [A.SmallestMaxSize(max_size=size), A.CenterCrop(height=size, width=size)] + T += [A.Normalize(mean=mean, std=std), ToTensorV2()] # Normalize and convert to Tensor + LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p)) + return A.Compose(T) + + except ImportError: # package not installed, skip + pass + except Exception as e: + LOGGER.info(f'{prefix}{e}') + + +class ClassifyLetterBox: + """YOLOv8 LetterBox class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])""" + + def __init__(self, size=(640, 640), auto=False, stride=32): + """Resizes image and crops it to center with max dimensions 'h' and 'w'.""" + super().__init__() + self.h, self.w = (size, size) if isinstance(size, int) else size + self.auto = auto # pass max size integer, automatically solve for short side using stride + self.stride = stride # used with auto + + def __call__(self, im): # im = np.array HWC + imh, imw = im.shape[:2] + r = min(self.h / imh, self.w / imw) # ratio of new/old + h, w = round(imh * r), round(imw * r) # resized image + hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else self.h, self.w + top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1) + im_out = np.full((self.h, self.w, 3), 114, dtype=im.dtype) + im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR) + return im_out + + +class CenterCrop: + """YOLOv8 CenterCrop class for image preprocessing, i.e. T.Compose([CenterCrop(size), ToTensor()])""" + + def __init__(self, size=640): + """Converts an image from numpy array to PyTorch tensor.""" + super().__init__() + self.h, self.w = (size, size) if isinstance(size, int) else size + + def __call__(self, im): # im = np.array HWC + imh, imw = im.shape[:2] + m = min(imh, imw) # min dimension + top, left = (imh - m) // 2, (imw - m) // 2 + return cv2.resize(im[top:top + m, left:left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR) + + +class ToTensor: + """YOLOv8 ToTensor class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()]).""" + + def __init__(self, half=False): + """Initialize YOLOv8 ToTensor object with optional half-precision support.""" + super().__init__() + self.half = half + + def __call__(self, im): # im = np.array HWC in BGR order + im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1]) # HWC to CHW -> BGR to RGB -> contiguous + im = torch.from_numpy(im) # to torch + im = im.half() if self.half else im.float() # uint8 to fp16/32 + im /= 255.0 # 0-255 to 0.0-1.0 + return im diff --git a/ultralytics/data/base.py b/ultralytics/data/base.py new file mode 100644 index 0000000000000000000000000000000000000000..3705e907749eeee0ee5fd529b9cf5a3a140d8a3c --- /dev/null +++ b/ultralytics/data/base.py @@ -0,0 +1,287 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import glob +import math +import os +import random +from copy import deepcopy +from multiprocessing.pool import ThreadPool +from pathlib import Path +from typing import Optional + +import cv2 +import numpy as np +import psutil +from torch.utils.data import Dataset +from tqdm import tqdm + +from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM_BAR_FORMAT + +from .utils import HELP_URL, IMG_FORMATS + + +class BaseDataset(Dataset): + """ + Base dataset class for loading and processing image data. + + Args: + img_path (str): Path to the folder containing images. + imgsz (int, optional): Image size. Defaults to 640. + cache (bool, optional): Cache images to RAM or disk during training. Defaults to False. + augment (bool, optional): If True, data augmentation is applied. Defaults to True. + hyp (dict, optional): Hyperparameters to apply data augmentation. Defaults to None. + prefix (str, optional): Prefix to print in log messages. Defaults to ''. + rect (bool, optional): If True, rectangular training is used. Defaults to False. + batch_size (int, optional): Size of batches. Defaults to None. + stride (int, optional): Stride. Defaults to 32. + pad (float, optional): Padding. Defaults to 0.0. + single_cls (bool, optional): If True, single class training is used. Defaults to False. + classes (list): List of included classes. Default is None. + fraction (float): Fraction of dataset to utilize. Default is 1.0 (use all data). + + Attributes: + im_files (list): List of image file paths. + labels (list): List of label data dictionaries. + ni (int): Number of images in the dataset. + ims (list): List of loaded images. + npy_files (list): List of numpy file paths. + transforms (callable): Image transformation function. + """ + + def __init__(self, + img_path, + imgsz=640, + cache=False, + augment=True, + hyp=DEFAULT_CFG, + prefix='', + rect=False, + batch_size=16, + stride=32, + pad=0.5, + single_cls=False, + classes=None, + fraction=1.0): + super().__init__() + self.img_path = img_path + self.imgsz = imgsz + self.augment = augment + self.single_cls = single_cls + self.prefix = prefix + self.fraction = fraction + self.im_files = self.get_img_files(self.img_path) + self.labels = self.get_labels() + self.update_labels(include_class=classes) # single_cls and include_class + self.ni = len(self.labels) # number of images + self.rect = rect + self.batch_size = batch_size + self.stride = stride + self.pad = pad + if self.rect: + assert self.batch_size is not None + self.set_rectangle() + + # Buffer thread for mosaic images + self.buffer = [] # buffer size = batch size + self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0 + + # Cache stuff + if cache == 'ram' and not self.check_cache_ram(): + cache = False + self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni + self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files] + if cache: + self.cache_images(cache) + + # Transforms + self.transforms = self.build_transforms(hyp=hyp) + + def get_img_files(self, img_path): + """Read image files.""" + try: + f = [] # image files + for p in img_path if isinstance(img_path, list) else [img_path]: + p = Path(p) # os-agnostic + if p.is_dir(): # dir + f += glob.glob(str(p / '**' / '*.*'), recursive=True) + # F = list(p.rglob('*.*')) # pathlib + elif p.is_file(): # file + with open(p) as t: + t = t.read().strip().splitlines() + parent = str(p.parent) + os.sep + f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path + # F += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib) + else: + raise FileNotFoundError(f'{self.prefix}{p} does not exist') + im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS) + # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib + assert im_files, f'{self.prefix}No images found' + except Exception as e: + raise FileNotFoundError(f'{self.prefix}Error loading data from {img_path}\n{HELP_URL}') from e + if self.fraction < 1: + im_files = im_files[:round(len(im_files) * self.fraction)] + return im_files + + def update_labels(self, include_class: Optional[list]): + """include_class, filter labels to include only these classes (optional).""" + include_class_array = np.array(include_class).reshape(1, -1) + for i in range(len(self.labels)): + if include_class is not None: + cls = self.labels[i]['cls'] + bboxes = self.labels[i]['bboxes'] + segments = self.labels[i]['segments'] + keypoints = self.labels[i]['keypoints'] + j = (cls == include_class_array).any(1) + self.labels[i]['cls'] = cls[j] + self.labels[i]['bboxes'] = bboxes[j] + if segments: + self.labels[i]['segments'] = [segments[si] for si, idx in enumerate(j) if idx] + if keypoints is not None: + self.labels[i]['keypoints'] = keypoints[j] + if self.single_cls: + self.labels[i]['cls'][:, 0] = 0 + + def load_image(self, i): + """Loads 1 image from dataset index 'i', returns (im, resized hw).""" + im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i] + if im is None: # not cached in RAM + if fn.exists(): # load npy + im = np.load(fn) + else: # read image + im = cv2.imread(f) # BGR + if im is None: + raise FileNotFoundError(f'Image Not Found {f}') + h0, w0 = im.shape[:2] # orig hw + r = self.imgsz / max(h0, w0) # ratio + if r != 1: # if sizes are not equal + interp = cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA + im = cv2.resize(im, (min(math.ceil(w0 * r), self.imgsz), min(math.ceil(h0 * r), self.imgsz)), + interpolation=interp) + + # Add to buffer if training with augmentations + if self.augment: + self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized + self.buffer.append(i) + if len(self.buffer) >= self.max_buffer_length: + j = self.buffer.pop(0) + self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None + + return im, (h0, w0), im.shape[:2] + + return self.ims[i], self.im_hw0[i], self.im_hw[i] + + def cache_images(self, cache): + """Cache images to memory or disk.""" + b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes + fcn = self.cache_images_to_disk if cache == 'disk' else self.load_image + with ThreadPool(NUM_THREADS) as pool: + results = pool.imap(fcn, range(self.ni)) + pbar = tqdm(enumerate(results), total=self.ni, bar_format=TQDM_BAR_FORMAT, disable=LOCAL_RANK > 0) + for i, x in pbar: + if cache == 'disk': + b += self.npy_files[i].stat().st_size + else: # 'ram' + self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i) + b += self.ims[i].nbytes + pbar.desc = f'{self.prefix}Caching images ({b / gb:.1f}GB {cache})' + pbar.close() + + def cache_images_to_disk(self, i): + """Saves an image as an *.npy file for faster loading.""" + f = self.npy_files[i] + if not f.exists(): + np.save(f.as_posix(), cv2.imread(self.im_files[i])) + + def check_cache_ram(self, safety_margin=0.5): + """Check image caching requirements vs available memory.""" + b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes + n = min(self.ni, 30) # extrapolate from 30 random images + for _ in range(n): + im = cv2.imread(random.choice(self.im_files)) # sample image + ratio = self.imgsz / max(im.shape[0], im.shape[1]) # max(h, w) # ratio + b += im.nbytes * ratio ** 2 + mem_required = b * self.ni / n * (1 + safety_margin) # GB required to cache dataset into RAM + mem = psutil.virtual_memory() + cache = mem_required < mem.available # to cache or not to cache, that is the question + if not cache: + LOGGER.info(f'{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images ' + f'with {int(safety_margin * 100)}% safety margin but only ' + f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, ' + f"{'caching images ✅' if cache else 'not caching images ⚠️'}") + return cache + + def set_rectangle(self): + """Sets the shape of bounding boxes for YOLO detections as rectangles.""" + bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int) # batch index + nb = bi[-1] + 1 # number of batches + + s = np.array([x.pop('shape') for x in self.labels]) # hw + ar = s[:, 0] / s[:, 1] # aspect ratio + irect = ar.argsort() + self.im_files = [self.im_files[i] for i in irect] + self.labels = [self.labels[i] for i in irect] + ar = ar[irect] + + # Set training image shapes + shapes = [[1, 1]] * nb + for i in range(nb): + ari = ar[bi == i] + mini, maxi = ari.min(), ari.max() + if maxi < 1: + shapes[i] = [maxi, 1] + elif mini > 1: + shapes[i] = [1, 1 / mini] + + self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride + self.batch = bi # batch index of image + + def __getitem__(self, index): + """Returns transformed label information for given index.""" + return self.transforms(self.get_image_and_label(index)) + + def get_image_and_label(self, index): + """Get and return label information from the dataset.""" + label = deepcopy(self.labels[index]) # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948 + label.pop('shape', None) # shape is for rect, remove it + label['img'], label['ori_shape'], label['resized_shape'] = self.load_image(index) + label['ratio_pad'] = (label['resized_shape'][0] / label['ori_shape'][0], + label['resized_shape'][1] / label['ori_shape'][1]) # for evaluation + if self.rect: + label['rect_shape'] = self.batch_shapes[self.batch[index]] + return self.update_labels_info(label) + + def __len__(self): + """Returns the length of the labels list for the dataset.""" + return len(self.labels) + + def update_labels_info(self, label): + """custom your label format here.""" + return label + + def build_transforms(self, hyp=None): + """Users can custom augmentations here + like: + if self.augment: + # Training transforms + return Compose([]) + else: + # Val transforms + return Compose([]) + """ + raise NotImplementedError + + def get_labels(self): + """Users can custom their own format here. + Make sure your output is a list with each element like below: + dict( + im_file=im_file, + shape=shape, # format: (height, width) + cls=cls, + bboxes=bboxes, # xywh + segments=segments, # xy + keypoints=keypoints, # xy + normalized=True, # or False + bbox_format="xyxy", # or xywh, ltwh + ) + """ + raise NotImplementedError diff --git a/ultralytics/data/build.py b/ultralytics/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..a8d9299eee9f514fc00f7f912df347f0a6092e2d --- /dev/null +++ b/ultralytics/data/build.py @@ -0,0 +1,170 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import os +import random +from pathlib import Path + +import numpy as np +import torch +from PIL import Image +from torch.utils.data import dataloader, distributed + +from ultralytics.data.loaders import (LOADERS, LoadImages, LoadPilAndNumpy, LoadScreenshots, LoadStreams, LoadTensor, + SourceTypes, autocast_list) +from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS +from ultralytics.utils import RANK, colorstr +from ultralytics.utils.checks import check_file + +from .dataset import YOLODataset +from .utils import PIN_MEMORY + + +class InfiniteDataLoader(dataloader.DataLoader): + """Dataloader that reuses workers. Uses same syntax as vanilla DataLoader.""" + + def __init__(self, *args, **kwargs): + """Dataloader that infinitely recycles workers, inherits from DataLoader.""" + super().__init__(*args, **kwargs) + object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler)) + self.iterator = super().__iter__() + + def __len__(self): + """Returns the length of the batch sampler's sampler.""" + return len(self.batch_sampler.sampler) + + def __iter__(self): + """Creates a sampler that repeats indefinitely.""" + for _ in range(len(self)): + yield next(self.iterator) + + def reset(self): + """Reset iterator. + This is useful when we want to modify settings of dataset while training. + """ + self.iterator = self._get_iterator() + + +class _RepeatSampler: + """ + Sampler that repeats forever. + + Args: + sampler (Dataset.sampler): The sampler to repeat. + """ + + def __init__(self, sampler): + """Initializes an object that repeats a given sampler indefinitely.""" + self.sampler = sampler + + def __iter__(self): + """Iterates over the 'sampler' and yields its contents.""" + while True: + yield from iter(self.sampler) + + +def seed_worker(worker_id): # noqa + """Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader.""" + worker_seed = torch.initial_seed() % 2 ** 32 + np.random.seed(worker_seed) + random.seed(worker_seed) + + +def build_yolo_dataset(cfg, img_path, batch, data, mode='train', rect=False, stride=32): + """Build YOLO Dataset""" + return YOLODataset( + img_path=img_path, + imgsz=cfg.imgsz, + batch_size=batch, + augment=mode == 'train', # augmentation + hyp=cfg, # TODO: probably add a get_hyps_from_cfg function + rect=cfg.rect or rect, # rectangular batches + cache=cfg.cache or None, + single_cls=cfg.single_cls or False, + stride=int(stride), + pad=0.0 if mode == 'train' else 0.5, + prefix=colorstr(f'{mode}: '), + use_segments=cfg.task == 'segment', + use_keypoints=cfg.task == 'pose', + classes=cfg.classes, + data=data, + fraction=cfg.fraction if mode == 'train' else 1.0) + + +def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1): + """Return an InfiniteDataLoader or DataLoader for training or validation set.""" + batch = min(batch, len(dataset)) + nd = torch.cuda.device_count() # number of CUDA devices + nw = min([os.cpu_count() // max(nd, 1), batch if batch > 1 else 0, workers]) # number of workers + sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle) + generator = torch.Generator() + generator.manual_seed(6148914691236517205 + RANK) + return InfiniteDataLoader(dataset=dataset, + batch_size=batch, + shuffle=shuffle and sampler is None, + num_workers=nw, + sampler=sampler, + pin_memory=PIN_MEMORY, + collate_fn=getattr(dataset, 'collate_fn', None), + worker_init_fn=seed_worker, + generator=generator) + + +def check_source(source): + """Check source type and return corresponding flag values.""" + webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False + if isinstance(source, (str, int, Path)): # int for local usb camera + source = str(source) + is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS) + is_url = source.lower().startswith(('https://', 'http://', 'rtsp://', 'rtmp://')) + webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file) + screenshot = source.lower() == 'screen' + if is_url and is_file: + source = check_file(source) # download + elif isinstance(source, tuple(LOADERS)): + in_memory = True + elif isinstance(source, (list, tuple)): + source = autocast_list(source) # convert all list elements to PIL or np arrays + from_img = True + elif isinstance(source, (Image.Image, np.ndarray)): + from_img = True + elif isinstance(source, torch.Tensor): + tensor = True + else: + raise TypeError('Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict') + + return source, webcam, screenshot, from_img, in_memory, tensor + + +def load_inference_source(source=None, imgsz=640, vid_stride=1): + """ + Loads an inference source for object detection and applies necessary transformations. + + Args: + source (str, Path, Tensor, PIL.Image, np.ndarray): The input source for inference. + imgsz (int, optional): The size of the image for inference. Default is 640. + vid_stride (int, optional): The frame interval for video sources. Default is 1. + + Returns: + dataset (Dataset): A dataset object for the specified input source. + """ + source, webcam, screenshot, from_img, in_memory, tensor = check_source(source) + source_type = source.source_type if in_memory else SourceTypes(webcam, screenshot, from_img, tensor) + + # Dataloader + if tensor: + dataset = LoadTensor(source) + elif in_memory: + dataset = source + elif webcam: + dataset = LoadStreams(source, imgsz=imgsz, vid_stride=vid_stride) + elif screenshot: + dataset = LoadScreenshots(source, imgsz=imgsz) + elif from_img: + dataset = LoadPilAndNumpy(source, imgsz=imgsz) + else: + dataset = LoadImages(source, imgsz=imgsz, vid_stride=vid_stride) + + # Attach source types to the dataset + setattr(dataset, 'source_type', source_type) + + return dataset diff --git a/ultralytics/data/converter.py b/ultralytics/data/converter.py new file mode 100644 index 0000000000000000000000000000000000000000..4dca466ad3ffcb9bf564c1dea9a1ec76c408097d --- /dev/null +++ b/ultralytics/data/converter.py @@ -0,0 +1,230 @@ +import json +from collections import defaultdict +from pathlib import Path + +import cv2 +import numpy as np +from tqdm import tqdm + +from ultralytics.utils.checks import check_requirements +from ultralytics.utils.files import make_dirs + + +def coco91_to_coco80_class(): + """Converts 91-index COCO class IDs to 80-index COCO class IDs. + + Returns: + (list): A list of 91 class IDs where the index represents the 80-index class ID and the value is the + corresponding 91-index class ID. + + """ + return [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, None, 24, 25, None, + None, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, None, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, None, 60, None, None, 61, None, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + None, 73, 74, 75, 76, 77, 78, 79, None] + + +def convert_coco(labels_dir='../coco/annotations/', use_segments=False, use_keypoints=False, cls91to80=True): + """Converts COCO dataset annotations to a format suitable for training YOLOv5 models. + + Args: + labels_dir (str, optional): Path to directory containing COCO dataset annotation files. + use_segments (bool, optional): Whether to include segmentation masks in the output. + use_keypoints (bool, optional): Whether to include keypoint annotations in the output. + cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs. + + Raises: + FileNotFoundError: If the labels_dir path does not exist. + + Example Usage: + convert_coco(labels_dir='../coco/annotations/', use_segments=True, use_keypoints=True, cls91to80=True) + + Output: + Generates output files in the specified output directory. + """ + + save_dir = make_dirs('yolo_labels') # output directory + coco80 = coco91_to_coco80_class() + + # Import json + for json_file in sorted(Path(labels_dir).resolve().glob('*.json')): + fn = Path(save_dir) / 'labels' / json_file.stem.replace('instances_', '') # folder name + fn.mkdir(parents=True, exist_ok=True) + with open(json_file) as f: + data = json.load(f) + + # Create image dict + images = {f'{x["id"]:d}': x for x in data['images']} + # Create image-annotations dict + imgToAnns = defaultdict(list) + for ann in data['annotations']: + imgToAnns[ann['image_id']].append(ann) + + # Write labels file + for img_id, anns in tqdm(imgToAnns.items(), desc=f'Annotations {json_file}'): + img = images[f'{img_id:d}'] + h, w, f = img['height'], img['width'], img['file_name'] + + bboxes = [] + segments = [] + keypoints = [] + for ann in anns: + if ann['iscrowd']: + continue + # The COCO box format is [top left x, top left y, width, height] + box = np.array(ann['bbox'], dtype=np.float64) + box[:2] += box[2:] / 2 # xy top-left corner to center + box[[0, 2]] /= w # normalize x + box[[1, 3]] /= h # normalize y + if box[2] <= 0 or box[3] <= 0: # if w <= 0 and h <= 0 + continue + + cls = coco80[ann['category_id'] - 1] if cls91to80 else ann['category_id'] - 1 # class + box = [cls] + box.tolist() + if box not in bboxes: + bboxes.append(box) + if use_segments and ann.get('segmentation') is not None: + if len(ann['segmentation']) == 0: + segments.append([]) + continue + if isinstance(ann['segmentation'], dict): + ann['segmentation'] = rle2polygon(ann['segmentation']) + if len(ann['segmentation']) > 1: + s = merge_multi_segment(ann['segmentation']) + s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist() + else: + s = [j for i in ann['segmentation'] for j in i] # all segments concatenated + s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist() + s = [cls] + s + if s not in segments: + segments.append(s) + if use_keypoints and ann.get('keypoints') is not None: + k = (np.array(ann['keypoints']).reshape(-1, 3) / np.array([w, h, 1])).reshape(-1).tolist() + k = box + k + keypoints.append(k) + + # Write + with open((fn / f).with_suffix('.txt'), 'a') as file: + for i in range(len(bboxes)): + if use_keypoints: + line = *(keypoints[i]), # cls, box, keypoints + else: + line = *(segments[i] + if use_segments and len(segments[i]) > 0 else bboxes[i]), # cls, box or segments + file.write(('%g ' * len(line)).rstrip() % line + '\n') + + +def rle2polygon(segmentation): + """ + Convert Run-Length Encoding (RLE) mask to polygon coordinates. + + Args: + segmentation (dict, list): RLE mask representation of the object segmentation. + + Returns: + (list): A list of lists representing the polygon coordinates for each contour. + + Note: + Requires the 'pycocotools' package to be installed. + """ + check_requirements('pycocotools') + from pycocotools import mask + + m = mask.decode(segmentation) + m[m > 0] = 255 + contours, _ = cv2.findContours(m, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS) + polygons = [] + for contour in contours: + epsilon = 0.001 * cv2.arcLength(contour, True) + contour_approx = cv2.approxPolyDP(contour, epsilon, True) + polygon = contour_approx.flatten().tolist() + polygons.append(polygon) + return polygons + + +def min_index(arr1, arr2): + """ + Find a pair of indexes with the shortest distance between two arrays of 2D points. + + Args: + arr1 (np.array): A NumPy array of shape (N, 2) representing N 2D points. + arr2 (np.array): A NumPy array of shape (M, 2) representing M 2D points. + + Returns: + (tuple): A tuple containing the indexes of the points with the shortest distance in arr1 and arr2 respectively. + """ + dis = ((arr1[:, None, :] - arr2[None, :, :]) ** 2).sum(-1) + return np.unravel_index(np.argmin(dis, axis=None), dis.shape) + + +def merge_multi_segment(segments): + """ + Merge multiple segments into one list by connecting the coordinates with the minimum distance between each segment. + This function connects these coordinates with a thin line to merge all segments into one. + + Args: + segments (List[List]): Original segmentations in COCO's JSON file. + Each element is a list of coordinates, like [segmentation1, segmentation2,...]. + + Returns: + s (List[np.ndarray]): A list of connected segments represented as NumPy arrays. + """ + s = [] + segments = [np.array(i).reshape(-1, 2) for i in segments] + idx_list = [[] for _ in range(len(segments))] + + # record the indexes with min distance between each segment + for i in range(1, len(segments)): + idx1, idx2 = min_index(segments[i - 1], segments[i]) + idx_list[i - 1].append(idx1) + idx_list[i].append(idx2) + + # use two round to connect all the segments + for k in range(2): + # forward connection + if k == 0: + for i, idx in enumerate(idx_list): + # middle segments have two indexes + # reverse the index of middle segments + if len(idx) == 2 and idx[0] > idx[1]: + idx = idx[::-1] + segments[i] = segments[i][::-1, :] + + segments[i] = np.roll(segments[i], -idx[0], axis=0) + segments[i] = np.concatenate([segments[i], segments[i][:1]]) + # deal with the first segment and the last one + if i in [0, len(idx_list) - 1]: + s.append(segments[i]) + else: + idx = [0, idx[1] - idx[0]] + s.append(segments[i][idx[0]:idx[1] + 1]) + + else: + for i in range(len(idx_list) - 1, -1, -1): + if i not in [0, len(idx_list) - 1]: + idx = idx_list[i] + nidx = abs(idx[1] - idx[0]) + s.append(segments[i][nidx:]) + return s + + +def delete_dsstore(path='../datasets'): + """Delete Apple .DS_Store files in the specified directory and its subdirectories.""" + from pathlib import Path + + files = list(Path(path).rglob('.DS_store')) + print(files) + for f in files: + f.unlink() + + +if __name__ == '__main__': + source = 'COCO' + + if source == 'COCO': + convert_coco( + '../datasets/coco/annotations', # directory with *.json + use_segments=False, + use_keypoints=True, + cls91to80=False) diff --git a/ultralytics/data/dataloaders/__init__.py b/ultralytics/data/dataloaders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ultralytics/data/dataset.py b/ultralytics/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a617e90d7bce321b606448acdabbafc64601ca86 --- /dev/null +++ b/ultralytics/data/dataset.py @@ -0,0 +1,275 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path + +import cv2 +import numpy as np +import torch +import torchvision +from tqdm import tqdm + +from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM_BAR_FORMAT, is_dir_writeable + +from .augment import Compose, Format, Instances, LetterBox, classify_albumentations, classify_transforms, v8_transforms +from .base import BaseDataset +from .utils import HELP_URL, LOGGER, get_hash, img2label_paths, verify_image_label + + +class YOLODataset(BaseDataset): + """ + Dataset class for loading object detection and/or segmentation labels in YOLO format. + + Args: + data (dict, optional): A dataset YAML dictionary. Defaults to None. + use_segments (bool, optional): If True, segmentation masks are used as labels. Defaults to False. + use_keypoints (bool, optional): If True, keypoints are used as labels. Defaults to False. + + Returns: + (torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model. + """ + cache_version = '1.0.2' # dataset labels *.cache version, >= 1.0.0 for YOLOv8 + rand_interp_methods = [cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4] + + def __init__(self, *args, data=None, use_segments=False, use_keypoints=False, **kwargs): + self.use_segments = use_segments + self.use_keypoints = use_keypoints + self.data = data + assert not (self.use_segments and self.use_keypoints), 'Can not use both segments and keypoints.' + super().__init__(*args, **kwargs) + + def cache_labels(self, path=Path('./labels.cache')): + """Cache dataset labels, check images and read shapes. + Args: + path (Path): path where to save the cache file (default: Path('./labels.cache')). + Returns: + (dict): labels. + """ + x = {'labels': []} + nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages + desc = f'{self.prefix}Scanning {path.parent / path.stem}...' + total = len(self.im_files) + nkpt, ndim = self.data.get('kpt_shape', (0, 0)) + if self.use_keypoints and (nkpt <= 0 or ndim not in (2, 3)): + raise ValueError("'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of " + "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'") + with ThreadPool(NUM_THREADS) as pool: + results = pool.imap(func=verify_image_label, + iterable=zip(self.im_files, self.label_files, repeat(self.prefix), + repeat(self.use_keypoints), repeat(len(self.data['names'])), repeat(nkpt), + repeat(ndim))) + pbar = tqdm(results, desc=desc, total=total, bar_format=TQDM_BAR_FORMAT) + for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar: + nm += nm_f + nf += nf_f + ne += ne_f + nc += nc_f + if im_file: + x['labels'].append( + dict( + im_file=im_file, + shape=shape, + cls=lb[:, 0:1], # n, 1 + bboxes=lb[:, 1:], # n, 4 + segments=segments, + keypoints=keypoint, + normalized=True, + bbox_format='xywh')) + if msg: + msgs.append(msg) + pbar.desc = f'{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt' + pbar.close() + + if msgs: + LOGGER.info('\n'.join(msgs)) + if nf == 0: + LOGGER.warning(f'{self.prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}') + x['hash'] = get_hash(self.label_files + self.im_files) + x['results'] = nf, nm, ne, nc, len(self.im_files) + x['msgs'] = msgs # warnings + x['version'] = self.cache_version # cache version + if is_dir_writeable(path.parent): + if path.exists(): + path.unlink() # remove *.cache file if exists + np.save(str(path), x) # save cache for next time + path.with_suffix('.cache.npy').rename(path) # remove .npy suffix + LOGGER.info(f'{self.prefix}New cache created: {path}') + else: + LOGGER.warning(f'{self.prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable, cache not saved.') + return x + + def get_labels(self): + """Returns dictionary of labels for YOLO training.""" + self.label_files = img2label_paths(self.im_files) + cache_path = Path(self.label_files[0]).parent.with_suffix('.cache') + try: + import gc + gc.disable() # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585 + cache, exists = np.load(str(cache_path), allow_pickle=True).item(), True # load dict + gc.enable() + assert cache['version'] == self.cache_version # matches current version + assert cache['hash'] == get_hash(self.label_files + self.im_files) # identical hash + except (FileNotFoundError, AssertionError, AttributeError): + cache, exists = self.cache_labels(cache_path), False # run cache ops + + # Display cache + nf, nm, ne, nc, n = cache.pop('results') # found, missing, empty, corrupt, total + if exists and LOCAL_RANK in (-1, 0): + d = f'Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt' + tqdm(None, desc=self.prefix + d, total=n, initial=n, bar_format=TQDM_BAR_FORMAT) # display cache results + if cache['msgs']: + LOGGER.info('\n'.join(cache['msgs'])) # display warnings + if nf == 0: # number of labels found + raise FileNotFoundError(f'{self.prefix}No labels found in {cache_path}, can not start training. {HELP_URL}') + + # Read cache + [cache.pop(k) for k in ('hash', 'version', 'msgs')] # remove items + labels = cache['labels'] + self.im_files = [lb['im_file'] for lb in labels] # update im_files + + # Check if the dataset is all boxes or all segments + lengths = ((len(lb['cls']), len(lb['bboxes']), len(lb['segments'])) for lb in labels) + len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths)) + if len_segments and len_boxes != len_segments: + LOGGER.warning( + f'WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = {len_segments}, ' + f'len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. ' + 'To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset.') + for lb in labels: + lb['segments'] = [] + if len_cls == 0: + raise ValueError(f'All labels empty in {cache_path}, can not start training without labels. {HELP_URL}') + return labels + + # TODO: use hyp config to set all these augmentations + def build_transforms(self, hyp=None): + """Builds and appends transforms to the list.""" + if self.augment: + hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0 + hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0 + transforms = v8_transforms(self, self.imgsz, hyp) + else: + transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)]) + transforms.append( + Format(bbox_format='xywh', + normalize=True, + return_mask=self.use_segments, + return_keypoint=self.use_keypoints, + batch_idx=True, + mask_ratio=hyp.mask_ratio, + mask_overlap=hyp.overlap_mask)) + return transforms + + def close_mosaic(self, hyp): + """Sets mosaic, copy_paste and mixup options to 0.0 and builds transformations.""" + hyp.mosaic = 0.0 # set mosaic ratio=0.0 + hyp.copy_paste = 0.0 # keep the same behavior as previous v8 close-mosaic + hyp.mixup = 0.0 # keep the same behavior as previous v8 close-mosaic + self.transforms = self.build_transforms(hyp) + + def update_labels_info(self, label): + """custom your label format here.""" + # NOTE: cls is not with bboxes now, classification and semantic segmentation need an independent cls label + # we can make it also support classification and semantic segmentation by add or remove some dict keys there. + bboxes = label.pop('bboxes') + segments = label.pop('segments') + keypoints = label.pop('keypoints', None) + bbox_format = label.pop('bbox_format') + normalized = label.pop('normalized') + label['instances'] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized) + return label + + @staticmethod + def collate_fn(batch): + """Collates data samples into batches.""" + new_batch = {} + keys = batch[0].keys() + values = list(zip(*[list(b.values()) for b in batch])) + for i, k in enumerate(keys): + value = values[i] + if k == 'img': + value = torch.stack(value, 0) + if k in ['masks', 'keypoints', 'bboxes', 'cls']: + value = torch.cat(value, 0) + new_batch[k] = value + new_batch['batch_idx'] = list(new_batch['batch_idx']) + for i in range(len(new_batch['batch_idx'])): + new_batch['batch_idx'][i] += i # add target image index for build_targets() + new_batch['batch_idx'] = torch.cat(new_batch['batch_idx'], 0) + return new_batch + + +# Classification dataloaders ------------------------------------------------------------------------------------------- +class ClassificationDataset(torchvision.datasets.ImageFolder): + """ + YOLO Classification Dataset. + + Args: + root (str): Dataset path. + + Attributes: + cache_ram (bool): True if images should be cached in RAM, False otherwise. + cache_disk (bool): True if images should be cached on disk, False otherwise. + samples (list): List of samples containing file, index, npy, and im. + torch_transforms (callable): torchvision transforms applied to the dataset. + album_transforms (callable, optional): Albumentations transforms applied to the dataset if augment is True. + """ + + def __init__(self, root, args, augment=False, cache=False): + """ + Initialize YOLO object with root, image size, augmentations, and cache settings. + + Args: + root (str): Dataset path. + args (Namespace): Argument parser containing dataset related settings. + augment (bool, optional): True if dataset should be augmented, False otherwise. Defaults to False. + cache (bool | str | optional): Cache setting, can be True, False, 'ram' or 'disk'. Defaults to False. + """ + super().__init__(root=root) + if augment and args.fraction < 1.0: # reduce training fraction + self.samples = self.samples[:round(len(self.samples) * args.fraction)] + self.cache_ram = cache is True or cache == 'ram' + self.cache_disk = cache == 'disk' + self.samples = [list(x) + [Path(x[0]).with_suffix('.npy'), None] for x in self.samples] # file, index, npy, im + self.torch_transforms = classify_transforms(args.imgsz) + self.album_transforms = classify_albumentations( + augment=augment, + size=args.imgsz, + scale=(1.0 - args.scale, 1.0), # (0.08, 1.0) + hflip=args.fliplr, + vflip=args.flipud, + hsv_h=args.hsv_h, # HSV-Hue augmentation (fraction) + hsv_s=args.hsv_s, # HSV-Saturation augmentation (fraction) + hsv_v=args.hsv_v, # HSV-Value augmentation (fraction) + mean=(0.0, 0.0, 0.0), # IMAGENET_MEAN + std=(1.0, 1.0, 1.0), # IMAGENET_STD + auto_aug=False) if augment else None + + def __getitem__(self, i): + """Returns subset of data and targets corresponding to given indices.""" + f, j, fn, im = self.samples[i] # filename, index, filename.with_suffix('.npy'), image + if self.cache_ram and im is None: + im = self.samples[i][3] = cv2.imread(f) + elif self.cache_disk: + if not fn.exists(): # load npy + np.save(fn.as_posix(), cv2.imread(f)) + im = np.load(fn) + else: # read image + im = cv2.imread(f) # BGR + if self.album_transforms: + sample = self.album_transforms(image=cv2.cvtColor(im, cv2.COLOR_BGR2RGB))['image'] + else: + sample = self.torch_transforms(im) + return {'img': sample, 'cls': j} + + def __len__(self) -> int: + return len(self.samples) + + +# TODO: support semantic segmentation +class SemanticDataset(BaseDataset): + + def __init__(self): + """Initialize a SemanticDataset object.""" + super().__init__() diff --git a/ultralytics/data/loaders.py b/ultralytics/data/loaders.py new file mode 100644 index 0000000000000000000000000000000000000000..9bf256bd1b9103b6388a79d07690c0ed18671c84 --- /dev/null +++ b/ultralytics/data/loaders.py @@ -0,0 +1,407 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import glob +import math +import os +import time +from dataclasses import dataclass +from pathlib import Path +from threading import Thread +from urllib.parse import urlparse + +import cv2 +import numpy as np +import requests +import torch +from PIL import Image + +from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS +from ultralytics.utils import LOGGER, ROOT, is_colab, is_kaggle, ops +from ultralytics.utils.checks import check_requirements + + +@dataclass +class SourceTypes: + webcam: bool = False + screenshot: bool = False + from_img: bool = False + tensor: bool = False + + +class LoadStreams: + """YOLOv8 streamloader, i.e. `yolo predict source='rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP streams`.""" + + def __init__(self, sources='file.streams', imgsz=640, vid_stride=1): + """Initialize instance variables and check for consistent input stream shapes.""" + torch.backends.cudnn.benchmark = True # faster for fixed-size inference + self.mode = 'stream' + self.imgsz = imgsz + self.vid_stride = vid_stride # video frame-rate stride + sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources] + n = len(sources) + self.sources = [ops.clean_str(x) for x in sources] # clean source names for later + self.imgs, self.fps, self.frames, self.threads, self.shape = [[]] * n, [0] * n, [0] * n, [None] * n, [None] * n + for i, s in enumerate(sources): # index, source + # Start thread to read frames from video stream + st = f'{i + 1}/{n}: {s}... ' + if urlparse(s).hostname in ('www.youtube.com', 'youtube.com', 'youtu.be'): # if source is YouTube video + # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/Zgi9g1ksQHc' + s = get_best_youtube_url(s) + s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam + if s == 0 and (is_colab() or is_kaggle()): + raise NotImplementedError("'source=0' webcam not supported in Colab and Kaggle notebooks. " + "Try running 'source=0' in a local environment.") + cap = cv2.VideoCapture(s) + if not cap.isOpened(): + raise ConnectionError(f'{st}Failed to open {s}') + w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan + self.frames[i] = max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float('inf') # infinite stream fallback + self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback + + success, im = cap.read() # guarantee first frame + if not success or im is None: + raise ConnectionError(f'{st}Failed to read images from {s}') + self.imgs[i].append(im) + self.shape[i] = im.shape + self.threads[i] = Thread(target=self.update, args=([i, cap, s]), daemon=True) + LOGGER.info(f'{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)') + self.threads[i].start() + LOGGER.info('') # newline + + # Check for common shapes + self.bs = self.__len__() + + def update(self, i, cap, stream): + """Read stream `i` frames in daemon thread.""" + n, f = 0, self.frames[i] # frame number, frame array + while cap.isOpened() and n < f: + # Only read a new frame if the buffer is empty + if not self.imgs[i]: + n += 1 + cap.grab() # .read() = .grab() followed by .retrieve() + if n % self.vid_stride == 0: + success, im = cap.retrieve() + if success: + self.imgs[i].append(im) # add image to buffer + else: + LOGGER.warning('WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.') + self.imgs[i].append(np.zeros(self.shape[i])) + cap.open(stream) # re-open stream if signal was lost + else: + time.sleep(0.01) # wait until the buffer is empty + + def __iter__(self): + """Iterates through YOLO image feed and re-opens unresponsive streams.""" + self.count = -1 + return self + + def __next__(self): + """Returns source paths, transformed and original images for processing.""" + self.count += 1 + + # Wait until a frame is available in each buffer + while not all(self.imgs): + if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord('q'): # q to quit + cv2.destroyAllWindows() + raise StopIteration + time.sleep(1 / min(self.fps)) + + # Get and remove the next frame from imgs buffer + return self.sources, [x.pop(0) for x in self.imgs], None, '' + + def __len__(self): + """Return the length of the sources object.""" + return len(self.sources) # 1E12 frames = 32 streams at 30 FPS for 30 years + + +class LoadScreenshots: + """YOLOv8 screenshot dataloader, i.e. `yolo predict source=screen`.""" + + def __init__(self, source, imgsz=640): + """source = [screen_number left top width height] (pixels).""" + check_requirements('mss') + import mss # noqa + + source, *params = source.split() + self.screen, left, top, width, height = 0, None, None, None, None # default to full screen 0 + if len(params) == 1: + self.screen = int(params[0]) + elif len(params) == 4: + left, top, width, height = (int(x) for x in params) + elif len(params) == 5: + self.screen, left, top, width, height = (int(x) for x in params) + self.imgsz = imgsz + self.mode = 'stream' + self.frame = 0 + self.sct = mss.mss() + self.bs = 1 + + # Parse monitor shape + monitor = self.sct.monitors[self.screen] + self.top = monitor['top'] if top is None else (monitor['top'] + top) + self.left = monitor['left'] if left is None else (monitor['left'] + left) + self.width = width or monitor['width'] + self.height = height or monitor['height'] + self.monitor = {'left': self.left, 'top': self.top, 'width': self.width, 'height': self.height} + + def __iter__(self): + """Returns an iterator of the object.""" + return self + + def __next__(self): + """mss screen capture: get raw pixels from the screen as np array.""" + im0 = np.array(self.sct.grab(self.monitor))[:, :, :3] # [:, :, :3] BGRA to BGR + s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: ' + + self.frame += 1 + return [str(self.screen)], [im0], None, s # screen, img, vid_cap, string + + +class LoadImages: + """YOLOv8 image/video dataloader, i.e. `yolo predict source=image.jpg/vid.mp4`.""" + + def __init__(self, path, imgsz=640, vid_stride=1): + """Initialize the Dataloader and raise FileNotFoundError if file not found.""" + parent = None + if isinstance(path, str) and Path(path).suffix == '.txt': # *.txt file with img/vid/dir on each line + parent = Path(path).parent + path = Path(path).read_text().rsplit() + files = [] + for p in sorted(path) if isinstance(path, (list, tuple)) else [path]: + a = str(Path(p).absolute()) # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912 + if '*' in a: + files.extend(sorted(glob.glob(a, recursive=True))) # glob + elif os.path.isdir(a): + files.extend(sorted(glob.glob(os.path.join(a, '*.*')))) # dir + elif os.path.isfile(a): + files.append(a) # files (absolute or relative to CWD) + elif parent and (parent / p).is_file(): + files.append(str((parent / p).absolute())) # files (relative to *.txt file parent) + else: + raise FileNotFoundError(f'{p} does not exist') + + images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS] + videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS] + ni, nv = len(images), len(videos) + + self.imgsz = imgsz + self.files = images + videos + self.nf = ni + nv # number of files + self.video_flag = [False] * ni + [True] * nv + self.mode = 'image' + self.vid_stride = vid_stride # video frame-rate stride + self.bs = 1 + if any(videos): + self.orientation = None # rotation degrees + self._new_video(videos[0]) # new video + else: + self.cap = None + if self.nf == 0: + raise FileNotFoundError(f'No images or videos found in {p}. ' + f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}') + + def __iter__(self): + """Returns an iterator object for VideoStream or ImageFolder.""" + self.count = 0 + return self + + def __next__(self): + """Return next image, path and metadata from dataset.""" + if self.count == self.nf: + raise StopIteration + path = self.files[self.count] + + if self.video_flag[self.count]: + # Read video + self.mode = 'video' + for _ in range(self.vid_stride): + self.cap.grab() + success, im0 = self.cap.retrieve() + while not success: + self.count += 1 + self.cap.release() + if self.count == self.nf: # last video + raise StopIteration + path = self.files[self.count] + self._new_video(path) + success, im0 = self.cap.read() + + self.frame += 1 + # im0 = self._cv2_rotate(im0) # for use if cv2 autorotation is False + s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: ' + + else: + # Read image + self.count += 1 + im0 = cv2.imread(path) # BGR + if im0 is None: + raise FileNotFoundError(f'Image Not Found {path}') + s = f'image {self.count}/{self.nf} {path}: ' + + return [path], [im0], self.cap, s + + def _new_video(self, path): + """Create a new video capture object.""" + self.frame = 0 + self.cap = cv2.VideoCapture(path) + self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride) + if hasattr(cv2, 'CAP_PROP_ORIENTATION_META'): # cv2<4.6.0 compatibility + self.orientation = int(self.cap.get(cv2.CAP_PROP_ORIENTATION_META)) # rotation degrees + # Disable auto-orientation due to known issues in https://github.com/ultralytics/yolov5/issues/8493 + # self.cap.set(cv2.CAP_PROP_ORIENTATION_AUTO, 0) + + def _cv2_rotate(self, im): + """Rotate a cv2 video manually.""" + if self.orientation == 0: + return cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE) + elif self.orientation == 180: + return cv2.rotate(im, cv2.ROTATE_90_COUNTERCLOCKWISE) + elif self.orientation == 90: + return cv2.rotate(im, cv2.ROTATE_180) + return im + + def __len__(self): + """Returns the number of files in the object.""" + return self.nf # number of files + + +class LoadPilAndNumpy: + + def __init__(self, im0, imgsz=640): + """Initialize PIL and Numpy Dataloader.""" + if not isinstance(im0, list): + im0 = [im0] + self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)] + self.im0 = [self._single_check(im) for im in im0] + self.imgsz = imgsz + self.mode = 'image' + # Generate fake paths + self.bs = len(self.im0) + + @staticmethod + def _single_check(im): + """Validate and format an image to numpy array.""" + assert isinstance(im, (Image.Image, np.ndarray)), f'Expected PIL/np.ndarray image type, but got {type(im)}' + if isinstance(im, Image.Image): + if im.mode != 'RGB': + im = im.convert('RGB') + im = np.asarray(im)[:, :, ::-1] + im = np.ascontiguousarray(im) # contiguous + return im + + def __len__(self): + """Returns the length of the 'im0' attribute.""" + return len(self.im0) + + def __next__(self): + """Returns batch paths, images, processed images, None, ''.""" + if self.count == 1: # loop only once as it's batch inference + raise StopIteration + self.count += 1 + return self.paths, self.im0, None, '' + + def __iter__(self): + """Enables iteration for class LoadPilAndNumpy.""" + self.count = 0 + return self + + +class LoadTensor: + + def __init__(self, im0) -> None: + self.im0 = self._single_check(im0) + self.bs = self.im0.shape[0] + self.mode = 'image' + self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)] + + @staticmethod + def _single_check(im, stride=32): + """Validate and format an image to torch.Tensor.""" + s = f'WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) ' \ + f'divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible.' + if len(im.shape) != 4: + if len(im.shape) != 3: + raise ValueError(s) + LOGGER.warning(s) + im = im.unsqueeze(0) + if im.shape[2] % stride or im.shape[3] % stride: + raise ValueError(s) + if im.max() > 1.0: + LOGGER.warning(f'WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. ' + f'Dividing input by 255.') + im = im.float() / 255.0 + + return im + + def __iter__(self): + """Returns an iterator object.""" + self.count = 0 + return self + + def __next__(self): + """Return next item in the iterator.""" + if self.count == 1: + raise StopIteration + self.count += 1 + return self.paths, self.im0, None, '' + + def __len__(self): + """Returns the batch size.""" + return self.bs + + +def autocast_list(source): + """ + Merges a list of source of different types into a list of numpy arrays or PIL images + """ + files = [] + for im in source: + if isinstance(im, (str, Path)): # filename or uri + files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im)) + elif isinstance(im, (Image.Image, np.ndarray)): # PIL or np Image + files.append(im) + else: + raise TypeError(f'type {type(im).__name__} is not a supported Ultralytics prediction source type. \n' + f'See https://docs.ultralytics.com/modes/predict for supported source types.') + + return files + + +LOADERS = [LoadStreams, LoadPilAndNumpy, LoadImages, LoadScreenshots] + + +def get_best_youtube_url(url, use_pafy=True): + """ + Retrieves the URL of the best quality MP4 video stream from a given YouTube video. + + This function uses the pafy or yt_dlp library to extract the video info from YouTube. It then finds the highest + quality MP4 format that has video codec but no audio codec, and returns the URL of this video stream. + + Args: + url (str): The URL of the YouTube video. + use_pafy (bool): Use the pafy package, default=True, otherwise use yt_dlp package. + + Returns: + (str): The URL of the best quality MP4 video stream, or None if no suitable stream is found. + """ + if use_pafy: + check_requirements(('pafy', 'youtube_dl==2020.12.2')) + import pafy # noqa + return pafy.new(url).getbest(preftype='mp4').url + else: + check_requirements('yt-dlp') + import yt_dlp + with yt_dlp.YoutubeDL({'quiet': True}) as ydl: + info_dict = ydl.extract_info(url, download=False) # extract info + for f in info_dict.get('formats', None): + if f['vcodec'] != 'none' and f['acodec'] == 'none' and f['ext'] == 'mp4': + return f.get('url', None) + + +if __name__ == '__main__': + img = cv2.imread(str(ROOT / 'assets/bus.jpg')) + dataset = LoadPilAndNumpy(im0=img) + for d in dataset: + print(d[0]) diff --git a/ultralytics/data/scripts/download_weights.sh b/ultralytics/data/scripts/download_weights.sh new file mode 100644 index 0000000000000000000000000000000000000000..983299737004d073e33d8c174aa27c5263dd2427 --- /dev/null +++ b/ultralytics/data/scripts/download_weights.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Ultralytics YOLO 🚀, AGPL-3.0 license +# Download latest models from https://github.com/ultralytics/assets/releases +# Example usage: bash ultralytics/data/scripts/download_weights.sh +# parent +# └── weights +# ├── yolov8n.pt ← downloads here +# ├── yolov8s.pt +# └── ... + +python - < 9) & (shape[1] > 9), f'image size {shape} <10 pixels' + assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}' + if im.format.lower() in ('jpg', 'jpeg'): + with open(im_file, 'rb') as f: + f.seek(-2, 2) + if f.read() != b'\xff\xd9': # corrupt JPEG + ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100) + msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved' + + # Verify labels + if os.path.isfile(lb_file): + nf = 1 # label found + with open(lb_file) as f: + lb = [x.split() for x in f.read().strip().splitlines() if len(x)] + if any(len(x) > 6 for x in lb) and (not keypoint): # is segment + classes = np.array([x[0] for x in lb], dtype=np.float32) + segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb] # (cls, xy1...) + lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh) + lb = np.array(lb, dtype=np.float32) + nl = len(lb) + if nl: + if keypoint: + assert lb.shape[1] == (5 + nkpt * ndim), f'labels require {(5 + nkpt * ndim)} columns each' + assert (lb[:, 5::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels' + assert (lb[:, 6::ndim] <= 1).all(), 'non-normalized or out of bounds coordinate labels' + else: + assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected' + assert (lb[:, 1:] <= 1).all(), \ + f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}' + assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}' + # All labels + max_cls = int(lb[:, 0].max()) # max label count + assert max_cls <= num_cls, \ + f'Label class {max_cls} exceeds dataset class count {num_cls}. ' \ + f'Possible class labels are 0-{num_cls - 1}' + _, i = np.unique(lb, axis=0, return_index=True) + if len(i) < nl: # duplicate row check + lb = lb[i] # remove duplicates + if segments: + segments = [segments[x] for x in i] + msg = f'{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed' + else: + ne = 1 # label empty + lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros( + (0, 5), dtype=np.float32) + else: + nm = 1 # label missing + lb = np.zeros((0, (5 + nkpt * ndim)), dtype=np.float32) if keypoint else np.zeros((0, 5), dtype=np.float32) + if keypoint: + keypoints = lb[:, 5:].reshape(-1, nkpt, ndim) + if ndim == 2: + kpt_mask = np.ones(keypoints.shape[:2], dtype=np.float32) + kpt_mask = np.where(keypoints[..., 0] < 0, 0.0, kpt_mask) + kpt_mask = np.where(keypoints[..., 1] < 0, 0.0, kpt_mask) + keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1) # (nl, nkpt, 3) + lb = lb[:, :5] + return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg + except Exception as e: + nc = 1 + msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}' + return [None, None, None, None, None, nm, nf, ne, nc, msg] + + +def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1): + """ + Args: + imgsz (tuple): The image size. + polygons (list[np.ndarray]): [N, M], N is the number of polygons, M is the number of points(Be divided by 2). + color (int): color + downsample_ratio (int): downsample ratio + """ + mask = np.zeros(imgsz, dtype=np.uint8) + polygons = np.asarray(polygons) + polygons = polygons.astype(np.int32) + shape = polygons.shape + polygons = polygons.reshape(shape[0], -1, 2) + cv2.fillPoly(mask, polygons, color=color) + nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio) + # NOTE: fillPoly firstly then resize is trying the keep the same way + # of loss calculation when mask-ratio=1. + mask = cv2.resize(mask, (nw, nh)) + return mask + + +def polygons2masks(imgsz, polygons, color, downsample_ratio=1): + """ + Args: + imgsz (tuple): The image size. + polygons (list[np.ndarray]): each polygon is [N, M], N is number of polygons, M is number of points (M % 2 = 0) + color (int): color + downsample_ratio (int): downsample ratio + """ + masks = [] + for si in range(len(polygons)): + mask = polygon2mask(imgsz, [polygons[si].reshape(-1)], color, downsample_ratio) + masks.append(mask) + return np.array(masks) + + +def polygons2masks_overlap(imgsz, segments, downsample_ratio=1): + """Return a (640, 640) overlap mask.""" + masks = np.zeros((imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio), + dtype=np.int32 if len(segments) > 255 else np.uint8) + areas = [] + ms = [] + for si in range(len(segments)): + mask = polygon2mask(imgsz, [segments[si].reshape(-1)], downsample_ratio=downsample_ratio, color=1) + ms.append(mask) + areas.append(mask.sum()) + areas = np.asarray(areas) + index = np.argsort(-areas) + ms = np.array(ms)[index] + for i in range(len(segments)): + mask = ms[i] * (i + 1) + masks = masks + mask + masks = np.clip(masks, a_min=0, a_max=i + 1) + return masks, index + + +def check_det_dataset(dataset, autodownload=True): + """Download, check and/or unzip dataset if not found locally.""" + data = check_file(dataset) + + # Download (optional) + extract_dir = '' + if isinstance(data, (str, Path)) and (zipfile.is_zipfile(data) or is_tarfile(data)): + new_dir = safe_download(data, dir=DATASETS_DIR, unzip=True, delete=False, curl=False) + data = next((DATASETS_DIR / new_dir).rglob('*.yaml')) + extract_dir, autodownload = data.parent, False + + # Read yaml (optional) + if isinstance(data, (str, Path)): + data = yaml_load(data, append_filename=True) # dictionary + + # Checks + for k in 'train', 'val': + if k not in data: + raise SyntaxError( + emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs.")) + if 'names' not in data and 'nc' not in data: + raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs.")) + if 'names' in data and 'nc' in data and len(data['names']) != data['nc']: + raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match.")) + if 'names' not in data: + data['names'] = [f'class_{i}' for i in range(data['nc'])] + else: + data['nc'] = len(data['names']) + + data['names'] = check_class_names(data['names']) + + # Resolve paths + path = Path(extract_dir or data.get('path') or Path(data.get('yaml_file', '')).parent) # dataset root + + if not path.is_absolute(): + path = (DATASETS_DIR / path).resolve() + data['path'] = path # download scripts + for k in 'train', 'val', 'test': + if data.get(k): # prepend path + if isinstance(data[k], str): + x = (path / data[k]).resolve() + if not x.exists() and data[k].startswith('../'): + x = (path / data[k][3:]).resolve() + data[k] = str(x) + else: + data[k] = [str((path / x).resolve()) for x in data[k]] + + # Parse yaml + train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download')) + if val: + val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path + if not all(x.exists() for x in val): + name = clean_url(dataset) # dataset name with URL auth stripped + m = f"\nDataset '{name}' images not found ⚠️, missing path '{[x for x in val if not x.exists()][0]}'" + if s and autodownload: + LOGGER.warning(m) + else: + m += f"\nNote dataset download directory is '{DATASETS_DIR}'. You can update this in '{SETTINGS_YAML}'" + raise FileNotFoundError(m) + t = time.time() + if s.startswith('http') and s.endswith('.zip'): # URL + safe_download(url=s, dir=DATASETS_DIR, delete=True) + r = None # success + elif s.startswith('bash '): # bash script + LOGGER.info(f'Running {s} ...') + r = os.system(s) + else: # python script + r = exec(s, {'yaml': data}) # return None + dt = f'({round(time.time() - t, 1)}s)' + s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f'failure {dt} ❌' + LOGGER.info(f'Dataset download {s}\n') + check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf') # download fonts + + return data # dictionary + + +def check_cls_dataset(dataset: str, split=''): + """ + Checks a classification dataset such as Imagenet. + + This function accepts a `dataset` name and attempts to retrieve the corresponding dataset information. + If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally. + + Args: + dataset (str): The name of the dataset. + split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''. + + Returns: + (dict): A dictionary containing the following keys: + - 'train' (Path): The directory path containing the training set of the dataset. + - 'val' (Path): The directory path containing the validation set of the dataset. + - 'test' (Path): The directory path containing the test set of the dataset. + - 'nc' (int): The number of classes in the dataset. + - 'names' (dict): A dictionary of class names in the dataset. + + Raises: + FileNotFoundError: If the specified dataset is not found and cannot be downloaded. + """ + + dataset = Path(dataset) + data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve() + if not data_dir.is_dir(): + LOGGER.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...') + t = time.time() + if str(dataset) == 'imagenet': + subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True) + else: + url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip' + download(url, dir=data_dir.parent) + s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n" + LOGGER.info(s) + train_set = data_dir / 'train' + val_set = data_dir / 'val' if (data_dir / 'val').exists() else None # data/test or data/val + test_set = data_dir / 'test' if (data_dir / 'test').exists() else None # data/val or data/test + if split == 'val' and not val_set: + LOGGER.info("WARNING ⚠️ Dataset 'split=val' not found, using 'split=test' instead.") + elif split == 'test' and not test_set: + LOGGER.info("WARNING ⚠️ Dataset 'split=test' not found, using 'split=val' instead.") + + nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()]) # number of classes + names = [x.name for x in (data_dir / 'train').iterdir() if x.is_dir()] # class names list + names = dict(enumerate(sorted(names))) + return {'train': train_set, 'val': val_set or test_set, 'test': test_set or val_set, 'nc': nc, 'names': names} + + +class HUBDatasetStats(): + """ + A class for generating HUB dataset JSON and `-hub` dataset directory. + + Args: + path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip). Default is 'coco128.yaml'. + task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Default is 'detect'. + autodownload (bool): Attempt to download dataset if not found locally. Default is False. + + Usage + from ultralytics.data.utils import HUBDatasetStats + stats = HUBDatasetStats('/Users/glennjocher/Downloads/coco8.zip', task='detect') # detect dataset + stats = HUBDatasetStats('/Users/glennjocher/Downloads/coco8-seg.zip', task='segment') # segment dataset + stats = HUBDatasetStats('/Users/glennjocher/Downloads/coco8-pose.zip', task='pose') # pose dataset + stats.get_json(save=False) + stats.process_images() + """ + + def __init__(self, path='coco128.yaml', task='detect', autodownload=False): + """Initialize class.""" + LOGGER.info(f'Starting HUB dataset checks for {path}....') + zipped, data_dir, yaml_path = self._unzip(Path(path)) + try: + # data = yaml_load(check_yaml(yaml_path)) # data dict + data = check_det_dataset(yaml_path, autodownload) # data dict + if zipped: + data['path'] = data_dir + except Exception as e: + raise Exception('error/HUB/dataset_stats/yaml_load') from e + + self.hub_dir = Path(str(data['path']) + '-hub') + self.im_dir = self.hub_dir / 'images' + self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images + self.stats = {'nc': len(data['names']), 'names': list(data['names'].values())} # statistics dictionary + self.data = data + self.task = task # detect, segment, pose, classify + + @staticmethod + def _find_yaml(dir): + """Return data.yaml file.""" + files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive + assert files, f'No *.yaml file found in {dir}' + if len(files) > 1: + files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name + assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed' + assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}' + return files[0] + + def _unzip(self, path): + """Unzip data.zip.""" + if not str(path).endswith('.zip'): # path is data.yaml + return False, None, path + unzip_dir = unzip_file(path, path=path.parent) + assert unzip_dir.is_dir(), f'Error unzipping {path}, {unzip_dir} not found. ' \ + f'path/to/abc.zip MUST unzip to path/to/abc/' + return True, str(unzip_dir), self._find_yaml(unzip_dir) # zipped, data_dir, yaml_path + + def _hub_ops(self, f): + """Saves a compressed image for HUB previews.""" + compress_one_image(f, self.im_dir / Path(f).name) # save to dataset-hub + + def get_json(self, save=False, verbose=False): + """Return dataset JSON for Ultralytics HUB.""" + from ultralytics.data import YOLODataset # ClassificationDataset + + def _round(labels): + """Update labels to integer class and 4 decimal place floats.""" + if self.task == 'detect': + coordinates = labels['bboxes'] + elif self.task == 'segment': + coordinates = [x.flatten() for x in labels['segments']] + elif self.task == 'pose': + n = labels['keypoints'].shape[0] + coordinates = np.concatenate((labels['bboxes'], labels['keypoints'].reshape(n, -1)), 1) + else: + raise ValueError('Undefined dataset task.') + zipped = zip(labels['cls'], coordinates) + return [[int(c), *(round(float(x), 4) for x in points)] for c, points in zipped] + + for split in 'train', 'val', 'test': + if self.data.get(split) is None: + self.stats[split] = None # i.e. no test set + continue + + dataset = YOLODataset(img_path=self.data[split], + data=self.data, + use_segments=self.task == 'segment', + use_keypoints=self.task == 'pose') + x = np.array([ + np.bincount(label['cls'].astype(int).flatten(), minlength=self.data['nc']) + for label in tqdm(dataset.labels, total=len(dataset), desc='Statistics')]) # shape(128x80) + self.stats[split] = { + 'instance_stats': { + 'total': int(x.sum()), + 'per_class': x.sum(0).tolist()}, + 'image_stats': { + 'total': len(dataset), + 'unlabelled': int(np.all(x == 0, 1).sum()), + 'per_class': (x > 0).sum(0).tolist()}, + 'labels': [{ + Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)]} + + # Save, print and return + if save: + stats_path = self.hub_dir / 'stats.json' + LOGGER.info(f'Saving {stats_path.resolve()}...') + with open(stats_path, 'w') as f: + json.dump(self.stats, f) # save stats.json + if verbose: + LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False)) + return self.stats + + def process_images(self): + """Compress images for Ultralytics HUB.""" + from ultralytics.data import YOLODataset # ClassificationDataset + + for split in 'train', 'val', 'test': + if self.data.get(split) is None: + continue + dataset = YOLODataset(img_path=self.data[split], data=self.data) + with ThreadPool(NUM_THREADS) as pool: + for _ in tqdm(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f'{split} images'): + pass + LOGGER.info(f'Done. All images saved to {self.im_dir}') + return self.im_dir + + +def compress_one_image(f, f_new=None, max_dim=1920, quality=50): + """ + Compresses a single image file to reduced size while preserving its aspect ratio and quality using either the + Python Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will + not be resized. + + Args: + f (str): The path to the input image file. + f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten. + max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels. + quality (int, optional): The image compression quality as a percentage. Default is 50%. + + Usage: + from pathlib import Path + from ultralytics.data.utils import compress_one_image + for f in Path('/Users/glennjocher/Downloads/dataset').rglob('*.jpg'): + compress_one_image(f) + """ + try: # use PIL + im = Image.open(f) + r = max_dim / max(im.height, im.width) # ratio + if r < 1.0: # image too large + im = im.resize((int(im.width * r), int(im.height * r))) + im.save(f_new or f, 'JPEG', quality=quality, optimize=True) # save + except Exception as e: # use OpenCV + LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}') + im = cv2.imread(f) + im_height, im_width = im.shape[:2] + r = max_dim / max(im_height, im_width) # ratio + if r < 1.0: # image too large + im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA) + cv2.imwrite(str(f_new or f), im) + + +def delete_dsstore(path): + """ + Deletes all ".DS_store" files under a specified directory. + + Args: + path (str, optional): The directory path where the ".DS_store" files should be deleted. + + Usage: + from ultralytics.data.utils import delete_dsstore + delete_dsstore('/Users/glennjocher/Downloads/dataset') + + Note: + ".DS_store" files are created by the Apple operating system and contain metadata about folders and files. They + are hidden system files and can cause issues when transferring files between different operating systems. + """ + # Delete Apple .DS_store files + files = list(Path(path).rglob('.DS_store')) + LOGGER.info(f'Deleting *.DS_store files: {files}') + for f in files: + f.unlink() + + +def zip_directory(dir, use_zipfile_library=True): + """ + Zips a directory and saves the archive to the specified output path. + + Args: + dir (str): The path to the directory to be zipped. + use_zipfile_library (bool): Whether to use zipfile library or shutil for zipping. + + Usage: + from ultralytics.data.utils import zip_directory + zip_directory('/Users/glennjocher/Downloads/playground') + + zip -r coco8-pose.zip coco8-pose + """ + delete_dsstore(dir) + if use_zipfile_library: + dir = Path(dir) + with zipfile.ZipFile(dir.with_suffix('.zip'), 'w', zipfile.ZIP_DEFLATED) as zip_file: + for file_path in dir.glob('**/*'): + if file_path.is_file(): + zip_file.write(file_path, file_path.relative_to(dir)) + else: + import shutil + shutil.make_archive(dir, 'zip', dir) + + +def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False): + """ + Autosplit a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files. + + Args: + path (Path, optional): Path to images directory. Defaults to DATASETS_DIR / 'coco128/images'. + weights (list | tuple, optional): Train, validation, and test split fractions. Defaults to (0.9, 0.1, 0.0). + annotated_only (bool, optional): If True, only images with an associated txt file are used. Defaults to False. + + Usage: + from utils.dataloaders import autosplit + autosplit() + """ + + path = Path(path) # images dir + files = sorted(x for x in path.rglob('*.*') if x.suffix[1:].lower() in IMG_FORMATS) # image files only + n = len(files) # number of files + random.seed(0) # for reproducibility + indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split + + txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files + for x in txt: + if (path.parent / x).exists(): + (path.parent / x).unlink() # remove existing + + LOGGER.info(f'Autosplitting images from {path}' + ', using *.txt labeled images only' * annotated_only) + for i, img in tqdm(zip(indices, files), total=n): + if not annotated_only or Path(img2label_paths([str(img)])[0]).exists(): # check label + with open(path.parent / txt[i], 'a') as f: + f.write(f'./{img.relative_to(path.parent).as_posix()}' + '\n') # add image to txt file diff --git a/ultralytics/engine/__init__.py b/ultralytics/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ultralytics/engine/__pycache__/__init__.cpython-310.pyc b/ultralytics/engine/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5856a84ae3c9528cbcce161c77368b878425c491 Binary files /dev/null and b/ultralytics/engine/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/engine/__pycache__/__init__.cpython-39.pyc b/ultralytics/engine/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48c248fea4354815581451794156f3b3ca5d16ac Binary files /dev/null and b/ultralytics/engine/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/engine/__pycache__/exporter.cpython-310.pyc b/ultralytics/engine/__pycache__/exporter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5180be5395d2473e82b97160d642f6a5a6e967b Binary files /dev/null and b/ultralytics/engine/__pycache__/exporter.cpython-310.pyc differ diff --git a/ultralytics/engine/__pycache__/exporter.cpython-39.pyc b/ultralytics/engine/__pycache__/exporter.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d26572a4fec925005d07a93fb7cc321076b4551 Binary files /dev/null and b/ultralytics/engine/__pycache__/exporter.cpython-39.pyc differ diff --git a/ultralytics/engine/__pycache__/model.cpython-310.pyc b/ultralytics/engine/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99725db1ea2c293bb7d1ca33194ba92519ba0b72 Binary files /dev/null and b/ultralytics/engine/__pycache__/model.cpython-310.pyc differ diff --git a/ultralytics/engine/__pycache__/model.cpython-39.pyc b/ultralytics/engine/__pycache__/model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b89f206db960189108787dd7154ea6b75c62172 Binary files /dev/null and b/ultralytics/engine/__pycache__/model.cpython-39.pyc differ diff --git a/ultralytics/engine/__pycache__/predictor.cpython-310.pyc b/ultralytics/engine/__pycache__/predictor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abe563ffa9e502fe58e837286a58c7cfc6d69fb4 Binary files /dev/null and b/ultralytics/engine/__pycache__/predictor.cpython-310.pyc differ diff --git a/ultralytics/engine/__pycache__/predictor.cpython-39.pyc b/ultralytics/engine/__pycache__/predictor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b506d99f164645226f2953c361176ab0cbd36538 Binary files /dev/null and b/ultralytics/engine/__pycache__/predictor.cpython-39.pyc differ diff --git a/ultralytics/engine/__pycache__/results.cpython-310.pyc b/ultralytics/engine/__pycache__/results.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34bb3dd6b031aa3e5aaffbd4de741290ab3376a4 Binary files /dev/null and b/ultralytics/engine/__pycache__/results.cpython-310.pyc differ diff --git a/ultralytics/engine/__pycache__/results.cpython-39.pyc b/ultralytics/engine/__pycache__/results.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcc0d0680df062b220c144c70cfa2e796338d818 Binary files /dev/null and b/ultralytics/engine/__pycache__/results.cpython-39.pyc differ diff --git a/ultralytics/engine/__pycache__/trainer.cpython-310.pyc b/ultralytics/engine/__pycache__/trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a76c7fc7447f8e7764496c1248e4dc2bc1e1084f Binary files /dev/null and b/ultralytics/engine/__pycache__/trainer.cpython-310.pyc differ diff --git a/ultralytics/engine/__pycache__/trainer.cpython-39.pyc b/ultralytics/engine/__pycache__/trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f83ca05cf2c8bffe32ab0eb48ebc5db8d0f1c38 Binary files /dev/null and b/ultralytics/engine/__pycache__/trainer.cpython-39.pyc differ diff --git a/ultralytics/engine/__pycache__/validator.cpython-310.pyc b/ultralytics/engine/__pycache__/validator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccce9db764ed3fefdfbde215ea36e40b39135c80 Binary files /dev/null and b/ultralytics/engine/__pycache__/validator.cpython-310.pyc differ diff --git a/ultralytics/engine/__pycache__/validator.cpython-39.pyc b/ultralytics/engine/__pycache__/validator.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64d26b99e8843d61653961b2a7d626f56b7f3b9c Binary files /dev/null and b/ultralytics/engine/__pycache__/validator.cpython-39.pyc differ diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py new file mode 100644 index 0000000000000000000000000000000000000000..dd5b091e34ad6d048eaed314493fbe6f9ec0c579 --- /dev/null +++ b/ultralytics/engine/exporter.py @@ -0,0 +1,969 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Export a YOLOv8 PyTorch model to other formats. TensorFlow exports authored by https://github.com/zldrobit + +Format | `format=argument` | Model +--- | --- | --- +PyTorch | - | yolov8n.pt +TorchScript | `torchscript` | yolov8n.torchscript +ONNX | `onnx` | yolov8n.onnx +OpenVINO | `openvino` | yolov8n_openvino_model/ +TensorRT | `engine` | yolov8n.engine +CoreML | `coreml` | yolov8n.mlmodel +TensorFlow SavedModel | `saved_model` | yolov8n_saved_model/ +TensorFlow GraphDef | `pb` | yolov8n.pb +TensorFlow Lite | `tflite` | yolov8n.tflite +TensorFlow Edge TPU | `edgetpu` | yolov8n_edgetpu.tflite +TensorFlow.js | `tfjs` | yolov8n_web_model/ +PaddlePaddle | `paddle` | yolov8n_paddle_model/ +ncnn | `ncnn` | yolov8n_ncnn_model/ + +Requirements: + $ pip install "ultralytics[export]" + +Python: + from ultralytics import YOLO + model = YOLO('yolov8n.pt') + results = model.export(format='onnx') + +CLI: + $ yolo mode=export model=yolov8n.pt format=onnx + +Inference: + $ yolo predict model=yolov8n.pt # PyTorch + yolov8n.torchscript # TorchScript + yolov8n.onnx # ONNX Runtime or OpenCV DNN with dnn=True + yolov8n_openvino_model # OpenVINO + yolov8n.engine # TensorRT + yolov8n.mlmodel # CoreML (macOS-only) + yolov8n_saved_model # TensorFlow SavedModel + yolov8n.pb # TensorFlow GraphDef + yolov8n.tflite # TensorFlow Lite + yolov8n_edgetpu.tflite # TensorFlow Edge TPU + yolov8n_paddle_model # PaddlePaddle + +TensorFlow.js: + $ cd .. && git clone https://github.com/zldrobit/tfjs-yolov5-example.git && cd tfjs-yolov5-example + $ npm install + $ ln -s ../../yolov5/yolov8n_web_model public/yolov8n_web_model + $ npm start +""" +import json +import os +import shutil +import subprocess +import time +import warnings +from copy import deepcopy +from datetime import datetime +from pathlib import Path + +import torch + +from ultralytics.cfg import get_cfg +from ultralytics.nn.autobackend import check_class_names +from ultralytics.nn.modules import C2f, Detect, RTDETRDecoder +from ultralytics.nn.tasks import DetectionModel, SegmentationModel +from ultralytics.utils import (ARM64, DEFAULT_CFG, LINUX, LOGGER, MACOS, ROOT, WINDOWS, __version__, callbacks, + colorstr, get_default_args, yaml_save) +from ultralytics.utils.checks import check_imgsz, check_requirements, check_version +from ultralytics.utils.downloads import attempt_download_asset, get_github_assets +from ultralytics.utils.files import file_size, spaces_in_path +from ultralytics.utils.ops import Profile +from ultralytics.utils.torch_utils import get_latest_opset, select_device, smart_inference_mode + + +def export_formats(): + """YOLOv8 export formats.""" + import pandas + x = [ + ['PyTorch', '-', '.pt', True, True], + ['TorchScript', 'torchscript', '.torchscript', True, True], + ['ONNX', 'onnx', '.onnx', True, True], + ['OpenVINO', 'openvino', '_openvino_model', True, False], + ['TensorRT', 'engine', '.engine', False, True], + ['CoreML', 'coreml', '.mlmodel', True, False], + ['TensorFlow SavedModel', 'saved_model', '_saved_model', True, True], + ['TensorFlow GraphDef', 'pb', '.pb', True, True], + ['TensorFlow Lite', 'tflite', '.tflite', True, False], + ['TensorFlow Edge TPU', 'edgetpu', '_edgetpu.tflite', True, False], + ['TensorFlow.js', 'tfjs', '_web_model', True, False], + ['PaddlePaddle', 'paddle', '_paddle_model', True, True], + ['ncnn', 'ncnn', '_ncnn_model', True, True], ] + return pandas.DataFrame(x, columns=['Format', 'Argument', 'Suffix', 'CPU', 'GPU']) + + +def gd_outputs(gd): + """TensorFlow GraphDef model output node names.""" + name_list, input_list = [], [] + for node in gd.node: # tensorflow.core.framework.node_def_pb2.NodeDef + name_list.append(node.name) + input_list.extend(node.input) + return sorted(f'{x}:0' for x in list(set(name_list) - set(input_list)) if not x.startswith('NoOp')) + + +def try_export(inner_func): + """YOLOv8 export decorator, i..e @try_export.""" + inner_args = get_default_args(inner_func) + + def outer_func(*args, **kwargs): + """Export a model.""" + prefix = inner_args['prefix'] + try: + with Profile() as dt: + f, model = inner_func(*args, **kwargs) + LOGGER.info(f"{prefix} export success ✅ {dt.t:.1f}s, saved as '{f}' ({file_size(f):.1f} MB)") + return f, model + except Exception as e: + LOGGER.info(f'{prefix} export failure ❌ {dt.t:.1f}s: {e}') + raise e + + return outer_func + + +class Exporter: + """ + A class for exporting a model. + + Attributes: + args (SimpleNamespace): Configuration for the exporter. + save_dir (Path): Directory to save results. + """ + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + """ + Initializes the Exporter class. + + Args: + cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG. + overrides (dict, optional): Configuration overrides. Defaults to None. + _callbacks (list, optional): List of callback functions. Defaults to None. + """ + self.args = get_cfg(cfg, overrides) + self.callbacks = _callbacks or callbacks.get_default_callbacks() + callbacks.add_integration_callbacks(self) + + @smart_inference_mode() + def __call__(self, model=None): + """Returns list of exported files/dirs after running callbacks.""" + self.run_callbacks('on_export_start') + t = time.time() + format = self.args.format.lower() # to lowercase + if format in ('tensorrt', 'trt'): # engine aliases + format = 'engine' + fmts = tuple(export_formats()['Argument'][1:]) # available export formats + flags = [x == format for x in fmts] + if sum(flags) != 1: + raise ValueError(f"Invalid export format='{format}'. Valid formats are {fmts}") + jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, ncnn = flags # export booleans + + # Load PyTorch model + self.device = select_device('cpu' if self.args.device is None else self.args.device) + + # Checks + model.names = check_class_names(model.names) + if self.args.half and onnx and self.device.type == 'cpu': + LOGGER.warning('WARNING ⚠️ half=True only compatible with GPU export, i.e. use device=0') + self.args.half = False + assert not self.args.dynamic, 'half=True not compatible with dynamic=True, i.e. use only one.' + self.imgsz = check_imgsz(self.args.imgsz, stride=model.stride, min_dim=2) # check image size + if self.args.optimize: + assert not ncnn, "optimize=True not compatible with format='ncnn', i.e. use optimize=False" + assert self.device.type == 'cpu', "optimize=True not compatible with cuda devices, i.e. use device='cpu'" + if edgetpu and not LINUX: + raise SystemError('Edge TPU export only supported on Linux. See https://coral.ai/docs/edgetpu/compiler/') + + # Input + im = torch.zeros(self.args.batch, 3, *self.imgsz).to(self.device) + file = Path( + getattr(model, 'pt_path', None) or getattr(model, 'yaml_file', None) or model.yaml.get('yaml_file', '')) + if file.suffix in ('.yaml', '.yml'): + file = Path(file.name) + + # Update model + model = deepcopy(model).to(self.device) + for p in model.parameters(): + p.requires_grad = False + model.eval() + model.float() + model = model.fuse() + for k, m in model.named_modules(): + if isinstance(m, (Detect, RTDETRDecoder)): # Segment and Pose use Detect base class + m.dynamic = self.args.dynamic + m.export = True + m.format = self.args.format + elif isinstance(m, C2f) and not any((saved_model, pb, tflite, edgetpu, tfjs)): + # EdgeTPU does not support FlexSplitV while split provides cleaner ONNX graph + m.forward = m.forward_split + + y = None + for _ in range(2): + y = model(im) # dry runs + if self.args.half and (engine or onnx) and self.device.type != 'cpu': + im, model = im.half(), model.half() # to FP16 + + # Filter warnings + warnings.filterwarnings('ignore', category=torch.jit.TracerWarning) # suppress TracerWarning + warnings.filterwarnings('ignore', category=UserWarning) # suppress shape prim::Constant missing ONNX warning + warnings.filterwarnings('ignore', category=DeprecationWarning) # suppress CoreML np.bool deprecation warning + + # Assign + self.im = im + self.model = model + self.file = file + self.output_shape = tuple(y.shape) if isinstance(y, torch.Tensor) else \ + tuple(tuple(x.shape if isinstance(x, torch.Tensor) else []) for x in y) + self.pretty_name = Path(self.model.yaml.get('yaml_file', self.file)).stem.replace('yolo', 'YOLO') + trained_on = f'trained on {Path(self.args.data).name}' if self.args.data else '(untrained)' + description = f'Ultralytics {self.pretty_name} model {trained_on}' + self.metadata = { + 'description': description, + 'author': 'Ultralytics', + 'license': 'AGPL-3.0 https://ultralytics.com/license', + 'date': datetime.now().isoformat(), + 'version': __version__, + 'stride': int(max(model.stride)), + 'task': model.task, + 'batch': self.args.batch, + 'imgsz': self.imgsz, + 'names': model.names} # model metadata + if model.task == 'pose': + self.metadata['kpt_shape'] = model.model[-1].kpt_shape + + LOGGER.info(f"\n{colorstr('PyTorch:')} starting from '{file}' with input shape {tuple(im.shape)} BCHW and " + f'output shape(s) {self.output_shape} ({file_size(file):.1f} MB)') + + # Exports + f = [''] * len(fmts) # exported filenames + if jit or ncnn: # TorchScript + f[0], _ = self.export_torchscript() + if engine: # TensorRT required before ONNX + f[1], _ = self.export_engine() + if onnx or xml: # OpenVINO requires ONNX + f[2], _ = self.export_onnx() + if xml: # OpenVINO + f[3], _ = self.export_openvino() + if coreml: # CoreML + f[4], _ = self.export_coreml() + if any((saved_model, pb, tflite, edgetpu, tfjs)): # TensorFlow formats + self.args.int8 |= edgetpu + f[5], s_model = self.export_saved_model() + if pb or tfjs: # pb prerequisite to tfjs + f[6], _ = self.export_pb(s_model) + if tflite: + f[7], _ = self.export_tflite(s_model, nms=False, agnostic_nms=self.args.agnostic_nms) + if edgetpu: + f[8], _ = self.export_edgetpu(tflite_model=Path(f[5]) / f'{self.file.stem}_full_integer_quant.tflite') + if tfjs: + f[9], _ = self.export_tfjs() + if paddle: # PaddlePaddle + f[10], _ = self.export_paddle() + if ncnn: # ncnn + f[11], _ = self.export_ncnn() + + # Finish + f = [str(x) for x in f if x] # filter out '' and None + if any(f): + f = str(Path(f[-1])) + square = self.imgsz[0] == self.imgsz[1] + s = '' if square else f"WARNING ⚠️ non-PyTorch val requires square images, 'imgsz={self.imgsz}' will not " \ + f"work. Use export 'imgsz={max(self.imgsz)}' if val is required." + imgsz = self.imgsz[0] if square else str(self.imgsz)[1:-1].replace(' ', '') + data = f'data={self.args.data}' if model.task == 'segment' and format == 'pb' else '' + LOGGER.info( + f'\nExport complete ({time.time() - t:.1f}s)' + f"\nResults saved to {colorstr('bold', file.parent.resolve())}" + f'\nPredict: yolo predict task={model.task} model={f} imgsz={imgsz} {data}' + f'\nValidate: yolo val task={model.task} model={f} imgsz={imgsz} data={self.args.data} {s}' + f'\nVisualize: https://netron.app') + + self.run_callbacks('on_export_end') + return f # return list of exported files/dirs + + @try_export + def export_torchscript(self, prefix=colorstr('TorchScript:')): + """YOLOv8 TorchScript model export.""" + LOGGER.info(f'\n{prefix} starting export with torch {torch.__version__}...') + f = self.file.with_suffix('.torchscript') + + ts = torch.jit.trace(self.model, self.im, strict=False) + extra_files = {'config.txt': json.dumps(self.metadata)} # torch._C.ExtraFilesMap() + if self.args.optimize: # https://pytorch.org/tutorials/recipes/mobile_interpreter.html + LOGGER.info(f'{prefix} optimizing for mobile...') + from torch.utils.mobile_optimizer import optimize_for_mobile + optimize_for_mobile(ts)._save_for_lite_interpreter(str(f), _extra_files=extra_files) + else: + ts.save(str(f), _extra_files=extra_files) + return f, None + + @try_export + def export_onnx(self, prefix=colorstr('ONNX:')): + """YOLOv8 ONNX export.""" + requirements = ['onnx>=1.12.0'] + if self.args.simplify: + requirements += ['onnxsim>=0.4.17', 'onnxruntime-gpu' if torch.cuda.is_available() else 'onnxruntime'] + check_requirements(requirements) + import onnx # noqa + + opset_version = self.args.opset or get_latest_opset() + LOGGER.info(f'\n{prefix} starting export with onnx {onnx.__version__} opset {opset_version}...') + f = str(self.file.with_suffix('.onnx')) + + output_names = ['output0', 'output1'] if isinstance(self.model, SegmentationModel) else ['output0'] + dynamic = self.args.dynamic + if dynamic: + dynamic = {'images': {0: 'batch', 2: 'height', 3: 'width'}} # shape(1,3,640,640) + if isinstance(self.model, SegmentationModel): + dynamic['output0'] = {0: 'batch', 2: 'anchors'} # shape(1, 116, 8400) + dynamic['output1'] = {0: 'batch', 2: 'mask_height', 3: 'mask_width'} # shape(1,32,160,160) + elif isinstance(self.model, DetectionModel): + dynamic['output0'] = {0: 'batch', 2: 'anchors'} # shape(1, 84, 8400) + + torch.onnx.export( + self.model.cpu() if dynamic else self.model, # --dynamic only compatible with cpu + self.im.cpu() if dynamic else self.im, + f, + verbose=False, + opset_version=opset_version, + do_constant_folding=True, # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False + input_names=['images'], + output_names=output_names, + dynamic_axes=dynamic or None) + + # Checks + model_onnx = onnx.load(f) # load onnx model + # onnx.checker.check_model(model_onnx) # check onnx model + + # Simplify + if self.args.simplify: + try: + import onnxsim + + LOGGER.info(f'{prefix} simplifying with onnxsim {onnxsim.__version__}...') + # subprocess.run(f'onnxsim "{f}" "{f}"', shell=True) + model_onnx, check = onnxsim.simplify(model_onnx) + assert check, 'Simplified ONNX model could not be validated' + except Exception as e: + LOGGER.info(f'{prefix} simplifier failure: {e}') + + # Metadata + for k, v in self.metadata.items(): + meta = model_onnx.metadata_props.add() + meta.key, meta.value = k, str(v) + + onnx.save(model_onnx, f) + return f, model_onnx + + @try_export + def export_openvino(self, prefix=colorstr('OpenVINO:')): + """YOLOv8 OpenVINO export.""" + check_requirements('openvino-dev>=2023.0') # requires openvino-dev: https://pypi.org/project/openvino-dev/ + import openvino.runtime as ov # noqa + from openvino.tools import mo # noqa + + LOGGER.info(f'\n{prefix} starting export with openvino {ov.__version__}...') + f = str(self.file).replace(self.file.suffix, f'_openvino_model{os.sep}') + f_onnx = self.file.with_suffix('.onnx') + f_ov = str(Path(f) / self.file.with_suffix('.xml').name) + + ov_model = mo.convert_model(f_onnx, + model_name=self.pretty_name, + framework='onnx', + compress_to_fp16=self.args.half) # export + + # Set RT info + ov_model.set_rt_info('YOLOv8', ['model_info', 'model_type']) + ov_model.set_rt_info(True, ['model_info', 'reverse_input_channels']) + ov_model.set_rt_info(114, ['model_info', 'pad_value']) + ov_model.set_rt_info([255.0], ['model_info', 'scale_values']) + ov_model.set_rt_info(self.args.iou, ['model_info', 'iou_threshold']) + ov_model.set_rt_info([v.replace(' ', '_') for k, v in sorted(self.model.names.items())], + ['model_info', 'labels']) + if self.model.task != 'classify': + ov_model.set_rt_info('fit_to_window_letterbox', ['model_info', 'resize_type']) + + ov.serialize(ov_model, f_ov) # save + yaml_save(Path(f) / 'metadata.yaml', self.metadata) # add metadata.yaml + return f, None + + @try_export + def export_paddle(self, prefix=colorstr('PaddlePaddle:')): + """YOLOv8 Paddle export.""" + check_requirements(('paddlepaddle', 'x2paddle')) + import x2paddle # noqa + from x2paddle.convert import pytorch2paddle # noqa + + LOGGER.info(f'\n{prefix} starting export with X2Paddle {x2paddle.__version__}...') + f = str(self.file).replace(self.file.suffix, f'_paddle_model{os.sep}') + + pytorch2paddle(module=self.model, save_dir=f, jit_type='trace', input_examples=[self.im]) # export + yaml_save(Path(f) / 'metadata.yaml', self.metadata) # add metadata.yaml + return f, None + + @try_export + def export_ncnn(self, prefix=colorstr('ncnn:')): + """ + YOLOv8 ncnn export using PNNX https://github.com/pnnx/pnnx. + """ + check_requirements('git+https://github.com/Tencent/ncnn.git' if ARM64 else 'ncnn') # requires ncnn + import ncnn # noqa + + LOGGER.info(f'\n{prefix} starting export with ncnn {ncnn.__version__}...') + f = Path(str(self.file).replace(self.file.suffix, f'_ncnn_model{os.sep}')) + f_ts = self.file.with_suffix('.torchscript') + + pnnx_filename = 'pnnx.exe' if WINDOWS else 'pnnx' + if Path(pnnx_filename).is_file(): + pnnx = pnnx_filename + elif (ROOT / pnnx_filename).is_file(): + pnnx = ROOT / pnnx_filename + else: + LOGGER.warning( + f'{prefix} WARNING ⚠️ PNNX not found. Attempting to download binary file from ' + 'https://github.com/pnnx/pnnx/.\nNote PNNX Binary file must be placed in current working directory ' + f'or in {ROOT}. See PNNX repo for full installation instructions.') + _, assets = get_github_assets(repo='pnnx/pnnx', retry=True) + asset = [x for x in assets if ('macos' if MACOS else 'ubuntu' if LINUX else 'windows') in x][0] + attempt_download_asset(asset, repo='pnnx/pnnx', release='latest') + unzip_dir = Path(asset).with_suffix('') + pnnx = ROOT / pnnx_filename # new location + (unzip_dir / pnnx_filename).rename(pnnx) # move binary to ROOT + shutil.rmtree(unzip_dir) # delete unzip dir + Path(asset).unlink() # delete zip + pnnx.chmod(0o777) # set read, write, and execute permissions for everyone + + use_ncnn = True + ncnn_args = [ + f'ncnnparam={f / "model.ncnn.param"}', + f'ncnnbin={f / "model.ncnn.bin"}', + f'ncnnpy={f / "model_ncnn.py"}', ] if use_ncnn else [] + + use_pnnx = False + pnnx_args = [ + f'pnnxparam={f / "model.pnnx.param"}', + f'pnnxbin={f / "model.pnnx.bin"}', + f'pnnxpy={f / "model_pnnx.py"}', + f'pnnxonnx={f / "model.pnnx.onnx"}', ] if use_pnnx else [] + + cmd = [ + str(pnnx), + str(f_ts), + *ncnn_args, + *pnnx_args, + f'fp16={int(self.args.half)}', + f'device={self.device.type}', + f'inputshape="{[self.args.batch, 3, *self.imgsz]}"', ] + f.mkdir(exist_ok=True) # make ncnn_model directory + LOGGER.info(f"{prefix} running '{' '.join(cmd)}'") + subprocess.run(cmd, check=True) + for f_debug in 'debug.bin', 'debug.param', 'debug2.bin', 'debug2.param': # remove debug files + Path(f_debug).unlink(missing_ok=True) + + yaml_save(f / 'metadata.yaml', self.metadata) # add metadata.yaml + return str(f), None + + @try_export + def export_coreml(self, prefix=colorstr('CoreML:')): + """YOLOv8 CoreML export.""" + check_requirements('coremltools>=6.0,<=6.2') + import coremltools as ct # noqa + + LOGGER.info(f'\n{prefix} starting export with coremltools {ct.__version__}...') + f = self.file.with_suffix('.mlmodel') + + bias = [0.0, 0.0, 0.0] + scale = 1 / 255 + classifier_config = None + if self.model.task == 'classify': + classifier_config = ct.ClassifierConfig(list(self.model.names.values())) if self.args.nms else None + model = self.model + elif self.model.task == 'detect': + model = iOSDetectModel(self.model, self.im) if self.args.nms else self.model + else: + # TODO CoreML Segment and Pose model pipelining + model = self.model + + ts = torch.jit.trace(model.eval(), self.im, strict=False) # TorchScript model + ct_model = ct.convert(ts, + inputs=[ct.ImageType('image', shape=self.im.shape, scale=scale, bias=bias)], + classifier_config=classifier_config) + bits, mode = (8, 'kmeans_lut') if self.args.int8 else (16, 'linear') if self.args.half else (32, None) + if bits < 32: + if 'kmeans' in mode: + check_requirements('scikit-learn') # scikit-learn package required for k-means quantization + ct_model = ct.models.neural_network.quantization_utils.quantize_weights(ct_model, bits, mode) + if self.args.nms and self.model.task == 'detect': + ct_model = self._pipeline_coreml(ct_model) + + m = self.metadata # metadata dict + ct_model.short_description = m.pop('description') + ct_model.author = m.pop('author') + ct_model.license = m.pop('license') + ct_model.version = m.pop('version') + ct_model.user_defined_metadata.update({k: str(v) for k, v in m.items()}) + ct_model.save(str(f)) + return f, ct_model + + @try_export + def export_engine(self, prefix=colorstr('TensorRT:')): + """YOLOv8 TensorRT export https://developer.nvidia.com/tensorrt.""" + assert self.im.device.type != 'cpu', "export running on CPU but must be on GPU, i.e. use 'device=0'" + try: + import tensorrt as trt # noqa + except ImportError: + if LINUX: + check_requirements('nvidia-tensorrt', cmds='-U --index-url https://pypi.ngc.nvidia.com') + import tensorrt as trt # noqa + + check_version(trt.__version__, '7.0.0', hard=True) # require tensorrt>=7.0.0 + self.args.simplify = True + f_onnx, _ = self.export_onnx() + + LOGGER.info(f'\n{prefix} starting export with TensorRT {trt.__version__}...') + assert Path(f_onnx).exists(), f'failed to export ONNX file: {f_onnx}' + f = self.file.with_suffix('.engine') # TensorRT engine file + logger = trt.Logger(trt.Logger.INFO) + if self.args.verbose: + logger.min_severity = trt.Logger.Severity.VERBOSE + + builder = trt.Builder(logger) + config = builder.create_builder_config() + config.max_workspace_size = self.args.workspace * 1 << 30 + # config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace << 30) # fix TRT 8.4 deprecation notice + + flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + network = builder.create_network(flag) + parser = trt.OnnxParser(network, logger) + if not parser.parse_from_file(f_onnx): + raise RuntimeError(f'failed to load ONNX file: {f_onnx}') + + inputs = [network.get_input(i) for i in range(network.num_inputs)] + outputs = [network.get_output(i) for i in range(network.num_outputs)] + for inp in inputs: + LOGGER.info(f'{prefix} input "{inp.name}" with shape{inp.shape} {inp.dtype}') + for out in outputs: + LOGGER.info(f'{prefix} output "{out.name}" with shape{out.shape} {out.dtype}') + + if self.args.dynamic: + shape = self.im.shape + if shape[0] <= 1: + LOGGER.warning(f'{prefix} WARNING ⚠️ --dynamic model requires maximum --batch-size argument') + profile = builder.create_optimization_profile() + for inp in inputs: + profile.set_shape(inp.name, (1, *shape[1:]), (max(1, shape[0] // 2), *shape[1:]), shape) + config.add_optimization_profile(profile) + + LOGGER.info( + f'{prefix} building FP{16 if builder.platform_has_fast_fp16 and self.args.half else 32} engine as {f}') + if builder.platform_has_fast_fp16 and self.args.half: + config.set_flag(trt.BuilderFlag.FP16) + + # Write file + with builder.build_engine(network, config) as engine, open(f, 'wb') as t: + # Metadata + meta = json.dumps(self.metadata) + t.write(len(meta).to_bytes(4, byteorder='little', signed=True)) + t.write(meta.encode()) + # Model + t.write(engine.serialize()) + + return f, None + + @try_export + def export_saved_model(self, prefix=colorstr('TensorFlow SavedModel:')): + """YOLOv8 TensorFlow SavedModel export.""" + try: + import tensorflow as tf # noqa + except ImportError: + cuda = torch.cuda.is_available() + check_requirements(f"tensorflow{'-macos' if MACOS else '-aarch64' if ARM64 else '' if cuda else '-cpu'}") + import tensorflow as tf # noqa + check_requirements(('onnx', 'onnx2tf>=1.9.1', 'sng4onnx>=1.0.1', 'onnxsim>=0.4.17', 'onnx_graphsurgeon>=0.3.26', + 'tflite_support', 'onnxruntime-gpu' if torch.cuda.is_available() else 'onnxruntime'), + cmds='--extra-index-url https://pypi.ngc.nvidia.com') + + LOGGER.info(f'\n{prefix} starting export with tensorflow {tf.__version__}...') + f = Path(str(self.file).replace(self.file.suffix, '_saved_model')) + if f.is_dir(): + import shutil + shutil.rmtree(f) # delete output folder + + # Export to ONNX + self.args.simplify = True + f_onnx, _ = self.export_onnx() + + # Export to TF + tmp_file = f / 'tmp_tflite_int8_calibration_images.npy' # int8 calibration images file + if self.args.int8: + if self.args.data: + import numpy as np + + from ultralytics.data.dataset import YOLODataset + from ultralytics.data.utils import check_det_dataset + + # Generate calibration data for integer quantization + LOGGER.info(f"{prefix} collecting INT8 calibration images from 'data={self.args.data}'") + dataset = YOLODataset(check_det_dataset(self.args.data)['val'], imgsz=self.imgsz[0], augment=False) + images = [] + n_images = 100 # maximum number of images + for n, batch in enumerate(dataset): + if n >= n_images: + break + im = batch['img'].permute(1, 2, 0)[None] # list to nparray, CHW to BHWC, + images.append(im) + f.mkdir() + images = torch.cat(images, 0).float() + # mean = images.view(-1, 3).mean(0) # imagenet mean [123.675, 116.28, 103.53] + # std = images.view(-1, 3).std(0) # imagenet std [58.395, 57.12, 57.375] + np.save(str(tmp_file), images.numpy()) # BHWC + int8 = f'-oiqt -qt per-tensor -cind images "{tmp_file}" "[[[[0, 0, 0]]]]" "[[[[255, 255, 255]]]]"' + else: + int8 = '-oiqt -qt per-tensor' + else: + int8 = '' + + cmd = f'onnx2tf -i "{f_onnx}" -o "{f}" -nuo --non_verbose {int8}'.strip() + LOGGER.info(f"{prefix} running '{cmd}'") + subprocess.run(cmd, shell=True) + yaml_save(f / 'metadata.yaml', self.metadata) # add metadata.yaml + + # Remove/rename TFLite models + if self.args.int8: + tmp_file.unlink(missing_ok=True) + for file in f.rglob('*_dynamic_range_quant.tflite'): + file.rename(file.with_name(file.stem.replace('_dynamic_range_quant', '_int8') + file.suffix)) + for file in f.rglob('*_integer_quant_with_int16_act.tflite'): + file.unlink() # delete extra fp16 activation TFLite files + + # Add TFLite metadata + for file in f.rglob('*.tflite'): + f.unlink() if 'quant_with_int16_act.tflite' in str(f) else self._add_tflite_metadata(file) + + # Load saved_model + keras_model = tf.saved_model.load(f, tags=None, options=None) + + return str(f), keras_model + + @try_export + def export_pb(self, keras_model, prefix=colorstr('TensorFlow GraphDef:')): + """YOLOv8 TensorFlow GraphDef *.pb export https://github.com/leimao/Frozen_Graph_TensorFlow.""" + import tensorflow as tf # noqa + from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 # noqa + + LOGGER.info(f'\n{prefix} starting export with tensorflow {tf.__version__}...') + f = self.file.with_suffix('.pb') + + m = tf.function(lambda x: keras_model(x)) # full model + m = m.get_concrete_function(tf.TensorSpec(keras_model.inputs[0].shape, keras_model.inputs[0].dtype)) + frozen_func = convert_variables_to_constants_v2(m) + frozen_func.graph.as_graph_def() + tf.io.write_graph(graph_or_graph_def=frozen_func.graph, logdir=str(f.parent), name=f.name, as_text=False) + return f, None + + @try_export + def export_tflite(self, keras_model, nms, agnostic_nms, prefix=colorstr('TensorFlow Lite:')): + """YOLOv8 TensorFlow Lite export.""" + import tensorflow as tf # noqa + + LOGGER.info(f'\n{prefix} starting export with tensorflow {tf.__version__}...') + saved_model = Path(str(self.file).replace(self.file.suffix, '_saved_model')) + if self.args.int8: + f = saved_model / f'{self.file.stem}_int8.tflite' # fp32 in/out + elif self.args.half: + f = saved_model / f'{self.file.stem}_float16.tflite' # fp32 in/out + else: + f = saved_model / f'{self.file.stem}_float32.tflite' + return str(f), None + + @try_export + def export_edgetpu(self, tflite_model='', prefix=colorstr('Edge TPU:')): + """YOLOv8 Edge TPU export https://coral.ai/docs/edgetpu/models-intro/.""" + LOGGER.warning(f'{prefix} WARNING ⚠️ Edge TPU known bug https://github.com/ultralytics/ultralytics/issues/1185') + + cmd = 'edgetpu_compiler --version' + help_url = 'https://coral.ai/docs/edgetpu/compiler/' + assert LINUX, f'export only supported on Linux. See {help_url}' + if subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True).returncode != 0: + LOGGER.info(f'\n{prefix} export requires Edge TPU compiler. Attempting install from {help_url}') + sudo = subprocess.run('sudo --version >/dev/null', shell=True).returncode == 0 # sudo installed on system + for c in ( + 'curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -', + 'echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | sudo tee /etc/apt/sources.list.d/coral-edgetpu.list', + 'sudo apt-get update', 'sudo apt-get install edgetpu-compiler'): + subprocess.run(c if sudo else c.replace('sudo ', ''), shell=True, check=True) + ver = subprocess.run(cmd, shell=True, capture_output=True, check=True).stdout.decode().split()[-1] + + LOGGER.info(f'\n{prefix} starting export with Edge TPU compiler {ver}...') + f = str(tflite_model).replace('.tflite', '_edgetpu.tflite') # Edge TPU model + + cmd = f'edgetpu_compiler -s -d -k 10 --out_dir "{Path(f).parent}" "{tflite_model}"' + LOGGER.info(f"{prefix} running '{cmd}'") + subprocess.run(cmd, shell=True) + self._add_tflite_metadata(f) + return f, None + + @try_export + def export_tfjs(self, prefix=colorstr('TensorFlow.js:')): + """YOLOv8 TensorFlow.js export.""" + check_requirements('tensorflowjs') + import tensorflow as tf + import tensorflowjs as tfjs # noqa + + LOGGER.info(f'\n{prefix} starting export with tensorflowjs {tfjs.__version__}...') + f = str(self.file).replace(self.file.suffix, '_web_model') # js dir + f_pb = str(self.file.with_suffix('.pb')) # *.pb path + + gd = tf.Graph().as_graph_def() # TF GraphDef + with open(f_pb, 'rb') as file: + gd.ParseFromString(file.read()) + outputs = ','.join(gd_outputs(gd)) + LOGGER.info(f'\n{prefix} output node names: {outputs}') + + with spaces_in_path(f_pb) as fpb_, spaces_in_path(f) as f_: # exporter can not handle spaces in path + cmd = f'tensorflowjs_converter --input_format=tf_frozen_model --output_node_names={outputs} "{fpb_}" "{f_}"' + LOGGER.info(f"{prefix} running '{cmd}'") + subprocess.run(cmd, shell=True) + + if ' ' in str(f): + LOGGER.warning(f"{prefix} WARNING ⚠️ your model may not work correctly with spaces in path '{f}'.") + + # f_json = Path(f) / 'model.json' # *.json path + # with open(f_json, 'w') as j: # sort JSON Identity_* in ascending order + # subst = re.sub( + # r'{"outputs": {"Identity.?.?": {"name": "Identity.?.?"}, ' + # r'"Identity.?.?": {"name": "Identity.?.?"}, ' + # r'"Identity.?.?": {"name": "Identity.?.?"}, ' + # r'"Identity.?.?": {"name": "Identity.?.?"}}}', + # r'{"outputs": {"Identity": {"name": "Identity"}, ' + # r'"Identity_1": {"name": "Identity_1"}, ' + # r'"Identity_2": {"name": "Identity_2"}, ' + # r'"Identity_3": {"name": "Identity_3"}}}', + # f_json.read_text(), + # ) + # j.write(subst) + yaml_save(Path(f) / 'metadata.yaml', self.metadata) # add metadata.yaml + return f, None + + def _add_tflite_metadata(self, file): + """Add metadata to *.tflite models per https://www.tensorflow.org/lite/models/convert/metadata.""" + from tflite_support import flatbuffers # noqa + from tflite_support import metadata as _metadata # noqa + from tflite_support import metadata_schema_py_generated as _metadata_fb # noqa + + # Create model info + model_meta = _metadata_fb.ModelMetadataT() + model_meta.name = self.metadata['description'] + model_meta.version = self.metadata['version'] + model_meta.author = self.metadata['author'] + model_meta.license = self.metadata['license'] + + # Label file + tmp_file = Path(file).parent / 'temp_meta.txt' + with open(tmp_file, 'w') as f: + f.write(str(self.metadata)) + + label_file = _metadata_fb.AssociatedFileT() + label_file.name = tmp_file.name + label_file.type = _metadata_fb.AssociatedFileType.TENSOR_AXIS_LABELS + + # Create input info + input_meta = _metadata_fb.TensorMetadataT() + input_meta.name = 'image' + input_meta.description = 'Input image to be detected.' + input_meta.content = _metadata_fb.ContentT() + input_meta.content.contentProperties = _metadata_fb.ImagePropertiesT() + input_meta.content.contentProperties.colorSpace = _metadata_fb.ColorSpaceType.RGB + input_meta.content.contentPropertiesType = _metadata_fb.ContentProperties.ImageProperties + + # Create output info + output1 = _metadata_fb.TensorMetadataT() + output1.name = 'output' + output1.description = 'Coordinates of detected objects, class labels, and confidence score' + output1.associatedFiles = [label_file] + if self.model.task == 'segment': + output2 = _metadata_fb.TensorMetadataT() + output2.name = 'output' + output2.description = 'Mask protos' + output2.associatedFiles = [label_file] + + # Create subgraph info + subgraph = _metadata_fb.SubGraphMetadataT() + subgraph.inputTensorMetadata = [input_meta] + subgraph.outputTensorMetadata = [output1, output2] if self.model.task == 'segment' else [output1] + model_meta.subgraphMetadata = [subgraph] + + b = flatbuffers.Builder(0) + b.Finish(model_meta.Pack(b), _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER) + metadata_buf = b.Output() + + populator = _metadata.MetadataPopulator.with_model_file(str(file)) + populator.load_metadata_buffer(metadata_buf) + populator.load_associated_files([str(tmp_file)]) + populator.populate() + tmp_file.unlink() + + def _pipeline_coreml(self, model, prefix=colorstr('CoreML Pipeline:')): + """YOLOv8 CoreML pipeline.""" + import coremltools as ct # noqa + + LOGGER.info(f'{prefix} starting pipeline with coremltools {ct.__version__}...') + batch_size, ch, h, w = list(self.im.shape) # BCHW + + # Output shapes + spec = model.get_spec() + out0, out1 = iter(spec.description.output) + if MACOS: + from PIL import Image + img = Image.new('RGB', (w, h)) # img(192 width, 320 height) + # img = torch.zeros((*opt.img_size, 3)).numpy() # img size(320,192,3) iDetection + out = model.predict({'image': img}) + out0_shape = out[out0.name].shape + out1_shape = out[out1.name].shape + else: # linux and windows can not run model.predict(), get sizes from pytorch output y + out0_shape = self.output_shape[2], self.output_shape[1] - 4 # (3780, 80) + out1_shape = self.output_shape[2], 4 # (3780, 4) + + # Checks + names = self.metadata['names'] + nx, ny = spec.description.input[0].type.imageType.width, spec.description.input[0].type.imageType.height + na, nc = out0_shape + # na, nc = out0.type.multiArrayType.shape # number anchors, classes + assert len(names) == nc, f'{len(names)} names found for nc={nc}' # check + + # Define output shapes (missing) + out0.type.multiArrayType.shape[:] = out0_shape # (3780, 80) + out1.type.multiArrayType.shape[:] = out1_shape # (3780, 4) + # spec.neuralNetwork.preprocessing[0].featureName = '0' + + # Flexible input shapes + # from coremltools.models.neural_network import flexible_shape_utils + # s = [] # shapes + # s.append(flexible_shape_utils.NeuralNetworkImageSize(320, 192)) + # s.append(flexible_shape_utils.NeuralNetworkImageSize(640, 384)) # (height, width) + # flexible_shape_utils.add_enumerated_image_sizes(spec, feature_name='image', sizes=s) + # r = flexible_shape_utils.NeuralNetworkImageSizeRange() # shape ranges + # r.add_height_range((192, 640)) + # r.add_width_range((192, 640)) + # flexible_shape_utils.update_image_size_range(spec, feature_name='image', size_range=r) + + # Print + # print(spec.description) + + # Model from spec + model = ct.models.MLModel(spec) + + # 3. Create NMS protobuf + nms_spec = ct.proto.Model_pb2.Model() + nms_spec.specificationVersion = 5 + for i in range(2): + decoder_output = model._spec.description.output[i].SerializeToString() + nms_spec.description.input.add() + nms_spec.description.input[i].ParseFromString(decoder_output) + nms_spec.description.output.add() + nms_spec.description.output[i].ParseFromString(decoder_output) + + nms_spec.description.output[0].name = 'confidence' + nms_spec.description.output[1].name = 'coordinates' + + output_sizes = [nc, 4] + for i in range(2): + ma_type = nms_spec.description.output[i].type.multiArrayType + ma_type.shapeRange.sizeRanges.add() + ma_type.shapeRange.sizeRanges[0].lowerBound = 0 + ma_type.shapeRange.sizeRanges[0].upperBound = -1 + ma_type.shapeRange.sizeRanges.add() + ma_type.shapeRange.sizeRanges[1].lowerBound = output_sizes[i] + ma_type.shapeRange.sizeRanges[1].upperBound = output_sizes[i] + del ma_type.shape[:] + + nms = nms_spec.nonMaximumSuppression + nms.confidenceInputFeatureName = out0.name # 1x507x80 + nms.coordinatesInputFeatureName = out1.name # 1x507x4 + nms.confidenceOutputFeatureName = 'confidence' + nms.coordinatesOutputFeatureName = 'coordinates' + nms.iouThresholdInputFeatureName = 'iouThreshold' + nms.confidenceThresholdInputFeatureName = 'confidenceThreshold' + nms.iouThreshold = 0.45 + nms.confidenceThreshold = 0.25 + nms.pickTop.perClass = True + nms.stringClassLabels.vector.extend(names.values()) + nms_model = ct.models.MLModel(nms_spec) + + # 4. Pipeline models together + pipeline = ct.models.pipeline.Pipeline(input_features=[('image', ct.models.datatypes.Array(3, ny, nx)), + ('iouThreshold', ct.models.datatypes.Double()), + ('confidenceThreshold', ct.models.datatypes.Double())], + output_features=['confidence', 'coordinates']) + pipeline.add_model(model) + pipeline.add_model(nms_model) + + # Correct datatypes + pipeline.spec.description.input[0].ParseFromString(model._spec.description.input[0].SerializeToString()) + pipeline.spec.description.output[0].ParseFromString(nms_model._spec.description.output[0].SerializeToString()) + pipeline.spec.description.output[1].ParseFromString(nms_model._spec.description.output[1].SerializeToString()) + + # Update metadata + pipeline.spec.specificationVersion = 5 + pipeline.spec.description.metadata.userDefined.update({ + 'IoU threshold': str(nms.iouThreshold), + 'Confidence threshold': str(nms.confidenceThreshold)}) + + # Save the model + model = ct.models.MLModel(pipeline.spec) + model.input_description['image'] = 'Input image' + model.input_description['iouThreshold'] = f'(optional) IOU threshold override (default: {nms.iouThreshold})' + model.input_description['confidenceThreshold'] = \ + f'(optional) Confidence threshold override (default: {nms.confidenceThreshold})' + model.output_description['confidence'] = 'Boxes × Class confidence (see user-defined metadata "classes")' + model.output_description['coordinates'] = 'Boxes × [x, y, width, height] (relative to image size)' + LOGGER.info(f'{prefix} pipeline success') + return model + + def add_callback(self, event: str, callback): + """ + Appends the given callback. + """ + self.callbacks[event].append(callback) + + def run_callbacks(self, event: str): + """Execute all callbacks for a given event.""" + for callback in self.callbacks.get(event, []): + callback(self) + + +class iOSDetectModel(torch.nn.Module): + """Wrap an Ultralytics YOLO model for iOS export.""" + + def __init__(self, model, im): + """Initialize the iOSDetectModel class with a YOLO model and example image.""" + super().__init__() + b, c, h, w = im.shape # batch, channel, height, width + self.model = model + self.nc = len(model.names) # number of classes + if w == h: + self.normalize = 1.0 / w # scalar + else: + self.normalize = torch.tensor([1.0 / w, 1.0 / h, 1.0 / w, 1.0 / h]) # broadcast (slower, smaller) + + def forward(self, x): + """Normalize predictions of object detection model with input size-dependent factors.""" + xywh, cls = self.model(x)[0].transpose(0, 1).split((4, self.nc), 1) + return cls, xywh * self.normalize # confidence (3780, 80), coordinates (3780, 4) + + +def export(cfg=DEFAULT_CFG): + """Export a YOLOv model to a specific format.""" + cfg.model = cfg.model or 'yolov8n.yaml' + cfg.format = cfg.format or 'torchscript' + + from ultralytics import YOLO + model = YOLO(cfg.model) + model.export(**vars(cfg)) + + +if __name__ == '__main__': + """ + CLI: + yolo mode=export model=yolov8n.yaml format=onnx + """ + export() diff --git a/ultralytics/engine/model.py b/ultralytics/engine/model.py new file mode 100644 index 0000000000000000000000000000000000000000..2aa826bb06837fcee7bc2e6afcde6c991eebd806 --- /dev/null +++ b/ultralytics/engine/model.py @@ -0,0 +1,465 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import inspect +import sys +from pathlib import Path +from typing import Union + +from ultralytics.cfg import get_cfg +from ultralytics.engine.exporter import Exporter +from ultralytics.hub.utils import HUB_WEB_ROOT +from ultralytics.nn.tasks import attempt_load_one_weight, guess_model_task, nn, yaml_model_load +from ultralytics.utils import (DEFAULT_CFG, DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, RANK, ROOT, callbacks, + is_git_dir, yaml_load) +from ultralytics.utils.checks import check_file, check_imgsz, check_pip_update_available, check_yaml +from ultralytics.utils.downloads import GITHUB_ASSET_STEMS +from ultralytics.utils.torch_utils import smart_inference_mode + + +class Model: + """ + A base model class to unify apis for all the models. + + Args: + model (str, Path): Path to the model file to load or create. + task (Any, optional): Task type for the YOLO model. Defaults to None. + + Attributes: + predictor (Any): The predictor object. + model (Any): The model object. + trainer (Any): The trainer object. + task (str): The type of model task. + ckpt (Any): The checkpoint object if the model loaded from *.pt file. + cfg (str): The model configuration if loaded from *.yaml file. + ckpt_path (str): The checkpoint file path. + overrides (dict): Overrides for the trainer object. + metrics (Any): The data for metrics. + + Methods: + __call__(source=None, stream=False, **kwargs): + Alias for the predict method. + _new(cfg:str, verbose:bool=True) -> None: + Initializes a new model and infers the task type from the model definitions. + _load(weights:str, task:str='') -> None: + Initializes a new model and infers the task type from the model head. + _check_is_pytorch_model() -> None: + Raises TypeError if the model is not a PyTorch model. + reset() -> None: + Resets the model modules. + info(verbose:bool=False) -> None: + Logs the model info. + fuse() -> None: + Fuses the model for faster inference. + predict(source=None, stream=False, **kwargs) -> List[ultralytics.engine.results.Results]: + Performs prediction using the YOLO model. + + Returns: + list(ultralytics.engine.results.Results): The prediction results. + """ + + def __init__(self, model: Union[str, Path] = 'yolov8n.pt', task=None) -> None: + """ + Initializes the YOLO model. + + Args: + model (Union[str, Path], optional): Path or name of the model to load or create. Defaults to 'yolov8n.pt'. + task (Any, optional): Task type for the YOLO model. Defaults to None. + """ + self.callbacks = callbacks.get_default_callbacks() + self.predictor = None # reuse predictor + self.model = None # model object + self.trainer = None # trainer object + self.ckpt = None # if loaded from *.pt + self.cfg = None # if loaded from *.yaml + self.ckpt_path = None + self.overrides = {} # overrides for trainer object + self.metrics = None # validation/training metrics + self.session = None # HUB session + self.task = task # task type + model = str(model).strip() # strip spaces + + # Check if Ultralytics HUB model from https://hub.ultralytics.com + if self.is_hub_model(model): + from ultralytics.hub.session import HUBTrainingSession + self.session = HUBTrainingSession(model) + model = self.session.model_file + + # Load or create new YOLO model + suffix = Path(model).suffix + if not suffix and Path(model).stem in GITHUB_ASSET_STEMS: + model, suffix = Path(model).with_suffix('.pt'), '.pt' # add suffix, i.e. yolov8n -> yolov8n.pt + if suffix in ('.yaml', '.yml'): + self._new(model, task) + else: + self._load(model, task) + + def __call__(self, source=None, stream=False, **kwargs): + """Calls the 'predict' function with given arguments to perform object detection.""" + return self.predict(source, stream, **kwargs) + + @staticmethod + def is_hub_model(model): + """Check if the provided model is a HUB model.""" + return any(( + model.startswith(f'{HUB_WEB_ROOT}/models/'), # i.e. https://hub.ultralytics.com/models/MODEL_ID + [len(x) for x in model.split('_')] == [42, 20], # APIKEY_MODELID + len(model) == 20 and not Path(model).exists() and all(x not in model for x in './\\'))) # MODELID + + def _new(self, cfg: str, task=None, model=None, verbose=True): + """ + Initializes a new model and infers the task type from the model definitions. + + Args: + cfg (str): model configuration file + task (str | None): model task + model (BaseModel): Customized model. + verbose (bool): display model info on load + """ + cfg_dict = yaml_model_load(cfg) + self.cfg = cfg + self.task = task or guess_model_task(cfg_dict) + model = model or self.smart_load('model') + self.model = model(cfg_dict, verbose=verbose and RANK == -1) # build model + self.overrides['model'] = self.cfg + + # Below added to allow export from yamls + args = {**DEFAULT_CFG_DICT, **self.overrides} # combine model and default args, preferring model args + self.model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # attach args to model + self.model.task = self.task + + def _load(self, weights: str, task=None): + """ + Initializes a new model and infers the task type from the model head. + + Args: + weights (str): model checkpoint to be loaded + task (str | None): model task + """ + suffix = Path(weights).suffix + if suffix == '.pt': + self.model, self.ckpt = attempt_load_one_weight(weights) + self.task = self.model.args['task'] + self.overrides = self.model.args = self._reset_ckpt_args(self.model.args) + self.ckpt_path = self.model.pt_path + else: + weights = check_file(weights) + self.model, self.ckpt = weights, None + self.task = task or guess_model_task(weights) + self.ckpt_path = weights + self.overrides['model'] = weights + self.overrides['task'] = self.task + + def _check_is_pytorch_model(self): + """ + Raises TypeError is model is not a PyTorch model + """ + pt_str = isinstance(self.model, (str, Path)) and Path(self.model).suffix == '.pt' + pt_module = isinstance(self.model, nn.Module) + if not (pt_module or pt_str): + raise TypeError(f"model='{self.model}' must be a *.pt PyTorch model, but is a different type. " + f'PyTorch models can be used to train, val, predict and export, i.e. ' + f"'yolo export model=yolov8n.pt', but exported formats like ONNX, TensorRT etc. only " + f"support 'predict' and 'val' modes, i.e. 'yolo predict model=yolov8n.onnx'.") + + @smart_inference_mode() + def reset_weights(self): + """ + Resets the model modules parameters to randomly initialized values, losing all training information. + """ + self._check_is_pytorch_model() + for m in self.model.modules(): + if hasattr(m, 'reset_parameters'): + m.reset_parameters() + for p in self.model.parameters(): + p.requires_grad = True + return self + + @smart_inference_mode() + def load(self, weights='yolov8n.pt'): + """ + Transfers parameters with matching names and shapes from 'weights' to model. + """ + self._check_is_pytorch_model() + if isinstance(weights, (str, Path)): + weights, self.ckpt = attempt_load_one_weight(weights) + self.model.load(weights) + return self + + def info(self, detailed=False, verbose=True): + """ + Logs model info. + + Args: + detailed (bool): Show detailed information about model. + verbose (bool): Controls verbosity. + """ + self._check_is_pytorch_model() + return self.model.info(detailed=detailed, verbose=verbose) + + def fuse(self): + """Fuse PyTorch Conv2d and BatchNorm2d layers.""" + self._check_is_pytorch_model() + self.model.fuse() + + @smart_inference_mode() + def predict(self, source=None, stream=False, predictor=None, **kwargs): + """ + Perform prediction using the YOLO model. + + Args: + source (str | int | PIL | np.ndarray): The source of the image to make predictions on. + Accepts all source types accepted by the YOLO model. + stream (bool): Whether to stream the predictions or not. Defaults to False. + predictor (BasePredictor): Customized predictor. + **kwargs : Additional keyword arguments passed to the predictor. + Check the 'configuration' section in the documentation for all available options. + + Returns: + (List[ultralytics.engine.results.Results]): The prediction results. + """ + if source is None: + source = ROOT / 'assets' if is_git_dir() else 'https://ultralytics.com/images/bus.jpg' + LOGGER.warning(f"WARNING ⚠️ 'source' is missing. Using 'source={source}'.") + is_cli = (sys.argv[0].endswith('yolo') or sys.argv[0].endswith('ultralytics')) and any( + x in sys.argv for x in ('predict', 'track', 'mode=predict', 'mode=track')) + # Check prompts for SAM/FastSAM + prompts = kwargs.pop('prompts', None) + overrides = self.overrides.copy() + overrides['conf'] = 0.25 + overrides.update(kwargs) # prefer kwargs + overrides['mode'] = kwargs.get('mode', 'predict') + assert overrides['mode'] in ['track', 'predict'] + if not is_cli: + overrides['save'] = kwargs.get('save', False) # do not save by default if called in Python + if not self.predictor: + self.task = overrides.get('task') or self.task + predictor = predictor or self.smart_load('predictor') + self.predictor = predictor(overrides=overrides, _callbacks=self.callbacks) + self.predictor.setup_model(model=self.model, verbose=is_cli) + else: # only update args if predictor is already setup + self.predictor.args = get_cfg(self.predictor.args, overrides) + if 'project' in overrides or 'name' in overrides: + self.predictor.save_dir = self.predictor.get_save_dir() + # Set prompts for SAM/FastSAM + if len and hasattr(self.predictor, 'set_prompts'): + self.predictor.set_prompts(prompts) + return self.predictor.predict_cli(source=source) if is_cli else self.predictor(source=source, stream=stream) + + def track(self, source=None, stream=False, persist=False, **kwargs): + """ + Perform object tracking on the input source using the registered trackers. + + Args: + source (str, optional): The input source for object tracking. Can be a file path or a video stream. + stream (bool, optional): Whether the input source is a video stream. Defaults to False. + persist (bool, optional): Whether to persist the trackers if they already exist. Defaults to False. + **kwargs (optional): Additional keyword arguments for the tracking process. + + Returns: + (List[ultralytics.engine.results.Results]): The tracking results. + + """ + if not hasattr(self.predictor, 'trackers'): + from ultralytics.trackers import register_tracker + register_tracker(self, persist) + # ByteTrack-based method needs low confidence predictions as input + conf = kwargs.get('conf') or 0.1 + kwargs['conf'] = conf + kwargs['mode'] = 'track' + return self.predict(source=source, stream=stream, **kwargs) + + @smart_inference_mode() + def val(self, data=None, validator=None, **kwargs): + """ + Validate a model on a given dataset. + + Args: + data (str): The dataset to validate on. Accepts all formats accepted by yolo + validator (BaseValidator): Customized validator. + **kwargs : Any other args accepted by the validators. To see all args check 'configuration' section in docs + """ + overrides = self.overrides.copy() + overrides['rect'] = True # rect batches as default + overrides.update(kwargs) + overrides['mode'] = 'val' + args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides) + args.data = data or args.data + if 'task' in overrides: + self.task = args.task + else: + args.task = self.task + validator = validator or self.smart_load('validator') + if args.imgsz == DEFAULT_CFG.imgsz and not isinstance(self.model, (str, Path)): + args.imgsz = self.model.args['imgsz'] # use trained imgsz unless custom value is passed + args.imgsz = check_imgsz(args.imgsz, max_dim=1) + + validator = validator(args=args, _callbacks=self.callbacks) + validator(model=self.model) + self.metrics = validator.metrics + + return validator.metrics + + @smart_inference_mode() + def benchmark(self, **kwargs): + """ + Benchmark a model on all export formats. + + Args: + **kwargs : Any other args accepted by the validators. To see all args check 'configuration' section in docs + """ + self._check_is_pytorch_model() + from ultralytics.utils.benchmarks import benchmark + overrides = self.model.args.copy() + overrides.update(kwargs) + overrides['mode'] = 'benchmark' + overrides = {**DEFAULT_CFG_DICT, **overrides} # fill in missing overrides keys with defaults + return benchmark( + model=self, + data=kwargs.get('data'), # if no 'data' argument passed set data=None for default datasets + imgsz=overrides['imgsz'], + half=overrides['half'], + int8=overrides['int8'], + device=overrides['device'], + verbose=overrides['verbose']) + + def export(self, **kwargs): + """ + Export model. + + Args: + **kwargs : Any other args accepted by the predictors. To see all args check 'configuration' section in docs + """ + self._check_is_pytorch_model() + overrides = self.overrides.copy() + overrides.update(kwargs) + overrides['mode'] = 'export' + if overrides.get('imgsz') is None: + overrides['imgsz'] = self.model.args['imgsz'] # use trained imgsz unless custom value is passed + if 'batch' not in kwargs: + overrides['batch'] = 1 # default to 1 if not modified + if 'data' not in kwargs: + overrides['data'] = None # default to None if not modified (avoid int8 calibration with coco.yaml) + args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides) + args.task = self.task + return Exporter(overrides=args, _callbacks=self.callbacks)(model=self.model) + + def train(self, trainer=None, **kwargs): + """ + Trains the model on a given dataset. + + Args: + trainer (BaseTrainer, optional): Customized trainer. + **kwargs (Any): Any number of arguments representing the training configuration. + """ + self._check_is_pytorch_model() + if self.session: # Ultralytics HUB session + if any(kwargs): + LOGGER.warning('WARNING ⚠️ using HUB training arguments, ignoring local training arguments.') + kwargs = self.session.train_args + check_pip_update_available() + overrides = self.overrides.copy() + if kwargs.get('cfg'): + LOGGER.info(f"cfg file passed. Overriding default params with {kwargs['cfg']}.") + overrides = yaml_load(check_yaml(kwargs['cfg'])) + overrides.update(kwargs) + overrides['mode'] = 'train' + if not overrides.get('data'): + raise AttributeError("Dataset required but missing, i.e. pass 'data=coco128.yaml'") + if overrides.get('resume'): + overrides['resume'] = self.ckpt_path + self.task = overrides.get('task') or self.task + trainer = trainer or self.smart_load('trainer') + self.trainer = trainer(overrides=overrides, _callbacks=self.callbacks) + if not overrides.get('resume'): # manually set model only if not resuming + self.trainer.model = self.trainer.get_model(weights=self.model if self.ckpt else None, cfg=self.model.yaml) + self.model = self.trainer.model + self.trainer.hub_session = self.session # attach optional HUB session + self.trainer.train() + # Update model and cfg after training + if RANK in (-1, 0): + self.model, _ = attempt_load_one_weight(str(self.trainer.best)) + self.overrides = self.model.args + self.metrics = getattr(self.trainer.validator, 'metrics', None) # TODO: no metrics returned by DDP + + def to(self, device): + """ + Sends the model to the given device. + + Args: + device (str): device + """ + self._check_is_pytorch_model() + self.model.to(device) + + def tune(self, *args, **kwargs): + """ + Runs hyperparameter tuning using Ray Tune. See ultralytics.utils.tuner.run_ray_tune for Args. + + Returns: + (dict): A dictionary containing the results of the hyperparameter search. + + Raises: + ModuleNotFoundError: If Ray Tune is not installed. + """ + self._check_is_pytorch_model() + from ultralytics.utils.tuner import run_ray_tune + return run_ray_tune(self, *args, **kwargs) + + @property + def names(self): + """Returns class names of the loaded model.""" + return self.model.names if hasattr(self.model, 'names') else None + + @property + def device(self): + """Returns device if PyTorch model.""" + return next(self.model.parameters()).device if isinstance(self.model, nn.Module) else None + + @property + def transforms(self): + """Returns transform of the loaded model.""" + return self.model.transforms if hasattr(self.model, 'transforms') else None + + def add_callback(self, event: str, func): + """Add a callback.""" + self.callbacks[event].append(func) + + def clear_callback(self, event: str): + """Clear all event callbacks.""" + self.callbacks[event] = [] + + @staticmethod + def _reset_ckpt_args(args): + """Reset arguments when loading a PyTorch model.""" + include = {'imgsz', 'data', 'task', 'single_cls'} # only remember these arguments when loading a PyTorch model + return {k: v for k, v in args.items() if k in include} + + def _reset_callbacks(self): + """Reset all registered callbacks.""" + for event in callbacks.default_callbacks.keys(): + self.callbacks[event] = [callbacks.default_callbacks[event][0]] + + def __getattr__(self, attr): + """Raises error if object has no requested attribute.""" + name = self.__class__.__name__ + raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}") + + def smart_load(self, key): + """Load model/trainer/validator/predictor.""" + try: + return self.task_map[self.task][key] + except Exception: + name = self.__class__.__name__ + mode = inspect.stack()[1][3] # get the function name. + raise NotImplementedError( + f'WARNING ⚠️ `{name}` model does not support `{mode}` mode for `{self.task}` task yet.') + + @property + def task_map(self): + """ + Map head to model, trainer, validator, and predictor classes. + + Returns: + task_map (dict): The map of model task to mode classes. + """ + raise NotImplementedError('Please provide task map for your model!') diff --git a/ultralytics/engine/predictor.py b/ultralytics/engine/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..75ab8fdf4d340b679167d780746a4232bef45d8f --- /dev/null +++ b/ultralytics/engine/predictor.py @@ -0,0 +1,359 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Run prediction on images, videos, directories, globs, YouTube, webcam, streams, etc. + +Usage - sources: + $ yolo mode=predict model=yolov8n.pt source=0 # webcam + img.jpg # image + vid.mp4 # video + screen # screenshot + path/ # directory + list.txt # list of images + list.streams # list of streams + 'path/*.jpg' # glob + 'https://youtu.be/Zgi9g1ksQHc' # YouTube + 'rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP stream + +Usage - formats: + $ yolo mode=predict model=yolov8n.pt # PyTorch + yolov8n.torchscript # TorchScript + yolov8n.onnx # ONNX Runtime or OpenCV DNN with dnn=True + yolov8n_openvino_model # OpenVINO + yolov8n.engine # TensorRT + yolov8n.mlmodel # CoreML (macOS-only) + yolov8n_saved_model # TensorFlow SavedModel + yolov8n.pb # TensorFlow GraphDef + yolov8n.tflite # TensorFlow Lite + yolov8n_edgetpu.tflite # TensorFlow Edge TPU + yolov8n_paddle_model # PaddlePaddle +""" +import platform +from pathlib import Path + +import cv2 +import numpy as np +import torch + +from ultralytics.cfg import get_cfg +from ultralytics.data import load_inference_source +from ultralytics.data.augment import LetterBox, classify_transforms +from ultralytics.nn.autobackend import AutoBackend +from ultralytics.utils import DEFAULT_CFG, LOGGER, MACOS, SETTINGS, WINDOWS, callbacks, colorstr, ops +from ultralytics.utils.checks import check_imgsz, check_imshow +from ultralytics.utils.files import increment_path +from ultralytics.utils.torch_utils import select_device, smart_inference_mode + +STREAM_WARNING = """ + WARNING ⚠️ stream/video/webcam/dir predict source will accumulate results in RAM unless `stream=True` is passed, + causing potential out-of-memory errors for large sources or long-running streams/videos. + + Usage: + results = model(source=..., stream=True) # generator of Results objects + for r in results: + boxes = r.boxes # Boxes object for bbox outputs + masks = r.masks # Masks object for segment masks outputs + probs = r.probs # Class probabilities for classification outputs +""" + +inference_Time=0 +class BasePredictor: + """ + BasePredictor + + A base class for creating predictors. + + Attributes: + args (SimpleNamespace): Configuration for the predictor. + save_dir (Path): Directory to save results. + done_warmup (bool): Whether the predictor has finished setup. + model (nn.Module): Model used for prediction. + data (dict): Data configuration. + device (torch.device): Device used for prediction. + dataset (Dataset): Dataset used for prediction. + vid_path (str): Path to video file. + vid_writer (cv2.VideoWriter): Video writer for saving video output. + data_path (str): Path to data. + """ + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + """ + Initializes the BasePredictor class. + + Args: + cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG. + overrides (dict, optional): Configuration overrides. Defaults to None. + """ + self.args = get_cfg(cfg, overrides) + self.save_dir = self.get_save_dir() + if self.args.conf is None: + self.args.conf = 0.25 # default conf=0.25 + self.done_warmup = False + if self.args.show: + self.args.show = check_imshow(warn=True) + + # Usable if setup is done + self.model = None + self.data = self.args.data # data_dict + self.imgsz = None + self.device = None + self.dataset = None + self.vid_path, self.vid_writer = None, None + self.plotted_img = None + self.data_path = None + self.source_type = None + self.batch = None + self.results = None + self.transforms = None + self.callbacks = _callbacks or callbacks.get_default_callbacks() + self.txt_path = None + callbacks.add_integration_callbacks(self) + + def get_save_dir(self): + project = self.args.project or Path(SETTINGS['runs_dir']) / self.args.task + name = self.args.name or f'{self.args.mode}' + return increment_path(Path(project) / name, exist_ok=self.args.exist_ok) + + def preprocess(self, im): + """Prepares input image before inference. + + Args: + im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list. + """ + not_tensor = not isinstance(im, torch.Tensor) + if not_tensor: + im = np.stack(self.pre_transform(im)) + im = im[..., ::-1].transpose((0, 3, 1, 2)) # BGR to RGB, BHWC to BCHW, (n, 3, h, w) + im = np.ascontiguousarray(im) # contiguous + im = torch.from_numpy(im) + + img = im.to(self.device) + img = img.half() if self.model.fp16 else img.float() # uint8 to fp16/32 + if not_tensor: + img /= 255 # 0 - 255 to 0.0 - 1.0 + return img + + def inference(self, im, *args, **kwargs): + visualize = increment_path(self.save_dir / Path(self.batch[0][0]).stem, + mkdir=True) if self.args.visualize and (not self.source_type.tensor) else False + return self.model(im, augment=self.args.augment, visualize=visualize) + + def pre_transform(self, im): + """Pre-transform input image before inference. + + Args: + im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list. + + Return: A list of transformed imgs. + """ + same_shapes = all(x.shape == im[0].shape for x in im) + auto = same_shapes and self.model.pt + return [LetterBox(self.imgsz, auto=auto, stride=self.model.stride)(image=x) for x in im] + + def write_results(self, idx, results, batch): + """Write inference results to a file or directory.""" + p, im, _ = batch + log_string = '' + if len(im.shape) == 3: + im = im[None] # expand for batch dim + if self.source_type.webcam or self.source_type.from_img or self.source_type.tensor: # batch_size >= 1 + log_string += f'{idx}: ' + frame = self.dataset.count + else: + frame = getattr(self.dataset, 'frame', 0) + self.data_path = p + self.txt_path = str(self.save_dir / 'labels' / p.stem) + ('' if self.dataset.mode == 'image' else f'_{frame}') + log_string += '%gx%g ' % im.shape[2:] # print string + result = results[idx] + log_string += result.verbose() + + if self.args.save or self.args.show: # Add bbox to image + plot_args = { + 'line_width': self.args.line_width, + 'boxes': self.args.boxes, + 'conf': self.args.show_conf, + 'labels': self.args.show_labels} + if not self.args.retina_masks: + plot_args['im_gpu'] = im[idx] + self.plotted_img = result.plot(**plot_args) + # Write + if self.args.save_txt: + result.save_txt(f'{self.txt_path}.txt', save_conf=self.args.save_conf) + if self.args.save_crop: + result.save_crop(save_dir=self.save_dir / 'crops', + file_name=self.data_path.stem + ('' if self.dataset.mode == 'image' else f'_{frame}')) + + return log_string + + def postprocess(self, preds, img, orig_imgs): + """Post-processes predictions for an image and returns them.""" + return preds + + def __call__(self, source=None, model=None, stream=False, *args, **kwargs): + """Performs inference on an image or stream.""" + self.stream = stream + if stream: + return self.stream_inference(source, model, *args, **kwargs) + else: + return list(self.stream_inference(source, model, *args, **kwargs)) # merge list of Result into one + + def predict_cli(self, source=None, model=None): + """Method used for CLI prediction. It uses always generator as outputs as not required by CLI mode.""" + gen = self.stream_inference(source, model) + for _ in gen: # running CLI inference without accumulating any outputs (do not modify) + pass + + def setup_source(self, source): + """Sets up source and inference mode.""" + self.imgsz = check_imgsz(self.args.imgsz, stride=self.model.stride, min_dim=2) # check image size + self.transforms = getattr(self.model.model, 'transforms', classify_transforms( + self.imgsz[0])) if self.args.task == 'classify' else None + self.dataset = load_inference_source(source=source, imgsz=self.imgsz, vid_stride=self.args.vid_stride) + self.source_type = self.dataset.source_type + if not getattr(self, 'stream', True) and (self.dataset.mode == 'stream' or # streams + len(self.dataset) > 1000 or # images + any(getattr(self.dataset, 'video_flag', [False]))): # videos + LOGGER.warning(STREAM_WARNING) + self.vid_path, self.vid_writer = [None] * self.dataset.bs, [None] * self.dataset.bs + + @smart_inference_mode() + def stream_inference(self, source=None, model=None, *args, **kwargs): + """Streams real-time inference on camera feed and saves results to file.""" + if self.args.verbose: + LOGGER.info('') + + # Setup model + if not self.model: + self.setup_model(model) + + # Setup source every time predict is called + self.setup_source(source if source is not None else self.args.source) + + # Check if save_dir/ label file exists + if self.args.save or self.args.save_txt: + (self.save_dir / 'labels' if self.args.save_txt else self.save_dir).mkdir(parents=True, exist_ok=True) + + # Warmup model + if not self.done_warmup: + self.model.warmup(imgsz=(1 if self.model.pt or self.model.triton else self.dataset.bs, 3, *self.imgsz)) + self.done_warmup = True + + self.seen, self.windows, self.batch, profilers = 0, [], None, (ops.Profile(), ops.Profile(), ops.Profile()) + self.run_callbacks('on_predict_start') + for batch in self.dataset: + self.run_callbacks('on_predict_batch_start') + self.batch = batch + path, im0s, vid_cap, s = batch + + # Preprocess + with profilers[0]: + im = self.preprocess(im0s) + + # Inference + with profilers[1]: + preds = self.inference(im, *args, **kwargs) + + # Postprocess + with profilers[2]: + self.results = self.postprocess(preds, im, im0s) + self.run_callbacks('on_predict_postprocess_end') + + # Visualize, save, write results + n = len(im0s) + for i in range(n): + self.seen += 1 + self.results[i].speed = { + 'preprocess': profilers[0].dt * 1E3 / n, + 'inference': profilers[1].dt * 1E3 / n, + 'postprocess': profilers[2].dt * 1E3 / n} + p, im0 = path[i], None if self.source_type.tensor else im0s[i].copy() + p = Path(p) + + if self.args.verbose or self.args.save or self.args.save_txt or self.args.show: + s += self.write_results(i, self.results, (p, im, im0)) + if self.args.save or self.args.save_txt: + self.results[i].save_dir = self.save_dir.__str__() + if self.args.show and self.plotted_img is not None: + self.show(p) + if self.args.save and self.plotted_img is not None: + self.save_preds(vid_cap, i, str(self.save_dir / p.name)) + + self.run_callbacks('on_predict_batch_end') + yield from self.results + + # Print time (inference-only) + if self.args.verbose: + LOGGER.info(f'{s}{profilers[1].dt * 1E3:.1f}ms') + + # Release assets + if isinstance(self.vid_writer[-1], cv2.VideoWriter): + self.vid_writer[-1].release() # release final video writer + + # Print results + if self.args.verbose and self.seen: + t = tuple(x.t / self.seen * 1E3 for x in profilers) # speeds per image + LOGGER.info(f'Speed: %.1fms preprocess, %.1fms inference, %.1fms postprocess per image at shape ' + f'{(1, 3, *im.shape[2:])}' % t) + if self.args.save or self.args.save_txt or self.args.save_crop: + nl = len(list(self.save_dir.glob('labels/*.txt'))) # number of labels + s = f"\n{nl} label{'s' * (nl > 1)} saved to {self.save_dir / 'labels'}" if self.args.save_txt else '' + LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}{s}") + + self.run_callbacks('on_predict_end') + + def setup_model(self, model, verbose=True): + """Initialize YOLO model with given parameters and set it to evaluation mode.""" + self.model = AutoBackend(model or self.args.model, + device=select_device(self.args.device, verbose=verbose), + dnn=self.args.dnn, + data=self.args.data, + fp16=self.args.half, + fuse=True, + verbose=verbose) + + self.device = self.model.device # update device + self.args.half = self.model.fp16 # update half + self.model.eval() + + def show(self, p): + """Display an image in a window using OpenCV imshow().""" + im0 = self.plotted_img + if platform.system() == 'Linux' and p not in self.windows: + self.windows.append(p) + cv2.namedWindow(str(p), cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) # allow window resize (Linux) + cv2.resizeWindow(str(p), im0.shape[1], im0.shape[0]) + cv2.imshow(str(p), im0) + cv2.waitKey(500 if self.batch[3].startswith('image') else 1) # 1 millisecond + + def save_preds(self, vid_cap, idx, save_path): + """Save video predictions as mp4 at specified path.""" + im0 = self.plotted_img + # Save imgs + if self.dataset.mode == 'image': + cv2.imwrite(save_path, im0) + else: # 'video' or 'stream' + if self.vid_path[idx] != save_path: # new video + self.vid_path[idx] = save_path + if isinstance(self.vid_writer[idx], cv2.VideoWriter): + self.vid_writer[idx].release() # release previous video writer + if vid_cap: # video + fps = int(vid_cap.get(cv2.CAP_PROP_FPS)) # integer required, floats produce error in MP4 codec + w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + else: # stream + fps, w, h = 30, im0.shape[1], im0.shape[0] + suffix = '.mp4' if MACOS else '.avi' if WINDOWS else '.avi' + fourcc = 'avc1' if MACOS else 'WMV2' if WINDOWS else 'MJPG' + save_path = str(Path(save_path).with_suffix(suffix)) + self.vid_writer[idx] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h)) + self.vid_writer[idx].write(im0) + + def run_callbacks(self, event: str): + """Runs all registered callbacks for a specific event.""" + for callback in self.callbacks.get(event, []): + callback(self) + + def add_callback(self, event: str, func): + """ + Add callback + """ + self.callbacks[event].append(func) diff --git a/ultralytics/engine/results.py b/ultralytics/engine/results.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9b1e74b2acc73a1dfb1bfaa13759382a813d68 --- /dev/null +++ b/ultralytics/engine/results.py @@ -0,0 +1,604 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Ultralytics Results, Boxes and Masks classes for handling inference results + +Usage: See https://docs.ultralytics.com/modes/predict/ +""" + +from copy import deepcopy +from functools import lru_cache +from pathlib import Path + +import numpy as np +import torch + +from ultralytics.data.augment import LetterBox +from ultralytics.utils import LOGGER, SimpleClass, deprecation_warn, ops +from ultralytics.utils.plotting import Annotator, colors, save_one_box + + +class BaseTensor(SimpleClass): + """ + Base tensor class with additional methods for easy manipulation and device handling. + """ + + def __init__(self, data, orig_shape) -> None: + """Initialize BaseTensor with data and original shape. + + Args: + data (torch.Tensor | np.ndarray): Predictions, such as bboxes, masks and keypoints. + orig_shape (tuple): Original shape of image. + """ + assert isinstance(data, (torch.Tensor, np.ndarray)) + self.data = data + self.orig_shape = orig_shape + + @property + def shape(self): + """Return the shape of the data tensor.""" + return self.data.shape + + def cpu(self): + """Return a copy of the tensor on CPU memory.""" + return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.cpu(), self.orig_shape) + + def numpy(self): + """Return a copy of the tensor as a numpy array.""" + return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.numpy(), self.orig_shape) + + def cuda(self): + """Return a copy of the tensor on GPU memory.""" + return self.__class__(torch.as_tensor(self.data).cuda(), self.orig_shape) + + def to(self, *args, **kwargs): + """Return a copy of the tensor with the specified device and dtype.""" + return self.__class__(torch.as_tensor(self.data).to(*args, **kwargs), self.orig_shape) + + def __len__(self): # override len(results) + """Return the length of the data tensor.""" + return len(self.data) + + def __getitem__(self, idx): + """Return a BaseTensor with the specified index of the data tensor.""" + return self.__class__(self.data[idx], self.orig_shape) + + +class Results(SimpleClass): + """ + A class for storing and manipulating inference results. + + Args: + orig_img (numpy.ndarray): The original image as a numpy array. + path (str): The path to the image file. + names (dict): A dictionary of class names. + boxes (torch.tensor, optional): A 2D tensor of bounding box coordinates for each detection. + masks (torch.tensor, optional): A 3D tensor of detection masks, where each mask is a binary image. + probs (torch.tensor, optional): A 1D tensor of probabilities of each class for classification task. + keypoints (List[List[float]], optional): A list of detected keypoints for each object. + + Attributes: + orig_img (numpy.ndarray): The original image as a numpy array. + orig_shape (tuple): The original image shape in (height, width) format. + boxes (Boxes, optional): A Boxes object containing the detection bounding boxes. + masks (Masks, optional): A Masks object containing the detection masks. + probs (Probs, optional): A Probs object containing probabilities of each class for classification task. + keypoints (Keypoints, optional): A Keypoints object containing detected keypoints for each object. + speed (dict): A dictionary of preprocess, inference, and postprocess speeds in milliseconds per image. + names (dict): A dictionary of class names. + path (str): The path to the image file. + _keys (tuple): A tuple of attribute names for non-empty attributes. + """ + + def __init__(self, orig_img, path, names, boxes=None, masks=None, probs=None, keypoints=None) -> None: + """Initialize the Results class.""" + self.orig_img = orig_img + self.orig_shape = orig_img.shape[:2] + self.boxes = Boxes(boxes, self.orig_shape) if boxes is not None else None # native size boxes + self.masks = Masks(masks, self.orig_shape) if masks is not None else None # native size or imgsz masks + self.probs = Probs(probs) if probs is not None else None + self.keypoints = Keypoints(keypoints, self.orig_shape) if keypoints is not None else None + self.speed = {'preprocess': None, 'inference': None, 'postprocess': None} # milliseconds per image + self.names = names + self.path = path + self.save_dir = None + self._keys = ('boxes', 'masks', 'probs', 'keypoints') + + def __getitem__(self, idx): + """Return a Results object for the specified index.""" + r = self.new() + for k in self.keys: + setattr(r, k, getattr(self, k)[idx]) + return r + + def __len__(self): + """Return the number of detections in the Results object.""" + for k in self.keys: + return len(getattr(self, k)) + + def update(self, boxes=None, masks=None, probs=None): + """Update the boxes, masks, and probs attributes of the Results object.""" + if boxes is not None: + ops.clip_boxes(boxes, self.orig_shape) # clip boxes + self.boxes = Boxes(boxes, self.orig_shape) + if masks is not None: + self.masks = Masks(masks, self.orig_shape) + if probs is not None: + self.probs = probs + + def cpu(self): + """Return a copy of the Results object with all tensors on CPU memory.""" + r = self.new() + for k in self.keys: + setattr(r, k, getattr(self, k).cpu()) + return r + + def numpy(self): + """Return a copy of the Results object with all tensors as numpy arrays.""" + r = self.new() + for k in self.keys: + setattr(r, k, getattr(self, k).numpy()) + return r + + def cuda(self): + """Return a copy of the Results object with all tensors on GPU memory.""" + r = self.new() + for k in self.keys: + setattr(r, k, getattr(self, k).cuda()) + return r + + def to(self, *args, **kwargs): + """Return a copy of the Results object with tensors on the specified device and dtype.""" + r = self.new() + for k in self.keys: + setattr(r, k, getattr(self, k).to(*args, **kwargs)) + return r + + def new(self): + """Return a new Results object with the same image, path, and names.""" + return Results(orig_img=self.orig_img, path=self.path, names=self.names) + + @property + def keys(self): + """Return a list of non-empty attribute names.""" + return [k for k in self._keys if getattr(self, k) is not None] + + def plot( + self, + conf=True, + line_width=None, + font_size=None, + font='Arial.ttf', + pil=False, + img=None, + im_gpu=None, + kpt_radius=5, + kpt_line=True, + labels=True, + boxes=True, + masks=True, + probs=True, + **kwargs # deprecated args TODO: remove support in 8.2 + ): + """ + Plots the detection results on an input RGB image. Accepts a numpy array (cv2) or a PIL Image. + + Args: + conf (bool): Whether to plot the detection confidence score. + line_width (float, optional): The line width of the bounding boxes. If None, it is scaled to the image size. + font_size (float, optional): The font size of the text. If None, it is scaled to the image size. + font (str): The font to use for the text. + pil (bool): Whether to return the image as a PIL Image. + img (numpy.ndarray): Plot to another image. if not, plot to original image. + im_gpu (torch.Tensor): Normalized image in gpu with shape (1, 3, 640, 640), for faster mask plotting. + kpt_radius (int, optional): Radius of the drawn keypoints. Default is 5. + kpt_line (bool): Whether to draw lines connecting keypoints. + labels (bool): Whether to plot the label of bounding boxes. + boxes (bool): Whether to plot the bounding boxes. + masks (bool): Whether to plot the masks. + probs (bool): Whether to plot classification probability + + Returns: + (numpy.ndarray): A numpy array of the annotated image. + + Example: + ```python + from PIL import Image + from ultralytics import YOLO + + model = YOLO('yolov8n.pt') + results = model('bus.jpg') # results list + for r in results: + im_array = r.plot() # plot a BGR numpy array of predictions + im = Image.fromarray(im[..., ::-1]) # RGB PIL image + im.show() # show image + im.save('results.jpg') # save image + ``` + """ + if img is None and isinstance(self.orig_img, torch.Tensor): + img = np.ascontiguousarray(self.orig_img[0].permute(1, 2, 0).cpu().detach().numpy()) * 255 + + # Deprecation warn TODO: remove in 8.2 + if 'show_conf' in kwargs: + deprecation_warn('show_conf', 'conf') + conf = kwargs['show_conf'] + assert type(conf) == bool, '`show_conf` should be of boolean type, i.e, show_conf=True/False' + + if 'line_thickness' in kwargs: + deprecation_warn('line_thickness', 'line_width') + line_width = kwargs['line_thickness'] + assert type(line_width) == int, '`line_width` should be of int type, i.e, line_width=3' + + names = self.names + pred_boxes, show_boxes = self.boxes, boxes + pred_masks, show_masks = self.masks, masks + pred_probs, show_probs = self.probs, probs + annotator = Annotator( + deepcopy(self.orig_img if img is None else img), + line_width, + font_size, + font, + pil or (pred_probs is not None and show_probs), # Classify tasks default to pil=True + example=names) + + # Plot Segment results + if pred_masks and show_masks: + if im_gpu is None: + img = LetterBox(pred_masks.shape[1:])(image=annotator.result()) + im_gpu = torch.as_tensor(img, dtype=torch.float16, device=pred_masks.data.device).permute( + 2, 0, 1).flip(0).contiguous() / 255 + idx = pred_boxes.cls if pred_boxes else range(len(pred_masks)) + annotator.masks(pred_masks.data, colors=[colors(x, True) for x in idx], im_gpu=im_gpu) + + # Plot Detect results + if pred_boxes and show_boxes: + for d in reversed(pred_boxes): + c, conf, id = int(d.cls), float(d.conf) if conf else None, None if d.id is None else int(d.id.item()) + name = ('' if id is None else f'id:{id} ') + names[c] + label = (f'{name} {conf:.2f}' if conf else name) if labels else None + annotator.box_label(d.xyxy.squeeze(), label, color=colors(c, True)) + + # Plot Classify results + if pred_probs is not None and show_probs: + text = ',\n'.join(f'{names[j] if names else j} {pred_probs.data[j]:.2f}' for j in pred_probs.top5) + x = round(self.orig_shape[0] * 0.03) + annotator.text([x, x], text, txt_color=(255, 255, 255)) # TODO: allow setting colors + + # Plot Pose results + if self.keypoints is not None: + for k in reversed(self.keypoints.data): + annotator.kpts(k, self.orig_shape, radius=kpt_radius, kpt_line=kpt_line) + + return annotator.result() + + def verbose(self): + """ + Return log string for each task. + """ + log_string = '' + probs = self.probs + boxes = self.boxes + if len(self) == 0: + return log_string if probs is not None else f'{log_string}(no detections), ' + if probs is not None: + log_string += f"{', '.join(f'{self.names[j]} {probs.data[j]:.2f}' for j in probs.top5)}, " + if boxes: + for c in boxes.cls.unique(): + n = (boxes.cls == c).sum() # detections per class + log_string += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " + return log_string + + def save_txt(self, txt_file, save_conf=False): + """ + Save predictions into txt file. + + Args: + txt_file (str): txt file path. + save_conf (bool): save confidence score or not. + """ + boxes = self.boxes + masks = self.masks + probs = self.probs + kpts = self.keypoints + texts = [] + if probs is not None: + # Classify + [texts.append(f'{probs.data[j]:.2f} {self.names[j]}') for j in probs.top5] + elif boxes: + # Detect/segment/pose + for j, d in enumerate(boxes): + c, conf, id = int(d.cls), float(d.conf), None if d.id is None else int(d.id.item()) + line = (c, *d.xywhn.view(-1)) + if masks: + seg = masks[j].xyn[0].copy().reshape(-1) # reversed mask.xyn, (n,2) to (n*2) + line = (c, *seg) + if kpts is not None: + kpt = torch.cat((kpts[j].xyn, kpts[j].conf[..., None]), 2) if kpts[j].has_visible else kpts[j].xyn + line += (*kpt.reshape(-1).tolist(), ) + line += (conf, ) * save_conf + (() if id is None else (id, )) + texts.append(('%g ' * len(line)).rstrip() % line) + + if texts: + with open(txt_file, 'a') as f: + f.writelines(text + '\n' for text in texts) + + def save_crop(self, save_dir, file_name=Path('im.jpg')): + """ + Save cropped predictions to `save_dir/cls/file_name.jpg`. + + Args: + save_dir (str | pathlib.Path): Save path. + file_name (str | pathlib.Path): File name. + """ + if self.probs is not None: + LOGGER.warning('WARNING ⚠️ Classify task do not support `save_crop`.') + return + if isinstance(save_dir, str): + save_dir = Path(save_dir) + if isinstance(file_name, str): + file_name = Path(file_name) + for d in self.boxes: + save_one_box(d.xyxy, + self.orig_img.copy(), + file=save_dir / self.names[int(d.cls)] / f'{file_name.stem}.jpg', + BGR=True) + + def tojson(self, normalize=False): + """Convert the object to JSON format.""" + if self.probs is not None: + LOGGER.warning('Warning: Classify task do not support `tojson` yet.') + return + + import json + + # Create list of detection dictionaries + results = [] + data = self.boxes.data.cpu().tolist() + h, w = self.orig_shape if normalize else (1, 1) + for i, row in enumerate(data): + box = {'x1': row[0] / w, 'y1': row[1] / h, 'x2': row[2] / w, 'y2': row[3] / h} + conf = row[4] + id = int(row[5]) + name = self.names[id] + result = {'name': name, 'class': id, 'confidence': conf, 'box': box} + if self.masks: + x, y = self.masks.xy[i][:, 0], self.masks.xy[i][:, 1] # numpy array + result['segments'] = {'x': (x / w).tolist(), 'y': (y / h).tolist()} + if self.keypoints is not None: + x, y, visible = self.keypoints[i].data[0].cpu().unbind(dim=1) # torch Tensor + result['keypoints'] = {'x': (x / w).tolist(), 'y': (y / h).tolist(), 'visible': visible.tolist()} + results.append(result) + + # Convert detections to JSON + return json.dumps(results, indent=2) + + +class Boxes(BaseTensor): + """ + A class for storing and manipulating detection boxes. + + Args: + boxes (torch.Tensor | numpy.ndarray): A tensor or numpy array containing the detection boxes, + with shape (num_boxes, 6) or (num_boxes, 7). The last two columns contain confidence and class values. + If present, the third last column contains track IDs. + orig_shape (tuple): Original image size, in the format (height, width). + + Attributes: + xyxy (torch.Tensor | numpy.ndarray): The boxes in xyxy format. + conf (torch.Tensor | numpy.ndarray): The confidence values of the boxes. + cls (torch.Tensor | numpy.ndarray): The class values of the boxes. + id (torch.Tensor | numpy.ndarray): The track IDs of the boxes (if available). + xywh (torch.Tensor | numpy.ndarray): The boxes in xywh format. + xyxyn (torch.Tensor | numpy.ndarray): The boxes in xyxy format normalized by original image size. + xywhn (torch.Tensor | numpy.ndarray): The boxes in xywh format normalized by original image size. + data (torch.Tensor): The raw bboxes tensor (alias for `boxes`). + + Methods: + cpu(): Move the object to CPU memory. + numpy(): Convert the object to a numpy array. + cuda(): Move the object to CUDA memory. + to(*args, **kwargs): Move the object to the specified device. + """ + + def __init__(self, boxes, orig_shape) -> None: + """Initialize the Boxes class.""" + if boxes.ndim == 1: + boxes = boxes[None, :] + n = boxes.shape[-1] + assert n in (6, 7), f'expected `n` in [6, 7], but got {n}' # xyxy, (track_id), conf, cls + super().__init__(boxes, orig_shape) + self.is_track = n == 7 + self.orig_shape = orig_shape + + @property + def xyxy(self): + """Return the boxes in xyxy format.""" + return self.data[:, :4] + + @property + def conf(self): + """Return the confidence values of the boxes.""" + return self.data[:, -2] + + @property + def cls(self): + """Return the class values of the boxes.""" + return self.data[:, -1] + + @property + def id(self): + """Return the track IDs of the boxes (if available).""" + return self.data[:, -3] if self.is_track else None + + @property + @lru_cache(maxsize=2) # maxsize 1 should suffice + def xywh(self): + """Return the boxes in xywh format.""" + return ops.xyxy2xywh(self.xyxy) + + @property + @lru_cache(maxsize=2) + def xyxyn(self): + """Return the boxes in xyxy format normalized by original image size.""" + xyxy = self.xyxy.clone() if isinstance(self.xyxy, torch.Tensor) else np.copy(self.xyxy) + xyxy[..., [0, 2]] /= self.orig_shape[1] + xyxy[..., [1, 3]] /= self.orig_shape[0] + return xyxy + + @property + @lru_cache(maxsize=2) + def xywhn(self): + """Return the boxes in xywh format normalized by original image size.""" + xywh = ops.xyxy2xywh(self.xyxy) + xywh[..., [0, 2]] /= self.orig_shape[1] + xywh[..., [1, 3]] /= self.orig_shape[0] + return xywh + + @property + def boxes(self): + """Return the raw bboxes tensor (deprecated).""" + LOGGER.warning("WARNING ⚠️ 'Boxes.boxes' is deprecated. Use 'Boxes.data' instead.") + return self.data + + +class Masks(BaseTensor): + """ + A class for storing and manipulating detection masks. + + Attributes: + segments (list): Deprecated property for segments (normalized). + xy (list): A list of segments in pixel coordinates. + xyn (list): A list of normalized segments. + + Methods: + cpu(): Returns the masks tensor on CPU memory. + numpy(): Returns the masks tensor as a numpy array. + cuda(): Returns the masks tensor on GPU memory. + to(device, dtype): Returns the masks tensor with the specified device and dtype. + """ + + def __init__(self, masks, orig_shape) -> None: + """Initialize the Masks class with the given masks tensor and original image shape.""" + if masks.ndim == 2: + masks = masks[None, :] + super().__init__(masks, orig_shape) + + @property + @lru_cache(maxsize=1) + def segments(self): + """Return segments (normalized). Deprecated; use xyn property instead.""" + LOGGER.warning( + "WARNING ⚠️ 'Masks.segments' is deprecated. Use 'Masks.xyn' for segments (normalized) and 'Masks.xy' for segments (pixels) instead." + ) + return self.xyn + + @property + @lru_cache(maxsize=1) + def xyn(self): + """Return normalized segments.""" + return [ + ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=True) + for x in ops.masks2segments(self.data)] + + @property + @lru_cache(maxsize=1) + def xy(self): + """Return segments in pixel coordinates.""" + return [ + ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=False) + for x in ops.masks2segments(self.data)] + + @property + def masks(self): + """Return the raw masks tensor. Deprecated; use data attribute instead.""" + LOGGER.warning("WARNING ⚠️ 'Masks.masks' is deprecated. Use 'Masks.data' instead.") + return self.data + + +class Keypoints(BaseTensor): + """ + A class for storing and manipulating detection keypoints. + + Attributes: + xy (torch.Tensor): A collection of keypoints containing x, y coordinates for each detection. + xyn (torch.Tensor): A normalized version of xy with coordinates in the range [0, 1]. + conf (torch.Tensor): Confidence values associated with keypoints if available, otherwise None. + + Methods: + cpu(): Returns a copy of the keypoints tensor on CPU memory. + numpy(): Returns a copy of the keypoints tensor as a numpy array. + cuda(): Returns a copy of the keypoints tensor on GPU memory. + to(device, dtype): Returns a copy of the keypoints tensor with the specified device and dtype. + """ + + def __init__(self, keypoints, orig_shape) -> None: + """Initializes the Keypoints object with detection keypoints and original image size.""" + if keypoints.ndim == 2: + keypoints = keypoints[None, :] + super().__init__(keypoints, orig_shape) + self.has_visible = self.data.shape[-1] == 3 + + @property + @lru_cache(maxsize=1) + def xy(self): + """Returns x, y coordinates of keypoints.""" + return self.data[..., :2] + + @property + @lru_cache(maxsize=1) + def xyn(self): + """Returns normalized x, y coordinates of keypoints.""" + xy = self.xy.clone() if isinstance(self.xy, torch.Tensor) else np.copy(self.xy) + xy[..., 0] /= self.orig_shape[1] + xy[..., 1] /= self.orig_shape[0] + return xy + + @property + @lru_cache(maxsize=1) + def conf(self): + """Returns confidence values of keypoints if available, else None.""" + return self.data[..., 2] if self.has_visible else None + + +class Probs(BaseTensor): + """ + A class for storing and manipulating classification predictions. + + Attributes: + top1 (int): Index of the top 1 class. + top5 (list[int]): Indices of the top 5 classes. + top1conf (torch.Tensor): Confidence of the top 1 class. + top5conf (torch.Tensor): Confidences of the top 5 classes. + + Methods: + cpu(): Returns a copy of the probs tensor on CPU memory. + numpy(): Returns a copy of the probs tensor as a numpy array. + cuda(): Returns a copy of the probs tensor on GPU memory. + to(): Returns a copy of the probs tensor with the specified device and dtype. + """ + + def __init__(self, probs, orig_shape=None) -> None: + super().__init__(probs, orig_shape) + + @property + @lru_cache(maxsize=1) + def top1(self): + """Return the index of top 1.""" + return int(self.data.argmax()) + + @property + @lru_cache(maxsize=1) + def top5(self): + """Return the indices of top 5.""" + return (-self.data).argsort(0)[:5].tolist() # this way works with both torch and numpy. + + @property + @lru_cache(maxsize=1) + def top1conf(self): + """Return the confidence of top 1.""" + return self.data[self.top1] + + @property + @lru_cache(maxsize=1) + def top5conf(self): + """Return the confidences of top 5.""" + return self.data[self.top5] diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..abd20b21586e8bd050f708c7de55553ce0566685 --- /dev/null +++ b/ultralytics/engine/trainer.py @@ -0,0 +1,664 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Train a model on a dataset + +Usage: + $ yolo mode=train model=yolov8n.pt data=coco128.yaml imgsz=640 epochs=100 batch=16 +""" +import math +import os +import subprocess +import time +from copy import deepcopy +from datetime import datetime, timedelta +from pathlib import Path + +import numpy as np +import torch +from torch import distributed as dist +from torch import nn, optim +from torch.cuda import amp +from torch.nn.parallel import DistributedDataParallel as DDP +from tqdm import tqdm + +from ultralytics.cfg import get_cfg +from ultralytics.data.utils import check_cls_dataset, check_det_dataset +from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights +from ultralytics.utils import (DEFAULT_CFG, LOGGER, RANK, SETTINGS, TQDM_BAR_FORMAT, __version__, callbacks, clean_url, + colorstr, emojis, yaml_save) +from ultralytics.utils.autobatch import check_train_batch_size +from ultralytics.utils.checks import check_amp, check_file, check_imgsz, print_args +from ultralytics.utils.dist import ddp_cleanup, generate_ddp_command +from ultralytics.utils.files import get_latest_run, increment_path +from ultralytics.utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, init_seeds, one_cycle, select_device, + strip_optimizer) + + +class BaseTrainer: + """ + BaseTrainer + + A base class for creating trainers. + + Attributes: + args (SimpleNamespace): Configuration for the trainer. + check_resume (method): Method to check if training should be resumed from a saved checkpoint. + validator (BaseValidator): Validator instance. + model (nn.Module): Model instance. + callbacks (defaultdict): Dictionary of callbacks. + save_dir (Path): Directory to save results. + wdir (Path): Directory to save weights. + last (Path): Path to last checkpoint. + best (Path): Path to best checkpoint. + save_period (int): Save checkpoint every x epochs (disabled if < 1). + batch_size (int): Batch size for training. + epochs (int): Number of epochs to train for. + start_epoch (int): Starting epoch for training. + device (torch.device): Device to use for training. + amp (bool): Flag to enable AMP (Automatic Mixed Precision). + scaler (amp.GradScaler): Gradient scaler for AMP. + data (str): Path to data. + trainset (torch.utils.data.Dataset): Training dataset. + testset (torch.utils.data.Dataset): Testing dataset. + ema (nn.Module): EMA (Exponential Moving Average) of the model. + lf (nn.Module): Loss function. + scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler. + best_fitness (float): The best fitness value achieved. + fitness (float): Current fitness value. + loss (float): Current loss value. + tloss (float): Total loss value. + loss_names (list): List of loss names. + csv (Path): Path to results CSV file. + """ + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + """ + Initializes the BaseTrainer class. + + Args: + cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG. + overrides (dict, optional): Configuration overrides. Defaults to None. + """ + self.args = get_cfg(cfg, overrides) + self.device = select_device(self.args.device, self.args.batch) + self.check_resume() + self.validator = None + self.model = None + self.metrics = None + self.plots = {} + init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic) + + # Dirs + project = self.args.project or Path(SETTINGS['runs_dir']) / self.args.task + name = self.args.name or f'{self.args.mode}' + if hasattr(self.args, 'save_dir'): + self.save_dir = Path(self.args.save_dir) + else: + self.save_dir = Path( + increment_path(Path(project) / name, exist_ok=self.args.exist_ok if RANK in (-1, 0) else True)) + self.wdir = self.save_dir / 'weights' # weights dir + if RANK in (-1, 0): + self.wdir.mkdir(parents=True, exist_ok=True) # make dir + self.args.save_dir = str(self.save_dir) + yaml_save(self.save_dir / 'args.yaml', vars(self.args)) # save run args + self.last, self.best = self.wdir / 'last.pt', self.wdir / 'best.pt' # checkpoint paths + self.save_period = self.args.save_period + + self.batch_size = self.args.batch + self.epochs = self.args.epochs + self.start_epoch = 0 + if RANK == -1: + print_args(vars(self.args)) + + # Device + if self.device.type == 'cpu': + self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading + + # Model and Dataset + self.model = self.args.model + try: + if self.args.task == 'classify': + self.data = check_cls_dataset(self.args.data) + elif self.args.data.split('.')[-1] in ('yaml', 'yml') or self.args.task in ('detect', 'segment'): + self.data = check_det_dataset(self.args.data) + if 'yaml_file' in self.data: + self.args.data = self.data['yaml_file'] # for validating 'yolo train data=url.zip' usage + except Exception as e: + raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e + + self.trainset, self.testset = self.get_dataset(self.data) + self.ema = None + + # Optimization utils init + self.lf = None + self.scheduler = None + + # Epoch level metrics + self.best_fitness = None + self.fitness = None + self.loss = None + self.tloss = None + self.loss_names = ['Loss'] + self.csv = self.save_dir / 'results.csv' + self.plot_idx = [0, 1, 2] + + # Callbacks + self.callbacks = _callbacks or callbacks.get_default_callbacks() + if RANK in (-1, 0): + callbacks.add_integration_callbacks(self) + + def add_callback(self, event: str, callback): + """ + Appends the given callback. + """ + self.callbacks[event].append(callback) + + def set_callback(self, event: str, callback): + """ + Overrides the existing callbacks with the given callback. + """ + self.callbacks[event] = [callback] + + def run_callbacks(self, event: str): + """Run all existing callbacks associated with a particular event.""" + for callback in self.callbacks.get(event, []): + callback(self) + + def train(self): + """Allow device='', device=None on Multi-GPU systems to default to device=0.""" + if isinstance(self.args.device, int) or self.args.device: # i.e. device=0 or device=[0,1,2,3] + world_size = torch.cuda.device_count() + elif torch.cuda.is_available(): # i.e. device=None or device='' + world_size = 1 # default to device 0 + else: # i.e. device='cpu' or 'mps' + world_size = 0 + + # Run subprocess if DDP training, else train normally + if world_size > 1 and 'LOCAL_RANK' not in os.environ: + # Argument checks + if self.args.rect: + LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with Multi-GPU training, setting rect=False") + self.args.rect = False + # Command + cmd, file = generate_ddp_command(world_size, self) + try: + LOGGER.info(f'DDP command: {cmd}') + subprocess.run(cmd, check=True) + except Exception as e: + raise e + finally: + ddp_cleanup(self, str(file)) + else: + self._do_train(world_size) + + def _setup_ddp(self, world_size): + """Initializes and sets the DistributedDataParallel parameters for training.""" + torch.cuda.set_device(RANK) + self.device = torch.device('cuda', RANK) + LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') + os.environ['NCCL_BLOCKING_WAIT'] = '1' # set to enforce timeout + dist.init_process_group( + 'nccl' if dist.is_nccl_available() else 'gloo', + timeout=timedelta(seconds=10800), # 3 hours + rank=RANK, + world_size=world_size) + + def _setup_train(self, world_size): + """ + Builds dataloaders and optimizer on correct rank process. + """ + # Model + self.run_callbacks('on_pretrain_routine_start') + ckpt = self.setup_model() + self.model = self.model.to(self.device) + self.set_model_attributes() + # Check AMP + self.amp = torch.tensor(self.args.amp).to(self.device) # True or False + if self.amp and RANK in (-1, 0): # Single-GPU and DDP + callbacks_backup = callbacks.default_callbacks.copy() # backup callbacks as check_amp() resets them + self.amp = torch.tensor(check_amp(self.model), device=self.device) + callbacks.default_callbacks = callbacks_backup # restore callbacks + if RANK > -1 and world_size > 1: # DDP + dist.broadcast(self.amp, src=0) # broadcast the tensor from rank 0 to all other ranks (returns None) + self.amp = bool(self.amp) # as boolean + self.scaler = amp.GradScaler(enabled=self.amp) + if world_size > 1: + self.model = DDP(self.model, device_ids=[RANK]) + # Check imgsz + gs = max(int(self.model.stride.max() if hasattr(self.model, 'stride') else 32), 32) # grid size (max stride) + self.args.imgsz = check_imgsz(self.args.imgsz, stride=gs, floor=gs, max_dim=1) + # Batch size + if self.batch_size == -1: + if RANK == -1: # single-GPU only, estimate best batch size + self.args.batch = self.batch_size = check_train_batch_size(self.model, self.args.imgsz, self.amp) + else: + SyntaxError('batch=-1 to use AutoBatch is only available in Single-GPU training. ' + 'Please pass a valid batch size value for Multi-GPU DDP training, i.e. batch=16') + + # Dataloaders + batch_size = self.batch_size // max(world_size, 1) + self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode='train') + if RANK in (-1, 0): + self.test_loader = self.get_dataloader(self.testset, batch_size=batch_size * 2, rank=-1, mode='val') + self.validator = self.get_validator() + metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix='val') + self.metrics = dict(zip(metric_keys, [0] * len(metric_keys))) # TODO: init metrics for plot_results()? + self.ema = ModelEMA(self.model) + if self.args.plots: + self.plot_training_labels() + + # Optimizer + self.accumulate = max(round(self.args.nbs / self.batch_size), 1) # accumulate loss before optimizing + weight_decay = self.args.weight_decay * self.batch_size * self.accumulate / self.args.nbs # scale weight_decay + iterations = math.ceil(len(self.train_loader.dataset) / max(self.batch_size, self.args.nbs)) * self.epochs + self.optimizer = self.build_optimizer(model=self.model, + name=self.args.optimizer, + lr=self.args.lr0, + momentum=self.args.momentum, + decay=weight_decay, + iterations=iterations) + # Scheduler + if self.args.cos_lr: + self.lf = one_cycle(1, self.args.lrf, self.epochs) # cosine 1->hyp['lrf'] + else: + self.lf = lambda x: (1 - x / self.epochs) * (1.0 - self.args.lrf) + self.args.lrf # linear + self.scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=self.lf) + self.stopper, self.stop = EarlyStopping(patience=self.args.patience), False + self.resume_training(ckpt) + self.scheduler.last_epoch = self.start_epoch - 1 # do not move + self.run_callbacks('on_pretrain_routine_end') + + def _do_train(self, world_size=1): + """Train completed, evaluate and plot if specified by arguments.""" + if world_size > 1: + self._setup_ddp(world_size) + + self._setup_train(world_size) + + self.epoch_time = None + self.epoch_time_start = time.time() + self.train_time_start = time.time() + nb = len(self.train_loader) # number of batches + nw = max(round(self.args.warmup_epochs * + nb), 100) if self.args.warmup_epochs > 0 else -1 # number of warmup iterations + last_opt_step = -1 + self.run_callbacks('on_train_start') + LOGGER.info(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n' + f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n' + f"Logging results to {colorstr('bold', self.save_dir)}\n" + f'Starting training for {self.epochs} epochs...') + if self.args.close_mosaic: + base_idx = (self.epochs - self.args.close_mosaic) * nb + self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2]) + epoch = self.epochs # predefine for resume fully trained model edge cases + for epoch in range(self.start_epoch, self.epochs): + self.epoch = epoch + self.run_callbacks('on_train_epoch_start') + self.model.train() + if RANK != -1: + self.train_loader.sampler.set_epoch(epoch) + pbar = enumerate(self.train_loader) + # Update dataloader attributes (optional) + if epoch == (self.epochs - self.args.close_mosaic): + LOGGER.info('Closing dataloader mosaic') + if hasattr(self.train_loader.dataset, 'mosaic'): + self.train_loader.dataset.mosaic = False + if hasattr(self.train_loader.dataset, 'close_mosaic'): + self.train_loader.dataset.close_mosaic(hyp=self.args) + self.train_loader.reset() + + if RANK in (-1, 0): + LOGGER.info(self.progress_string()) + pbar = tqdm(enumerate(self.train_loader), total=nb, bar_format=TQDM_BAR_FORMAT) + self.tloss = None + self.optimizer.zero_grad() + for i, batch in pbar: + self.run_callbacks('on_train_batch_start') + # Warmup + ni = i + nb * epoch + if ni <= nw: + xi = [0, nw] # x interp + self.accumulate = max(1, np.interp(ni, xi, [1, self.args.nbs / self.batch_size]).round()) + for j, x in enumerate(self.optimizer.param_groups): + # Bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 + x['lr'] = np.interp( + ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x['initial_lr'] * self.lf(epoch)]) + if 'momentum' in x: + x['momentum'] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum]) + + # Forward + with torch.cuda.amp.autocast(self.amp): + batch = self.preprocess_batch(batch) + self.loss, self.loss_items = self.model(batch) + if RANK != -1: + self.loss *= world_size + self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \ + else self.loss_items + + # Backward + self.scaler.scale(self.loss).backward() + + # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html + if ni - last_opt_step >= self.accumulate: + self.optimizer_step() + last_opt_step = ni + + # Log + mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) + loss_len = self.tloss.shape[0] if len(self.tloss.size()) else 1 + losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0) + if RANK in (-1, 0): + pbar.set_description( + ('%11s' * 2 + '%11.4g' * (2 + loss_len)) % + (f'{epoch + 1}/{self.epochs}', mem, *losses, batch['cls'].shape[0], batch['img'].shape[-1])) + self.run_callbacks('on_batch_end') + if self.args.plots and ni in self.plot_idx: + self.plot_training_samples(batch, ni) + + self.run_callbacks('on_train_batch_end') + + self.lr = {f'lr/pg{ir}': x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers + + self.scheduler.step() + self.run_callbacks('on_train_epoch_end') + + if RANK in (-1, 0): + + # Validation + self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights']) + final_epoch = (epoch + 1 == self.epochs) or self.stopper.possible_stop + + if self.args.val or final_epoch: + self.metrics, self.fitness = self.validate() + self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr}) + self.stop = self.stopper(epoch + 1, self.fitness) + + # Save model + if self.args.save or (epoch + 1 == self.epochs): + self.save_model() + self.run_callbacks('on_model_save') + + tnow = time.time() + self.epoch_time = tnow - self.epoch_time_start + self.epoch_time_start = tnow + self.run_callbacks('on_fit_epoch_end') + torch.cuda.empty_cache() # clears GPU vRAM at end of epoch, can help with out of memory errors + + # Early Stopping + if RANK != -1: # if DDP training + broadcast_list = [self.stop if RANK == 0 else None] + dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks + if RANK != 0: + self.stop = broadcast_list[0] + if self.stop: + break # must break all DDP ranks + + if RANK in (-1, 0): + # Do final val with best.pt + LOGGER.info(f'\n{epoch - self.start_epoch + 1} epochs completed in ' + f'{(time.time() - self.train_time_start) / 3600:.3f} hours.') + self.final_eval() + if self.args.plots: + self.plot_metrics() + self.run_callbacks('on_train_end') + torch.cuda.empty_cache() + self.run_callbacks('teardown') + + def save_model(self): + """Save model checkpoints based on various conditions.""" + ckpt = { + 'epoch': self.epoch, + 'best_fitness': self.best_fitness, + 'model': deepcopy(de_parallel(self.model)).half(), + 'ema': deepcopy(self.ema.ema).half(), + 'updates': self.ema.updates, + 'optimizer': self.optimizer.state_dict(), + 'train_args': vars(self.args), # save as dict + 'date': datetime.now().isoformat(), + 'version': __version__} + + # Use dill (if exists) to serialize the lambda functions where pickle does not do this + try: + import dill as pickle + except ImportError: + import pickle + + # Save last, best and delete + torch.save(ckpt, self.last, pickle_module=pickle) + if self.best_fitness == self.fitness: + torch.save(ckpt, self.best, pickle_module=pickle) + if (self.epoch > 0) and (self.save_period > 0) and (self.epoch % self.save_period == 0): + torch.save(ckpt, self.wdir / f'epoch{self.epoch}.pt', pickle_module=pickle) + del ckpt + + @staticmethod + def get_dataset(data): + """ + Get train, val path from data dict if it exists. Returns None if data format is not recognized. + """ + return data['train'], data.get('val') or data.get('test') + + def setup_model(self): + """ + load/create/download model for any task. + """ + if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed + return + + model, weights = self.model, None + ckpt = None + if str(model).endswith('.pt'): + weights, ckpt = attempt_load_one_weight(model) + cfg = ckpt['model'].yaml + else: + cfg = model + self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1) # calls Model(cfg, weights) + return ckpt + + def optimizer_step(self): + """Perform a single step of the training optimizer with gradient clipping and EMA update.""" + self.scaler.unscale_(self.optimizer) # unscale gradients + torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients + self.scaler.step(self.optimizer) + self.scaler.update() + self.optimizer.zero_grad() + if self.ema: + self.ema.update(self.model) + + def preprocess_batch(self, batch): + """ + Allows custom preprocessing model inputs and ground truths depending on task type. + """ + return batch + + def validate(self): + """ + Runs validation on test set using self.validator. The returned dict is expected to contain "fitness" key. + """ + metrics = self.validator(self) + fitness = metrics.pop('fitness', -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found + if not self.best_fitness or self.best_fitness < fitness: + self.best_fitness = fitness + return metrics, fitness + + def get_model(self, cfg=None, weights=None, verbose=True): + """Get model and raise NotImplementedError for loading cfg files.""" + raise NotImplementedError("This task trainer doesn't support loading cfg files") + + def get_validator(self): + """Returns a NotImplementedError when the get_validator function is called.""" + raise NotImplementedError('get_validator function not implemented in trainer') + + def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'): + """ + Returns dataloader derived from torch.data.Dataloader. + """ + raise NotImplementedError('get_dataloader function not implemented in trainer') + + def build_dataset(self, img_path, mode='train', batch=None): + """Build dataset""" + raise NotImplementedError('build_dataset function not implemented in trainer') + + def label_loss_items(self, loss_items=None, prefix='train'): + """ + Returns a loss dict with labelled training loss items tensor + """ + # Not needed for classification but necessary for segmentation & detection + return {'loss': loss_items} if loss_items is not None else ['loss'] + + def set_model_attributes(self): + """ + To set or update model parameters before training. + """ + self.model.names = self.data['names'] + + def build_targets(self, preds, targets): + """Builds target tensors for training YOLO model.""" + pass + + def progress_string(self): + """Returns a string describing training progress.""" + return '' + + # TODO: may need to put these following functions into callback + def plot_training_samples(self, batch, ni): + """Plots training samples during YOLOv5 training.""" + pass + + def plot_training_labels(self): + """Plots training labels for YOLO model.""" + pass + + def save_metrics(self, metrics): + """Saves training metrics to a CSV file.""" + keys, vals = list(metrics.keys()), list(metrics.values()) + n = len(metrics) + 1 # number of cols + s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header + with open(self.csv, 'a') as f: + f.write(s + ('%23.5g,' * n % tuple([self.epoch] + vals)).rstrip(',') + '\n') + + def plot_metrics(self): + """Plot and display metrics visually.""" + pass + + def on_plot(self, name, data=None): + """Registers plots (e.g. to be consumed in callbacks)""" + self.plots[name] = {'data': data, 'timestamp': time.time()} + + def final_eval(self): + """Performs final evaluation and validation for object detection YOLO model.""" + for f in self.last, self.best: + if f.exists(): + strip_optimizer(f) # strip optimizers + if f is self.best: + LOGGER.info(f'\nValidating {f}...') + self.metrics = self.validator(model=f) + self.metrics.pop('fitness', None) + self.run_callbacks('on_fit_epoch_end') + + def check_resume(self): + """Check if resume checkpoint exists and update arguments accordingly.""" + resume = self.args.resume + if resume: + try: + exists = isinstance(resume, (str, Path)) and Path(resume).exists() + last = Path(check_file(resume) if exists else get_latest_run()) + + # Check that resume data YAML exists, otherwise strip to force re-download of dataset + ckpt_args = attempt_load_weights(last).args + if not Path(ckpt_args['data']).exists(): + ckpt_args['data'] = self.args.data + + self.args = get_cfg(ckpt_args) + self.args.model, resume = str(last), True # reinstate + except Exception as e: + raise FileNotFoundError('Resume checkpoint not found. Please pass a valid checkpoint to resume from, ' + "i.e. 'yolo train resume model=path/to/last.pt'") from e + self.resume = resume + + def resume_training(self, ckpt): + """Resume YOLO training from given epoch and best fitness.""" + if ckpt is None: + return + best_fitness = 0.0 + start_epoch = ckpt['epoch'] + 1 + if ckpt['optimizer'] is not None: + self.optimizer.load_state_dict(ckpt['optimizer']) # optimizer + best_fitness = ckpt['best_fitness'] + if self.ema and ckpt.get('ema'): + self.ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) # EMA + self.ema.updates = ckpt['updates'] + if self.resume: + assert start_epoch > 0, \ + f'{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n' \ + f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'" + LOGGER.info( + f'Resuming training from {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs') + if self.epochs < start_epoch: + LOGGER.info( + f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs.") + self.epochs += ckpt['epoch'] # finetune additional epochs + self.best_fitness = best_fitness + self.start_epoch = start_epoch + if start_epoch > (self.epochs - self.args.close_mosaic): + LOGGER.info('Closing dataloader mosaic') + if hasattr(self.train_loader.dataset, 'mosaic'): + self.train_loader.dataset.mosaic = False + if hasattr(self.train_loader.dataset, 'close_mosaic'): + self.train_loader.dataset.close_mosaic(hyp=self.args) + + def build_optimizer(self, model, name='auto', lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5): + """ + Constructs an optimizer for the given model, based on the specified optimizer name, learning rate, + momentum, weight decay, and number of iterations. + + Args: + model (torch.nn.Module): The model for which to build an optimizer. + name (str, optional): The name of the optimizer to use. If 'auto', the optimizer is selected + based on the number of iterations. Default: 'auto'. + lr (float, optional): The learning rate for the optimizer. Default: 0.001. + momentum (float, optional): The momentum factor for the optimizer. Default: 0.9. + decay (float, optional): The weight decay for the optimizer. Default: 1e-5. + iterations (float, optional): The number of iterations, which determines the optimizer if + name is 'auto'. Default: 1e5. + + Returns: + (torch.optim.Optimizer): The constructed optimizer. + """ + + g = [], [], [] # optimizer parameter groups + bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k) # normalization layers, i.e. BatchNorm2d() + if name == 'auto': + nc = getattr(model, 'nc', 10) # number of classes + lr_fit = round(0.002 * 5 / (4 + nc), 6) # lr0 fit equation to 6 decimal places + name, lr, momentum = ('SGD', 0.01, 0.9) if iterations > 10000 else ('AdamW', lr_fit, 0.9) + self.args.warmup_bias_lr = 0.0 # no higher than 0.01 for Adam + + for module_name, module in model.named_modules(): + for param_name, param in module.named_parameters(recurse=False): + fullname = f'{module_name}.{param_name}' if module_name else param_name + if 'bias' in fullname: # bias (no decay) + g[2].append(param) + elif isinstance(module, bn): # weight (no decay) + g[1].append(param) + else: # weight (with decay) + g[0].append(param) + + if name in ('Adam', 'Adamax', 'AdamW', 'NAdam', 'RAdam'): + optimizer = getattr(optim, name, optim.Adam)(g[2], lr=lr, betas=(momentum, 0.999), weight_decay=0.0) + elif name == 'RMSProp': + optimizer = optim.RMSprop(g[2], lr=lr, momentum=momentum) + elif name == 'SGD': + optimizer = optim.SGD(g[2], lr=lr, momentum=momentum, nesterov=True) + else: + raise NotImplementedError( + f"Optimizer '{name}' not found in list of available optimizers " + f'[Adam, AdamW, NAdam, RAdam, RMSProp, SGD, auto].' + 'To request support for addition optimizers please visit https://github.com/ultralytics/ultralytics.') + + optimizer.add_param_group({'params': g[0], 'weight_decay': decay}) # add g0 with weight_decay + optimizer.add_param_group({'params': g[1], 'weight_decay': 0.0}) # add g1 (BatchNorm2d weights) + LOGGER.info( + f"{colorstr('optimizer:')} {type(optimizer).__name__}(lr={lr}, momentum={momentum}) with parameter groups " + f'{len(g[1])} weight(decay=0.0), {len(g[0])} weight(decay={decay}), {len(g[2])} bias(decay=0.0)') + return optimizer diff --git a/ultralytics/engine/validator.py b/ultralytics/engine/validator.py new file mode 100644 index 0000000000000000000000000000000000000000..4bae397ae7461ea5706468eba054ffa9374f1207 --- /dev/null +++ b/ultralytics/engine/validator.py @@ -0,0 +1,279 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Check a model's accuracy on a test or val split of a dataset + +Usage: + $ yolo mode=val model=yolov8n.pt data=coco128.yaml imgsz=640 + +Usage - formats: + $ yolo mode=val model=yolov8n.pt # PyTorch + yolov8n.torchscript # TorchScript + yolov8n.onnx # ONNX Runtime or OpenCV DNN with dnn=True + yolov8n_openvino_model # OpenVINO + yolov8n.engine # TensorRT + yolov8n.mlmodel # CoreML (macOS-only) + yolov8n_saved_model # TensorFlow SavedModel + yolov8n.pb # TensorFlow GraphDef + yolov8n.tflite # TensorFlow Lite + yolov8n_edgetpu.tflite # TensorFlow Edge TPU + yolov8n_paddle_model # PaddlePaddle +""" +import json +import time +from pathlib import Path + +import torch +from tqdm import tqdm + +from ultralytics.cfg import get_cfg +from ultralytics.data.utils import check_cls_dataset, check_det_dataset +from ultralytics.nn.autobackend import AutoBackend +from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK, SETTINGS, TQDM_BAR_FORMAT, callbacks, colorstr, emojis +from ultralytics.utils.checks import check_imgsz +from ultralytics.utils.files import increment_path +from ultralytics.utils.ops import Profile +from ultralytics.utils.torch_utils import de_parallel, select_device, smart_inference_mode + + +class BaseValidator: + """ + BaseValidator + + A base class for creating validators. + + Attributes: + dataloader (DataLoader): Dataloader to use for validation. + pbar (tqdm): Progress bar to update during validation. + args (SimpleNamespace): Configuration for the validator. + model (nn.Module): Model to validate. + data (dict): Data dictionary. + device (torch.device): Device to use for validation. + batch_i (int): Current batch index. + training (bool): Whether the model is in training mode. + speed (float): Batch processing speed in seconds. + jdict (dict): Dictionary to store validation results. + save_dir (Path): Directory to save results. + """ + + def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): + """ + Initializes a BaseValidator instance. + + Args: + dataloader (torch.utils.data.DataLoader): Dataloader to be used for validation. + save_dir (Path): Directory to save results. + pbar (tqdm.tqdm): Progress bar for displaying progress. + args (SimpleNamespace): Configuration for the validator. + """ + self.dataloader = dataloader + self.pbar = pbar + self.args = args or get_cfg(DEFAULT_CFG) + self.model = None + self.data = None + self.device = None + self.batch_i = None + self.training = True + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + self.jdict = None + + project = self.args.project or Path(SETTINGS['runs_dir']) / self.args.task + name = self.args.name or f'{self.args.mode}' + self.save_dir = save_dir or increment_path(Path(project) / name, + exist_ok=self.args.exist_ok if RANK in (-1, 0) else True) + (self.save_dir / 'labels' if self.args.save_txt else self.save_dir).mkdir(parents=True, exist_ok=True) + + if self.args.conf is None: + self.args.conf = 0.001 # default conf=0.001 + + self.plots = {} + self.callbacks = _callbacks or callbacks.get_default_callbacks() + + @smart_inference_mode() + def __call__(self, trainer=None, model=None): + """ + Supports validation of a pre-trained model if passed or a model being trained + if trainer is passed (trainer gets priority). + """ + self.training = trainer is not None + augment = self.args.augment and (not self.training) + if self.training: + self.device = trainer.device + self.data = trainer.data + model = trainer.ema.ema or trainer.model + self.args.half = self.device.type != 'cpu' # force FP16 val during training + model = model.half() if self.args.half else model.float() + self.model = model + self.loss = torch.zeros_like(trainer.loss_items, device=trainer.device) + self.args.plots = trainer.stopper.possible_stop or (trainer.epoch == trainer.epochs - 1) + model.eval() + else: + callbacks.add_integration_callbacks(self) + self.run_callbacks('on_val_start') + assert model is not None, 'Either trainer or model is needed for validation' + model = AutoBackend(model, + device=select_device(self.args.device, self.args.batch), + dnn=self.args.dnn, + data=self.args.data, + fp16=self.args.half) + self.model = model + self.device = model.device # update device + self.args.half = model.fp16 # update half + stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine + imgsz = check_imgsz(self.args.imgsz, stride=stride) + if engine: + self.args.batch = model.batch_size + elif not pt and not jit: + self.args.batch = 1 # export.py models default to batch-size 1 + LOGGER.info(f'Forcing batch=1 square inference (1,3,{imgsz},{imgsz}) for non-PyTorch models') + + if isinstance(self.args.data, str) and self.args.data.split('.')[-1] in ('yaml', 'yml'): + self.data = check_det_dataset(self.args.data) + elif self.args.task == 'classify': + self.data = check_cls_dataset(self.args.data, split=self.args.split) + else: + raise FileNotFoundError(emojis(f"Dataset '{self.args.data}' for task={self.args.task} not found ❌")) + + if self.device.type == 'cpu': + self.args.workers = 0 # faster CPU val as time dominated by inference, not dataloading + if not pt: + self.args.rect = False + self.dataloader = self.dataloader or self.get_dataloader(self.data.get(self.args.split), self.args.batch) + + model.eval() + model.warmup(imgsz=(1 if pt else self.args.batch, 3, imgsz, imgsz)) # warmup + + dt = Profile(), Profile(), Profile(), Profile() + n_batches = len(self.dataloader) + desc = self.get_desc() + # NOTE: keeping `not self.training` in tqdm will eliminate pbar after segmentation evaluation during training, + # which may affect classification task since this arg is in yolov5/classify/val.py. + # bar = tqdm(self.dataloader, desc, n_batches, not self.training, bar_format=TQDM_BAR_FORMAT) + bar = tqdm(self.dataloader, desc, n_batches, bar_format=TQDM_BAR_FORMAT) + self.init_metrics(de_parallel(model)) + self.jdict = [] # empty before each val + for batch_i, batch in enumerate(bar): + self.run_callbacks('on_val_batch_start') + self.batch_i = batch_i + # Preprocess + with dt[0]: + batch = self.preprocess(batch) + + # Inference + with dt[1]: + preds = model(batch['img'], augment=augment) + + # Loss + with dt[2]: + if self.training: + self.loss += model.loss(batch, preds)[1] + + # Postprocess + with dt[3]: + preds = self.postprocess(preds) + + self.update_metrics(preds, batch) + if self.args.plots and batch_i < 3: + self.plot_val_samples(batch, batch_i) + self.plot_predictions(batch, preds, batch_i) + + self.run_callbacks('on_val_batch_end') + stats = self.get_stats() + self.check_stats(stats) + self.speed = dict(zip(self.speed.keys(), (x.t / len(self.dataloader.dataset) * 1E3 for x in dt))) + self.finalize_metrics() + self.print_results() + self.run_callbacks('on_val_end') + if self.training: + model.float() + results = {**stats, **trainer.label_loss_items(self.loss.cpu() / len(self.dataloader), prefix='val')} + return {k: round(float(v), 5) for k, v in results.items()} # return results as 5 decimal place floats + else: + LOGGER.info('Speed: %.1fms preprocess, %.1fms inference, %.1fms loss, %.1fms postprocess per image' % + tuple(self.speed.values())) + if self.args.save_json and self.jdict: + with open(str(self.save_dir / 'predictions.json'), 'w') as f: + LOGGER.info(f'Saving {f.name}...') + json.dump(self.jdict, f) # flatten and save + stats = self.eval_json(stats) # update stats + if self.args.plots or self.args.save_json: + LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}") + return stats + + def add_callback(self, event: str, callback): + """Appends the given callback.""" + self.callbacks[event].append(callback) + + def run_callbacks(self, event: str): + """Runs all callbacks associated with a specified event.""" + for callback in self.callbacks.get(event, []): + callback(self) + + def get_dataloader(self, dataset_path, batch_size): + """Get data loader from dataset path and batch size.""" + raise NotImplementedError('get_dataloader function not implemented for this validator') + + def build_dataset(self, img_path): + """Build dataset""" + raise NotImplementedError('build_dataset function not implemented in validator') + + def preprocess(self, batch): + """Preprocesses an input batch.""" + return batch + + def postprocess(self, preds): + """Describes and summarizes the purpose of 'postprocess()' but no details mentioned.""" + return preds + + def init_metrics(self, model): + """Initialize performance metrics for the YOLO model.""" + pass + + def update_metrics(self, preds, batch): + """Updates metrics based on predictions and batch.""" + pass + + def finalize_metrics(self, *args, **kwargs): + """Finalizes and returns all metrics.""" + pass + + def get_stats(self): + """Returns statistics about the model's performance.""" + return {} + + def check_stats(self, stats): + """Checks statistics.""" + pass + + def print_results(self): + """Prints the results of the model's predictions.""" + pass + + def get_desc(self): + """Get description of the YOLO model.""" + pass + + @property + def metric_keys(self): + """Returns the metric keys used in YOLO training/validation.""" + return [] + + def on_plot(self, name, data=None): + """Registers plots (e.g. to be consumed in callbacks)""" + self.plots[name] = {'data': data, 'timestamp': time.time()} + + # TODO: may need to put these following functions into callback + def plot_val_samples(self, batch, ni): + """Plots validation samples during training.""" + pass + + def plot_predictions(self, batch, preds, ni): + """Plots YOLO model predictions on batch images.""" + pass + + def pred_to_json(self, preds, batch): + """Convert predictions to JSON format.""" + pass + + def eval_json(self, stats): + """Evaluate and return JSON format of prediction statistics.""" + pass diff --git a/ultralytics/hub/__init__.py b/ultralytics/hub/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9f15d5e467b23e90697c8408c3667402673be549 --- /dev/null +++ b/ultralytics/hub/__init__.py @@ -0,0 +1,121 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import requests + +from ultralytics.data.utils import HUBDatasetStats +from ultralytics.hub.auth import Auth +from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX +from ultralytics.utils import LOGGER, SETTINGS, USER_CONFIG_DIR, yaml_save + + +def login(api_key=''): + """ + Log in to the Ultralytics HUB API using the provided API key. + + Args: + api_key (str, optional): May be an API key or a combination API key and model ID, i.e. key_id + + Example: + ```python + from ultralytics import hub + hub.login('API_KEY') + ``` + """ + Auth(api_key, verbose=True) + + +def logout(): + """ + Log out of Ultralytics HUB by removing the API key from the settings file. To log in again, use 'yolo hub login'. + + Example: + ```python + from ultralytics import hub + hub.logout() + ``` + """ + SETTINGS['api_key'] = '' + yaml_save(USER_CONFIG_DIR / 'settings.yaml', SETTINGS) + LOGGER.info(f"{PREFIX}logged out ✅. To log in again, use 'yolo hub login'.") + + +def start(key=''): + """ + Start training models with Ultralytics HUB (DEPRECATED). + + Args: + key (str, optional): A string containing either the API key and model ID combination (apikey_modelid), + or the full model URL (https://hub.ultralytics.com/models/apikey_modelid). + """ + api_key, model_id = key.split('_') + LOGGER.warning(f""" +WARNING ⚠️ ultralytics.start() is deprecated after 8.0.60. Updated usage to train Ultralytics HUB models is: + +from ultralytics import YOLO, hub + +hub.login('{api_key}') +model = YOLO('{HUB_WEB_ROOT}/models/{model_id}') +model.train()""") + + +def reset_model(model_id=''): + """Reset a trained model to an untrained state.""" + r = requests.post(f'{HUB_API_ROOT}/model-reset', json={'apiKey': Auth().api_key, 'modelId': model_id}) + if r.status_code == 200: + LOGGER.info(f'{PREFIX}Model reset successfully') + return + LOGGER.warning(f'{PREFIX}Model reset failure {r.status_code} {r.reason}') + + +def export_fmts_hub(): + """Returns a list of HUB-supported export formats.""" + from ultralytics.engine.exporter import export_formats + return list(export_formats()['Argument'][1:]) + ['ultralytics_tflite', 'ultralytics_coreml'] + + +def export_model(model_id='', format='torchscript'): + """Export a model to all formats.""" + assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}" + r = requests.post(f'{HUB_API_ROOT}/v1/models/{model_id}/export', + json={'format': format}, + headers={'x-api-key': Auth().api_key}) + assert r.status_code == 200, f'{PREFIX}{format} export failure {r.status_code} {r.reason}' + LOGGER.info(f'{PREFIX}{format} export started ✅') + + +def get_export(model_id='', format='torchscript'): + """Get an exported model dictionary with download URL.""" + assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}" + r = requests.post(f'{HUB_API_ROOT}/get-export', + json={ + 'apiKey': Auth().api_key, + 'modelId': model_id, + 'format': format}) + assert r.status_code == 200, f'{PREFIX}{format} get_export failure {r.status_code} {r.reason}' + return r.json() + + +def check_dataset(path='', task='detect'): + """ + Function for error-checking HUB dataset Zip file before upload. It checks a dataset for errors before it is + uploaded to the HUB. Usage examples are given below. + + Args: + path (str, optional): Path to data.zip (with data.yaml inside data.zip). Defaults to ''. + task (str, optional): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'. Defaults to 'detect'. + + Example: + ```python + from ultralytics.hub import check_dataset + + check_dataset('path/to/coco8.zip', task='detect') # detect dataset + check_dataset('path/to/coco8-seg.zip', task='segment') # segment dataset + check_dataset('path/to/coco8-pose.zip', task='pose') # pose dataset + ``` + """ + HUBDatasetStats(path=path, task=task).get_json() + LOGGER.info(f'Checks completed correctly ✅. Upload this dataset to {HUB_WEB_ROOT}/datasets/.') + + +if __name__ == '__main__': + start() diff --git a/ultralytics/hub/__pycache__/__init__.cpython-310.pyc b/ultralytics/hub/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4fd15cfd3dfefab719a4554bcd6600382402072 Binary files /dev/null and b/ultralytics/hub/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/hub/__pycache__/__init__.cpython-39.pyc b/ultralytics/hub/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9011ff253bf49734cc1057532be78474c136b4b8 Binary files /dev/null and b/ultralytics/hub/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/hub/__pycache__/auth.cpython-310.pyc b/ultralytics/hub/__pycache__/auth.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bebbf8f4fdf9770929d748ab2185480bf4c5b1c6 Binary files /dev/null and b/ultralytics/hub/__pycache__/auth.cpython-310.pyc differ diff --git a/ultralytics/hub/__pycache__/auth.cpython-39.pyc b/ultralytics/hub/__pycache__/auth.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c9084a6aac38f7bd9cee8f333d4adbcf630e00d Binary files /dev/null and b/ultralytics/hub/__pycache__/auth.cpython-39.pyc differ diff --git a/ultralytics/hub/__pycache__/utils.cpython-310.pyc b/ultralytics/hub/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21c9cc0e01b6fe79dff7917c7e6a35fbc5ef7209 Binary files /dev/null and b/ultralytics/hub/__pycache__/utils.cpython-310.pyc differ diff --git a/ultralytics/hub/__pycache__/utils.cpython-39.pyc b/ultralytics/hub/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..262276d4949270f9a5b00073ddc62ecf1c0f5dc8 Binary files /dev/null and b/ultralytics/hub/__pycache__/utils.cpython-39.pyc differ diff --git a/ultralytics/hub/auth.py b/ultralytics/hub/auth.py new file mode 100644 index 0000000000000000000000000000000000000000..721013c3602a21a46e400e77b2359c765e25f9e8 --- /dev/null +++ b/ultralytics/hub/auth.py @@ -0,0 +1,139 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import requests + +from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX, request_with_credentials +from ultralytics.utils import LOGGER, SETTINGS, emojis, is_colab + +API_KEY_URL = f'{HUB_WEB_ROOT}/settings?tab=api+keys' + + +class Auth: + id_token = api_key = model_key = False + + def __init__(self, api_key='', verbose=False): + """ + Initialize the Auth class with an optional API key. + + Args: + api_key (str, optional): May be an API key or a combination API key and model ID, i.e. key_id + """ + # Split the input API key in case it contains a combined key_model and keep only the API key part + api_key = api_key.split('_')[0] + + # Set API key attribute as value passed or SETTINGS API key if none passed + self.api_key = api_key or SETTINGS.get('api_key', '') + + # If an API key is provided + if self.api_key: + # If the provided API key matches the API key in the SETTINGS + if self.api_key == SETTINGS.get('api_key'): + # Log that the user is already logged in + if verbose: + LOGGER.info(f'{PREFIX}Authenticated ✅') + return + else: + # Attempt to authenticate with the provided API key + success = self.authenticate() + # If the API key is not provided and the environment is a Google Colab notebook + elif is_colab(): + # Attempt to authenticate using browser cookies + success = self.auth_with_cookies() + else: + # Request an API key + success = self.request_api_key() + + # Update SETTINGS with the new API key after successful authentication + if success: + SETTINGS.update({'api_key': self.api_key}) + # Log that the new login was successful + if verbose: + LOGGER.info(f'{PREFIX}New authentication successful ✅') + elif verbose: + LOGGER.info(f'{PREFIX}Retrieve API key from {API_KEY_URL}') + + def request_api_key(self, max_attempts=3): + """ + Prompt the user to input their API key. Returns the model ID. + """ + import getpass + for attempts in range(max_attempts): + LOGGER.info(f'{PREFIX}Login. Attempt {attempts + 1} of {max_attempts}') + input_key = getpass.getpass(f'Enter API key from {API_KEY_URL} ') + self.api_key = input_key.split('_')[0] # remove model id if present + if self.authenticate(): + return True + raise ConnectionError(emojis(f'{PREFIX}Failed to authenticate ❌')) + + def authenticate(self) -> bool: + """ + Attempt to authenticate with the server using either id_token or API key. + + Returns: + bool: True if authentication is successful, False otherwise. + """ + try: + header = self.get_auth_header() + if header: + r = requests.post(f'{HUB_API_ROOT}/v1/auth', headers=header) + if not r.json().get('success', False): + raise ConnectionError('Unable to authenticate.') + return True + raise ConnectionError('User has not authenticated locally.') + except ConnectionError: + self.id_token = self.api_key = False # reset invalid + LOGGER.warning(f'{PREFIX}Invalid API key ⚠️') + return False + + def auth_with_cookies(self) -> bool: + """ + Attempt to fetch authentication via cookies and set id_token. + User must be logged in to HUB and running in a supported browser. + + Returns: + bool: True if authentication is successful, False otherwise. + """ + if not is_colab(): + return False # Currently only works with Colab + try: + authn = request_with_credentials(f'{HUB_API_ROOT}/v1/auth/auto') + if authn.get('success', False): + self.id_token = authn.get('data', {}).get('idToken', None) + self.authenticate() + return True + raise ConnectionError('Unable to fetch browser authentication details.') + except ConnectionError: + self.id_token = False # reset invalid + return False + + def get_auth_header(self): + """ + Get the authentication header for making API requests. + + Returns: + (dict): The authentication header if id_token or API key is set, None otherwise. + """ + if self.id_token: + return {'authorization': f'Bearer {self.id_token}'} + elif self.api_key: + return {'x-api-key': self.api_key} + else: + return None + + def get_state(self) -> bool: + """ + Get the authentication state. + + Returns: + bool: True if either id_token or API key is set, False otherwise. + """ + return self.id_token or self.api_key + + def set_api_key(self, key: str): + """ + Set the API key for authentication. + + Args: + key (str): The API key string. + """ + self.api_key = key diff --git a/ultralytics/hub/session.py b/ultralytics/hub/session.py new file mode 100644 index 0000000000000000000000000000000000000000..30c74a8553d7615d85937bc8fcd7edaf117004fc --- /dev/null +++ b/ultralytics/hub/session.py @@ -0,0 +1,189 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +import signal +import sys +from pathlib import Path +from time import sleep + +import requests + +from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX, smart_request +from ultralytics.utils import LOGGER, __version__, checks, emojis, is_colab, threaded +from ultralytics.utils.errors import HUBModelError + +AGENT_NAME = f'python-{__version__}-colab' if is_colab() else f'python-{__version__}-local' + + +class HUBTrainingSession: + """ + HUB training session for Ultralytics HUB YOLO models. Handles model initialization, heartbeats, and checkpointing. + + Args: + url (str): Model identifier used to initialize the HUB training session. + + Attributes: + agent_id (str): Identifier for the instance communicating with the server. + model_id (str): Identifier for the YOLOv5 model being trained. + model_url (str): URL for the model in Ultralytics HUB. + api_url (str): API URL for the model in Ultralytics HUB. + auth_header (dict): Authentication header for the Ultralytics HUB API requests. + rate_limits (dict): Rate limits for different API calls (in seconds). + timers (dict): Timers for rate limiting. + metrics_queue (dict): Queue for the model's metrics. + model (dict): Model data fetched from Ultralytics HUB. + alive (bool): Indicates if the heartbeat loop is active. + """ + + def __init__(self, url): + """ + Initialize the HUBTrainingSession with the provided model identifier. + + Args: + url (str): Model identifier used to initialize the HUB training session. + It can be a URL string or a model key with specific format. + + Raises: + ValueError: If the provided model identifier is invalid. + ConnectionError: If connecting with global API key is not supported. + """ + + from ultralytics.hub.auth import Auth + + # Parse input + if url.startswith(f'{HUB_WEB_ROOT}/models/'): + url = url.split(f'{HUB_WEB_ROOT}/models/')[-1] + if [len(x) for x in url.split('_')] == [42, 20]: + key, model_id = url.split('_') + elif len(url) == 20: + key, model_id = '', url + else: + raise HUBModelError(f"model='{url}' not found. Check format is correct, i.e. " + f"model='{HUB_WEB_ROOT}/models/MODEL_ID' and try again.") + + # Authorize + auth = Auth(key) + self.agent_id = None # identifies which instance is communicating with server + self.model_id = model_id + self.model_url = f'{HUB_WEB_ROOT}/models/{model_id}' + self.api_url = f'{HUB_API_ROOT}/v1/models/{model_id}' + self.auth_header = auth.get_auth_header() + self.rate_limits = {'metrics': 3.0, 'ckpt': 900.0, 'heartbeat': 300.0} # rate limits (seconds) + self.timers = {} # rate limit timers (seconds) + self.metrics_queue = {} # metrics queue + self.model = self._get_model() + self.alive = True + self._start_heartbeat() # start heartbeats + self._register_signal_handlers() + LOGGER.info(f'{PREFIX}View model at {self.model_url} 🚀') + + def _register_signal_handlers(self): + """Register signal handlers for SIGTERM and SIGINT signals to gracefully handle termination.""" + signal.signal(signal.SIGTERM, self._handle_signal) + signal.signal(signal.SIGINT, self._handle_signal) + + def _handle_signal(self, signum, frame): + """ + Handle kill signals and prevent heartbeats from being sent on Colab after termination. + This method does not use frame, it is included as it is passed by signal. + """ + if self.alive is True: + LOGGER.info(f'{PREFIX}Kill signal received! ❌') + self._stop_heartbeat() + sys.exit(signum) + + def _stop_heartbeat(self): + """Terminate the heartbeat loop.""" + self.alive = False + + def upload_metrics(self): + """Upload model metrics to Ultralytics HUB.""" + payload = {'metrics': self.metrics_queue.copy(), 'type': 'metrics'} + smart_request('post', self.api_url, json=payload, headers=self.auth_header, code=2) + + def _get_model(self): + """Fetch and return model data from Ultralytics HUB.""" + api_url = f'{HUB_API_ROOT}/v1/models/{self.model_id}' + + try: + response = smart_request('get', api_url, headers=self.auth_header, thread=False, code=0) + data = response.json().get('data', None) + + if data.get('status', None) == 'trained': + raise ValueError(emojis(f'Model is already trained and uploaded to {self.model_url} 🚀')) + + if not data.get('data', None): + raise ValueError('Dataset may still be processing. Please wait a minute and try again.') # RF fix + self.model_id = data['id'] + + if data['status'] == 'new': # new model to start training + self.train_args = { + # TODO: deprecate 'batch_size' key for 'batch' in 3Q23 + 'batch': data['batch' if ('batch' in data) else 'batch_size'], + 'epochs': data['epochs'], + 'imgsz': data['imgsz'], + 'patience': data['patience'], + 'device': data['device'], + 'cache': data['cache'], + 'data': data['data']} + self.model_file = data.get('cfg') or data.get('weights') # cfg for pretrained=False + self.model_file = checks.check_yolov5u_filename(self.model_file, verbose=False) # YOLOv5->YOLOv5u + elif data['status'] == 'training': # existing model to resume training + self.train_args = {'data': data['data'], 'resume': True} + self.model_file = data['resume'] + + return data + except requests.exceptions.ConnectionError as e: + raise ConnectionRefusedError('ERROR: The HUB server is not online. Please try again later.') from e + except Exception: + raise + + def upload_model(self, epoch, weights, is_best=False, map=0.0, final=False): + """ + Upload a model checkpoint to Ultralytics HUB. + + Args: + epoch (int): The current training epoch. + weights (str): Path to the model weights file. + is_best (bool): Indicates if the current model is the best one so far. + map (float): Mean average precision of the model. + final (bool): Indicates if the model is the final model after training. + """ + if Path(weights).is_file(): + with open(weights, 'rb') as f: + file = f.read() + else: + LOGGER.warning(f'{PREFIX}WARNING ⚠️ Model upload issue. Missing model {weights}.') + file = None + url = f'{self.api_url}/upload' + # url = 'http://httpbin.org/post' # for debug + data = {'epoch': epoch} + if final: + data.update({'type': 'final', 'map': map}) + smart_request('post', + url, + data=data, + files={'best.pt': file}, + headers=self.auth_header, + retry=10, + timeout=3600, + thread=False, + progress=True, + code=4) + else: + data.update({'type': 'epoch', 'isBest': bool(is_best)}) + smart_request('post', url, data=data, files={'last.pt': file}, headers=self.auth_header, code=3) + + @threaded + def _start_heartbeat(self): + """Begin a threaded heartbeat loop to report the agent's status to Ultralytics HUB.""" + while self.alive: + r = smart_request('post', + f'{HUB_API_ROOT}/v1/agent/heartbeat/models/{self.model_id}', + json={ + 'agent': AGENT_NAME, + 'agentId': self.agent_id}, + headers=self.auth_header, + retry=0, + code=5, + thread=False) # already in a thread + self.agent_id = r.json().get('data', {}).get('agentId', None) + sleep(self.rate_limits['heartbeat']) diff --git a/ultralytics/hub/utils.py b/ultralytics/hub/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..68c194ba282a2daf41e592cbe7daa055fa7fea4e --- /dev/null +++ b/ultralytics/hub/utils.py @@ -0,0 +1,223 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import os +import platform +import random +import sys +import threading +import time +from pathlib import Path + +import requests +from tqdm import tqdm + +from ultralytics.utils import (ENVIRONMENT, LOGGER, ONLINE, RANK, SETTINGS, TESTS_RUNNING, TQDM_BAR_FORMAT, TryExcept, + __version__, colorstr, get_git_origin_url, is_colab, is_git_dir, is_pip_package) +from ultralytics.utils.downloads import GITHUB_ASSET_NAMES + +PREFIX = colorstr('Ultralytics HUB: ') +HELP_MSG = 'If this issue persists please visit https://github.com/ultralytics/hub/issues for assistance.' +HUB_API_ROOT = os.environ.get('ULTRALYTICS_HUB_API', 'https://api.ultralytics.com') +HUB_WEB_ROOT = os.environ.get('ULTRALYTICS_HUB_WEB', 'https://hub.ultralytics.com') + + +def request_with_credentials(url: str) -> any: + """ + Make an AJAX request with cookies attached in a Google Colab environment. + + Args: + url (str): The URL to make the request to. + + Returns: + (any): The response data from the AJAX request. + + Raises: + OSError: If the function is not run in a Google Colab environment. + """ + if not is_colab(): + raise OSError('request_with_credentials() must run in a Colab environment') + from google.colab import output # noqa + from IPython import display # noqa + display.display( + display.Javascript(""" + window._hub_tmp = new Promise((resolve, reject) => { + const timeout = setTimeout(() => reject("Failed authenticating existing browser session"), 5000) + fetch("%s", { + method: 'POST', + credentials: 'include' + }) + .then((response) => resolve(response.json())) + .then((json) => { + clearTimeout(timeout); + }).catch((err) => { + clearTimeout(timeout); + reject(err); + }); + }); + """ % url)) + return output.eval_js('_hub_tmp') + + +def requests_with_progress(method, url, **kwargs): + """ + Make an HTTP request using the specified method and URL, with an optional progress bar. + + Args: + method (str): The HTTP method to use (e.g. 'GET', 'POST'). + url (str): The URL to send the request to. + **kwargs (dict): Additional keyword arguments to pass to the underlying `requests.request` function. + + Returns: + (requests.Response): The response object from the HTTP request. + + Note: + If 'progress' is set to True, the progress bar will display the download progress + for responses with a known content length. + """ + progress = kwargs.pop('progress', False) + if not progress: + return requests.request(method, url, **kwargs) + response = requests.request(method, url, stream=True, **kwargs) + total = int(response.headers.get('content-length', 0)) # total size + try: + pbar = tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024, bar_format=TQDM_BAR_FORMAT) + for data in response.iter_content(chunk_size=1024): + pbar.update(len(data)) + pbar.close() + except requests.exceptions.ChunkedEncodingError: # avoid 'Connection broken: IncompleteRead' warnings + response.close() + return response + + +def smart_request(method, url, retry=3, timeout=30, thread=True, code=-1, verbose=True, progress=False, **kwargs): + """ + Makes an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout. + + Args: + method (str): The HTTP method to use for the request. Choices are 'post' and 'get'. + url (str): The URL to make the request to. + retry (int, optional): Number of retries to attempt before giving up. Default is 3. + timeout (int, optional): Timeout in seconds after which the function will give up retrying. Default is 30. + thread (bool, optional): Whether to execute the request in a separate daemon thread. Default is True. + code (int, optional): An identifier for the request, used for logging purposes. Default is -1. + verbose (bool, optional): A flag to determine whether to print out to console or not. Default is True. + progress (bool, optional): Whether to show a progress bar during the request. Default is False. + **kwargs (dict): Keyword arguments to be passed to the requests function specified in method. + + Returns: + (requests.Response): The HTTP response object. If the request is executed in a separate thread, returns None. + """ + retry_codes = (408, 500) # retry only these codes + + @TryExcept(verbose=verbose) + def func(func_method, func_url, **func_kwargs): + """Make HTTP requests with retries and timeouts, with optional progress tracking.""" + r = None # response + t0 = time.time() # initial time for timer + for i in range(retry + 1): + if (time.time() - t0) > timeout: + break + r = requests_with_progress(func_method, func_url, **func_kwargs) # i.e. get(url, data, json, files) + if r.status_code < 300: # return codes in the 2xx range are generally considered "good" or "successful" + break + try: + m = r.json().get('message', 'No JSON message.') + except AttributeError: + m = 'Unable to read JSON.' + if i == 0: + if r.status_code in retry_codes: + m += f' Retrying {retry}x for {timeout}s.' if retry else '' + elif r.status_code == 429: # rate limit + h = r.headers # response headers + m = f"Rate limit reached ({h['X-RateLimit-Remaining']}/{h['X-RateLimit-Limit']}). " \ + f"Please retry after {h['Retry-After']}s." + if verbose: + LOGGER.warning(f'{PREFIX}{m} {HELP_MSG} ({r.status_code} #{code})') + if r.status_code not in retry_codes: + return r + time.sleep(2 ** i) # exponential standoff + return r + + args = method, url + kwargs['progress'] = progress + if thread: + threading.Thread(target=func, args=args, kwargs=kwargs, daemon=True).start() + else: + return func(*args, **kwargs) + + +class Events: + """ + A class for collecting anonymous event analytics. Event analytics are enabled when sync=True in settings and + disabled when sync=False. Run 'yolo settings' to see and update settings YAML file. + + Attributes: + url (str): The URL to send anonymous events. + rate_limit (float): The rate limit in seconds for sending events. + metadata (dict): A dictionary containing metadata about the environment. + enabled (bool): A flag to enable or disable Events based on certain conditions. + """ + + url = 'https://www.google-analytics.com/mp/collect?measurement_id=G-X8NCJYTQXM&api_secret=QLQrATrNSwGRFRLE-cbHJw' + + def __init__(self): + """ + Initializes the Events object with default values for events, rate_limit, and metadata. + """ + self.events = [] # events list + self.rate_limit = 60.0 # rate limit (seconds) + self.t = 0.0 # rate limit timer (seconds) + self.metadata = { + 'cli': Path(sys.argv[0]).name == 'yolo', + 'install': 'git' if is_git_dir() else 'pip' if is_pip_package() else 'other', + 'python': '.'.join(platform.python_version_tuple()[:2]), # i.e. 3.10 + 'version': __version__, + 'env': ENVIRONMENT, + 'session_id': round(random.random() * 1E15), + 'engagement_time_msec': 1000} + self.enabled = \ + SETTINGS['sync'] and \ + RANK in (-1, 0) and \ + not TESTS_RUNNING and \ + ONLINE and \ + (is_pip_package() or get_git_origin_url() == 'https://github.com/ultralytics/ultralytics.git') + + def __call__(self, cfg): + """ + Attempts to add a new event to the events list and send events if the rate limit is reached. + + Args: + cfg (IterableSimpleNamespace): The configuration object containing mode and task information. + """ + if not self.enabled: + # Events disabled, do nothing + return + + # Attempt to add to events + if len(self.events) < 25: # Events list limited to 25 events (drop any events past this) + params = { + **self.metadata, 'task': cfg.task, + 'model': cfg.model if cfg.model in GITHUB_ASSET_NAMES else 'custom'} + if cfg.mode == 'export': + params['format'] = cfg.format + self.events.append({'name': cfg.mode, 'params': params}) + + # Check rate limit + t = time.time() + if (t - self.t) < self.rate_limit: + # Time is under rate limiter, wait to send + return + + # Time is over rate limiter, send now + data = {'client_id': SETTINGS['uuid'], 'events': self.events} # SHA-256 anonymized UUID hash and events list + + # POST equivalent to requests.post(self.url, json=data) + smart_request('post', self.url, json=data, retry=0, verbose=False) + + # Reset events and rate limit timer + self.events = [] + self.t = t + + +# Run below code on hub/utils init ------------------------------------------------------------------------------------- +events = Events() diff --git a/ultralytics/models/__init__.py b/ultralytics/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5d1b4a17145476cddf914043864d77568cd37710 --- /dev/null +++ b/ultralytics/models/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .rtdetr import RTDETR +from .sam import SAM +from .yolo import YOLO + +__all__ = 'YOLO', 'RTDETR', 'SAM' # allow simpler import diff --git a/ultralytics/models/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38cf937c9db24c8180ede397fc51f664318c7ad6 Binary files /dev/null and b/ultralytics/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20503c59dc6ac361d2b3fd6840b2a48bbcca8c4e Binary files /dev/null and b/ultralytics/models/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/fastsam/__init__.py b/ultralytics/models/fastsam/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3517afcc1f7dc757b0255ef651e8166f4e1e6a27 --- /dev/null +++ b/ultralytics/models/fastsam/__init__.py @@ -0,0 +1,8 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .model import FastSAM +from .predict import FastSAMPredictor +from .prompt import FastSAMPrompt +from .val import FastSAMValidator + +__all__ = 'FastSAMPredictor', 'FastSAM', 'FastSAMPrompt', 'FastSAMValidator' diff --git a/ultralytics/models/fastsam/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..853e845a51194d47bc48bafd03f7172ed0475b31 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/fastsam/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..053c7d527da946fc781eab2fa43196ea1629edc1 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/model.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9536d715746702580cb4710a388ae14e7aacd4bc Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/model.cpython-310.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/model.cpython-39.pyc b/ultralytics/models/fastsam/__pycache__/model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b4f3be80ef52beb041b6104722af2371e17f76c Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/model.cpython-39.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/predict.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fff5aa899e2eb0c810ab42cdde4fe1094aca90b6 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/predict.cpython-39.pyc b/ultralytics/models/fastsam/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e720b4fcf80cdb3c4931c87d541709e84b7379b4 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/prompt.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/prompt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29996c74f39680b2daf074d6526480c79bd73648 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/prompt.cpython-310.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/prompt.cpython-39.pyc b/ultralytics/models/fastsam/__pycache__/prompt.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..523a488e4d15790c978d957e517add4ebdf5e6db Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/prompt.cpython-39.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/utils.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bfcec0d97b4d24d8caa2053775503cfbd875342 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/utils.cpython-310.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/utils.cpython-39.pyc b/ultralytics/models/fastsam/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba80e08274b68d5567a7cdc3881063a8409ddaec Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/utils.cpython-39.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/val.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/val.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fb219da1c258fc079a6a134fa678a897a2d1770 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/val.cpython-310.pyc differ diff --git a/ultralytics/models/fastsam/__pycache__/val.cpython-39.pyc b/ultralytics/models/fastsam/__pycache__/val.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..334c3ba44415a1237dd55010b77396296a974b35 Binary files /dev/null and b/ultralytics/models/fastsam/__pycache__/val.cpython-39.pyc differ diff --git a/ultralytics/models/fastsam/model.py b/ultralytics/models/fastsam/model.py new file mode 100644 index 0000000000000000000000000000000000000000..1e5825f41a1f5f83d734c4fdd1f3e7a5a79131e9 --- /dev/null +++ b/ultralytics/models/fastsam/model.py @@ -0,0 +1,31 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from pathlib import Path + +from ultralytics.engine.model import Model + +from .predict import FastSAMPredictor +from .val import FastSAMValidator + + +class FastSAM(Model): + """ + FastSAM model interface. + + Usage - Predict: + from ultralytics import FastSAM + + model = FastSAM('last.pt') + results = model.predict('ultralytics/assets/bus.jpg') + """ + + def __init__(self, model='FastSAM-x.pt'): + """Call the __init__ method of the parent class (YOLO) with the updated default model""" + if model == 'FastSAM.pt': + model = 'FastSAM-x.pt' + assert Path(model).suffix not in ('.yaml', '.yml'), 'FastSAM models only support pre-trained models.' + super().__init__(model=model, task='segment') + + @property + def task_map(self): + return {'segment': {'predictor': FastSAMPredictor, 'validator': FastSAMValidator}} diff --git a/ultralytics/models/fastsam/predict.py b/ultralytics/models/fastsam/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..78ae0f55b60a1c6427526404a3c2f32892abbbcd --- /dev/null +++ b/ultralytics/models/fastsam/predict.py @@ -0,0 +1,53 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.engine.results import Results +from ultralytics.models.fastsam.utils import bbox_iou +from ultralytics.models.yolo.detect.predict import DetectionPredictor +from ultralytics.utils import DEFAULT_CFG, ops + + +class FastSAMPredictor(DetectionPredictor): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + super().__init__(cfg, overrides, _callbacks) + self.args.task = 'segment' + + def postprocess(self, preds, img, orig_imgs): + """TODO: filter by classes.""" + p = ops.non_max_suppression(preds[0], + self.args.conf, + self.args.iou, + agnostic=self.args.agnostic_nms, + max_det=self.args.max_det, + nc=len(self.model.names), + classes=self.args.classes) + full_box = torch.zeros_like(p[0][0]) + full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0 + full_box = full_box.view(1, -1) + critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:]) + if critical_iou_index.numel() != 0: + full_box[0][4] = p[0][critical_iou_index][:, 4] + full_box[0][6:] = p[0][critical_iou_index][:, 6:] + p[0][critical_iou_index] = full_box + results = [] + proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported + for i, pred in enumerate(p): + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + if not len(pred): # save empty boxes + results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6])) + continue + if self.args.retina_masks: + if not isinstance(orig_imgs, torch.Tensor): + pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape) + masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC + else: + masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC + if not isinstance(orig_imgs, torch.Tensor): + pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape) + results.append( + Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks)) + return results diff --git a/ultralytics/models/fastsam/prompt.py b/ultralytics/models/fastsam/prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..d9ccf8472d584f1a0c5fe33f18f2b78999424c04 --- /dev/null +++ b/ultralytics/models/fastsam/prompt.py @@ -0,0 +1,406 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import os + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import torch +from PIL import Image + + +class FastSAMPrompt: + + def __init__(self, img_path, results, device='cuda') -> None: + # self.img_path = img_path + self.device = device + self.results = results + self.img_path = img_path + self.ori_img = cv2.imread(img_path) + + # Import and assign clip + try: + import clip # for linear_assignment + except ImportError: + from ultralytics.utils.checks import check_requirements + check_requirements('git+https://github.com/openai/CLIP.git') # required before installing lap from source + import clip + self.clip = clip + + @staticmethod + def _segment_image(image, bbox): + image_array = np.array(image) + segmented_image_array = np.zeros_like(image_array) + x1, y1, x2, y2 = bbox + segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2] + segmented_image = Image.fromarray(segmented_image_array) + black_image = Image.new('RGB', image.size, (255, 255, 255)) + # transparency_mask = np.zeros_like((), dtype=np.uint8) + transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8) + transparency_mask[y1:y2, x1:x2] = 255 + transparency_mask_image = Image.fromarray(transparency_mask, mode='L') + black_image.paste(segmented_image, mask=transparency_mask_image) + return black_image + + @staticmethod + def _format_results(result, filter=0): + annotations = [] + n = len(result.masks.data) + for i in range(n): + mask = result.masks.data[i] == 1.0 + + if torch.sum(mask) < filter: + continue + annotation = { + 'id': i, + 'segmentation': mask.cpu().numpy(), + 'bbox': result.boxes.data[i], + 'score': result.boxes.conf[i]} + annotation['area'] = annotation['segmentation'].sum() + annotations.append(annotation) + return annotations + + @staticmethod + def filter_masks(annotations): # filter the overlap mask + annotations.sort(key=lambda x: x['area'], reverse=True) + to_remove = set() + for i in range(len(annotations)): + a = annotations[i] + for j in range(i + 1, len(annotations)): + b = annotations[j] + if i != j and j not in to_remove and b['area'] < a['area'] and \ + (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8: + to_remove.add(j) + + return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove + + @staticmethod + def _get_bbox_from_mask(mask): + mask = mask.astype(np.uint8) + contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + x1, y1, w, h = cv2.boundingRect(contours[0]) + x2, y2 = x1 + w, y1 + h + if len(contours) > 1: + for b in contours: + x_t, y_t, w_t, h_t = cv2.boundingRect(b) + # 将多个bbox合并成一个 + x1 = min(x1, x_t) + y1 = min(y1, y_t) + x2 = max(x2, x_t + w_t) + y2 = max(y2, y_t + h_t) + h = y2 - y1 + w = x2 - x1 + return [x1, y1, x2, y2] + + def plot(self, + annotations, + output, + bbox=None, + points=None, + point_label=None, + mask_random_color=True, + better_quality=True, + retina=False, + withContours=True): + if isinstance(annotations[0], dict): + annotations = [annotation['segmentation'] for annotation in annotations] + result_name = os.path.basename(self.img_path) + image = self.ori_img + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + original_h = image.shape[0] + original_w = image.shape[1] + # for macOS only + # plt.switch_backend('TkAgg') + plt.figure(figsize=(original_w / 100, original_h / 100)) + # Add subplot with no margin. + plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) + plt.margins(0, 0) + plt.gca().xaxis.set_major_locator(plt.NullLocator()) + plt.gca().yaxis.set_major_locator(plt.NullLocator()) + + plt.imshow(image) + if better_quality: + if isinstance(annotations[0], torch.Tensor): + annotations = np.array(annotations.cpu()) + for i, mask in enumerate(annotations): + mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8)) + annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8)) + if self.device == 'cpu': + annotations = np.array(annotations) + self.fast_show_mask( + annotations, + plt.gca(), + random_color=mask_random_color, + bbox=bbox, + points=points, + pointlabel=point_label, + retinamask=retina, + target_height=original_h, + target_width=original_w, + ) + else: + if isinstance(annotations[0], np.ndarray): + annotations = torch.from_numpy(annotations) + self.fast_show_mask_gpu( + annotations, + plt.gca(), + random_color=mask_random_color, + bbox=bbox, + points=points, + pointlabel=point_label, + retinamask=retina, + target_height=original_h, + target_width=original_w, + ) + if isinstance(annotations, torch.Tensor): + annotations = annotations.cpu().numpy() + if withContours: + contour_all = [] + temp = np.zeros((original_h, original_w, 1)) + for i, mask in enumerate(annotations): + if type(mask) == dict: + mask = mask['segmentation'] + annotation = mask.astype(np.uint8) + if not retina: + annotation = cv2.resize( + annotation, + (original_w, original_h), + interpolation=cv2.INTER_NEAREST, + ) + contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contour_all.extend(iter(contours)) + cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2) + color = np.array([0 / 255, 0 / 255, 1.0, 0.8]) + contour_mask = temp / 255 * color.reshape(1, 1, -1) + plt.imshow(contour_mask) + + save_path = output + if not os.path.exists(save_path): + os.makedirs(save_path) + plt.axis('off') + fig = plt.gcf() + plt.draw() + + try: + buf = fig.canvas.tostring_rgb() + except AttributeError: + fig.canvas.draw() + buf = fig.canvas.tostring_rgb() + cols, rows = fig.canvas.get_width_height() + img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3) + cv2.imwrite(os.path.join(save_path, result_name), cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)) + + # CPU post process + def fast_show_mask( + self, + annotation, + ax, + random_color=False, + bbox=None, + points=None, + pointlabel=None, + retinamask=True, + target_height=960, + target_width=960, + ): + msak_sum = annotation.shape[0] + height = annotation.shape[1] + weight = annotation.shape[2] + # 将annotation 按照面积 排序 + areas = np.sum(annotation, axis=(1, 2)) + sorted_indices = np.argsort(areas) + annotation = annotation[sorted_indices] + + index = (annotation != 0).argmax(axis=0) + if random_color: + color = np.random.random((msak_sum, 1, 1, 3)) + else: + color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 1.0]) + transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6 + visual = np.concatenate([color, transparency], axis=-1) + mask_image = np.expand_dims(annotation, -1) * visual + + show = np.zeros((height, weight, 4)) + h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij') + indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None)) + # 使用向量化索引更新show的值 + show[h_indices, w_indices, :] = mask_image[indices] + if bbox is not None: + x1, y1, x2, y2 = bbox + ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1)) + # draw point + if points is not None: + plt.scatter( + [point[0] for i, point in enumerate(points) if pointlabel[i] == 1], + [point[1] for i, point in enumerate(points) if pointlabel[i] == 1], + s=20, + c='y', + ) + plt.scatter( + [point[0] for i, point in enumerate(points) if pointlabel[i] == 0], + [point[1] for i, point in enumerate(points) if pointlabel[i] == 0], + s=20, + c='m', + ) + + if not retinamask: + show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST) + ax.imshow(show) + + def fast_show_mask_gpu( + self, + annotation, + ax, + random_color=False, + bbox=None, + points=None, + pointlabel=None, + retinamask=True, + target_height=960, + target_width=960, + ): + msak_sum = annotation.shape[0] + height = annotation.shape[1] + weight = annotation.shape[2] + areas = torch.sum(annotation, dim=(1, 2)) + sorted_indices = torch.argsort(areas, descending=False) + annotation = annotation[sorted_indices] + # 找每个位置第一个非零值下标 + index = (annotation != 0).to(torch.long).argmax(dim=0) + if random_color: + color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device) + else: + color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([30 / 255, 144 / 255, 1.0]).to( + annotation.device) + transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6 + visual = torch.cat([color, transparency], dim=-1) + mask_image = torch.unsqueeze(annotation, -1) * visual + # 按index取数,index指每个位置选哪个batch的数,把mask_image转成一个batch的形式 + show = torch.zeros((height, weight, 4)).to(annotation.device) + h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij') + indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None)) + # 使用向量化索引更新show的值 + show[h_indices, w_indices, :] = mask_image[indices] + show_cpu = show.cpu().numpy() + if bbox is not None: + x1, y1, x2, y2 = bbox + ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1)) + # draw point + if points is not None: + plt.scatter( + [point[0] for i, point in enumerate(points) if pointlabel[i] == 1], + [point[1] for i, point in enumerate(points) if pointlabel[i] == 1], + s=20, + c='y', + ) + plt.scatter( + [point[0] for i, point in enumerate(points) if pointlabel[i] == 0], + [point[1] for i, point in enumerate(points) if pointlabel[i] == 0], + s=20, + c='m', + ) + if not retinamask: + show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST) + ax.imshow(show_cpu) + + # clip + @torch.no_grad() + def retrieve(self, model, preprocess, elements, search_text: str, device) -> int: + preprocessed_images = [preprocess(image).to(device) for image in elements] + tokenized_text = self.clip.tokenize([search_text]).to(device) + stacked_images = torch.stack(preprocessed_images) + image_features = model.encode_image(stacked_images) + text_features = model.encode_text(tokenized_text) + image_features /= image_features.norm(dim=-1, keepdim=True) + text_features /= text_features.norm(dim=-1, keepdim=True) + probs = 100.0 * image_features @ text_features.T + return probs[:, 0].softmax(dim=0) + + def _crop_image(self, format_results): + + image = Image.fromarray(cv2.cvtColor(self.ori_img, cv2.COLOR_BGR2RGB)) + ori_w, ori_h = image.size + annotations = format_results + mask_h, mask_w = annotations[0]['segmentation'].shape + if ori_w != mask_w or ori_h != mask_h: + image = image.resize((mask_w, mask_h)) + cropped_boxes = [] + cropped_images = [] + not_crop = [] + filter_id = [] + # annotations, _ = filter_masks(annotations) + # filter_id = list(_) + for _, mask in enumerate(annotations): + if np.sum(mask['segmentation']) <= 100: + filter_id.append(_) + continue + bbox = self._get_bbox_from_mask(mask['segmentation']) # mask 的 bbox + cropped_boxes.append(self._segment_image(image, bbox)) # 保存裁剪的图片 + # cropped_boxes.append(segment_image(image,mask["segmentation"])) + cropped_images.append(bbox) # 保存裁剪的图片的bbox + + return cropped_boxes, cropped_images, not_crop, filter_id, annotations + + def box_prompt(self, bbox): + + assert (bbox[2] != 0 and bbox[3] != 0) + masks = self.results[0].masks.data + target_height = self.ori_img.shape[0] + target_width = self.ori_img.shape[1] + h = masks.shape[1] + w = masks.shape[2] + if h != target_height or w != target_width: + bbox = [ + int(bbox[0] * w / target_width), + int(bbox[1] * h / target_height), + int(bbox[2] * w / target_width), + int(bbox[3] * h / target_height), ] + bbox[0] = max(round(bbox[0]), 0) + bbox[1] = max(round(bbox[1]), 0) + bbox[2] = min(round(bbox[2]), w) + bbox[3] = min(round(bbox[3]), h) + + # IoUs = torch.zeros(len(masks), dtype=torch.float32) + bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0]) + + masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2)) + orig_masks_area = torch.sum(masks, dim=(1, 2)) + + union = bbox_area + orig_masks_area - masks_area + IoUs = masks_area / union + max_iou_index = torch.argmax(IoUs) + + return np.array([masks[max_iou_index].cpu().numpy()]) + + def point_prompt(self, points, pointlabel): # numpy 处理 + + masks = self._format_results(self.results[0], 0) + target_height = self.ori_img.shape[0] + target_width = self.ori_img.shape[1] + h = masks[0]['segmentation'].shape[0] + w = masks[0]['segmentation'].shape[1] + if h != target_height or w != target_width: + points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points] + onemask = np.zeros((h, w)) + for i, annotation in enumerate(masks): + mask = annotation['segmentation'] if type(annotation) == dict else annotation + for i, point in enumerate(points): + if mask[point[1], point[0]] == 1 and pointlabel[i] == 1: + onemask += mask + if mask[point[1], point[0]] == 1 and pointlabel[i] == 0: + onemask -= mask + onemask = onemask >= 1 + return np.array([onemask]) + + def text_prompt(self, text): + format_results = self._format_results(self.results[0], 0) + cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results) + clip_model, preprocess = self.clip.load('ViT-B/32', device=self.device) + scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device) + max_idx = scores.argsort() + max_idx = max_idx[-1] + max_idx += sum(np.array(filter_id) <= int(max_idx)) + return np.array([annotations[max_idx]['segmentation']]) + + def everything_prompt(self): + return self.results[0].masks.data diff --git a/ultralytics/models/fastsam/utils.py b/ultralytics/models/fastsam/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c78333e79bec637b171974f81d8fc199101100 --- /dev/null +++ b/ultralytics/models/fastsam/utils.py @@ -0,0 +1,64 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + + +def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20): + """ + Adjust bounding boxes to stick to image border if they are within a certain threshold. + + Args: + boxes (torch.Tensor): (n, 4) + image_shape (tuple): (height, width) + threshold (int): pixel threshold + + Returns: + adjusted_boxes (torch.Tensor): adjusted bounding boxes + """ + + # Image dimensions + h, w = image_shape + + # Adjust boxes + boxes[boxes[:, 0] < threshold, 0] = 0 # x1 + boxes[boxes[:, 1] < threshold, 1] = 0 # y1 + boxes[boxes[:, 2] > w - threshold, 2] = w # x2 + boxes[boxes[:, 3] > h - threshold, 3] = h # y2 + return boxes + + +def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False): + """ + Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes. + + Args: + box1 (torch.Tensor): (4, ) + boxes (torch.Tensor): (n, 4) + + Returns: + high_iou_indices (torch.Tensor): Indices of boxes with IoU > thres + """ + boxes = adjust_bboxes_to_image_border(boxes, image_shape) + # obtain coordinates for intersections + x1 = torch.max(box1[0], boxes[:, 0]) + y1 = torch.max(box1[1], boxes[:, 1]) + x2 = torch.min(box1[2], boxes[:, 2]) + y2 = torch.min(box1[3], boxes[:, 3]) + + # compute the area of intersection + intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0) + + # compute the area of both individual boxes + box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1]) + box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + # compute the area of union + union = box1_area + box2_area - intersection + + # compute the IoU + iou = intersection / union # Should be shape (n, ) + if raw_output: + return 0 if iou.numel() == 0 else iou + + # return indices of boxes with IoU > thres + return torch.nonzero(iou > iou_thres).flatten() diff --git a/ultralytics/models/fastsam/val.py b/ultralytics/models/fastsam/val.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd78a01e98f44ab1fc16f28f78ec5d247780d37 --- /dev/null +++ b/ultralytics/models/fastsam/val.py @@ -0,0 +1,244 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from multiprocessing.pool import ThreadPool +from pathlib import Path + +import numpy as np +import torch +import torch.nn.functional as F + +from ultralytics.models.yolo.detect import DetectionValidator +from ultralytics.utils import LOGGER, NUM_THREADS, ops +from ultralytics.utils.checks import check_requirements +from ultralytics.utils.metrics import SegmentMetrics, box_iou, mask_iou +from ultralytics.utils.plotting import output_to_target, plot_images + + +class FastSAMValidator(DetectionValidator): + + def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): + """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.""" + super().__init__(dataloader, save_dir, pbar, args, _callbacks) + self.args.task = 'segment' + self.metrics = SegmentMetrics(save_dir=self.save_dir, on_plot=self.on_plot) + + def preprocess(self, batch): + """Preprocesses batch by converting masks to float and sending to device.""" + batch = super().preprocess(batch) + batch['masks'] = batch['masks'].to(self.device).float() + return batch + + def init_metrics(self, model): + """Initialize metrics and select mask processing function based on save_json flag.""" + super().init_metrics(model) + self.plot_masks = [] + if self.args.save_json: + check_requirements('pycocotools>=2.0.6') + self.process = ops.process_mask_upsample # more accurate + else: + self.process = ops.process_mask # faster + + def get_desc(self): + """Return a formatted description of evaluation metrics.""" + return ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P', + 'R', 'mAP50', 'mAP50-95)') + + def postprocess(self, preds): + """Postprocesses YOLO predictions and returns output detections with proto.""" + p = ops.non_max_suppression(preds[0], + self.args.conf, + self.args.iou, + labels=self.lb, + multi_label=True, + agnostic=self.args.single_cls, + max_det=self.args.max_det, + nc=self.nc) + proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported + return p, proto + + def update_metrics(self, preds, batch): + """Metrics.""" + for si, (pred, proto) in enumerate(zip(preds[0], preds[1])): + idx = batch['batch_idx'] == si + cls = batch['cls'][idx] + bbox = batch['bboxes'][idx] + nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions + shape = batch['ori_shape'][si] + correct_masks = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + self.seen += 1 + + if npr == 0: + if nl: + self.stats.append((correct_bboxes, correct_masks, *torch.zeros( + (2, 0), device=self.device), cls.squeeze(-1))) + if self.args.plots: + self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1)) + continue + + # Masks + midx = [si] if self.args.overlap_mask else idx + gt_masks = batch['masks'][midx] + pred_masks = self.process(proto, pred[:, 6:], pred[:, :4], shape=batch['img'][si].shape[1:]) + + # Predictions + if self.args.single_cls: + pred[:, 5] = 0 + predn = pred.clone() + ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape, + ratio_pad=batch['ratio_pad'][si]) # native-space pred + + # Evaluate + if nl: + height, width = batch['img'].shape[2:] + tbox = ops.xywh2xyxy(bbox) * torch.tensor( + (width, height, width, height), device=self.device) # target boxes + ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape, + ratio_pad=batch['ratio_pad'][si]) # native-space labels + labelsn = torch.cat((cls, tbox), 1) # native-space labels + correct_bboxes = self._process_batch(predn, labelsn) + # TODO: maybe remove these `self.` arguments as they already are member variable + correct_masks = self._process_batch(predn, + labelsn, + pred_masks, + gt_masks, + overlap=self.args.overlap_mask, + masks=True) + if self.args.plots: + self.confusion_matrix.process_batch(predn, labelsn) + + # Append correct_masks, correct_boxes, pconf, pcls, tcls + self.stats.append((correct_bboxes, correct_masks, pred[:, 4], pred[:, 5], cls.squeeze(-1))) + + pred_masks = torch.as_tensor(pred_masks, dtype=torch.uint8) + if self.args.plots and self.batch_i < 3: + self.plot_masks.append(pred_masks[:15].cpu()) # filter top 15 to plot + + # Save + if self.args.save_json: + pred_masks = ops.scale_image(pred_masks.permute(1, 2, 0).contiguous().cpu().numpy(), + shape, + ratio_pad=batch['ratio_pad'][si]) + self.pred_to_json(predn, batch['im_file'][si], pred_masks) + # if self.args.save_txt: + # save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt') + + def finalize_metrics(self, *args, **kwargs): + """Sets speed and confusion matrix for evaluation metrics.""" + self.metrics.speed = self.speed + self.metrics.confusion_matrix = self.confusion_matrix + + def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False): + """ + Return correct prediction matrix + Arguments: + detections (array[N, 6]), x1, y1, x2, y2, conf, class + labels (array[M, 5]), class, x1, y1, x2, y2 + Returns: + correct (array[N, 10]), for 10 IoU levels + """ + if masks: + if overlap: + nl = len(labels) + index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1 + gt_masks = gt_masks.repeat(nl, 1, 1) # shape(1,640,640) -> (n,640,640) + gt_masks = torch.where(gt_masks == index, 1.0, 0.0) + if gt_masks.shape[1:] != pred_masks.shape[1:]: + gt_masks = F.interpolate(gt_masks[None], pred_masks.shape[1:], mode='bilinear', align_corners=False)[0] + gt_masks = gt_masks.gt_(0.5) + iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1)) + else: # boxes + iou = box_iou(labels[:, 1:], detections[:, :4]) + + correct = np.zeros((detections.shape[0], self.iouv.shape[0])).astype(bool) + correct_class = labels[:, 0:1] == detections[:, 5] + for i in range(len(self.iouv)): + x = torch.where((iou >= self.iouv[i]) & correct_class) # IoU > threshold and classes match + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), + 1).cpu().numpy() # [label, detect, iou] + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + # matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + correct[matches[:, 1].astype(int), i] = True + return torch.tensor(correct, dtype=torch.bool, device=detections.device) + + def plot_val_samples(self, batch, ni): + """Plots validation samples with bounding box labels.""" + plot_images(batch['img'], + batch['batch_idx'], + batch['cls'].squeeze(-1), + batch['bboxes'], + batch['masks'], + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_labels.jpg', + names=self.names, + on_plot=self.on_plot) + + def plot_predictions(self, batch, preds, ni): + """Plots batch predictions with masks and bounding boxes.""" + plot_images( + batch['img'], + *output_to_target(preds[0], max_det=15), # not set to self.args.max_det due to slow plotting speed + torch.cat(self.plot_masks, dim=0) if len(self.plot_masks) else self.plot_masks, + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_pred.jpg', + names=self.names, + on_plot=self.on_plot) # pred + self.plot_masks.clear() + + def pred_to_json(self, predn, filename, pred_masks): + """Save one JSON result.""" + # Example result = {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236} + from pycocotools.mask import encode # noqa + + def single_encode(x): + """Encode predicted masks as RLE and append results to jdict.""" + rle = encode(np.asarray(x[:, :, None], order='F', dtype='uint8'))[0] + rle['counts'] = rle['counts'].decode('utf-8') + return rle + + stem = Path(filename).stem + image_id = int(stem) if stem.isnumeric() else stem + box = ops.xyxy2xywh(predn[:, :4]) # xywh + box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner + pred_masks = np.transpose(pred_masks, (2, 0, 1)) + with ThreadPool(NUM_THREADS) as pool: + rles = pool.map(single_encode, pred_masks) + for i, (p, b) in enumerate(zip(predn.tolist(), box.tolist())): + self.jdict.append({ + 'image_id': image_id, + 'category_id': self.class_map[int(p[5])], + 'bbox': [round(x, 3) for x in b], + 'score': round(p[4], 5), + 'segmentation': rles[i]}) + + def eval_json(self, stats): + """Return COCO-style object detection evaluation metrics.""" + if self.args.save_json and self.is_coco and len(self.jdict): + anno_json = self.data['path'] / 'annotations/instances_val2017.json' # annotations + pred_json = self.save_dir / 'predictions.json' # predictions + LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...') + try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + check_requirements('pycocotools>=2.0.6') + from pycocotools.coco import COCO # noqa + from pycocotools.cocoeval import COCOeval # noqa + + for x in anno_json, pred_json: + assert x.is_file(), f'{x} file not found' + anno = COCO(str(anno_json)) # init annotations api + pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path) + for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'segm')]): + if self.is_coco: + eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # im to eval + eval.evaluate() + eval.accumulate() + eval.summarize() + idx = i * 4 + 2 + stats[self.metrics.keys[idx + 1]], stats[ + self.metrics.keys[idx]] = eval.stats[:2] # update mAP50-95 and mAP50 + except Exception as e: + LOGGER.warning(f'pycocotools unable to run: {e}') + return stats diff --git a/ultralytics/models/nas/__init__.py b/ultralytics/models/nas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ab0f837dc73733f8ed894806d545b4d1d14fbbb8 --- /dev/null +++ b/ultralytics/models/nas/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .model import NAS +from .predict import NASPredictor +from .val import NASValidator + +__all__ = 'NASPredictor', 'NASValidator', 'NAS' diff --git a/ultralytics/models/nas/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/nas/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..358f9a16a47b44b4345bce062e94646c22eb814c Binary files /dev/null and b/ultralytics/models/nas/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/nas/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/nas/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c08dee1d61ae93b004d220386ba67b80d3786ae8 Binary files /dev/null and b/ultralytics/models/nas/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/nas/__pycache__/model.cpython-310.pyc b/ultralytics/models/nas/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dedb176ffdf2afa232933fbb49a527d2cd4732d0 Binary files /dev/null and b/ultralytics/models/nas/__pycache__/model.cpython-310.pyc differ diff --git a/ultralytics/models/nas/__pycache__/model.cpython-39.pyc b/ultralytics/models/nas/__pycache__/model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ed9f93df9a875eb583c87f2cf1f8ed92c65bba8 Binary files /dev/null and b/ultralytics/models/nas/__pycache__/model.cpython-39.pyc differ diff --git a/ultralytics/models/nas/__pycache__/predict.cpython-310.pyc b/ultralytics/models/nas/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea4f75492de5e954e3c91eef0d23c11f74702bd8 Binary files /dev/null and b/ultralytics/models/nas/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/nas/__pycache__/predict.cpython-39.pyc b/ultralytics/models/nas/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f61f91d519a8db446967b47d331224420576502d Binary files /dev/null and b/ultralytics/models/nas/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/nas/__pycache__/val.cpython-310.pyc b/ultralytics/models/nas/__pycache__/val.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..079dc793f587c2976c979d690eb09dbb3b706645 Binary files /dev/null and b/ultralytics/models/nas/__pycache__/val.cpython-310.pyc differ diff --git a/ultralytics/models/nas/__pycache__/val.cpython-39.pyc b/ultralytics/models/nas/__pycache__/val.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c459288658fb4128ab3b4694b4ea8750155ac145 Binary files /dev/null and b/ultralytics/models/nas/__pycache__/val.cpython-39.pyc differ diff --git a/ultralytics/models/nas/model.py b/ultralytics/models/nas/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8b547ae6459835ec8ef31d8ce41ace34dcac5d01 --- /dev/null +++ b/ultralytics/models/nas/model.py @@ -0,0 +1,59 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +YOLO-NAS model interface. + +Usage - Predict: + from ultralytics import NAS + + model = NAS('yolo_nas_s') + results = model.predict('ultralytics/assets/bus.jpg') +""" + +from pathlib import Path + +import torch + +from ultralytics.engine.model import Model +from ultralytics.utils.torch_utils import model_info, smart_inference_mode + +from .predict import NASPredictor +from .val import NASValidator + + +class NAS(Model): + + def __init__(self, model='yolo_nas_s.pt') -> None: + assert Path(model).suffix not in ('.yaml', '.yml'), 'YOLO-NAS models only support pre-trained models.' + super().__init__(model, task='detect') + + @smart_inference_mode() + def _load(self, weights: str, task: str): + # Load or create new NAS model + import super_gradients + suffix = Path(weights).suffix + if suffix == '.pt': + self.model = torch.load(weights) + elif suffix == '': + self.model = super_gradients.training.models.get(weights, pretrained_weights='coco') + # Standardize model + self.model.fuse = lambda verbose=True: self.model + self.model.stride = torch.tensor([32]) + self.model.names = dict(enumerate(self.model._class_names)) + self.model.is_fused = lambda: False # for info() + self.model.yaml = {} # for info() + self.model.pt_path = weights # for export() + self.model.task = 'detect' # for export() + + def info(self, detailed=False, verbose=True): + """ + Logs model info. + + Args: + detailed (bool): Show detailed information about model. + verbose (bool): Controls verbosity. + """ + return model_info(self.model, detailed=detailed, verbose=verbose, imgsz=640) + + @property + def task_map(self): + return {'detect': {'predictor': NASPredictor, 'validator': NASValidator}} diff --git a/ultralytics/models/nas/predict.py b/ultralytics/models/nas/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb9641d4d19e11a33bec0994e0da16b9af1c9c8 --- /dev/null +++ b/ultralytics/models/nas/predict.py @@ -0,0 +1,35 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.engine.predictor import BasePredictor +from ultralytics.engine.results import Results +from ultralytics.utils import ops +from ultralytics.utils.ops import xyxy2xywh + + +class NASPredictor(BasePredictor): + + def postprocess(self, preds_in, img, orig_imgs): + """Postprocesses predictions and returns a list of Results objects.""" + + # Cat boxes and class scores + boxes = xyxy2xywh(preds_in[0][0]) + preds = torch.cat((boxes, preds_in[0][1]), -1).permute(0, 2, 1) + + preds = ops.non_max_suppression(preds, + self.args.conf, + self.args.iou, + agnostic=self.args.agnostic_nms, + max_det=self.args.max_det, + classes=self.args.classes) + + results = [] + for i, pred in enumerate(preds): + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + if not isinstance(orig_imgs, torch.Tensor): + pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape) + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred)) + return results diff --git a/ultralytics/models/nas/val.py b/ultralytics/models/nas/val.py new file mode 100644 index 0000000000000000000000000000000000000000..a53bbb5ab97874b5522081ed43c74d15d35a95fe --- /dev/null +++ b/ultralytics/models/nas/val.py @@ -0,0 +1,25 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.models.yolo.detect import DetectionValidator +from ultralytics.utils import ops +from ultralytics.utils.ops import xyxy2xywh + +__all__ = ['NASValidator'] + + +class NASValidator(DetectionValidator): + + def postprocess(self, preds_in): + """Apply Non-maximum suppression to prediction outputs.""" + boxes = xyxy2xywh(preds_in[0][0]) + preds = torch.cat((boxes, preds_in[0][1]), -1).permute(0, 2, 1) + return ops.non_max_suppression(preds, + self.args.conf, + self.args.iou, + labels=self.lb, + multi_label=False, + agnostic=self.args.single_cls, + max_det=self.args.max_det, + max_time_img=0.5) diff --git a/ultralytics/models/rtdetr/__init__.py b/ultralytics/models/rtdetr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93a1b1b443ad781ee0f53dbc1d0bb667c09828b1 --- /dev/null +++ b/ultralytics/models/rtdetr/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .model import RTDETR +from .predict import RTDETRPredictor +from .val import RTDETRValidator + +__all__ = 'RTDETRPredictor', 'RTDETRValidator', 'RTDETR' diff --git a/ultralytics/models/rtdetr/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f5f7bdc9cf80f9947551c3b96e1083e04767c2b Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/rtdetr/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..886fdadc0ed784b2a677e520dfde369dba6c5f71 Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/model.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d94d9acd9138106d80438edf14e067a7b73f637a Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/model.cpython-310.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/model.cpython-39.pyc b/ultralytics/models/rtdetr/__pycache__/model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54a53909e2777900c0d1fd246c641c37d6ab525f Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/model.cpython-39.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/predict.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa5dba2463251694d12fc7c28f78a3eb87b00386 Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/predict.cpython-39.pyc b/ultralytics/models/rtdetr/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5463206e52dc4747c0edea628ecdb2ddcdf96f5 Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/train.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/train.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15980227c14d56b9c0ac91e95027f3040c2792a1 Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/train.cpython-310.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/train.cpython-39.pyc b/ultralytics/models/rtdetr/__pycache__/train.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5447fc1b0d414ab336d28863584a85942b7bf435 Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/train.cpython-39.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/val.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/val.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e243964b921f18faaf7683d656cc240486d5b222 Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/val.cpython-310.pyc differ diff --git a/ultralytics/models/rtdetr/__pycache__/val.cpython-39.pyc b/ultralytics/models/rtdetr/__pycache__/val.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cda5722863e7c4f0e9ca691b8e45c80ab8936d86 Binary files /dev/null and b/ultralytics/models/rtdetr/__pycache__/val.cpython-39.pyc differ diff --git a/ultralytics/models/rtdetr/model.py b/ultralytics/models/rtdetr/model.py new file mode 100644 index 0000000000000000000000000000000000000000..dc0d5dd343cb0b943b409d2c10287a297359cadc --- /dev/null +++ b/ultralytics/models/rtdetr/model.py @@ -0,0 +1,30 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +RT-DETR model interface +""" +from ultralytics.engine.model import Model +from ultralytics.nn.tasks import RTDETRDetectionModel + +from .predict import RTDETRPredictor +from .train import RTDETRTrainer +from .val import RTDETRValidator + + +class RTDETR(Model): + """ + RTDETR model interface. + """ + + def __init__(self, model='rtdetr-l.pt') -> None: + if model and not model.split('.')[-1] in ('pt', 'yaml', 'yml'): + raise NotImplementedError('RT-DETR only supports creating from *.pt file or *.yaml file.') + super().__init__(model=model, task='detect') + + @property + def task_map(self): + return { + 'detect': { + 'predictor': RTDETRPredictor, + 'validator': RTDETRValidator, + 'trainer': RTDETRTrainer, + 'model': RTDETRDetectionModel}} diff --git a/ultralytics/models/rtdetr/predict.py b/ultralytics/models/rtdetr/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..463ed38fe1b11c24a76e23adb48563ae848f8e04 --- /dev/null +++ b/ultralytics/models/rtdetr/predict.py @@ -0,0 +1,44 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.data.augment import LetterBox +from ultralytics.engine.predictor import BasePredictor +from ultralytics.engine.results import Results +from ultralytics.utils import ops + + +class RTDETRPredictor(BasePredictor): + + def postprocess(self, preds, img, orig_imgs): + """Postprocess predictions and returns a list of Results objects.""" + nd = preds[0].shape[-1] + bboxes, scores = preds[0].split((4, nd - 4), dim=-1) + results = [] + for i, bbox in enumerate(bboxes): # (300, 4) + bbox = ops.xywh2xyxy(bbox) + score, cls = scores[i].max(-1, keepdim=True) # (300, 1) + idx = score.squeeze(-1) > self.args.conf # (300, ) + if self.args.classes is not None: + idx = (cls == torch.tensor(self.args.classes, device=cls.device)).any(1) & idx + pred = torch.cat([bbox, score, cls], dim=-1)[idx] # filter + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + oh, ow = orig_img.shape[:2] + if not isinstance(orig_imgs, torch.Tensor): + pred[..., [0, 2]] *= ow + pred[..., [1, 3]] *= oh + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred)) + return results + + def pre_transform(self, im): + """Pre-transform input image before inference. + + Args: + im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list. + + Return: A list of transformed imgs. + """ + # The size must be square(640) and scaleFilled. + return [LetterBox(self.imgsz, auto=False, scaleFill=True)(image=x) for x in im] diff --git a/ultralytics/models/rtdetr/train.py b/ultralytics/models/rtdetr/train.py new file mode 100644 index 0000000000000000000000000000000000000000..4ca3bf6366ee7de4b067ffc0f632f53f7d0e651e --- /dev/null +++ b/ultralytics/models/rtdetr/train.py @@ -0,0 +1,80 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from copy import copy + +import torch + +from ultralytics.models.yolo.detect import DetectionTrainer +from ultralytics.nn.tasks import RTDETRDetectionModel +from ultralytics.utils import DEFAULT_CFG, RANK, colorstr + +from .val import RTDETRDataset, RTDETRValidator + + +class RTDETRTrainer(DetectionTrainer): + + def get_model(self, cfg=None, weights=None, verbose=True): + """Return a YOLO detection model.""" + model = RTDETRDetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1) + if weights: + model.load(weights) + return model + + def build_dataset(self, img_path, mode='val', batch=None): + """Build RTDETR Dataset + + Args: + img_path (str): Path to the folder containing images. + mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode. + batch (int, optional): Size of batches, this is for `rect`. Defaults to None. + """ + return RTDETRDataset( + img_path=img_path, + imgsz=self.args.imgsz, + batch_size=batch, + augment=mode == 'train', # no augmentation + hyp=self.args, + rect=False, # no rect + cache=self.args.cache or None, + prefix=colorstr(f'{mode}: '), + data=self.data) + + def get_validator(self): + """Returns a DetectionValidator for RTDETR model validation.""" + self.loss_names = 'giou_loss', 'cls_loss', 'l1_loss' + return RTDETRValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args)) + + def preprocess_batch(self, batch): + """Preprocesses a batch of images by scaling and converting to float.""" + batch = super().preprocess_batch(batch) + bs = len(batch['img']) + batch_idx = batch['batch_idx'] + gt_bbox, gt_class = [], [] + for i in range(bs): + gt_bbox.append(batch['bboxes'][batch_idx == i].to(batch_idx.device)) + gt_class.append(batch['cls'][batch_idx == i].to(device=batch_idx.device, dtype=torch.long)) + return batch + + +def train(cfg=DEFAULT_CFG, use_python=False): + """Train and optimize RTDETR model given training data and device.""" + model = 'rtdetr-l.yaml' + data = cfg.data or 'coco128.yaml' # or yolo.ClassificationDataset("mnist") + device = cfg.device if cfg.device is not None else '' + + # NOTE: F.grid_sample which is in rt-detr does not support deterministic=True + # NOTE: amp training causes nan outputs and end with error while doing bipartite graph matching + args = dict(model=model, + data=data, + device=device, + imgsz=640, + exist_ok=True, + batch=4, + deterministic=False, + amp=False) + trainer = RTDETRTrainer(overrides=args) + trainer.train() + + +if __name__ == '__main__': + train() diff --git a/ultralytics/models/rtdetr/val.py b/ultralytics/models/rtdetr/val.py new file mode 100644 index 0000000000000000000000000000000000000000..3a9563d3e8c46032607db9a200b929ed115437a3 --- /dev/null +++ b/ultralytics/models/rtdetr/val.py @@ -0,0 +1,151 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from pathlib import Path + +import cv2 +import numpy as np +import torch + +from ultralytics.data import YOLODataset +from ultralytics.data.augment import Compose, Format, v8_transforms +from ultralytics.models.yolo.detect import DetectionValidator +from ultralytics.utils import colorstr, ops + +__all__ = 'RTDETRValidator', # tuple or list + + +# TODO: Temporarily, RT-DETR does not need padding. +class RTDETRDataset(YOLODataset): + + def __init__(self, *args, data=None, **kwargs): + super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs) + + # NOTE: add stretch version load_image for rtdetr mosaic + def load_image(self, i): + """Loads 1 image from dataset index 'i', returns (im, resized hw).""" + im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i] + if im is None: # not cached in RAM + if fn.exists(): # load npy + im = np.load(fn) + else: # read image + im = cv2.imread(f) # BGR + if im is None: + raise FileNotFoundError(f'Image Not Found {f}') + h0, w0 = im.shape[:2] # orig hw + im = cv2.resize(im, (self.imgsz, self.imgsz), interpolation=cv2.INTER_LINEAR) + + # Add to buffer if training with augmentations + if self.augment: + self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized + self.buffer.append(i) + if len(self.buffer) >= self.max_buffer_length: + j = self.buffer.pop(0) + self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None + + return im, (h0, w0), im.shape[:2] + + return self.ims[i], self.im_hw0[i], self.im_hw[i] + + def build_transforms(self, hyp=None): + """Temporarily, only for evaluation.""" + if self.augment: + hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0 + hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0 + transforms = v8_transforms(self, self.imgsz, hyp, stretch=True) + else: + # transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scaleFill=True)]) + transforms = Compose([]) + transforms.append( + Format(bbox_format='xywh', + normalize=True, + return_mask=self.use_segments, + return_keypoint=self.use_keypoints, + batch_idx=True, + mask_ratio=hyp.mask_ratio, + mask_overlap=hyp.overlap_mask)) + return transforms + + +class RTDETRValidator(DetectionValidator): + + def build_dataset(self, img_path, mode='val', batch=None): + """Build YOLO Dataset + + Args: + img_path (str): Path to the folder containing images. + mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode. + batch (int, optional): Size of batches, this is for `rect`. Defaults to None. + """ + return RTDETRDataset( + img_path=img_path, + imgsz=self.args.imgsz, + batch_size=batch, + augment=False, # no augmentation + hyp=self.args, + rect=False, # no rect + cache=self.args.cache or None, + prefix=colorstr(f'{mode}: '), + data=self.data) + + def postprocess(self, preds): + """Apply Non-maximum suppression to prediction outputs.""" + bs, _, nd = preds[0].shape + bboxes, scores = preds[0].split((4, nd - 4), dim=-1) + bboxes *= self.args.imgsz + outputs = [torch.zeros((0, 6), device=bboxes.device)] * bs + for i, bbox in enumerate(bboxes): # (300, 4) + bbox = ops.xywh2xyxy(bbox) + score, cls = scores[i].max(-1) # (300, ) + # Do not need threshold for evaluation as only got 300 boxes here. + # idx = score > self.args.conf + pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1) # filter + # sort by confidence to correctly get internal metrics. + pred = pred[score.argsort(descending=True)] + outputs[i] = pred # [idx] + + return outputs + + def update_metrics(self, preds, batch): + """Metrics.""" + for si, pred in enumerate(preds): + idx = batch['batch_idx'] == si + cls = batch['cls'][idx] + bbox = batch['bboxes'][idx] + nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions + shape = batch['ori_shape'][si] + correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + self.seen += 1 + + if npr == 0: + if nl: + self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1))) + if self.args.plots: + self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1)) + continue + + # Predictions + if self.args.single_cls: + pred[:, 5] = 0 + predn = pred.clone() + predn[..., [0, 2]] *= shape[1] / self.args.imgsz # native-space pred + predn[..., [1, 3]] *= shape[0] / self.args.imgsz # native-space pred + + # Evaluate + if nl: + tbox = ops.xywh2xyxy(bbox) # target boxes + tbox[..., [0, 2]] *= shape[1] # native-space pred + tbox[..., [1, 3]] *= shape[0] # native-space pred + labelsn = torch.cat((cls, tbox), 1) # native-space labels + # NOTE: To get correct metrics, the inputs of `_process_batch` should always be float32 type. + correct_bboxes = self._process_batch(predn.float(), labelsn) + # TODO: maybe remove these `self.` arguments as they already are member variable + if self.args.plots: + self.confusion_matrix.process_batch(predn, labelsn) + self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1))) # (conf, pcls, tcls) + + # Save + if self.args.save_json: + self.pred_to_json(predn, batch['im_file'][si]) + if self.args.save_txt: + file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt' + self.save_one_txt(predn, self.args.save_conf, shape, file) diff --git a/ultralytics/models/sam/__init__.py b/ultralytics/models/sam/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..acb9b9f07d32512998218d15709fff012055d42b --- /dev/null +++ b/ultralytics/models/sam/__init__.py @@ -0,0 +1,8 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .model import SAM +from .predict import Predictor + +# from .build import build_sam + +__all__ = 'SAM', 'Predictor' # tuple or list diff --git a/ultralytics/models/sam/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/sam/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..638efaaffd305a31d02e163dfd64069101e0198e Binary files /dev/null and b/ultralytics/models/sam/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/sam/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/sam/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb598cf89664995647594e844b97aca8a3ef91a1 Binary files /dev/null and b/ultralytics/models/sam/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/sam/__pycache__/amg.cpython-310.pyc b/ultralytics/models/sam/__pycache__/amg.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c31fb3de5b87fd29da4220c69e1f038a1007ffc5 Binary files /dev/null and b/ultralytics/models/sam/__pycache__/amg.cpython-310.pyc differ diff --git a/ultralytics/models/sam/__pycache__/amg.cpython-39.pyc b/ultralytics/models/sam/__pycache__/amg.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..123fde48f8739b75f3980d87fcb7b6edc40874d6 Binary files /dev/null and b/ultralytics/models/sam/__pycache__/amg.cpython-39.pyc differ diff --git a/ultralytics/models/sam/__pycache__/build.cpython-310.pyc b/ultralytics/models/sam/__pycache__/build.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fcd34212d8fc756e97e3d3d9e2012b39e2ca1c1 Binary files /dev/null and b/ultralytics/models/sam/__pycache__/build.cpython-310.pyc differ diff --git a/ultralytics/models/sam/__pycache__/build.cpython-39.pyc b/ultralytics/models/sam/__pycache__/build.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..579eb5b839d1149be377ee4442e47d9064045c7f Binary files /dev/null and b/ultralytics/models/sam/__pycache__/build.cpython-39.pyc differ diff --git a/ultralytics/models/sam/__pycache__/model.cpython-310.pyc b/ultralytics/models/sam/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..687a10fc0bcceff5e890733356f5607176fd47fd Binary files /dev/null and b/ultralytics/models/sam/__pycache__/model.cpython-310.pyc differ diff --git a/ultralytics/models/sam/__pycache__/model.cpython-39.pyc b/ultralytics/models/sam/__pycache__/model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b7be692c7a4ee9261836e1300d3c7a553e3681e Binary files /dev/null and b/ultralytics/models/sam/__pycache__/model.cpython-39.pyc differ diff --git a/ultralytics/models/sam/__pycache__/predict.cpython-310.pyc b/ultralytics/models/sam/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8f5535fb12438fab57aabb62f31835eeaf8faac Binary files /dev/null and b/ultralytics/models/sam/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/sam/__pycache__/predict.cpython-39.pyc b/ultralytics/models/sam/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f17e03e296d59be41fa4fa7999164dd3a8a8027 Binary files /dev/null and b/ultralytics/models/sam/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/sam/amg.py b/ultralytics/models/sam/amg.py new file mode 100644 index 0000000000000000000000000000000000000000..41d6bdfca4b5e7731b5bb743e0f446eebaf0ca3c --- /dev/null +++ b/ultralytics/models/sam/amg.py @@ -0,0 +1,311 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import math +from copy import deepcopy +from itertools import product +from typing import Any, Dict, Generator, ItemsView, List, Tuple + +import numpy as np +import torch + + +class MaskData: + """ + A structure for storing masks and their related data in batched format. + Implements basic filtering and concatenation. + """ + + def __init__(self, **kwargs) -> None: + """Initialize a MaskData object, ensuring all values are supported types.""" + for v in kwargs.values(): + assert isinstance( + v, (list, np.ndarray, torch.Tensor)), 'MaskData only supports list, numpy arrays, and torch tensors.' + self._stats = dict(**kwargs) + + def __setitem__(self, key: str, item: Any) -> None: + """Set an item in the MaskData object, ensuring it is a supported type.""" + assert isinstance( + item, (list, np.ndarray, torch.Tensor)), 'MaskData only supports list, numpy arrays, and torch tensors.' + self._stats[key] = item + + def __delitem__(self, key: str) -> None: + """Delete an item from the MaskData object.""" + del self._stats[key] + + def __getitem__(self, key: str) -> Any: + """Get an item from the MaskData object.""" + return self._stats[key] + + def items(self) -> ItemsView[str, Any]: + """Return an ItemsView of the MaskData object.""" + return self._stats.items() + + def filter(self, keep: torch.Tensor) -> None: + """Filter the MaskData object based on the given boolean tensor.""" + for k, v in self._stats.items(): + if v is None: + self._stats[k] = None + elif isinstance(v, torch.Tensor): + self._stats[k] = v[torch.as_tensor(keep, device=v.device)] + elif isinstance(v, np.ndarray): + self._stats[k] = v[keep.detach().cpu().numpy()] + elif isinstance(v, list) and keep.dtype == torch.bool: + self._stats[k] = [a for i, a in enumerate(v) if keep[i]] + elif isinstance(v, list): + self._stats[k] = [v[i] for i in keep] + else: + raise TypeError(f'MaskData key {k} has an unsupported type {type(v)}.') + + def cat(self, new_stats: 'MaskData') -> None: + """Concatenate a new MaskData object to the current one.""" + for k, v in new_stats.items(): + if k not in self._stats or self._stats[k] is None: + self._stats[k] = deepcopy(v) + elif isinstance(v, torch.Tensor): + self._stats[k] = torch.cat([self._stats[k], v], dim=0) + elif isinstance(v, np.ndarray): + self._stats[k] = np.concatenate([self._stats[k], v], axis=0) + elif isinstance(v, list): + self._stats[k] = self._stats[k] + deepcopy(v) + else: + raise TypeError(f'MaskData key {k} has an unsupported type {type(v)}.') + + def to_numpy(self) -> None: + """Convert all torch tensors in the MaskData object to numpy arrays.""" + for k, v in self._stats.items(): + if isinstance(v, torch.Tensor): + self._stats[k] = v.detach().cpu().numpy() + + +def is_box_near_crop_edge(boxes: torch.Tensor, + crop_box: List[int], + orig_box: List[int], + atol: float = 20.0) -> torch.Tensor: + """Return a boolean tensor indicating if boxes are near the crop edge.""" + crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device) + orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device) + boxes = uncrop_boxes_xyxy(boxes, crop_box).float() + near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0) + near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0) + near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge) + return torch.any(near_crop_edge, dim=1) + + +def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor: + """Convert bounding boxes from XYXY format to XYWH format.""" + box_xywh = deepcopy(box_xyxy) + box_xywh[2] = box_xywh[2] - box_xywh[0] + box_xywh[3] = box_xywh[3] - box_xywh[1] + return box_xywh + + +def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]: + """Yield batches of data from the input arguments.""" + assert args and all(len(a) == len(args[0]) for a in args), 'Batched iteration must have same-size inputs.' + n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0) + for b in range(n_batches): + yield [arg[b * batch_size:(b + 1) * batch_size] for arg in args] + + +def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]: + """Encode masks as uncompressed RLEs in the format expected by pycocotools.""" + # Put in fortran order and flatten h,w + b, h, w = tensor.shape + tensor = tensor.permute(0, 2, 1).flatten(1) + + # Compute change indices + diff = tensor[:, 1:] ^ tensor[:, :-1] + change_indices = diff.nonzero() + + # Encode run length + out = [] + for i in range(b): + cur_idxs = change_indices[change_indices[:, 0] == i, 1] + cur_idxs = torch.cat([ + torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device), + cur_idxs + 1, + torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device), ]) + btw_idxs = cur_idxs[1:] - cur_idxs[:-1] + counts = [] if tensor[i, 0] == 0 else [0] + counts.extend(btw_idxs.detach().cpu().tolist()) + out.append({'size': [h, w], 'counts': counts}) + return out + + +def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray: + """Compute a binary mask from an uncompressed RLE.""" + h, w = rle['size'] + mask = np.empty(h * w, dtype=bool) + idx = 0 + parity = False + for count in rle['counts']: + mask[idx:idx + count] = parity + idx += count + parity ^= True + mask = mask.reshape(w, h) + return mask.transpose() # Put in C order + + +def area_from_rle(rle: Dict[str, Any]) -> int: + """Calculate the area of a mask from its uncompressed RLE.""" + return sum(rle['counts'][1::2]) + + +def calculate_stability_score(masks: torch.Tensor, mask_threshold: float, threshold_offset: float) -> torch.Tensor: + """ + Computes the stability score for a batch of masks. The stability + score is the IoU between the binary masks obtained by thresholding + the predicted mask logits at high and low values. + """ + # One mask is always contained inside the other. + # Save memory by preventing unnecessary cast to torch.int64 + intersections = ((masks > (mask_threshold + threshold_offset)).sum(-1, dtype=torch.int16).sum(-1, + dtype=torch.int32)) + unions = ((masks > (mask_threshold - threshold_offset)).sum(-1, dtype=torch.int16).sum(-1, dtype=torch.int32)) + return intersections / unions + + +def build_point_grid(n_per_side: int) -> np.ndarray: + """Generate a 2D grid of evenly spaced points in the range [0,1]x[0,1].""" + offset = 1 / (2 * n_per_side) + points_one_side = np.linspace(offset, 1 - offset, n_per_side) + points_x = np.tile(points_one_side[None, :], (n_per_side, 1)) + points_y = np.tile(points_one_side[:, None], (1, n_per_side)) + return np.stack([points_x, points_y], axis=-1).reshape(-1, 2) + + +def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer: int) -> List[np.ndarray]: + """Generate point grids for all crop layers.""" + return [build_point_grid(int(n_per_side / (scale_per_layer ** i))) for i in range(n_layers + 1)] + + +def generate_crop_boxes(im_size: Tuple[int, ...], n_layers: int, + overlap_ratio: float) -> Tuple[List[List[int]], List[int]]: + """Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.""" + crop_boxes, layer_idxs = [], [] + im_h, im_w = im_size + short_side = min(im_h, im_w) + + # Original image + crop_boxes.append([0, 0, im_w, im_h]) + layer_idxs.append(0) + + def crop_len(orig_len, n_crops, overlap): + """Crops bounding boxes to the size of the input image.""" + return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops)) + + for i_layer in range(n_layers): + n_crops_per_side = 2 ** (i_layer + 1) + overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side)) + + crop_w = crop_len(im_w, n_crops_per_side, overlap) + crop_h = crop_len(im_h, n_crops_per_side, overlap) + + crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)] + crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)] + + # Crops in XYWH format + for x0, y0 in product(crop_box_x0, crop_box_y0): + box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)] + crop_boxes.append(box) + layer_idxs.append(i_layer + 1) + + return crop_boxes, layer_idxs + + +def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor: + """Uncrop bounding boxes by adding the crop box offset.""" + x0, y0, _, _ = crop_box + offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device) + # Check if boxes has a channel dimension + if len(boxes.shape) == 3: + offset = offset.unsqueeze(1) + return boxes + offset + + +def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor: + """Uncrop points by adding the crop box offset.""" + x0, y0, _, _ = crop_box + offset = torch.tensor([[x0, y0]], device=points.device) + # Check if points has a channel dimension + if len(points.shape) == 3: + offset = offset.unsqueeze(1) + return points + offset + + +def uncrop_masks(masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int) -> torch.Tensor: + """Uncrop masks by padding them to the original image size.""" + x0, y0, x1, y1 = crop_box + if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h: + return masks + # Coordinate transform masks + pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0) + pad = (x0, pad_x - x0, y0, pad_y - y0) + return torch.nn.functional.pad(masks, pad, value=0) + + +def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> Tuple[np.ndarray, bool]: + """Remove small disconnected regions or holes in a mask, returning the mask and a modification indicator.""" + import cv2 # type: ignore + + assert mode in {'holes', 'islands'} + correct_holes = mode == 'holes' + working_mask = (correct_holes ^ mask).astype(np.uint8) + n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8) + sizes = stats[:, -1][1:] # Row 0 is background label + small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh] + if not small_regions: + return mask, False + fill_labels = [0] + small_regions + if not correct_holes: + # If every region is below threshold, keep largest + fill_labels = [i for i in range(n_labels) if i not in fill_labels] or [int(np.argmax(sizes)) + 1] + mask = np.isin(regions, fill_labels) + return mask, True + + +def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]: + """Encode uncompressed RLE (run-length encoding) to COCO RLE format.""" + from pycocotools import mask as mask_utils # type: ignore + + h, w = uncompressed_rle['size'] + rle = mask_utils.frPyObjects(uncompressed_rle, h, w) + rle['counts'] = rle['counts'].decode('utf-8') # Necessary to serialize with json + return rle + + +def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor: + """ + Calculates boxes in XYXY format around masks. Return [0,0,0,0] for + an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4. + """ + # torch.max below raises an error on empty inputs, just skip in this case + if torch.numel(masks) == 0: + return torch.zeros(*masks.shape[:-2], 4, device=masks.device) + + # Normalize shape to CxHxW + shape = masks.shape + h, w = shape[-2:] + masks = masks.flatten(0, -3) if len(shape) > 2 else masks.unsqueeze(0) + # Get top and bottom edges + in_height, _ = torch.max(masks, dim=-1) + in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :] + bottom_edges, _ = torch.max(in_height_coords, dim=-1) + in_height_coords = in_height_coords + h * (~in_height) + top_edges, _ = torch.min(in_height_coords, dim=-1) + + # Get left and right edges + in_width, _ = torch.max(masks, dim=-2) + in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :] + right_edges, _ = torch.max(in_width_coords, dim=-1) + in_width_coords = in_width_coords + w * (~in_width) + left_edges, _ = torch.min(in_width_coords, dim=-1) + + # If the mask is empty the right edge will be to the left of the left edge. + # Replace these boxes with [0, 0, 0, 0] + empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges) + out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1) + out = out * (~empty_filter).unsqueeze(-1) + + # Return to original shape + return out.reshape(*shape[:-2], 4) if len(shape) > 2 else out[0] diff --git a/ultralytics/models/sam/build.py b/ultralytics/models/sam/build.py new file mode 100644 index 0000000000000000000000000000000000000000..77c67ef8e80a39d685047c7c76e5371f226a98cb --- /dev/null +++ b/ultralytics/models/sam/build.py @@ -0,0 +1,158 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from functools import partial + +import torch + +from ultralytics.utils.downloads import attempt_download_asset + +from .modules.decoders import MaskDecoder +from .modules.encoders import ImageEncoderViT, PromptEncoder +from .modules.sam import Sam +from .modules.tiny_encoder import TinyViT +from .modules.transformer import TwoWayTransformer + + +def build_sam_vit_h(checkpoint=None): + """Build and return a Segment Anything Model (SAM) h-size model.""" + return _build_sam( + encoder_embed_dim=1280, + encoder_depth=32, + encoder_num_heads=16, + encoder_global_attn_indexes=[7, 15, 23, 31], + checkpoint=checkpoint, + ) + + +def build_sam_vit_l(checkpoint=None): + """Build and return a Segment Anything Model (SAM) l-size model.""" + return _build_sam( + encoder_embed_dim=1024, + encoder_depth=24, + encoder_num_heads=16, + encoder_global_attn_indexes=[5, 11, 17, 23], + checkpoint=checkpoint, + ) + + +def build_sam_vit_b(checkpoint=None): + """Build and return a Segment Anything Model (SAM) b-size model.""" + return _build_sam( + encoder_embed_dim=768, + encoder_depth=12, + encoder_num_heads=12, + encoder_global_attn_indexes=[2, 5, 8, 11], + checkpoint=checkpoint, + ) + + +def build_mobile_sam(checkpoint=None): + """Build and return Mobile Segment Anything Model (Mobile-SAM).""" + return _build_sam( + encoder_embed_dim=[64, 128, 160, 320], + encoder_depth=[2, 2, 6, 2], + encoder_num_heads=[2, 4, 5, 10], + encoder_global_attn_indexes=None, + mobile_sam=True, + checkpoint=checkpoint, + ) + + +def _build_sam(encoder_embed_dim, + encoder_depth, + encoder_num_heads, + encoder_global_attn_indexes, + checkpoint=None, + mobile_sam=False): + """Builds the selected SAM model architecture.""" + prompt_embed_dim = 256 + image_size = 1024 + vit_patch_size = 16 + image_embedding_size = image_size // vit_patch_size + image_encoder = (TinyViT( + img_size=1024, + in_chans=3, + num_classes=1000, + embed_dims=encoder_embed_dim, + depths=encoder_depth, + num_heads=encoder_num_heads, + window_sizes=[7, 7, 14, 7], + mlp_ratio=4.0, + drop_rate=0.0, + drop_path_rate=0.0, + use_checkpoint=False, + mbconv_expand_ratio=4.0, + local_conv_size=3, + layer_lr_decay=0.8, + ) if mobile_sam else ImageEncoderViT( + depth=encoder_depth, + embed_dim=encoder_embed_dim, + img_size=image_size, + mlp_ratio=4, + norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), + num_heads=encoder_num_heads, + patch_size=vit_patch_size, + qkv_bias=True, + use_rel_pos=True, + global_attn_indexes=encoder_global_attn_indexes, + window_size=14, + out_chans=prompt_embed_dim, + )) + sam = Sam( + image_encoder=image_encoder, + prompt_encoder=PromptEncoder( + embed_dim=prompt_embed_dim, + image_embedding_size=(image_embedding_size, image_embedding_size), + input_image_size=(image_size, image_size), + mask_in_chans=16, + ), + mask_decoder=MaskDecoder( + num_multimask_outputs=3, + transformer=TwoWayTransformer( + depth=2, + embedding_dim=prompt_embed_dim, + mlp_dim=2048, + num_heads=8, + ), + transformer_dim=prompt_embed_dim, + iou_head_depth=3, + iou_head_hidden_dim=256, + ), + pixel_mean=[123.675, 116.28, 103.53], + pixel_std=[58.395, 57.12, 57.375], + ) + if checkpoint is not None: + checkpoint = attempt_download_asset(checkpoint) + with open(checkpoint, 'rb') as f: + state_dict = torch.load(f) + sam.load_state_dict(state_dict) + sam.eval() + # sam.load_state_dict(torch.load(checkpoint), strict=True) + # sam.eval() + return sam + + +sam_model_map = { + 'sam_h.pt': build_sam_vit_h, + 'sam_l.pt': build_sam_vit_l, + 'sam_b.pt': build_sam_vit_b, + 'mobile_sam.pt': build_mobile_sam, } + + +def build_sam(ckpt='sam_b.pt'): + """Build a SAM model specified by ckpt.""" + model_builder = None + for k in sam_model_map.keys(): + if ckpt.endswith(k): + model_builder = sam_model_map.get(k) + + if not model_builder: + raise FileNotFoundError(f'{ckpt} is not a supported sam model. Available models are: \n {sam_model_map.keys()}') + + return model_builder(ckpt) diff --git a/ultralytics/models/sam/model.py b/ultralytics/models/sam/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9bb1d82a81ebc4c0b0e5969f3626029dc77e1ec5 --- /dev/null +++ b/ultralytics/models/sam/model.py @@ -0,0 +1,50 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +SAM model interface +""" + +from ultralytics.engine.model import Model +from ultralytics.utils.torch_utils import model_info + +from .build import build_sam +from .predict import Predictor + + +class SAM(Model): + """ + SAM model interface. + """ + + def __init__(self, model='sam_b.pt') -> None: + if model and not model.endswith('.pt') and not model.endswith('.pth'): + # Should raise AssertionError instead? + raise NotImplementedError('Segment anything prediction requires pre-trained checkpoint') + super().__init__(model=model, task='segment') + + def _load(self, weights: str, task=None): + self.model = build_sam(weights) + + def predict(self, source, stream=False, bboxes=None, points=None, labels=None, **kwargs): + """Predicts and returns segmentation masks for given image or video source.""" + overrides = dict(conf=0.25, task='segment', mode='predict', imgsz=1024) + kwargs.update(overrides) + prompts = dict(bboxes=bboxes, points=points, labels=labels) + return super().predict(source, stream, prompts=prompts, **kwargs) + + def __call__(self, source=None, stream=False, bboxes=None, points=None, labels=None, **kwargs): + """Calls the 'predict' function with given arguments to perform object detection.""" + return self.predict(source, stream, bboxes, points, labels, **kwargs) + + def info(self, detailed=False, verbose=True): + """ + Logs model info. + + Args: + detailed (bool): Show detailed information about model. + verbose (bool): Controls verbosity. + """ + return model_info(self.model, detailed=detailed, verbose=verbose) + + @property + def task_map(self): + return {'segment': {'predictor': Predictor}} diff --git a/ultralytics/models/sam/modules/__init__.py b/ultralytics/models/sam/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c73604daded2a31176069b8620b9a80d6634d5b8 --- /dev/null +++ b/ultralytics/models/sam/modules/__init__.py @@ -0,0 +1 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license diff --git a/ultralytics/models/sam/modules/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/sam/modules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6d0a96276c7e425842f9b0edfeef1cf807d78c2 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/sam/modules/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc51b8c8ad78508c2a9b0b90ec17fb47126ce429 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/decoders.cpython-310.pyc b/ultralytics/models/sam/modules/__pycache__/decoders.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84aae0dae05ca5724ed0209bd643646e3733eba0 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/decoders.cpython-310.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/decoders.cpython-39.pyc b/ultralytics/models/sam/modules/__pycache__/decoders.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6489ff4c2b708735eac2c2c7f475baea15312133 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/decoders.cpython-39.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/encoders.cpython-310.pyc b/ultralytics/models/sam/modules/__pycache__/encoders.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b15604c81dbc54a965472ebadd012f695a2a9719 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/encoders.cpython-310.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/encoders.cpython-39.pyc b/ultralytics/models/sam/modules/__pycache__/encoders.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d9cee6bc057ee18937a24046a30f3d270c954b8 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/encoders.cpython-39.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/sam.cpython-310.pyc b/ultralytics/models/sam/modules/__pycache__/sam.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c3f3a82b24637fa1efc5618ab467cefc203472d Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/sam.cpython-310.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/sam.cpython-39.pyc b/ultralytics/models/sam/modules/__pycache__/sam.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f4dd5d3c90e133b87fea72e759aea33d7bafc98 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/sam.cpython-39.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/tiny_encoder.cpython-310.pyc b/ultralytics/models/sam/modules/__pycache__/tiny_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d69834c58ca1bb0a6583ba62ef0c76c3c3771ac Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/tiny_encoder.cpython-310.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/tiny_encoder.cpython-39.pyc b/ultralytics/models/sam/modules/__pycache__/tiny_encoder.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..882ef8da195bd993b4bb4049430ecadbfcab2dd7 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/tiny_encoder.cpython-39.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/transformer.cpython-310.pyc b/ultralytics/models/sam/modules/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0cc017a713433315e570abc623265eabde7d9d8 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/transformer.cpython-310.pyc differ diff --git a/ultralytics/models/sam/modules/__pycache__/transformer.cpython-39.pyc b/ultralytics/models/sam/modules/__pycache__/transformer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4a82d7d36d1b55b1e518a9712e0f38e6e4969b9 Binary files /dev/null and b/ultralytics/models/sam/modules/__pycache__/transformer.cpython-39.pyc differ diff --git a/ultralytics/models/sam/modules/decoders.py b/ultralytics/models/sam/modules/decoders.py new file mode 100644 index 0000000000000000000000000000000000000000..8ae3e59012af1aa462167648c7e9abb0dc4635c9 --- /dev/null +++ b/ultralytics/models/sam/modules/decoders.py @@ -0,0 +1,159 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from typing import List, Tuple, Type + +import torch +from torch import nn +from torch.nn import functional as F + +from ultralytics.nn.modules import LayerNorm2d + + +class MaskDecoder(nn.Module): + + def __init__( + self, + *, + transformer_dim: int, + transformer: nn.Module, + num_multimask_outputs: int = 3, + activation: Type[nn.Module] = nn.GELU, + iou_head_depth: int = 3, + iou_head_hidden_dim: int = 256, + ) -> None: + """ + Predicts masks given an image and prompt embeddings, using a transformer architecture. + + Arguments: + transformer_dim (int): the channel dimension of the transformer module + transformer (nn.Module): the transformer used to predict masks + num_multimask_outputs (int): the number of masks to predict when disambiguating masks + activation (nn.Module): the type of activation to use when upscaling masks + iou_head_depth (int): the depth of the MLP used to predict mask quality + iou_head_hidden_dim (int): the hidden dimension of the MLP used to predict mask quality + """ + super().__init__() + self.transformer_dim = transformer_dim + self.transformer = transformer + + self.num_multimask_outputs = num_multimask_outputs + + self.iou_token = nn.Embedding(1, transformer_dim) + self.num_mask_tokens = num_multimask_outputs + 1 + self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) + + self.output_upscaling = nn.Sequential( + nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2), + LayerNorm2d(transformer_dim // 4), + activation(), + nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2), + activation(), + ) + self.output_hypernetworks_mlps = nn.ModuleList([ + MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) for _ in range(self.num_mask_tokens)]) + + self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth) + + def forward( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Arguments: + image_embeddings (torch.Tensor): the embeddings from the image encoder + image_pe (torch.Tensor): positional encoding with the shape of image_embeddings + sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes + dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs + multimask_output (bool): Whether to return multiple masks or a single mask. + + Returns: + torch.Tensor: batched predicted masks + torch.Tensor: batched predictions of mask quality + """ + masks, iou_pred = self.predict_masks( + image_embeddings=image_embeddings, + image_pe=image_pe, + sparse_prompt_embeddings=sparse_prompt_embeddings, + dense_prompt_embeddings=dense_prompt_embeddings, + ) + + # Select the correct mask or masks for output + mask_slice = slice(1, None) if multimask_output else slice(0, 1) + masks = masks[:, mask_slice, :, :] + iou_pred = iou_pred[:, mask_slice] + + # Prepare output + return masks, iou_pred + + def predict_masks( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Predicts masks. See 'forward' for more details.""" + # Concatenate output tokens + output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0) + output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.size(0), -1, -1) + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) + + # Expand per-image data in batch direction to be per-mask + src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) + src = src + dense_prompt_embeddings + pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) + b, c, h, w = src.shape + + # Run the transformer + hs, src = self.transformer(src, pos_src, tokens) + iou_token_out = hs[:, 0, :] + mask_tokens_out = hs[:, 1:(1 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + src = src.transpose(1, 2).view(b, c, h, w) + upscaled_embedding = self.output_upscaling(src) + hyper_in_list: List[torch.Tensor] = [ + self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)] + hyper_in = torch.stack(hyper_in_list, dim=1) + b, c, h, w = upscaled_embedding.shape + masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) + + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + + return masks, iou_pred + + +class MLP(nn.Module): + """ + Lightly adapted from + https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py + """ + + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + sigmoid_output: bool = False, + ) -> None: + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.sigmoid_output = sigmoid_output + + def forward(self, x): + """Executes feedforward within the neural network module and applies activation.""" + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + if self.sigmoid_output: + x = torch.sigmoid(x) + return x diff --git a/ultralytics/models/sam/modules/encoders.py b/ultralytics/models/sam/modules/encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..ebb754f11295f1bcc71a4e9fd301cc206015d1e7 --- /dev/null +++ b/ultralytics/models/sam/modules/encoders.py @@ -0,0 +1,583 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from typing import Any, Optional, Tuple, Type + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ultralytics.nn.modules import LayerNorm2d, MLPBlock + + +# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa +class ImageEncoderViT(nn.Module): + + def __init__( + self, + img_size: int = 1024, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + out_chans: int = 256, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_abs_pos: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + global_attn_indexes: Tuple[int, ...] = (), + ) -> None: + """ + Args: + img_size (int): Input image size. + patch_size (int): Patch size. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of ViT. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + global_attn_indexes (list): Indexes for blocks using global attention. + """ + super().__init__() + self.img_size = img_size + + self.patch_embed = PatchEmbed( + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + in_chans=in_chans, + embed_dim=embed_dim, + ) + + self.pos_embed: Optional[nn.Parameter] = None + if use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)) + + self.blocks = nn.ModuleList() + for i in range(depth): + block = Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + act_layer=act_layer, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=window_size if i not in global_attn_indexes else 0, + input_size=(img_size // patch_size, img_size // patch_size), + ) + self.blocks.append(block) + + self.neck = nn.Sequential( + nn.Conv2d( + embed_dim, + out_chans, + kernel_size=1, + bias=False, + ), + LayerNorm2d(out_chans), + nn.Conv2d( + out_chans, + out_chans, + kernel_size=3, + padding=1, + bias=False, + ), + LayerNorm2d(out_chans), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.patch_embed(x) + if self.pos_embed is not None: + x = x + self.pos_embed + + for blk in self.blocks: + x = blk(x) + + x = self.neck(x.permute(0, 3, 1, 2)) + + return x + + +class PromptEncoder(nn.Module): + + def __init__( + self, + embed_dim: int, + image_embedding_size: Tuple[int, int], + input_image_size: Tuple[int, int], + mask_in_chans: int, + activation: Type[nn.Module] = nn.GELU, + ) -> None: + """ + Encodes prompts for input to SAM's mask decoder. + + Arguments: + embed_dim (int): The prompts' embedding dimension + image_embedding_size (tuple(int, int)): The spatial size of the + image embedding, as (H, W). + input_image_size (int): The padded size of the image as input + to the image encoder, as (H, W). + mask_in_chans (int): The number of hidden channels used for + encoding input masks. + activation (nn.Module): The activation to use when encoding + input masks. + """ + super().__init__() + self.embed_dim = embed_dim + self.input_image_size = input_image_size + self.image_embedding_size = image_embedding_size + self.pe_layer = PositionEmbeddingRandom(embed_dim // 2) + + self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners + point_embeddings = [nn.Embedding(1, embed_dim) for _ in range(self.num_point_embeddings)] + self.point_embeddings = nn.ModuleList(point_embeddings) + self.not_a_point_embed = nn.Embedding(1, embed_dim) + + self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1]) + self.mask_downscaling = nn.Sequential( + nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans // 4), + activation(), + nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans), + activation(), + nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1), + ) + self.no_mask_embed = nn.Embedding(1, embed_dim) + + def get_dense_pe(self) -> torch.Tensor: + """ + Returns the positional encoding used to encode point prompts, + applied to a dense set of points the shape of the image encoding. + + Returns: + torch.Tensor: Positional encoding with shape + 1x(embed_dim)x(embedding_h)x(embedding_w) + """ + return self.pe_layer(self.image_embedding_size).unsqueeze(0) + + def _embed_points( + self, + points: torch.Tensor, + labels: torch.Tensor, + pad: bool, + ) -> torch.Tensor: + """Embeds point prompts.""" + points = points + 0.5 # Shift to center of pixel + if pad: + padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device) + padding_label = -torch.ones((labels.shape[0], 1), device=labels.device) + points = torch.cat([points, padding_point], dim=1) + labels = torch.cat([labels, padding_label], dim=1) + point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size) + point_embedding[labels == -1] = 0.0 + point_embedding[labels == -1] += self.not_a_point_embed.weight + point_embedding[labels == 0] += self.point_embeddings[0].weight + point_embedding[labels == 1] += self.point_embeddings[1].weight + return point_embedding + + def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: + """Embeds box prompts.""" + boxes = boxes + 0.5 # Shift to center of pixel + coords = boxes.reshape(-1, 2, 2) + corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size) + corner_embedding[:, 0, :] += self.point_embeddings[2].weight + corner_embedding[:, 1, :] += self.point_embeddings[3].weight + return corner_embedding + + def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor: + """Embeds mask inputs.""" + return self.mask_downscaling(masks) + + def _get_batch_size( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> int: + """ + Gets the batch size of the output given the batch size of the input prompts. + """ + if points is not None: + return points[0].shape[0] + elif boxes is not None: + return boxes.shape[0] + elif masks is not None: + return masks.shape[0] + else: + return 1 + + def _get_device(self) -> torch.device: + return self.point_embeddings[0].weight.device + + def forward( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Embeds different types of prompts, returning both sparse and dense + embeddings. + + Arguments: + points (tuple(torch.Tensor, torch.Tensor), None): point coordinates + and labels to embed. + boxes (torch.Tensor, None): boxes to embed + masks (torch.Tensor, None): masks to embed + + Returns: + torch.Tensor: sparse embeddings for the points and boxes, with shape + BxNx(embed_dim), where N is determined by the number of input points + and boxes. + torch.Tensor: dense embeddings for the masks, in the shape + Bx(embed_dim)x(embed_H)x(embed_W) + """ + bs = self._get_batch_size(points, boxes, masks) + sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device()) + if points is not None: + coords, labels = points + point_embeddings = self._embed_points(coords, labels, pad=(boxes is None)) + sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1) + if boxes is not None: + box_embeddings = self._embed_boxes(boxes) + sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1) + + if masks is not None: + dense_embeddings = self._embed_masks(masks) + else: + dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, + 1).expand(bs, -1, self.image_embedding_size[0], + self.image_embedding_size[1]) + + return sparse_embeddings, dense_embeddings + + +class PositionEmbeddingRandom(nn.Module): + """ + Positional encoding using random spatial frequencies. + """ + + def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None: + super().__init__() + if scale is None or scale <= 0.0: + scale = 1.0 + self.register_buffer( + 'positional_encoding_gaussian_matrix', + scale * torch.randn((2, num_pos_feats)), + ) + + def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor: + """Positionally encode points that are normalized to [0,1].""" + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coords = 2 * coords - 1 + coords = coords @ self.positional_encoding_gaussian_matrix + coords = 2 * np.pi * coords + # outputs d_1 x ... x d_n x C shape + return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1) + + def forward(self, size: Tuple[int, int]) -> torch.Tensor: + """Generate positional encoding for a grid of the specified size.""" + h, w = size + device: Any = self.positional_encoding_gaussian_matrix.device + grid = torch.ones((h, w), device=device, dtype=torch.float32) + y_embed = grid.cumsum(dim=0) - 0.5 + x_embed = grid.cumsum(dim=1) - 0.5 + y_embed = y_embed / h + x_embed = x_embed / w + + pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1)) + return pe.permute(2, 0, 1) # C x H x W + + def forward_with_coords(self, coords_input: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor: + """Positionally encode points that are not normalized to [0,1].""" + coords = coords_input.clone() + coords[:, :, 0] = coords[:, :, 0] / image_size[1] + coords[:, :, 1] = coords[:, :, 1] / image_size[0] + return self._pe_encoding(coords.to(torch.float)) # B x N x C + + +class Block(nn.Module): + """Transformer blocks with support of window attention and residual propagation blocks""" + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. If it equals 0, then + use global attention. + input_size (tuple(int, int), None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else (window_size, window_size), + ) + + self.norm2 = norm_layer(dim) + self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) + + self.window_size = window_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x + x = self.norm1(x) + # Window partition + if self.window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, self.window_size) + + x = self.attn(x) + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, self.window_size, pad_hw, (H, W)) + + x = shortcut + x + x = x + self.mlp(self.norm2(x)) + + return x + + +class Attention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (tuple(int, int), None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + assert (input_size is not None), 'Input size must be provided if using relative positional encoding.' + # initialize relative positional embeddings + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, H, W, _ = x.shape + # qkv with shape (3, B, nHead, H * W, C) + qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + # q, k, v with shape (B * nHead, H * W, C) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) + + attn = (q * self.scale) @ k.transpose(-2, -1) + + if self.use_rel_pos: + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) + + attn = attn.softmax(dim=-1) + x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) + x = self.proj(x) + + return x + + +def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]: + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows, (Hp, Wp) + + +def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], + hw: Tuple[int, int]) -> torch.Tensor: + """ + Window unpartition into original sequences and removing padding. + Args: + windows (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode='linear', + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, + q: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: Tuple[int, int], + k_size: Tuple[int, int], +) -> torch.Tensor: + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. + q_size (Tuple): spatial sequence size of query q with (q_h, q_w). + k_size (Tuple): spatial sequence size of key k with (k_h, k_w). + + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum('bhwc,hkc->bhwk', r_q, Rh) + rel_w = torch.einsum('bhwc,wkc->bhwk', r_q, Rw) + + attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view( + B, q_h * q_w, k_h * k_w) + + return attn + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + kernel_size: Tuple[int, int] = (16, 16), + stride: Tuple[int, int] = (16, 16), + padding: Tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + """ + super().__init__() + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x diff --git a/ultralytics/models/sam/modules/sam.py b/ultralytics/models/sam/modules/sam.py new file mode 100644 index 0000000000000000000000000000000000000000..a9d6af3481b0e1299079374c87b45e104ec819ba --- /dev/null +++ b/ultralytics/models/sam/modules/sam.py @@ -0,0 +1,173 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Tuple + +import torch +from torch import nn +from torch.nn import functional as F + +from .decoders import MaskDecoder +from .encoders import ImageEncoderViT, PromptEncoder + + +class Sam(nn.Module): + mask_threshold: float = 0.0 + image_format: str = 'RGB' + + def __init__(self, + image_encoder: ImageEncoderViT, + prompt_encoder: PromptEncoder, + mask_decoder: MaskDecoder, + pixel_mean: List[float] = None, + pixel_std: List[float] = None) -> None: + """ + SAM predicts object masks from an image and input prompts. + + Arguments: + image_encoder (ImageEncoderViT): The backbone used to encode the + image into image embeddings that allow for efficient mask prediction. + prompt_encoder (PromptEncoder): Encodes various types of input prompts. + mask_decoder (MaskDecoder): Predicts masks from the image embeddings + and encoded prompts. + pixel_mean (list(float)): Mean values for normalizing pixels in the input image. + pixel_std (list(float)): Std values for normalizing pixels in the input image. + """ + if pixel_mean is None: + pixel_mean = [123.675, 116.28, 103.53] + if pixel_std is None: + pixel_std = [58.395, 57.12, 57.375] + super().__init__() + self.image_encoder = image_encoder + self.prompt_encoder = prompt_encoder + self.mask_decoder = mask_decoder + self.register_buffer('pixel_mean', torch.Tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer('pixel_std', torch.Tensor(pixel_std).view(-1, 1, 1), False) + + @property + def device(self) -> Any: + return self.pixel_mean.device + + @torch.no_grad() + def forward( + self, + batched_input: List[Dict[str, Any]], + multimask_output: bool, + ) -> List[Dict[str, torch.Tensor]]: + """ + Predicts masks end-to-end from provided images and prompts. + If prompts are not known in advance, using SamPredictor is + recommended over calling the model directly. + + Arguments: + batched_input (list(dict)): A list over input images, each a + dictionary with the following keys. A prompt key can be + excluded if it is not present. + 'image': The image as a torch tensor in 3xHxW format, + already transformed for input to the model. + 'original_size': (tuple(int, int)) The original size of + the image before transformation, as (H, W). + 'point_coords': (torch.Tensor) Batched point prompts for + this image, with shape BxNx2. Already transformed to the + input frame of the model. + 'point_labels': (torch.Tensor) Batched labels for point prompts, + with shape BxN. + 'boxes': (torch.Tensor) Batched box inputs, with shape Bx4. + Already transformed to the input frame of the model. + 'mask_inputs': (torch.Tensor) Batched mask inputs to the model, + in the form Bx1xHxW. + multimask_output (bool): Whether the model should predict multiple + disambiguating masks, or return a single mask. + + Returns: + (list(dict)): A list over input images, where each element is + as dictionary with the following keys. + 'masks': (torch.Tensor) Batched binary mask predictions, + with shape BxCxHxW, where B is the number of input prompts, + C is determined by multimask_output, and (H, W) is the + original size of the image. + 'iou_predictions': (torch.Tensor) The model's predictions + of mask quality, in shape BxC. + 'low_res_logits': (torch.Tensor) Low resolution logits with + shape BxCxHxW, where H=W=256. Can be passed as mask input + to subsequent iterations of prediction. + """ + input_images = torch.stack([self.preprocess(x['image']) for x in batched_input], dim=0) + image_embeddings = self.image_encoder(input_images) + + outputs = [] + for image_record, curr_embedding in zip(batched_input, image_embeddings): + if 'point_coords' in image_record: + points = (image_record['point_coords'], image_record['point_labels']) + else: + points = None + sparse_embeddings, dense_embeddings = self.prompt_encoder( + points=points, + boxes=image_record.get('boxes', None), + masks=image_record.get('mask_inputs', None), + ) + low_res_masks, iou_predictions = self.mask_decoder( + image_embeddings=curr_embedding.unsqueeze(0), + image_pe=self.prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + ) + masks = self.postprocess_masks( + low_res_masks, + input_size=image_record['image'].shape[-2:], + original_size=image_record['original_size'], + ) + masks = masks > self.mask_threshold + outputs.append({ + 'masks': masks, + 'iou_predictions': iou_predictions, + 'low_res_logits': low_res_masks, }) + return outputs + + def postprocess_masks( + self, + masks: torch.Tensor, + input_size: Tuple[int, ...], + original_size: Tuple[int, ...], + ) -> torch.Tensor: + """ + Remove padding and upscale masks to the original image size. + + Arguments: + masks (torch.Tensor): Batched masks from the mask_decoder, + in BxCxHxW format. + input_size (tuple(int, int)): The size of the image input to the + model, in (H, W) format. Used to remove padding. + original_size (tuple(int, int)): The original size of the image + before resizing for input to the model, in (H, W) format. + + Returns: + (torch.Tensor): Batched masks in BxCxHxW format, where (H, W) + is given by original_size. + """ + masks = F.interpolate( + masks, + (self.image_encoder.img_size, self.image_encoder.img_size), + mode='bilinear', + align_corners=False, + ) + masks = masks[..., :input_size[0], :input_size[1]] + masks = F.interpolate(masks, original_size, mode='bilinear', align_corners=False) + return masks + + def preprocess(self, x: torch.Tensor) -> torch.Tensor: + """Normalize pixel values and pad to a square input.""" + # Normalize colors + x = (x - self.pixel_mean) / self.pixel_std + + # Pad + h, w = x.shape[-2:] + padh = self.image_encoder.img_size - h + padw = self.image_encoder.img_size - w + return F.pad(x, (0, padw, 0, padh)) diff --git a/ultralytics/models/sam/modules/tiny_encoder.py b/ultralytics/models/sam/modules/tiny_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..1950dea799fec973f5c9f210e4687503af2d1576 --- /dev/null +++ b/ultralytics/models/sam/modules/tiny_encoder.py @@ -0,0 +1,653 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +# -------------------------------------------------------- +# TinyViT Model Architecture +# Copyright (c) 2022 Microsoft +# Adapted from LeViT and Swin Transformer +# LeViT: (https://github.com/facebookresearch/levit) +# Swin: (https://github.com/microsoft/swin-transformer) +# Build the TinyViT Model +# -------------------------------------------------------- + +import itertools +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from ultralytics.utils.instance import to_2tuple + + +class Conv2d_BN(torch.nn.Sequential): + + def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1): + super().__init__() + self.add_module('c', torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False)) + bn = torch.nn.BatchNorm2d(b) + torch.nn.init.constant_(bn.weight, bn_weight_init) + torch.nn.init.constant_(bn.bias, 0) + self.add_module('bn', bn) + + @torch.no_grad() + def fuse(self): + c, bn = self._modules.values() + w = bn.weight / (bn.running_var + bn.eps) ** 0.5 + w = c.weight * w[:, None, None, None] + b = bn.bias - bn.running_mean * bn.weight / \ + (bn.running_var + bn.eps)**0.5 + m = torch.nn.Conv2d(w.size(1) * self.c.groups, + w.size(0), + w.shape[2:], + stride=self.c.stride, + padding=self.c.padding, + dilation=self.c.dilation, + groups=self.c.groups) + m.weight.data.copy_(w) + m.bias.data.copy_(b) + return m + + +# NOTE: This module and timm package is needed only for training. +# from ultralytics.utils.checks import check_requirements +# check_requirements('timm') +# from timm.models.layers import DropPath as TimmDropPath +# from timm.models.layers import trunc_normal_ +# class DropPath(TimmDropPath): +# +# def __init__(self, drop_prob=None): +# super().__init__(drop_prob=drop_prob) +# self.drop_prob = drop_prob +# +# def __repr__(self): +# msg = super().__repr__() +# msg += f'(drop_prob={self.drop_prob})' +# return msg + + +class PatchEmbed(nn.Module): + + def __init__(self, in_chans, embed_dim, resolution, activation): + super().__init__() + img_size: Tuple[int, int] = to_2tuple(resolution) + self.patches_resolution = (img_size[0] // 4, img_size[1] // 4) + self.num_patches = self.patches_resolution[0] * \ + self.patches_resolution[1] + self.in_chans = in_chans + self.embed_dim = embed_dim + n = embed_dim + self.seq = nn.Sequential( + Conv2d_BN(in_chans, n // 2, 3, 2, 1), + activation(), + Conv2d_BN(n // 2, n, 3, 2, 1), + ) + + def forward(self, x): + return self.seq(x) + + +class MBConv(nn.Module): + + def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path): + super().__init__() + self.in_chans = in_chans + self.hidden_chans = int(in_chans * expand_ratio) + self.out_chans = out_chans + + self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1) + self.act1 = activation() + + self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans, ks=3, stride=1, pad=1, groups=self.hidden_chans) + self.act2 = activation() + + self.conv3 = Conv2d_BN(self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0) + self.act3 = activation() + + # NOTE: `DropPath` is needed only for training. + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + + def forward(self, x): + shortcut = x + + x = self.conv1(x) + x = self.act1(x) + + x = self.conv2(x) + x = self.act2(x) + + x = self.conv3(x) + + x = self.drop_path(x) + + x += shortcut + x = self.act3(x) + + return x + + +class PatchMerging(nn.Module): + + def __init__(self, input_resolution, dim, out_dim, activation): + super().__init__() + + self.input_resolution = input_resolution + self.dim = dim + self.out_dim = out_dim + self.act = activation() + self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0) + stride_c = 2 + if (out_dim == 320 or out_dim == 448 or out_dim == 576): + stride_c = 1 + self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim) + self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0) + + def forward(self, x): + if x.ndim == 3: + H, W = self.input_resolution + B = len(x) + # (B, C, H, W) + x = x.view(B, H, W, -1).permute(0, 3, 1, 2) + + x = self.conv1(x) + x = self.act(x) + + x = self.conv2(x) + x = self.act(x) + x = self.conv3(x) + x = x.flatten(2).transpose(1, 2) + return x + + +class ConvLayer(nn.Module): + + def __init__( + self, + dim, + input_resolution, + depth, + activation, + drop_path=0., + downsample=None, + use_checkpoint=False, + out_dim=None, + conv_expand_ratio=4., + ): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + MBConv( + dim, + dim, + conv_expand_ratio, + activation, + drop_path[i] if isinstance(drop_path, list) else drop_path, + ) for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + +class Mlp(nn.Module): + + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.norm = nn.LayerNorm(in_features) + self.fc1 = nn.Linear(in_features, hidden_features) + self.fc2 = nn.Linear(hidden_features, out_features) + self.act = act_layer() + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.norm(x) + + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(torch.nn.Module): + + def __init__( + self, + dim, + key_dim, + num_heads=8, + attn_ratio=4, + resolution=(14, 14), + ): + super().__init__() + # (h, w) + assert isinstance(resolution, tuple) and len(resolution) == 2 + self.num_heads = num_heads + self.scale = key_dim ** -0.5 + self.key_dim = key_dim + self.nh_kd = nh_kd = key_dim * num_heads + self.d = int(attn_ratio * key_dim) + self.dh = int(attn_ratio * key_dim) * num_heads + self.attn_ratio = attn_ratio + h = self.dh + nh_kd * 2 + + self.norm = nn.LayerNorm(dim) + self.qkv = nn.Linear(dim, h) + self.proj = nn.Linear(self.dh, dim) + + points = list(itertools.product(range(resolution[0]), range(resolution[1]))) + N = len(points) + attention_offsets = {} + idxs = [] + for p1 in points: + for p2 in points: + offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets))) + self.register_buffer('attention_bias_idxs', torch.LongTensor(idxs).view(N, N), persistent=False) + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and hasattr(self, 'ab'): + del self.ab + else: + self.ab = self.attention_biases[:, self.attention_bias_idxs] + + def forward(self, x): # x (B,N,C) + B, N, _ = x.shape + + # Normalization + x = self.norm(x) + + qkv = self.qkv(x) + # (B, N, num_heads, d) + q, k, v = qkv.view(B, N, self.num_heads, -1).split([self.key_dim, self.key_dim, self.d], dim=3) + # (B, num_heads, N, d) + q = q.permute(0, 2, 1, 3) + k = k.permute(0, 2, 1, 3) + v = v.permute(0, 2, 1, 3) + self.ab = self.ab.to(self.attention_biases.device) + + attn = ((q @ k.transpose(-2, -1)) * self.scale + + (self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab)) + attn = attn.softmax(dim=-1) + x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh) + x = self.proj(x) + return x + + +class TinyViTBlock(nn.Module): + r""" TinyViT Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int, int]): Input resolution. + num_heads (int): Number of attention heads. + window_size (int): Window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + drop (float, optional): Dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + local_conv_size (int): the kernel size of the convolution between + Attention and MLP. Default: 3 + activation (torch.nn): the activation function. Default: nn.GELU + """ + + def __init__( + self, + dim, + input_resolution, + num_heads, + window_size=7, + mlp_ratio=4., + drop=0., + drop_path=0., + local_conv_size=3, + activation=nn.GELU, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + assert window_size > 0, 'window_size must be greater than 0' + self.window_size = window_size + self.mlp_ratio = mlp_ratio + + # NOTE: `DropPath` is needed only for training. + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + + assert dim % num_heads == 0, 'dim must be divisible by num_heads' + head_dim = dim // num_heads + + window_resolution = (window_size, window_size) + self.attn = Attention(dim, head_dim, num_heads, attn_ratio=1, resolution=window_resolution) + + mlp_hidden_dim = int(dim * mlp_ratio) + mlp_activation = activation + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=mlp_activation, drop=drop) + + pad = local_conv_size // 2 + self.local_conv = Conv2d_BN(dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, 'input feature has wrong size' + res_x = x + if H == self.window_size and W == self.window_size: + x = self.attn(x) + else: + x = x.view(B, H, W, C) + pad_b = (self.window_size - H % self.window_size) % self.window_size + pad_r = (self.window_size - W % self.window_size) % self.window_size + padding = pad_b > 0 or pad_r > 0 + + if padding: + x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b)) + + pH, pW = H + pad_b, W + pad_r + nH = pH // self.window_size + nW = pW // self.window_size + # window partition + x = x.view(B, nH, self.window_size, nW, self.window_size, + C).transpose(2, 3).reshape(B * nH * nW, self.window_size * self.window_size, C) + x = self.attn(x) + # window reverse + x = x.view(B, nH, nW, self.window_size, self.window_size, C).transpose(2, 3).reshape(B, pH, pW, C) + + if padding: + x = x[:, :H, :W].contiguous() + + x = x.view(B, L, C) + + x = res_x + self.drop_path(x) + + x = x.transpose(1, 2).reshape(B, C, H, W) + x = self.local_conv(x) + x = x.view(B, C, L).transpose(1, 2) + + x = x + self.drop_path(self.mlp(x)) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \ + f'window_size={self.window_size}, mlp_ratio={self.mlp_ratio}' + + +class BasicLayer(nn.Module): + """ A basic TinyViT layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + drop (float, optional): Dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + local_conv_size (int): the kernel size of the depthwise convolution between attention and MLP. Default: 3 + activation (torch.nn): the activation function. Default: nn.GELU + out_dim (int | optional): the output dimension of the layer. Default: None + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + mlp_ratio=4., + drop=0., + drop_path=0., + downsample=None, + use_checkpoint=False, + local_conv_size=3, + activation=nn.GELU, + out_dim=None, + ): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + TinyViTBlock( + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + window_size=window_size, + mlp_ratio=mlp_ratio, + drop=drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + local_conv_size=local_conv_size, + activation=activation, + ) for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}' + + +class LayerNorm2d(nn.Module): + + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class TinyViT(nn.Module): + + def __init__( + self, + img_size=224, + in_chans=3, + num_classes=1000, + embed_dims=[96, 192, 384, 768], + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_sizes=[7, 7, 14, 7], + mlp_ratio=4., + drop_rate=0., + drop_path_rate=0.1, + use_checkpoint=False, + mbconv_expand_ratio=4.0, + local_conv_size=3, + layer_lr_decay=1.0, + ): + super().__init__() + self.img_size = img_size + self.num_classes = num_classes + self.depths = depths + self.num_layers = len(depths) + self.mlp_ratio = mlp_ratio + + activation = nn.GELU + + self.patch_embed = PatchEmbed(in_chans=in_chans, + embed_dim=embed_dims[0], + resolution=img_size, + activation=activation) + + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + kwargs = dict( + dim=embed_dims[i_layer], + input_resolution=(patches_resolution[0] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)), + patches_resolution[1] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer))), + # input_resolution=(patches_resolution[0] // (2 ** i_layer), + # patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + out_dim=embed_dims[min(i_layer + 1, + len(embed_dims) - 1)], + activation=activation, + ) + if i_layer == 0: + layer = ConvLayer( + conv_expand_ratio=mbconv_expand_ratio, + **kwargs, + ) + else: + layer = BasicLayer(num_heads=num_heads[i_layer], + window_size=window_sizes[i_layer], + mlp_ratio=self.mlp_ratio, + drop=drop_rate, + local_conv_size=local_conv_size, + **kwargs) + self.layers.append(layer) + + # Classifier head + self.norm_head = nn.LayerNorm(embed_dims[-1]) + self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity() + + # init weights + self.apply(self._init_weights) + self.set_layer_lr_decay(layer_lr_decay) + self.neck = nn.Sequential( + nn.Conv2d( + embed_dims[-1], + 256, + kernel_size=1, + bias=False, + ), + LayerNorm2d(256), + nn.Conv2d( + 256, + 256, + kernel_size=3, + padding=1, + bias=False, + ), + LayerNorm2d(256), + ) + + def set_layer_lr_decay(self, layer_lr_decay): + decay_rate = layer_lr_decay + + # layers -> blocks (depth) + depth = sum(self.depths) + lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)] + + def _set_lr_scale(m, scale): + for p in m.parameters(): + p.lr_scale = scale + + self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0])) + i = 0 + for layer in self.layers: + for block in layer.blocks: + block.apply(lambda x: _set_lr_scale(x, lr_scales[i])) + i += 1 + if layer.downsample is not None: + layer.downsample.apply(lambda x: _set_lr_scale(x, lr_scales[i - 1])) + assert i == depth + for m in [self.norm_head, self.head]: + m.apply(lambda x: _set_lr_scale(x, lr_scales[-1])) + + for k, p in self.named_parameters(): + p.param_name = k + + def _check_lr_scale(m): + for p in m.parameters(): + assert hasattr(p, 'lr_scale'), p.param_name + + self.apply(_check_lr_scale) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + # NOTE: This initialization is needed only for training. + # trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {'attention_biases'} + + def forward_features(self, x): + # x: (N, C, H, W) + x = self.patch_embed(x) + + x = self.layers[0](x) + start_i = 1 + + for i in range(start_i, len(self.layers)): + layer = self.layers[i] + x = layer(x) + B, _, C = x.size() + x = x.view(B, 64, 64, C) + x = x.permute(0, 3, 1, 2) + x = self.neck(x) + return x + + def forward(self, x): + x = self.forward_features(x) + return x diff --git a/ultralytics/models/sam/modules/transformer.py b/ultralytics/models/sam/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..348597bfc1691aad29df57c4a74b8d6ac9c286a6 --- /dev/null +++ b/ultralytics/models/sam/modules/transformer.py @@ -0,0 +1,235 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import math +from typing import Tuple, Type + +import torch +from torch import Tensor, nn + +from ultralytics.nn.modules import MLPBlock + + +class TwoWayTransformer(nn.Module): + + def __init__( + self, + depth: int, + embedding_dim: int, + num_heads: int, + mlp_dim: int, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + ) -> None: + """ + A transformer decoder that attends to an input image using + queries whose positional embedding is supplied. + + Args: + depth (int): number of layers in the transformer + embedding_dim (int): the channel dimension for the input embeddings + num_heads (int): the number of heads for multihead attention. Must + divide embedding_dim + mlp_dim (int): the channel dimension internal to the MLP block + activation (nn.Module): the activation to use in the MLP block + """ + super().__init__() + self.depth = depth + self.embedding_dim = embedding_dim + self.num_heads = num_heads + self.mlp_dim = mlp_dim + self.layers = nn.ModuleList() + + for i in range(depth): + self.layers.append( + TwoWayAttentionBlock( + embedding_dim=embedding_dim, + num_heads=num_heads, + mlp_dim=mlp_dim, + activation=activation, + attention_downsample_rate=attention_downsample_rate, + skip_first_layer_pe=(i == 0), + )) + + self.final_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + self.norm_final_attn = nn.LayerNorm(embedding_dim) + + def forward( + self, + image_embedding: Tensor, + image_pe: Tensor, + point_embedding: Tensor, + ) -> Tuple[Tensor, Tensor]: + """ + Args: + image_embedding (torch.Tensor): image to attend to. Should be shape + B x embedding_dim x h x w for any h and w. + image_pe (torch.Tensor): the positional encoding to add to the image. Must + have the same shape as image_embedding. + point_embedding (torch.Tensor): the embedding to add to the query points. + Must have shape B x N_points x embedding_dim for any N_points. + + Returns: + torch.Tensor: the processed point_embedding + torch.Tensor: the processed image_embedding + """ + # BxCxHxW -> BxHWxC == B x N_image_tokens x C + bs, c, h, w = image_embedding.shape + image_embedding = image_embedding.flatten(2).permute(0, 2, 1) + image_pe = image_pe.flatten(2).permute(0, 2, 1) + + # Prepare queries + queries = point_embedding + keys = image_embedding + + # Apply transformer blocks and final layernorm + for layer in self.layers: + queries, keys = layer( + queries=queries, + keys=keys, + query_pe=point_embedding, + key_pe=image_pe, + ) + + # Apply the final attention layer from the points to the image + q = queries + point_embedding + k = keys + image_pe + attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm_final_attn(queries) + + return queries, keys + + +class TwoWayAttentionBlock(nn.Module): + + def __init__( + self, + embedding_dim: int, + num_heads: int, + mlp_dim: int = 2048, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + skip_first_layer_pe: bool = False, + ) -> None: + """ + A transformer block with four layers: (1) self-attention of sparse + inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp + block on sparse inputs, and (4) cross attention of dense inputs to sparse + inputs. + + Arguments: + embedding_dim (int): the channel dimension of the embeddings + num_heads (int): the number of heads in the attention layers + mlp_dim (int): the hidden dimension of the mlp block + activation (nn.Module): the activation of the mlp block + skip_first_layer_pe (bool): skip the PE on the first layer + """ + super().__init__() + self.self_attn = Attention(embedding_dim, num_heads) + self.norm1 = nn.LayerNorm(embedding_dim) + + self.cross_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + self.norm2 = nn.LayerNorm(embedding_dim) + + self.mlp = MLPBlock(embedding_dim, mlp_dim, activation) + self.norm3 = nn.LayerNorm(embedding_dim) + + self.norm4 = nn.LayerNorm(embedding_dim) + self.cross_attn_image_to_token = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward(self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor) -> Tuple[Tensor, Tensor]: + """Apply self-attention and cross-attention to queries and keys and return the processed embeddings.""" + + # Self attention block + if self.skip_first_layer_pe: + queries = self.self_attn(q=queries, k=queries, v=queries) + else: + q = queries + query_pe + attn_out = self.self_attn(q=q, k=q, v=queries) + queries = queries + attn_out + queries = self.norm1(queries) + + # Cross attention block, tokens attending to image embedding + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.norm3(queries) + + # Cross attention block, image embedding attending to tokens + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries) + keys = keys + attn_out + keys = self.norm4(keys) + + return queries, keys + + +class Attention(nn.Module): + """ + An attention layer that allows for downscaling the size of the embedding + after projection to queries, keys, and values. + """ + + def __init__( + self, + embedding_dim: int, + num_heads: int, + downsample_rate: int = 1, + ) -> None: + super().__init__() + self.embedding_dim = embedding_dim + self.internal_dim = embedding_dim // downsample_rate + self.num_heads = num_heads + assert self.internal_dim % num_heads == 0, 'num_heads must divide embedding_dim.' + + self.q_proj = nn.Linear(embedding_dim, self.internal_dim) + self.k_proj = nn.Linear(embedding_dim, self.internal_dim) + self.v_proj = nn.Linear(embedding_dim, self.internal_dim) + self.out_proj = nn.Linear(self.internal_dim, embedding_dim) + + def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor: + """Separate the input tensor into the specified number of attention heads.""" + b, n, c = x.shape + x = x.reshape(b, n, num_heads, c // num_heads) + return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head + + def _recombine_heads(self, x: Tensor) -> Tensor: + """Recombine the separated attention heads into a single tensor.""" + b, n_heads, n_tokens, c_per_head = x.shape + x = x.transpose(1, 2) + return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C + + def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor: + """Compute the attention output given the input query, key, and value tensors.""" + + # Input projections + q = self.q_proj(q) + k = self.k_proj(k) + v = self.v_proj(v) + + # Separate into heads + q = self._separate_heads(q, self.num_heads) + k = self._separate_heads(k, self.num_heads) + v = self._separate_heads(v, self.num_heads) + + # Attention + _, _, _, c_per_head = q.shape + attn = q @ k.permute(0, 1, 3, 2) # B x N_heads x N_tokens x N_tokens + attn = attn / math.sqrt(c_per_head) + attn = torch.softmax(attn, dim=-1) + + # Get output + out = attn @ v + out = self._recombine_heads(out) + out = self.out_proj(out) + + return out diff --git a/ultralytics/models/sam/predict.py b/ultralytics/models/sam/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..c7b23875d20f999e97c9c708f5e53e647e0bd137 --- /dev/null +++ b/ultralytics/models/sam/predict.py @@ -0,0 +1,408 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import numpy as np +import torch +import torch.nn.functional as F +import torchvision + +from ultralytics.data.augment import LetterBox +from ultralytics.engine.predictor import BasePredictor +from ultralytics.engine.results import Results +from ultralytics.utils import DEFAULT_CFG, ops +from ultralytics.utils.torch_utils import select_device + +from .amg import (batch_iterator, batched_mask_to_box, build_all_layer_point_grids, calculate_stability_score, + generate_crop_boxes, is_box_near_crop_edge, remove_small_regions, uncrop_boxes_xyxy, uncrop_masks) +from .build import build_sam + + +class Predictor(BasePredictor): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + if overrides is None: + overrides = {} + overrides.update(dict(task='segment', mode='predict', imgsz=1024)) + super().__init__(cfg, overrides, _callbacks) + # SAM needs retina_masks=True, or the results would be a mess. + self.args.retina_masks = True + # Args for set_image + self.im = None + self.features = None + # Args for set_prompts + self.prompts = {} + # Args for segment everything + self.segment_all = False + + def preprocess(self, im): + """Prepares input image before inference. + + Args: + im (torch.Tensor | List(np.ndarray)): BCHW for tensor, [(HWC) x B] for list. + """ + if self.im is not None: + return self.im + not_tensor = not isinstance(im, torch.Tensor) + if not_tensor: + im = np.stack(self.pre_transform(im)) + im = im[..., ::-1].transpose((0, 3, 1, 2)) # BGR to RGB, BHWC to BCHW, (n, 3, h, w) + im = np.ascontiguousarray(im) # contiguous + im = torch.from_numpy(im) + + img = im.to(self.device) + img = img.half() if self.model.fp16 else img.float() # uint8 to fp16/32 + if not_tensor: + img = (img - self.mean) / self.std + return img + + def pre_transform(self, im): + """Pre-transform input image before inference. + + Args: + im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list. + + Return: A list of transformed imgs. + """ + assert len(im) == 1, 'SAM model has not supported batch inference yet!' + return [LetterBox(self.args.imgsz, auto=False, center=False)(image=x) for x in im] + + def inference(self, im, bboxes=None, points=None, labels=None, masks=None, multimask_output=False, *args, **kwargs): + """ + Predict masks for the given input prompts, using the currently set image. + + Args: + im (torch.Tensor): The preprocessed image, (N, C, H, W). + bboxes (np.ndarray | List, None): (N, 4), in XYXY format. + points (np.ndarray | List, None): (N, 2), Each point is in (X,Y) in pixels. + labels (np.ndarray | List, None): (N, ), labels for the point prompts. + 1 indicates a foreground point and 0 indicates a background point. + masks (np.ndarray, None): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form (N, H, W), where + for SAM, H=W=256. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + + Returns: + (np.ndarray): The output masks in CxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (np.ndarray): An array of length C containing the model's + predictions for the quality of each mask. + (np.ndarray): An array of shape CxHxW, where C is the number + of masks and H=W=256. These low resolution logits can be passed to + a subsequent iteration as mask input. + """ + # Get prompts from self.prompts first + bboxes = self.prompts.pop('bboxes', bboxes) + points = self.prompts.pop('points', points) + masks = self.prompts.pop('masks', masks) + if all(i is None for i in [bboxes, points, masks]): + return self.generate(im, *args, **kwargs) + return self.prompt_inference(im, bboxes, points, labels, masks, multimask_output) + + def prompt_inference(self, im, bboxes=None, points=None, labels=None, masks=None, multimask_output=False): + """ + Predict masks for the given input prompts, using the currently set image. + + Args: + im (torch.Tensor): The preprocessed image, (N, C, H, W). + bboxes (np.ndarray | List, None): (N, 4), in XYXY format. + points (np.ndarray | List, None): (N, 2), Each point is in (X,Y) in pixels. + labels (np.ndarray | List, None): (N, ), labels for the point prompts. + 1 indicates a foreground point and 0 indicates a background point. + masks (np.ndarray, None): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form (N, H, W), where + for SAM, H=W=256. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + + Returns: + (np.ndarray): The output masks in CxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (np.ndarray): An array of length C containing the model's + predictions for the quality of each mask. + (np.ndarray): An array of shape CxHxW, where C is the number + of masks and H=W=256. These low resolution logits can be passed to + a subsequent iteration as mask input. + """ + features = self.model.image_encoder(im) if self.features is None else self.features + + src_shape, dst_shape = self.batch[1][0].shape[:2], im.shape[2:] + r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1]) + # Transform input prompts + if points is not None: + points = torch.as_tensor(points, dtype=torch.float32, device=self.device) + points = points[None] if points.ndim == 1 else points + # Assuming labels are all positive if users don't pass labels. + if labels is None: + labels = np.ones(points.shape[0]) + labels = torch.as_tensor(labels, dtype=torch.int32, device=self.device) + points *= r + # (N, 2) --> (N, 1, 2), (N, ) --> (N, 1) + points, labels = points[:, None, :], labels[:, None] + if bboxes is not None: + bboxes = torch.as_tensor(bboxes, dtype=torch.float32, device=self.device) + bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes + bboxes *= r + if masks is not None: + masks = torch.as_tensor(masks, dtype=torch.float32, device=self.device) + masks = masks[:, None, :, :] + + points = (points, labels) if points is not None else None + # Embed prompts + sparse_embeddings, dense_embeddings = self.model.prompt_encoder( + points=points, + boxes=bboxes, + masks=masks, + ) + + # Predict masks + pred_masks, pred_scores = self.model.mask_decoder( + image_embeddings=features, + image_pe=self.model.prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + ) + + # (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, ) + # `d` could be 1 or 3 depends on `multimask_output`. + return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1) + + def generate(self, + im, + crop_n_layers=0, + crop_overlap_ratio=512 / 1500, + crop_downscale_factor=1, + point_grids=None, + points_stride=32, + points_batch_size=64, + conf_thres=0.88, + stability_score_thresh=0.95, + stability_score_offset=0.95, + crop_nms_thresh=0.7): + """Segment the whole image. + + Args: + im (torch.Tensor): The preprocessed image, (N, C, H, W). + crop_n_layers (int): If >0, mask prediction will be run again on + crops of the image. Sets the number of layers to run, where each + layer has 2**i_layer number of image crops. + crop_overlap_ratio (float): Sets the degree to which crops overlap. + In the first crop layer, crops will overlap by this fraction of + the image length. Later layers with more crops scale down this overlap. + crop_downscale_factor (int): The number of points-per-side + sampled in layer n is scaled down by crop_n_points_downscale_factor**n. + point_grids (list(np.ndarray), None): A list over explicit grids + of points used for sampling, normalized to [0,1]. The nth grid in the + list is used in the nth crop layer. Exclusive with points_per_side. + points_stride (int, None): The number of points to be sampled + along one side of the image. The total number of points is + points_per_side**2. If None, 'point_grids' must provide explicit + point sampling. + points_batch_size (int): Sets the number of points run simultaneously + by the model. Higher numbers may be faster but use more GPU memory. + conf_thres (float): A filtering threshold in [0,1], using the + model's predicted mask quality. + stability_score_thresh (float): A filtering threshold in [0,1], using + the stability of the mask under changes to the cutoff used to binarize + the model's mask predictions. + stability_score_offset (float): The amount to shift the cutoff when + calculated the stability score. + crop_nms_thresh (float): The box IoU cutoff used by non-maximal + suppression to filter duplicate masks between different crops. + """ + self.segment_all = True + ih, iw = im.shape[2:] + crop_regions, layer_idxs = generate_crop_boxes((ih, iw), crop_n_layers, crop_overlap_ratio) + if point_grids is None: + point_grids = build_all_layer_point_grids( + points_stride, + crop_n_layers, + crop_downscale_factor, + ) + pred_masks, pred_scores, pred_bboxes, region_areas = [], [], [], [] + for crop_region, layer_idx in zip(crop_regions, layer_idxs): + x1, y1, x2, y2 = crop_region + w, h = x2 - x1, y2 - y1 + area = torch.tensor(w * h, device=im.device) + points_scale = np.array([[w, h]]) # w, h + # Crop image and interpolate to input size + crop_im = F.interpolate(im[..., y1:y2, x1:x2], (ih, iw), mode='bilinear', align_corners=False) + # (num_points, 2) + points_for_image = point_grids[layer_idx] * points_scale + crop_masks, crop_scores, crop_bboxes = [], [], [] + for (points, ) in batch_iterator(points_batch_size, points_for_image): + pred_mask, pred_score = self.prompt_inference(crop_im, points=points, multimask_output=True) + # Interpolate predicted masks to input size + pred_mask = F.interpolate(pred_mask[None], (h, w), mode='bilinear', align_corners=False)[0] + idx = pred_score > conf_thres + pred_mask, pred_score = pred_mask[idx], pred_score[idx] + + stability_score = calculate_stability_score(pred_mask, self.model.mask_threshold, + stability_score_offset) + idx = stability_score > stability_score_thresh + pred_mask, pred_score = pred_mask[idx], pred_score[idx] + # Bool type is much more memory-efficient. + pred_mask = pred_mask > self.model.mask_threshold + # (N, 4) + pred_bbox = batched_mask_to_box(pred_mask).float() + keep_mask = ~is_box_near_crop_edge(pred_bbox, crop_region, [0, 0, iw, ih]) + if not torch.all(keep_mask): + pred_bbox = pred_bbox[keep_mask] + pred_mask = pred_mask[keep_mask] + pred_score = pred_score[keep_mask] + + crop_masks.append(pred_mask) + crop_bboxes.append(pred_bbox) + crop_scores.append(pred_score) + + # Do nms within this crop + crop_masks = torch.cat(crop_masks) + crop_bboxes = torch.cat(crop_bboxes) + crop_scores = torch.cat(crop_scores) + keep = torchvision.ops.nms(crop_bboxes, crop_scores, self.args.iou) # NMS + crop_bboxes = uncrop_boxes_xyxy(crop_bboxes[keep], crop_region) + crop_masks = uncrop_masks(crop_masks[keep], crop_region, ih, iw) + crop_scores = crop_scores[keep] + + pred_masks.append(crop_masks) + pred_bboxes.append(crop_bboxes) + pred_scores.append(crop_scores) + region_areas.append(area.expand(len(crop_masks))) + + pred_masks = torch.cat(pred_masks) + pred_bboxes = torch.cat(pred_bboxes) + pred_scores = torch.cat(pred_scores) + region_areas = torch.cat(region_areas) + + # Remove duplicate masks between crops + if len(crop_regions) > 1: + scores = 1 / region_areas + keep = torchvision.ops.nms(pred_bboxes, scores, crop_nms_thresh) + pred_masks = pred_masks[keep] + pred_bboxes = pred_bboxes[keep] + pred_scores = pred_scores[keep] + + return pred_masks, pred_scores, pred_bboxes + + def setup_model(self, model, verbose=True): + """Set up YOLO model with specified thresholds and device.""" + device = select_device(self.args.device, verbose=verbose) + if model is None: + model = build_sam(self.args.model) + model.eval() + self.model = model.to(device) + self.device = device + self.mean = torch.tensor([123.675, 116.28, 103.53]).view(-1, 1, 1).to(device) + self.std = torch.tensor([58.395, 57.12, 57.375]).view(-1, 1, 1).to(device) + # TODO: Temporary settings for compatibility + self.model.pt = False + self.model.triton = False + self.model.stride = 32 + self.model.fp16 = False + self.done_warmup = True + + def postprocess(self, preds, img, orig_imgs): + """Postprocesses inference output predictions to create detection masks for objects.""" + # (N, 1, H, W), (N, 1) + pred_masks, pred_scores = preds[:2] + pred_bboxes = preds[2] if self.segment_all else None + names = dict(enumerate(str(i) for i in range(len(pred_masks)))) + results = [] + for i, masks in enumerate([pred_masks]): + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + if pred_bboxes is not None: + pred_bboxes = ops.scale_boxes(img.shape[2:], pred_bboxes.float(), orig_img.shape, padding=False) + cls = torch.arange(len(pred_masks), dtype=torch.int32, device=pred_masks.device) + pred_bboxes = torch.cat([pred_bboxes, pred_scores[:, None], cls[:, None]], dim=-1) + + masks = ops.scale_masks(masks[None].float(), orig_img.shape[:2], padding=False)[0] + masks = masks > self.model.mask_threshold # to bool + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + results.append(Results(orig_img=orig_img, path=img_path, names=names, masks=masks, boxes=pred_bboxes)) + # Reset segment-all mode. + self.segment_all = False + return results + + def setup_source(self, source): + """Sets up source and inference mode.""" + if source is not None: + super().setup_source(source) + + def set_image(self, image): + """Set image in advance. + Args: + + image (str | np.ndarray): image file path or np.ndarray image by cv2. + """ + if self.model is None: + model = build_sam(self.args.model) + self.setup_model(model) + self.setup_source(image) + assert len(self.dataset) == 1, '`set_image` only supports setting one image!' + for batch in self.dataset: + im = self.preprocess(batch[1]) + self.features = self.model.image_encoder(im) + self.im = im + break + + def set_prompts(self, prompts): + """Set prompts in advance.""" + self.prompts = prompts + + def reset_image(self): + self.im = None + self.features = None + + @staticmethod + def remove_small_regions(masks, min_area=0, nms_thresh=0.7): + """ + Removes small disconnected regions and holes in masks, then reruns + box NMS to remove any new duplicates. Requires open-cv as a dependency. + + Args: + masks (torch.Tensor): Masks, (N, H, W). + min_area (int): Minimum area threshold. + nms_thresh (float): NMS threshold. + """ + if len(masks) == 0: + return masks + + # Filter small disconnected regions and holes + new_masks = [] + scores = [] + for mask in masks: + mask = mask.cpu().numpy() + mask, changed = remove_small_regions(mask, min_area, mode='holes') + unchanged = not changed + mask, changed = remove_small_regions(mask, min_area, mode='islands') + unchanged = unchanged and not changed + + new_masks.append(torch.as_tensor(mask).unsqueeze(0)) + # Give score=0 to changed masks and score=1 to unchanged masks + # so NMS will prefer ones that didn't need postprocessing + scores.append(float(unchanged)) + + # Recalculate boxes and remove any new duplicates + new_masks = torch.cat(new_masks, dim=0) + boxes = batched_mask_to_box(new_masks) + keep = torchvision.ops.nms( + boxes.float(), + torch.as_tensor(scores), + nms_thresh, + ) + + # Only recalculate masks for masks that have changed + for i in keep: + if scores[i] == 0.0: + masks[i] = new_masks[i] + + return masks[keep] diff --git a/ultralytics/models/utils/__init__.py b/ultralytics/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c73604daded2a31176069b8620b9a80d6634d5b8 --- /dev/null +++ b/ultralytics/models/utils/__init__.py @@ -0,0 +1 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license diff --git a/ultralytics/models/utils/loss.py b/ultralytics/models/utils/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..3ca9886480a8983d0f12bb0d67ca60e113024681 --- /dev/null +++ b/ultralytics/models/utils/loss.py @@ -0,0 +1,295 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ultralytics.utils.loss import FocalLoss, VarifocalLoss +from ultralytics.utils.metrics import bbox_iou + +from .ops import HungarianMatcher + + +class DETRLoss(nn.Module): + + def __init__(self, + nc=80, + loss_gain=None, + aux_loss=True, + use_fl=True, + use_vfl=False, + use_uni_match=False, + uni_match_ind=0): + """ + DETR loss function. + + Args: + nc (int): The number of classes. + loss_gain (dict): The coefficient of loss. + aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used. + use_vfl (bool): Use VarifocalLoss or not. + use_uni_match (bool): Whether to use a fixed layer to assign labels for auxiliary branch. + uni_match_ind (int): The fixed indices of a layer. + """ + super().__init__() + + if loss_gain is None: + loss_gain = {'class': 1, 'bbox': 5, 'giou': 2, 'no_object': 0.1, 'mask': 1, 'dice': 1} + self.nc = nc + self.matcher = HungarianMatcher(cost_gain={'class': 2, 'bbox': 5, 'giou': 2}) + self.loss_gain = loss_gain + self.aux_loss = aux_loss + self.fl = FocalLoss() if use_fl else None + self.vfl = VarifocalLoss() if use_vfl else None + + self.use_uni_match = use_uni_match + self.uni_match_ind = uni_match_ind + self.device = None + + def _get_loss_class(self, pred_scores, targets, gt_scores, num_gts, postfix=''): + # logits: [b, query, num_classes], gt_class: list[[n, 1]] + name_class = f'loss_class{postfix}' + bs, nq = pred_scores.shape[:2] + # one_hot = F.one_hot(targets, self.nc + 1)[..., :-1] # (bs, num_queries, num_classes) + one_hot = torch.zeros((bs, nq, self.nc + 1), dtype=torch.int64, device=targets.device) + one_hot.scatter_(2, targets.unsqueeze(-1), 1) + one_hot = one_hot[..., :-1] + gt_scores = gt_scores.view(bs, nq, 1) * one_hot + + if self.fl: + if num_gts and self.vfl: + loss_cls = self.vfl(pred_scores, gt_scores, one_hot) + else: + loss_cls = self.fl(pred_scores, one_hot.float()) + loss_cls /= max(num_gts, 1) / nq + else: + loss_cls = nn.BCEWithLogitsLoss(reduction='none')(pred_scores, gt_scores).mean(1).sum() # YOLO CLS loss + + return {name_class: loss_cls.squeeze() * self.loss_gain['class']} + + def _get_loss_bbox(self, pred_bboxes, gt_bboxes, postfix=''): + # boxes: [b, query, 4], gt_bbox: list[[n, 4]] + name_bbox = f'loss_bbox{postfix}' + name_giou = f'loss_giou{postfix}' + + loss = {} + if len(gt_bboxes) == 0: + loss[name_bbox] = torch.tensor(0., device=self.device) + loss[name_giou] = torch.tensor(0., device=self.device) + return loss + + loss[name_bbox] = self.loss_gain['bbox'] * F.l1_loss(pred_bboxes, gt_bboxes, reduction='sum') / len(gt_bboxes) + loss[name_giou] = 1.0 - bbox_iou(pred_bboxes, gt_bboxes, xywh=True, GIoU=True) + loss[name_giou] = loss[name_giou].sum() / len(gt_bboxes) + loss[name_giou] = self.loss_gain['giou'] * loss[name_giou] + loss = {k: v.squeeze() for k, v in loss.items()} + return loss + + def _get_loss_mask(self, masks, gt_mask, match_indices, postfix=''): + # masks: [b, query, h, w], gt_mask: list[[n, H, W]] + name_mask = f'loss_mask{postfix}' + name_dice = f'loss_dice{postfix}' + + loss = {} + if sum(len(a) for a in gt_mask) == 0: + loss[name_mask] = torch.tensor(0., device=self.device) + loss[name_dice] = torch.tensor(0., device=self.device) + return loss + + num_gts = len(gt_mask) + src_masks, target_masks = self._get_assigned_bboxes(masks, gt_mask, match_indices) + src_masks = F.interpolate(src_masks.unsqueeze(0), size=target_masks.shape[-2:], mode='bilinear')[0] + # TODO: torch does not have `sigmoid_focal_loss`, but it's not urgent since we don't use mask branch for now. + loss[name_mask] = self.loss_gain['mask'] * F.sigmoid_focal_loss(src_masks, target_masks, + torch.tensor([num_gts], dtype=torch.float32)) + loss[name_dice] = self.loss_gain['dice'] * self._dice_loss(src_masks, target_masks, num_gts) + return loss + + def _dice_loss(self, inputs, targets, num_gts): + inputs = F.sigmoid(inputs) + inputs = inputs.flatten(1) + targets = targets.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_gts + + def _get_loss_aux(self, + pred_bboxes, + pred_scores, + gt_bboxes, + gt_cls, + gt_groups, + match_indices=None, + postfix='', + masks=None, + gt_mask=None): + """Get auxiliary losses""" + # NOTE: loss class, bbox, giou, mask, dice + loss = torch.zeros(5 if masks is not None else 3, device=pred_bboxes.device) + if match_indices is None and self.use_uni_match: + match_indices = self.matcher(pred_bboxes[self.uni_match_ind], + pred_scores[self.uni_match_ind], + gt_bboxes, + gt_cls, + gt_groups, + masks=masks[self.uni_match_ind] if masks is not None else None, + gt_mask=gt_mask) + for i, (aux_bboxes, aux_scores) in enumerate(zip(pred_bboxes, pred_scores)): + aux_masks = masks[i] if masks is not None else None + loss_ = self._get_loss(aux_bboxes, + aux_scores, + gt_bboxes, + gt_cls, + gt_groups, + masks=aux_masks, + gt_mask=gt_mask, + postfix=postfix, + match_indices=match_indices) + loss[0] += loss_[f'loss_class{postfix}'] + loss[1] += loss_[f'loss_bbox{postfix}'] + loss[2] += loss_[f'loss_giou{postfix}'] + # if masks is not None and gt_mask is not None: + # loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices, postfix) + # loss[3] += loss_[f'loss_mask{postfix}'] + # loss[4] += loss_[f'loss_dice{postfix}'] + + loss = { + f'loss_class_aux{postfix}': loss[0], + f'loss_bbox_aux{postfix}': loss[1], + f'loss_giou_aux{postfix}': loss[2]} + # if masks is not None and gt_mask is not None: + # loss[f'loss_mask_aux{postfix}'] = loss[3] + # loss[f'loss_dice_aux{postfix}'] = loss[4] + return loss + + def _get_index(self, match_indices): + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)]) + src_idx = torch.cat([src for (src, _) in match_indices]) + dst_idx = torch.cat([dst for (_, dst) in match_indices]) + return (batch_idx, src_idx), dst_idx + + def _get_assigned_bboxes(self, pred_bboxes, gt_bboxes, match_indices): + pred_assigned = torch.cat([ + t[I] if len(I) > 0 else torch.zeros(0, t.shape[-1], device=self.device) + for t, (I, _) in zip(pred_bboxes, match_indices)]) + gt_assigned = torch.cat([ + t[J] if len(J) > 0 else torch.zeros(0, t.shape[-1], device=self.device) + for t, (_, J) in zip(gt_bboxes, match_indices)]) + return pred_assigned, gt_assigned + + def _get_loss(self, + pred_bboxes, + pred_scores, + gt_bboxes, + gt_cls, + gt_groups, + masks=None, + gt_mask=None, + postfix='', + match_indices=None): + """Get losses""" + if match_indices is None: + match_indices = self.matcher(pred_bboxes, + pred_scores, + gt_bboxes, + gt_cls, + gt_groups, + masks=masks, + gt_mask=gt_mask) + + idx, gt_idx = self._get_index(match_indices) + pred_bboxes, gt_bboxes = pred_bboxes[idx], gt_bboxes[gt_idx] + + bs, nq = pred_scores.shape[:2] + targets = torch.full((bs, nq), self.nc, device=pred_scores.device, dtype=gt_cls.dtype) + targets[idx] = gt_cls[gt_idx] + + gt_scores = torch.zeros([bs, nq], device=pred_scores.device) + if len(gt_bboxes): + gt_scores[idx] = bbox_iou(pred_bboxes.detach(), gt_bboxes, xywh=True).squeeze(-1) + + loss = {} + loss.update(self._get_loss_class(pred_scores, targets, gt_scores, len(gt_bboxes), postfix)) + loss.update(self._get_loss_bbox(pred_bboxes, gt_bboxes, postfix)) + # if masks is not None and gt_mask is not None: + # loss.update(self._get_loss_mask(masks, gt_mask, match_indices, postfix)) + return loss + + def forward(self, pred_bboxes, pred_scores, batch, postfix='', **kwargs): + """ + Args: + pred_bboxes (torch.Tensor): [l, b, query, 4] + pred_scores (torch.Tensor): [l, b, query, num_classes] + batch (dict): A dict includes: + gt_cls (torch.Tensor) with shape [num_gts, ], + gt_bboxes (torch.Tensor): [num_gts, 4], + gt_groups (List(int)): a list of batch size length includes the number of gts of each image. + postfix (str): postfix of loss name. + """ + self.device = pred_bboxes.device + match_indices = kwargs.get('match_indices', None) + gt_cls, gt_bboxes, gt_groups = batch['cls'], batch['bboxes'], batch['gt_groups'] + + total_loss = self._get_loss(pred_bboxes[-1], + pred_scores[-1], + gt_bboxes, + gt_cls, + gt_groups, + postfix=postfix, + match_indices=match_indices) + + if self.aux_loss: + total_loss.update( + self._get_loss_aux(pred_bboxes[:-1], pred_scores[:-1], gt_bboxes, gt_cls, gt_groups, match_indices, + postfix)) + + return total_loss + + +class RTDETRDetectionLoss(DETRLoss): + + def forward(self, preds, batch, dn_bboxes=None, dn_scores=None, dn_meta=None): + pred_bboxes, pred_scores = preds + total_loss = super().forward(pred_bboxes, pred_scores, batch) + + if dn_meta is not None: + dn_pos_idx, dn_num_group = dn_meta['dn_pos_idx'], dn_meta['dn_num_group'] + assert len(batch['gt_groups']) == len(dn_pos_idx) + + # denoising match indices + match_indices = self.get_dn_match_indices(dn_pos_idx, dn_num_group, batch['gt_groups']) + + # compute denoising training loss + dn_loss = super().forward(dn_bboxes, dn_scores, batch, postfix='_dn', match_indices=match_indices) + total_loss.update(dn_loss) + else: + total_loss.update({f'{k}_dn': torch.tensor(0., device=self.device) for k in total_loss.keys()}) + + return total_loss + + @staticmethod + def get_dn_match_indices(dn_pos_idx, dn_num_group, gt_groups): + """Get the match indices for denoising. + + Args: + dn_pos_idx (List[torch.Tensor]): A list includes positive indices of denoising. + dn_num_group (int): The number of groups of denoising. + gt_groups (List(int)): a list of batch size length includes the number of gts of each image. + + Returns: + dn_match_indices (List(tuple)): Matched indices. + + """ + dn_match_indices = [] + idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0) + for i, num_gt in enumerate(gt_groups): + if num_gt > 0: + gt_idx = torch.arange(end=num_gt, dtype=torch.long) + idx_groups[i] + gt_idx = gt_idx.repeat(dn_num_group) + assert len(dn_pos_idx[i]) == len(gt_idx), 'Expected the same length, ' + f'but got {len(dn_pos_idx[i])} and {len(gt_idx)} respectively.' + dn_match_indices.append((dn_pos_idx[i], gt_idx)) + else: + dn_match_indices.append((torch.zeros([0], dtype=torch.long), torch.zeros([0], dtype=torch.long))) + return dn_match_indices diff --git a/ultralytics/models/utils/ops.py b/ultralytics/models/utils/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..abd4758cf3eb836d230649434079974886be5f95 --- /dev/null +++ b/ultralytics/models/utils/ops.py @@ -0,0 +1,260 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch +import torch.nn as nn +import torch.nn.functional as F +from scipy.optimize import linear_sum_assignment + +from ultralytics.utils.metrics import bbox_iou +from ultralytics.utils.ops import xywh2xyxy, xyxy2xywh + + +class HungarianMatcher(nn.Module): + """ + A module implementing the HungarianMatcher, which is a differentiable module to solve the assignment problem in + an end-to-end fashion. + + HungarianMatcher performs optimal assignment over predicted and ground truth bounding boxes using a cost function + that considers classification scores, bounding box coordinates, and optionally, mask predictions. + + Attributes: + cost_gain (dict): Dictionary of cost coefficients for different components: 'class', 'bbox', 'giou', 'mask', and 'dice'. + use_fl (bool): Indicates whether to use Focal Loss for the classification cost calculation. + with_mask (bool): Indicates whether the model makes mask predictions. + num_sample_points (int): The number of sample points used in mask cost calculation. + alpha (float): The alpha factor in Focal Loss calculation. + gamma (float): The gamma factor in Focal Loss calculation. + + Methods: + forward(pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=None, gt_mask=None): Computes the assignment + between predictions and ground truths for a batch. + _cost_mask(bs, num_gts, masks=None, gt_mask=None): Computes the mask cost and dice cost if masks are predicted. + """ + + def __init__(self, cost_gain=None, use_fl=True, with_mask=False, num_sample_points=12544, alpha=0.25, gamma=2.0): + super().__init__() + if cost_gain is None: + cost_gain = {'class': 1, 'bbox': 5, 'giou': 2, 'mask': 1, 'dice': 1} + self.cost_gain = cost_gain + self.use_fl = use_fl + self.with_mask = with_mask + self.num_sample_points = num_sample_points + self.alpha = alpha + self.gamma = gamma + + def forward(self, pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=None, gt_mask=None): + """ + Forward pass for HungarianMatcher. This function computes costs based on prediction and ground truth + (classification cost, L1 cost between boxes and GIoU cost between boxes) and finds the optimal matching + between predictions and ground truth based on these costs. + + Args: + pred_bboxes (Tensor): Predicted bounding boxes with shape [batch_size, num_queries, 4]. + pred_scores (Tensor): Predicted scores with shape [batch_size, num_queries, num_classes]. + gt_cls (torch.Tensor): Ground truth classes with shape [num_gts, ]. + gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape [num_gts, 4]. + gt_groups (List[int]): List of length equal to batch size, containing the number of ground truths for + each image. + masks (Tensor, optional): Predicted masks with shape [batch_size, num_queries, height, width]. + Defaults to None. + gt_mask (List[Tensor], optional): List of ground truth masks, each with shape [num_masks, Height, Width]. + Defaults to None. + + Returns: + (List[Tuple[Tensor, Tensor]]): A list of size batch_size, each element is a tuple (index_i, index_j), where: + - index_i is the tensor of indices of the selected predictions (in order) + - index_j is the tensor of indices of the corresponding selected ground truth targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + + bs, nq, nc = pred_scores.shape + + if sum(gt_groups) == 0: + return [(torch.tensor([], dtype=torch.long), torch.tensor([], dtype=torch.long)) for _ in range(bs)] + + # We flatten to compute the cost matrices in a batch + # [batch_size * num_queries, num_classes] + pred_scores = pred_scores.detach().view(-1, nc) + pred_scores = F.sigmoid(pred_scores) if self.use_fl else F.softmax(pred_scores, dim=-1) + # [batch_size * num_queries, 4] + pred_bboxes = pred_bboxes.detach().view(-1, 4) + + # Compute the classification cost + pred_scores = pred_scores[:, gt_cls] + if self.use_fl: + neg_cost_class = (1 - self.alpha) * (pred_scores ** self.gamma) * (-(1 - pred_scores + 1e-8).log()) + pos_cost_class = self.alpha * ((1 - pred_scores) ** self.gamma) * (-(pred_scores + 1e-8).log()) + cost_class = pos_cost_class - neg_cost_class + else: + cost_class = -pred_scores + + # Compute the L1 cost between boxes + cost_bbox = (pred_bboxes.unsqueeze(1) - gt_bboxes.unsqueeze(0)).abs().sum(-1) # (bs*num_queries, num_gt) + + # Compute the GIoU cost between boxes, (bs*num_queries, num_gt) + cost_giou = 1.0 - bbox_iou(pred_bboxes.unsqueeze(1), gt_bboxes.unsqueeze(0), xywh=True, GIoU=True).squeeze(-1) + + # Final cost matrix + C = self.cost_gain['class'] * cost_class + \ + self.cost_gain['bbox'] * cost_bbox + \ + self.cost_gain['giou'] * cost_giou + # Compute the mask cost and dice cost + if self.with_mask: + C += self._cost_mask(bs, gt_groups, masks, gt_mask) + + C = C.view(bs, nq, -1).cpu() + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(gt_groups, -1))] + gt_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0) + # (idx for queries, idx for gt) + return [(torch.tensor(i, dtype=torch.long), torch.tensor(j, dtype=torch.long) + gt_groups[k]) + for k, (i, j) in enumerate(indices)] + + def _cost_mask(self, bs, num_gts, masks=None, gt_mask=None): + assert masks is not None and gt_mask is not None, 'Make sure the input has `mask` and `gt_mask`' + # all masks share the same set of points for efficient matching + sample_points = torch.rand([bs, 1, self.num_sample_points, 2]) + sample_points = 2.0 * sample_points - 1.0 + + out_mask = F.grid_sample(masks.detach(), sample_points, align_corners=False).squeeze(-2) + out_mask = out_mask.flatten(0, 1) + + tgt_mask = torch.cat(gt_mask).unsqueeze(1) + sample_points = torch.cat([a.repeat(b, 1, 1, 1) for a, b in zip(sample_points, num_gts) if b > 0]) + tgt_mask = F.grid_sample(tgt_mask, sample_points, align_corners=False).squeeze([1, 2]) + + with torch.cuda.amp.autocast(False): + # binary cross entropy cost + pos_cost_mask = F.binary_cross_entropy_with_logits(out_mask, torch.ones_like(out_mask), reduction='none') + neg_cost_mask = F.binary_cross_entropy_with_logits(out_mask, torch.zeros_like(out_mask), reduction='none') + cost_mask = torch.matmul(pos_cost_mask, tgt_mask.T) + torch.matmul(neg_cost_mask, 1 - tgt_mask.T) + cost_mask /= self.num_sample_points + + # dice cost + out_mask = F.sigmoid(out_mask) + numerator = 2 * torch.matmul(out_mask, tgt_mask.T) + denominator = out_mask.sum(-1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0) + cost_dice = 1 - (numerator + 1) / (denominator + 1) + + C = self.cost_gain['mask'] * cost_mask + self.cost_gain['dice'] * cost_dice + return C + + +def get_cdn_group(batch, + num_classes, + num_queries, + class_embed, + num_dn=100, + cls_noise_ratio=0.5, + box_noise_scale=1.0, + training=False): + """ + Get contrastive denoising training group. This function creates a contrastive denoising training group with + positive and negative samples from the ground truths (gt). It applies noise to the class labels and bounding + box coordinates, and returns the modified labels, bounding boxes, attention mask and meta information. + + Args: + batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape [num_gts, ]), 'gt_bboxes' + (torch.Tensor with shape [num_gts, 4]), 'gt_groups' (List(int)) which is a list of batch size length + indicating the number of gts of each image. + num_classes (int): Number of classes. + num_queries (int): Number of queries. + class_embed (torch.Tensor): Embedding weights to map class labels to embedding space. + num_dn (int, optional): Number of denoising. Defaults to 100. + cls_noise_ratio (float, optional): Noise ratio for class labels. Defaults to 0.5. + box_noise_scale (float, optional): Noise scale for bounding box coordinates. Defaults to 1.0. + training (bool, optional): If it's in training mode. Defaults to False. + + Returns: + (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Dict]]): The modified class embeddings, + bounding boxes, attention mask and meta information for denoising. If not in training mode or 'num_dn' + is less than or equal to 0, the function returns None for all elements in the tuple. + """ + + if (not training) or num_dn <= 0: + return None, None, None, None + gt_groups = batch['gt_groups'] + total_num = sum(gt_groups) + max_nums = max(gt_groups) + if max_nums == 0: + return None, None, None, None + + num_group = num_dn // max_nums + num_group = 1 if num_group == 0 else num_group + # pad gt to max_num of a batch + bs = len(gt_groups) + gt_cls = batch['cls'] # (bs*num, ) + gt_bbox = batch['bboxes'] # bs*num, 4 + b_idx = batch['batch_idx'] + + # each group has positive and negative queries. + dn_cls = gt_cls.repeat(2 * num_group) # (2*num_group*bs*num, ) + dn_bbox = gt_bbox.repeat(2 * num_group, 1) # 2*num_group*bs*num, 4 + dn_b_idx = b_idx.repeat(2 * num_group).view(-1) # (2*num_group*bs*num, ) + + # positive and negative mask + # (bs*num*num_group, ), the second total_num*num_group part as negative samples + neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num + + if cls_noise_ratio > 0: + # half of bbox prob + mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5) + idx = torch.nonzero(mask).squeeze(-1) + # randomly put a new one here + new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device) + dn_cls[idx] = new_label + + if box_noise_scale > 0: + known_bbox = xywh2xyxy(dn_bbox) + + diff = (dn_bbox[..., 2:] * 0.5).repeat(1, 2) * box_noise_scale # 2*num_group*bs*num, 4 + + rand_sign = torch.randint_like(dn_bbox, 0, 2) * 2.0 - 1.0 + rand_part = torch.rand_like(dn_bbox) + rand_part[neg_idx] += 1.0 + rand_part *= rand_sign + known_bbox += rand_part * diff + known_bbox.clip_(min=0.0, max=1.0) + dn_bbox = xyxy2xywh(known_bbox) + dn_bbox = inverse_sigmoid(dn_bbox) + + # total denoising queries + num_dn = int(max_nums * 2 * num_group) + # class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)]) + dn_cls_embed = class_embed[dn_cls] # bs*num * 2 * num_group, 256 + padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device) + padding_bbox = torch.zeros(bs, num_dn, 4, device=gt_bbox.device) + + map_indices = torch.cat([torch.tensor(range(num), dtype=torch.long) for num in gt_groups]) + pos_idx = torch.stack([map_indices + max_nums * i for i in range(num_group)], dim=0) + + map_indices = torch.cat([map_indices + max_nums * i for i in range(2 * num_group)]) + padding_cls[(dn_b_idx, map_indices)] = dn_cls_embed + padding_bbox[(dn_b_idx, map_indices)] = dn_bbox + + tgt_size = num_dn + num_queries + attn_mask = torch.zeros([tgt_size, tgt_size], dtype=torch.bool) + # match query cannot see the reconstruct + attn_mask[num_dn:, :num_dn] = True + # reconstruct cannot see each other + for i in range(num_group): + if i == 0: + attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True + if i == num_group - 1: + attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * i * 2] = True + else: + attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True + attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), :max_nums * 2 * i] = True + dn_meta = { + 'dn_pos_idx': [p.reshape(-1) for p in pos_idx.cpu().split(list(gt_groups), dim=1)], + 'dn_num_group': num_group, + 'dn_num_split': [num_dn, num_queries]} + + return padding_cls.to(class_embed.device), padding_bbox.to(class_embed.device), attn_mask.to( + class_embed.device), dn_meta + + +def inverse_sigmoid(x, eps=1e-6): + """Inverse sigmoid function.""" + x = x.clip(min=0., max=1.) + return torch.log(x / (1 - x + eps) + eps) diff --git a/ultralytics/models/yolo/__init__.py b/ultralytics/models/yolo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f8137c11d57d17c4ff771f84a40643887674b8fc --- /dev/null +++ b/ultralytics/models/yolo/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from ultralytics.models.yolo import classify, detect, pose, segment + +from .model import YOLO + +__all__ = 'classify', 'segment', 'detect', 'pose', 'YOLO' diff --git a/ultralytics/models/yolo/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0665f5f3ec6d47fefecb553b62534eb69c46b78c Binary files /dev/null and b/ultralytics/models/yolo/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/yolo/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b46d167664c520a449e6d383d76c6dfdefb116fc Binary files /dev/null and b/ultralytics/models/yolo/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/__pycache__/model.cpython-310.pyc b/ultralytics/models/yolo/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..015a64b632c6129ebcb62def821e80f6690f7dc4 Binary files /dev/null and b/ultralytics/models/yolo/__pycache__/model.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/__pycache__/model.cpython-39.pyc b/ultralytics/models/yolo/__pycache__/model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98f0506565e2f85a1a58a1e0e82fd95d7d219cab Binary files /dev/null and b/ultralytics/models/yolo/__pycache__/model.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/classify/__init__.py b/ultralytics/models/yolo/classify/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ff1f1bdd75229ecf7fa828cb3fdc694f80083c92 --- /dev/null +++ b/ultralytics/models/yolo/classify/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from ultralytics.models.yolo.classify.predict import ClassificationPredictor, predict +from ultralytics.models.yolo.classify.train import ClassificationTrainer, train +from ultralytics.models.yolo.classify.val import ClassificationValidator, val + +__all__ = 'ClassificationPredictor', 'predict', 'ClassificationTrainer', 'train', 'ClassificationValidator', 'val' diff --git a/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfd35bcc793192f9f2d77b64e26ff9545b272c34 Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8ef1e79e8baa69e3b69099cfb8d08cf54e312e6 Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/classify/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e64ede018752ec14f138c0ee080e2184059bd0b Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/classify/__pycache__/predict.cpython-39.pyc b/ultralytics/models/yolo/classify/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0516d934a89a01b4fa7cf7c3c02cca3e096dd2a Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/classify/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/train.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..682e0abb91c0050c5fb9f73d2a5b412797001200 Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/train.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/classify/__pycache__/train.cpython-39.pyc b/ultralytics/models/yolo/classify/__pycache__/train.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7a0b1b693251f16a9d60f9e57c4018368b306d3 Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/train.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/classify/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/val.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83946ccdc8b5e3c1280ecdb66827c48977cdd03d Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/val.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/classify/__pycache__/val.cpython-39.pyc b/ultralytics/models/yolo/classify/__pycache__/val.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1768bd402a3319f22cafaebc515ee16d4999d108 Binary files /dev/null and b/ultralytics/models/yolo/classify/__pycache__/val.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/classify/predict.py b/ultralytics/models/yolo/classify/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..a7827fc251953bc5e06e4dc10355ad51791dfabb --- /dev/null +++ b/ultralytics/models/yolo/classify/predict.py @@ -0,0 +1,51 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.engine.predictor import BasePredictor +from ultralytics.engine.results import Results +from ultralytics.utils import DEFAULT_CFG, ROOT + + +class ClassificationPredictor(BasePredictor): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + super().__init__(cfg, overrides, _callbacks) + self.args.task = 'classify' + + def preprocess(self, img): + """Converts input image to model-compatible data type.""" + if not isinstance(img, torch.Tensor): + img = torch.stack([self.transforms(im) for im in img], dim=0) + img = (img if isinstance(img, torch.Tensor) else torch.from_numpy(img)).to(self.model.device) + return img.half() if self.model.fp16 else img.float() # uint8 to fp16/32 + + def postprocess(self, preds, img, orig_imgs): + """Postprocesses predictions to return Results objects.""" + results = [] + for i, pred in enumerate(preds): + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, probs=pred)) + + return results + + +def predict(cfg=DEFAULT_CFG, use_python=False): + """Run YOLO model predictions on input images/videos.""" + model = cfg.model or 'yolov8n-cls.pt' # or "resnet18" + source = cfg.source if cfg.source is not None else ROOT / 'assets' if (ROOT / 'assets').exists() \ + else 'https://ultralytics.com/images/bus.jpg' + + args = dict(model=model, source=source) + if use_python: + from ultralytics import YOLO + YOLO(model)(**args) + else: + predictor = ClassificationPredictor(overrides=args) + predictor.predict_cli() + + +if __name__ == '__main__': + predict() diff --git a/ultralytics/models/yolo/classify/train.py b/ultralytics/models/yolo/classify/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f6bf402b7715212d3d58b2bf4e946b6c3c2d3e5b --- /dev/null +++ b/ultralytics/models/yolo/classify/train.py @@ -0,0 +1,162 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch +import torchvision + +from ultralytics.data import ClassificationDataset, build_dataloader +from ultralytics.engine.trainer import BaseTrainer +from ultralytics.models import yolo +from ultralytics.nn.tasks import ClassificationModel, attempt_load_one_weight +from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK, colorstr +from ultralytics.utils.plotting import plot_images, plot_results +from ultralytics.utils.torch_utils import is_parallel, strip_optimizer, torch_distributed_zero_first + + +class ClassificationTrainer(BaseTrainer): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + """Initialize a ClassificationTrainer object with optional configuration overrides and callbacks.""" + if overrides is None: + overrides = {} + overrides['task'] = 'classify' + if overrides.get('imgsz') is None: + overrides['imgsz'] = 224 + super().__init__(cfg, overrides, _callbacks) + + def set_model_attributes(self): + """Set the YOLO model's class names from the loaded dataset.""" + self.model.names = self.data['names'] + + def get_model(self, cfg=None, weights=None, verbose=True): + """Returns a modified PyTorch model configured for training YOLO.""" + model = ClassificationModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1) + if weights: + model.load(weights) + + for m in model.modules(): + if not self.args.pretrained and hasattr(m, 'reset_parameters'): + m.reset_parameters() + if isinstance(m, torch.nn.Dropout) and self.args.dropout: + m.p = self.args.dropout # set dropout + for p in model.parameters(): + p.requires_grad = True # for training + return model + + def setup_model(self): + """ + load/create/download model for any task + """ + # Classification models require special handling + + if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed + return + + model = str(self.model) + # Load a YOLO model locally, from torchvision, or from Ultralytics assets + if model.endswith('.pt'): + self.model, _ = attempt_load_one_weight(model, device='cpu') + for p in self.model.parameters(): + p.requires_grad = True # for training + elif model.split('.')[-1] in ('yaml', 'yml'): + self.model = self.get_model(cfg=model) + elif model in torchvision.models.__dict__: + self.model = torchvision.models.__dict__[model](weights='IMAGENET1K_V1' if self.args.pretrained else None) + else: + FileNotFoundError(f'ERROR: model={model} not found locally or online. Please check model name.') + ClassificationModel.reshape_outputs(self.model, self.data['nc']) + + return # dont return ckpt. Classification doesn't support resume + + def build_dataset(self, img_path, mode='train', batch=None): + return ClassificationDataset(root=img_path, args=self.args, augment=mode == 'train') + + def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'): + """Returns PyTorch DataLoader with transforms to preprocess images for inference.""" + with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP + dataset = self.build_dataset(dataset_path, mode) + + loader = build_dataloader(dataset, batch_size, self.args.workers, rank=rank) + # Attach inference transforms + if mode != 'train': + if is_parallel(self.model): + self.model.module.transforms = loader.dataset.torch_transforms + else: + self.model.transforms = loader.dataset.torch_transforms + return loader + + def preprocess_batch(self, batch): + """Preprocesses a batch of images and classes.""" + batch['img'] = batch['img'].to(self.device) + batch['cls'] = batch['cls'].to(self.device) + return batch + + def progress_string(self): + """Returns a formatted string showing training progress.""" + return ('\n' + '%11s' * (4 + len(self.loss_names))) % \ + ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size') + + def get_validator(self): + """Returns an instance of ClassificationValidator for validation.""" + self.loss_names = ['loss'] + return yolo.classify.ClassificationValidator(self.test_loader, self.save_dir) + + def label_loss_items(self, loss_items=None, prefix='train'): + """ + Returns a loss dict with labelled training loss items tensor + """ + # Not needed for classification but necessary for segmentation & detection + keys = [f'{prefix}/{x}' for x in self.loss_names] + if loss_items is None: + return keys + loss_items = [round(float(loss_items), 5)] + return dict(zip(keys, loss_items)) + + def resume_training(self, ckpt): + """Resumes training from a given checkpoint.""" + pass + + def plot_metrics(self): + """Plots metrics from a CSV file.""" + plot_results(file=self.csv, classify=True, on_plot=self.on_plot) # save results.png + + def final_eval(self): + """Evaluate trained model and save validation results.""" + for f in self.last, self.best: + if f.exists(): + strip_optimizer(f) # strip optimizers + # TODO: validate best.pt after training completes + # if f is self.best: + # LOGGER.info(f'\nValidating {f}...') + # self.validator.args.save_json = True + # self.metrics = self.validator(model=f) + # self.metrics.pop('fitness', None) + # self.run_callbacks('on_fit_epoch_end') + LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}") + + def plot_training_samples(self, batch, ni): + """Plots training samples with their annotations.""" + plot_images( + images=batch['img'], + batch_idx=torch.arange(len(batch['img'])), + cls=batch['cls'].view(-1), # warning: use .view(), not .squeeze() for Classify models + fname=self.save_dir / f'train_batch{ni}.jpg', + on_plot=self.on_plot) + + +def train(cfg=DEFAULT_CFG, use_python=False): + """Train the YOLO classification model.""" + model = cfg.model or 'yolov8n-cls.pt' # or "resnet18" + data = cfg.data or 'mnist160' # or yolo.ClassificationDataset("mnist") + device = cfg.device if cfg.device is not None else '' + + args = dict(model=model, data=data, device=device) + if use_python: + from ultralytics import YOLO + YOLO(model).train(**args) + else: + trainer = ClassificationTrainer(overrides=args) + trainer.train() + + +if __name__ == '__main__': + train() diff --git a/ultralytics/models/yolo/classify/val.py b/ultralytics/models/yolo/classify/val.py new file mode 100644 index 0000000000000000000000000000000000000000..1f38b862962959f4ac96f1b5aecc157041a90cec --- /dev/null +++ b/ultralytics/models/yolo/classify/val.py @@ -0,0 +1,110 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.data import ClassificationDataset, build_dataloader +from ultralytics.engine.validator import BaseValidator +from ultralytics.utils import DEFAULT_CFG, LOGGER +from ultralytics.utils.metrics import ClassifyMetrics, ConfusionMatrix +from ultralytics.utils.plotting import plot_images + + +class ClassificationValidator(BaseValidator): + + def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): + """Initializes ClassificationValidator instance with args, dataloader, save_dir, and progress bar.""" + super().__init__(dataloader, save_dir, pbar, args, _callbacks) + self.args.task = 'classify' + self.metrics = ClassifyMetrics() + + def get_desc(self): + """Returns a formatted string summarizing classification metrics.""" + return ('%22s' + '%11s' * 2) % ('classes', 'top1_acc', 'top5_acc') + + def init_metrics(self, model): + """Initialize confusion matrix, class names, and top-1 and top-5 accuracy.""" + self.names = model.names + self.nc = len(model.names) + self.confusion_matrix = ConfusionMatrix(nc=self.nc, task='classify') + self.pred = [] + self.targets = [] + + def preprocess(self, batch): + """Preprocesses input batch and returns it.""" + batch['img'] = batch['img'].to(self.device, non_blocking=True) + batch['img'] = batch['img'].half() if self.args.half else batch['img'].float() + batch['cls'] = batch['cls'].to(self.device) + return batch + + def update_metrics(self, preds, batch): + """Updates running metrics with model predictions and batch targets.""" + n5 = min(len(self.model.names), 5) + self.pred.append(preds.argsort(1, descending=True)[:, :n5]) + self.targets.append(batch['cls']) + + def finalize_metrics(self, *args, **kwargs): + """Finalizes metrics of the model such as confusion_matrix and speed.""" + self.confusion_matrix.process_cls_preds(self.pred, self.targets) + if self.args.plots: + for normalize in True, False: + self.confusion_matrix.plot(save_dir=self.save_dir, + names=self.names.values(), + normalize=normalize, + on_plot=self.on_plot) + self.metrics.speed = self.speed + self.metrics.confusion_matrix = self.confusion_matrix + + def get_stats(self): + """Returns a dictionary of metrics obtained by processing targets and predictions.""" + self.metrics.process(self.targets, self.pred) + return self.metrics.results_dict + + def build_dataset(self, img_path): + return ClassificationDataset(root=img_path, args=self.args, augment=False) + + def get_dataloader(self, dataset_path, batch_size): + """Builds and returns a data loader for classification tasks with given parameters.""" + dataset = self.build_dataset(dataset_path) + return build_dataloader(dataset, batch_size, self.args.workers, rank=-1) + + def print_results(self): + """Prints evaluation metrics for YOLO object detection model.""" + pf = '%22s' + '%11.3g' * len(self.metrics.keys) # print format + LOGGER.info(pf % ('all', self.metrics.top1, self.metrics.top5)) + + def plot_val_samples(self, batch, ni): + """Plot validation image samples.""" + plot_images( + images=batch['img'], + batch_idx=torch.arange(len(batch['img'])), + cls=batch['cls'].view(-1), # warning: use .view(), not .squeeze() for Classify models + fname=self.save_dir / f'val_batch{ni}_labels.jpg', + names=self.names, + on_plot=self.on_plot) + + def plot_predictions(self, batch, preds, ni): + """Plots predicted bounding boxes on input images and saves the result.""" + plot_images(batch['img'], + batch_idx=torch.arange(len(batch['img'])), + cls=torch.argmax(preds, dim=1), + fname=self.save_dir / f'val_batch{ni}_pred.jpg', + names=self.names, + on_plot=self.on_plot) # pred + + +def val(cfg=DEFAULT_CFG, use_python=False): + """Validate YOLO model using custom data.""" + model = cfg.model or 'yolov8n-cls.pt' # or "resnet18" + data = cfg.data or 'mnist160' + + args = dict(model=model, data=data) + if use_python: + from ultralytics import YOLO + YOLO(model).val(**args) + else: + validator = ClassificationValidator(args=args) + validator(model=args['model']) + + +if __name__ == '__main__': + val() diff --git a/ultralytics/models/yolo/detect/__init__.py b/ultralytics/models/yolo/detect/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..caa200d106802783118e01a1dd334e4a9f98ff29 --- /dev/null +++ b/ultralytics/models/yolo/detect/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .predict import DetectionPredictor, predict +from .train import DetectionTrainer, train +from .val import DetectionValidator, val + +__all__ = 'DetectionPredictor', 'predict', 'DetectionTrainer', 'train', 'DetectionValidator', 'val' diff --git a/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdba78c5b3ae6e34a40c4c88e599f72c1c7b4fb3 Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6915c584fc86c8b3d132b179dd52b77057d2d50c Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/detect/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12a46c57a53d74b8b328edceb9f7f37d7fd3d70f Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/detect/__pycache__/predict.cpython-39.pyc b/ultralytics/models/yolo/detect/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28ff9b8ca7ad260235b6cdc1615c819509931cf5 Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/detect/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/train.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85ffc6754a54c88a76e86663d27917816ba826bc Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/train.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/detect/__pycache__/train.cpython-39.pyc b/ultralytics/models/yolo/detect/__pycache__/train.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c25e74ca59a53d0d16efb30230ad9f45363d93f4 Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/train.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/detect/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/val.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8aca54fd56646520758a89464e6850be49858db Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/val.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/detect/__pycache__/val.cpython-39.pyc b/ultralytics/models/yolo/detect/__pycache__/val.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30fa1a9f7d92f684586a3c0431d1fa73cde260d3 Binary files /dev/null and b/ultralytics/models/yolo/detect/__pycache__/val.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/detect/predict.py b/ultralytics/models/yolo/detect/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..11642fa64aed36ab6ddcb317a5ac7e9d8affaded --- /dev/null +++ b/ultralytics/models/yolo/detect/predict.py @@ -0,0 +1,48 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.engine.predictor import BasePredictor +from ultralytics.engine.results import Results +from ultralytics.utils import DEFAULT_CFG, ROOT, ops + + +class DetectionPredictor(BasePredictor): + + def postprocess(self, preds, img, orig_imgs): + """Postprocesses predictions and returns a list of Results objects.""" + preds = ops.non_max_suppression(preds, + self.args.conf, + self.args.iou, + agnostic=self.args.agnostic_nms, + max_det=self.args.max_det, + classes=self.args.classes) + + results = [] + for i, pred in enumerate(preds): + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + if not isinstance(orig_imgs, torch.Tensor): + pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape) + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred)) + return results + + +def predict(cfg=DEFAULT_CFG, use_python=False): + """Runs YOLO model inference on input image(s).""" + model = cfg.model or 'yolov8n.pt' + source = cfg.source if cfg.source is not None else ROOT / 'assets' if (ROOT / 'assets').exists() \ + else 'https://ultralytics.com/images/bus.jpg' + + args = dict(model=model, source=source) + if use_python: + from ultralytics import YOLO + YOLO(model)(**args) + else: + predictor = DetectionPredictor(overrides=args) + predictor.predict_cli() + + +if __name__ == '__main__': + predict() diff --git a/ultralytics/models/yolo/detect/train.py b/ultralytics/models/yolo/detect/train.py new file mode 100644 index 0000000000000000000000000000000000000000..1eb29329f643503604b9a6ff63de9386949623c8 --- /dev/null +++ b/ultralytics/models/yolo/detect/train.py @@ -0,0 +1,124 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from copy import copy + +import numpy as np + +from ultralytics.data import build_dataloader, build_yolo_dataset +from ultralytics.engine.trainer import BaseTrainer +from ultralytics.models import yolo +from ultralytics.nn.tasks import DetectionModel +from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK +from ultralytics.utils.plotting import plot_images, plot_labels, plot_results +from ultralytics.utils.torch_utils import de_parallel, torch_distributed_zero_first + + +# BaseTrainer python usage +class DetectionTrainer(BaseTrainer): + + def build_dataset(self, img_path, mode='train', batch=None): + """ + Build YOLO Dataset. + + Args: + img_path (str): Path to the folder containing images. + mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode. + batch (int, optional): Size of batches, this is for `rect`. Defaults to None. + """ + gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32) + return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == 'val', stride=gs) + + def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'): + """Construct and return dataloader.""" + assert mode in ['train', 'val'] + with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP + dataset = self.build_dataset(dataset_path, mode, batch_size) + shuffle = mode == 'train' + if getattr(dataset, 'rect', False) and shuffle: + LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False") + shuffle = False + workers = self.args.workers if mode == 'train' else self.args.workers * 2 + return build_dataloader(dataset, batch_size, workers, shuffle, rank) # return dataloader + + def preprocess_batch(self, batch): + """Preprocesses a batch of images by scaling and converting to float.""" + batch['img'] = batch['img'].to(self.device, non_blocking=True).float() / 255 + return batch + + def set_model_attributes(self): + """nl = de_parallel(self.model).model[-1].nl # number of detection layers (to scale hyps).""" + # self.args.box *= 3 / nl # scale to layers + # self.args.cls *= self.data["nc"] / 80 * 3 / nl # scale to classes and layers + # self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl # scale to image size and layers + self.model.nc = self.data['nc'] # attach number of classes to model + self.model.names = self.data['names'] # attach class names to model + self.model.args = self.args # attach hyperparameters to model + # TODO: self.model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc + + def get_model(self, cfg=None, weights=None, verbose=True): + """Return a YOLO detection model.""" + model = DetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1) + if weights: + model.load(weights) + return model + + def get_validator(self): + """Returns a DetectionValidator for YOLO model validation.""" + self.loss_names = 'box_loss', 'cls_loss', 'dfl_loss' + return yolo.detect.DetectionValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args)) + + def label_loss_items(self, loss_items=None, prefix='train'): + """ + Returns a loss dict with labelled training loss items tensor + """ + # Not needed for classification but necessary for segmentation & detection + keys = [f'{prefix}/{x}' for x in self.loss_names] + if loss_items is not None: + loss_items = [round(float(x), 5) for x in loss_items] # convert tensors to 5 decimal place floats + return dict(zip(keys, loss_items)) + else: + return keys + + def progress_string(self): + """Returns a formatted string of training progress with epoch, GPU memory, loss, instances and size.""" + return ('\n' + '%11s' * + (4 + len(self.loss_names))) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size') + + def plot_training_samples(self, batch, ni): + """Plots training samples with their annotations.""" + plot_images(images=batch['img'], + batch_idx=batch['batch_idx'], + cls=batch['cls'].squeeze(-1), + bboxes=batch['bboxes'], + paths=batch['im_file'], + fname=self.save_dir / f'train_batch{ni}.jpg', + on_plot=self.on_plot) + + def plot_metrics(self): + """Plots metrics from a CSV file.""" + plot_results(file=self.csv, on_plot=self.on_plot) # save results.png + + def plot_training_labels(self): + """Create a labeled training plot of the YOLO model.""" + boxes = np.concatenate([lb['bboxes'] for lb in self.train_loader.dataset.labels], 0) + cls = np.concatenate([lb['cls'] for lb in self.train_loader.dataset.labels], 0) + plot_labels(boxes, cls.squeeze(), names=self.data['names'], save_dir=self.save_dir, on_plot=self.on_plot) + + +def train(cfg=DEFAULT_CFG, use_python=False): + """Train and optimize YOLO model given training data and device.""" + model = cfg.model or 'yolov8n.pt' + data = cfg.data or 'coco128.yaml' # or yolo.ClassificationDataset("mnist") + device = cfg.device if cfg.device is not None else '' + + args = dict(model=model, data=data, device=device) + if use_python: + from ultralytics import YOLO + YOLO(model).train(**args) + else: + trainer = DetectionTrainer(overrides=args) + trainer.train() + + +if __name__ == '__main__': + train() diff --git a/ultralytics/models/yolo/detect/val.py b/ultralytics/models/yolo/detect/val.py new file mode 100644 index 0000000000000000000000000000000000000000..b9910c015ceccc319760810407fa67dffc42b703 --- /dev/null +++ b/ultralytics/models/yolo/detect/val.py @@ -0,0 +1,276 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import os +from pathlib import Path + +import numpy as np +import torch + +from ultralytics.data import build_dataloader, build_yolo_dataset +from ultralytics.engine.validator import BaseValidator +from ultralytics.utils import DEFAULT_CFG, LOGGER, ops +from ultralytics.utils.checks import check_requirements +from ultralytics.utils.metrics import ConfusionMatrix, DetMetrics, box_iou +from ultralytics.utils.plotting import output_to_target, plot_images +from ultralytics.utils.torch_utils import de_parallel + + +class DetectionValidator(BaseValidator): + + def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): + """Initialize detection model with necessary variables and settings.""" + super().__init__(dataloader, save_dir, pbar, args, _callbacks) + self.args.task = 'detect' + self.is_coco = False + self.class_map = None + self.metrics = DetMetrics(save_dir=self.save_dir, on_plot=self.on_plot) + self.iouv = torch.linspace(0.5, 0.95, 10) # iou vector for mAP@0.5:0.95 + self.niou = self.iouv.numel() + + def preprocess(self, batch): + """Preprocesses batch of images for YOLO training.""" + batch['img'] = batch['img'].to(self.device, non_blocking=True) + batch['img'] = (batch['img'].half() if self.args.half else batch['img'].float()) / 255 + for k in ['batch_idx', 'cls', 'bboxes']: + batch[k] = batch[k].to(self.device) + + nb = len(batch['img']) + self.lb = [torch.cat([batch['cls'], batch['bboxes']], dim=-1)[batch['batch_idx'] == i] + for i in range(nb)] if self.args.save_hybrid else [] # for autolabelling + + return batch + + def init_metrics(self, model): + """Initialize evaluation metrics for YOLO.""" + val = self.data.get(self.args.split, '') # validation path + self.is_coco = isinstance(val, str) and 'coco' in val and val.endswith(f'{os.sep}val2017.txt') # is COCO + self.class_map = ops.coco80_to_coco91_class() if self.is_coco else list(range(1000)) + self.args.save_json |= self.is_coco and not self.training # run on final val if training COCO + self.names = model.names + self.nc = len(model.names) + self.metrics.names = self.names + self.metrics.plot = self.args.plots + self.confusion_matrix = ConfusionMatrix(nc=self.nc) + self.seen = 0 + self.jdict = [] + self.stats = [] + + def get_desc(self): + """Return a formatted string summarizing class metrics of YOLO model.""" + return ('%22s' + '%11s' * 6) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)') + + def postprocess(self, preds): + """Apply Non-maximum suppression to prediction outputs.""" + return ops.non_max_suppression(preds, + self.args.conf, + self.args.iou, + labels=self.lb, + multi_label=True, + agnostic=self.args.single_cls, + max_det=self.args.max_det) + + def update_metrics(self, preds, batch): + """Metrics.""" + for si, pred in enumerate(preds): + idx = batch['batch_idx'] == si + cls = batch['cls'][idx] + bbox = batch['bboxes'][idx] + nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions + shape = batch['ori_shape'][si] + correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + self.seen += 1 + + if npr == 0: + if nl: + self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1))) + if self.args.plots: + self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1)) + continue + + # Predictions + if self.args.single_cls: + pred[:, 5] = 0 + predn = pred.clone() + ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape, + ratio_pad=batch['ratio_pad'][si]) # native-space pred + + # Evaluate + if nl: + height, width = batch['img'].shape[2:] + tbox = ops.xywh2xyxy(bbox) * torch.tensor( + (width, height, width, height), device=self.device) # target boxes + ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape, + ratio_pad=batch['ratio_pad'][si]) # native-space labels + labelsn = torch.cat((cls, tbox), 1) # native-space labels + correct_bboxes = self._process_batch(predn, labelsn) + # TODO: maybe remove these `self.` arguments as they already are member variable + if self.args.plots: + self.confusion_matrix.process_batch(predn, labelsn) + self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1))) # (conf, pcls, tcls) + + # Save + if self.args.save_json: + self.pred_to_json(predn, batch['im_file'][si]) + if self.args.save_txt: + file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt' + self.save_one_txt(predn, self.args.save_conf, shape, file) + + def finalize_metrics(self, *args, **kwargs): + """Set final values for metrics speed and confusion matrix.""" + self.metrics.speed = self.speed + self.metrics.confusion_matrix = self.confusion_matrix + + def get_stats(self): + """Returns metrics statistics and results dictionary.""" + stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*self.stats)] # to numpy + if len(stats) and stats[0].any(): + self.metrics.process(*stats) + self.nt_per_class = np.bincount(stats[-1].astype(int), minlength=self.nc) # number of targets per class + return self.metrics.results_dict + + def print_results(self): + """Prints training/validation set metrics per class.""" + pf = '%22s' + '%11i' * 2 + '%11.3g' * len(self.metrics.keys) # print format + LOGGER.info(pf % ('all', self.seen, self.nt_per_class.sum(), *self.metrics.mean_results())) + if self.nt_per_class.sum() == 0: + LOGGER.warning( + f'WARNING ⚠️ no labels found in {self.args.task} set, can not compute metrics without labels') + + # Print results per class + if self.args.verbose and not self.training and self.nc > 1 and len(self.stats): + for i, c in enumerate(self.metrics.ap_class_index): + LOGGER.info(pf % (self.names[c], self.seen, self.nt_per_class[c], *self.metrics.class_result(i))) + + if self.args.plots: + for normalize in True, False: + self.confusion_matrix.plot(save_dir=self.save_dir, + names=self.names.values(), + normalize=normalize, + on_plot=self.on_plot) + + def _process_batch(self, detections, labels): + """ + Return correct prediction matrix + Arguments: + detections (array[N, 6]), x1, y1, x2, y2, conf, class + labels (array[M, 5]), class, x1, y1, x2, y2 + Returns: + correct (array[N, 10]), for 10 IoU levels + """ + iou = box_iou(labels[:, 1:], detections[:, :4]) + correct = np.zeros((detections.shape[0], self.iouv.shape[0])).astype(bool) + correct_class = labels[:, 0:1] == detections[:, 5] + for i in range(len(self.iouv)): + x = torch.where((iou >= self.iouv[i]) & correct_class) # IoU > threshold and classes match + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), + 1).cpu().numpy() # [label, detect, iou] + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + # matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + correct[matches[:, 1].astype(int), i] = True + return torch.tensor(correct, dtype=torch.bool, device=detections.device) + + def build_dataset(self, img_path, mode='val', batch=None): + """Build YOLO Dataset + + Args: + img_path (str): Path to the folder containing images. + mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode. + batch (int, optional): Size of batches, this is for `rect`. Defaults to None. + """ + gs = max(int(de_parallel(self.model).stride if self.model else 0), 32) + return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, stride=gs) + + def get_dataloader(self, dataset_path, batch_size): + """Construct and return dataloader.""" + dataset = self.build_dataset(dataset_path, batch=batch_size, mode='val') + return build_dataloader(dataset, batch_size, self.args.workers, shuffle=False, rank=-1) # return dataloader + + def plot_val_samples(self, batch, ni): + """Plot validation image samples.""" + plot_images(batch['img'], + batch['batch_idx'], + batch['cls'].squeeze(-1), + batch['bboxes'], + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_labels.jpg', + names=self.names, + on_plot=self.on_plot) + + def plot_predictions(self, batch, preds, ni): + """Plots predicted bounding boxes on input images and saves the result.""" + plot_images(batch['img'], + *output_to_target(preds, max_det=self.args.max_det), + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_pred.jpg', + names=self.names, + on_plot=self.on_plot) # pred + + def save_one_txt(self, predn, save_conf, shape, file): + """Save YOLO detections to a txt file in normalized coordinates in a specific format.""" + gn = torch.tensor(shape)[[1, 0, 1, 0]] # normalization gain whwh + for *xyxy, conf, cls in predn.tolist(): + xywh = (ops.xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh + line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format + with open(file, 'a') as f: + f.write(('%g ' * len(line)).rstrip() % line + '\n') + + def pred_to_json(self, predn, filename): + """Serialize YOLO predictions to COCO json format.""" + stem = Path(filename).stem + image_id = int(stem) if stem.isnumeric() else stem + box = ops.xyxy2xywh(predn[:, :4]) # xywh + box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner + for p, b in zip(predn.tolist(), box.tolist()): + self.jdict.append({ + 'image_id': image_id, + 'category_id': self.class_map[int(p[5])], + 'bbox': [round(x, 3) for x in b], + 'score': round(p[4], 5)}) + + def eval_json(self, stats): + """Evaluates YOLO output in JSON format and returns performance statistics.""" + if self.args.save_json and self.is_coco and len(self.jdict): + anno_json = self.data['path'] / 'annotations/instances_val2017.json' # annotations + pred_json = self.save_dir / 'predictions.json' # predictions + LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...') + try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + check_requirements('pycocotools>=2.0.6') + from pycocotools.coco import COCO # noqa + from pycocotools.cocoeval import COCOeval # noqa + + for x in anno_json, pred_json: + assert x.is_file(), f'{x} file not found' + anno = COCO(str(anno_json)) # init annotations api + pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path) + eval = COCOeval(anno, pred, 'bbox') + if self.is_coco: + eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # images to eval + eval.evaluate() + eval.accumulate() + eval.summarize() + stats[self.metrics.keys[-1]], stats[self.metrics.keys[-2]] = eval.stats[:2] # update mAP50-95 and mAP50 + except Exception as e: + LOGGER.warning(f'pycocotools unable to run: {e}') + return stats + + +def val(cfg=DEFAULT_CFG, use_python=False): + """Validate trained YOLO model on validation dataset.""" + model = cfg.model or 'yolov8n.pt' + data = cfg.data or 'coco128.yaml' + + args = dict(model=model, data=data) + if use_python: + from ultralytics import YOLO + YOLO(model).val(**args) + else: + validator = DetectionValidator(args=args) + validator(model=args['model']) + + +if __name__ == '__main__': + val() diff --git a/ultralytics/models/yolo/model.py b/ultralytics/models/yolo/model.py new file mode 100644 index 0000000000000000000000000000000000000000..414b35a9b0ac11581708df9363e7698d3eaceec3 --- /dev/null +++ b/ultralytics/models/yolo/model.py @@ -0,0 +1,36 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from ultralytics.engine.model import Model +from ultralytics.models import yolo # noqa +from ultralytics.nn.tasks import ClassificationModel, DetectionModel, PoseModel, SegmentationModel + + +class YOLO(Model): + """ + YOLO (You Only Look Once) object detection model. + """ + + @property + def task_map(self): + """Map head to model, trainer, validator, and predictor classes""" + return { + 'classify': { + 'model': ClassificationModel, + 'trainer': yolo.classify.ClassificationTrainer, + 'validator': yolo.classify.ClassificationValidator, + 'predictor': yolo.classify.ClassificationPredictor, }, + 'detect': { + 'model': DetectionModel, + 'trainer': yolo.detect.DetectionTrainer, + 'validator': yolo.detect.DetectionValidator, + 'predictor': yolo.detect.DetectionPredictor, }, + 'segment': { + 'model': SegmentationModel, + 'trainer': yolo.segment.SegmentationTrainer, + 'validator': yolo.segment.SegmentationValidator, + 'predictor': yolo.segment.SegmentationPredictor, }, + 'pose': { + 'model': PoseModel, + 'trainer': yolo.pose.PoseTrainer, + 'validator': yolo.pose.PoseValidator, + 'predictor': yolo.pose.PosePredictor, }, } diff --git a/ultralytics/models/yolo/pose/__init__.py b/ultralytics/models/yolo/pose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad73d1feb003678cd053adb2b0c8f258bcd81fb8 --- /dev/null +++ b/ultralytics/models/yolo/pose/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .predict import PosePredictor, predict +from .train import PoseTrainer, train +from .val import PoseValidator, val + +__all__ = 'PoseTrainer', 'train', 'PoseValidator', 'val', 'PosePredictor', 'predict' diff --git a/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8dbde7a257a71add6a95a7909a5b0cfc798cb73 Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61def81f1a272cd9fd1ffe8f41897ebf9bff17a8 Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/pose/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fe1e6f7e689a179cf66ae9b1eaa4b1fdf9573ef Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/pose/__pycache__/predict.cpython-39.pyc b/ultralytics/models/yolo/pose/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..483d7eaf60ea5e51ba79f4b3a72f8e2bc90d0fcf Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/pose/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/train.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7225e7ad0e5c5fb3a6a34637221757ee56465dd1 Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/train.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/pose/__pycache__/train.cpython-39.pyc b/ultralytics/models/yolo/pose/__pycache__/train.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82452bd4243132c7500332c3cc28722c9dc00414 Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/train.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/pose/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/val.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1747db6244032be50ed6f16b80c4497bc761e284 Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/val.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/pose/__pycache__/val.cpython-39.pyc b/ultralytics/models/yolo/pose/__pycache__/val.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d83cba4c700596401c9c08327248db0ece9de8d1 Binary files /dev/null and b/ultralytics/models/yolo/pose/__pycache__/val.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/pose/predict.py b/ultralytics/models/yolo/pose/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..e075651413d02d566bd24158167d27d00eb5336e --- /dev/null +++ b/ultralytics/models/yolo/pose/predict.py @@ -0,0 +1,61 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from ultralytics.engine.results import Results +from ultralytics.models.yolo.detect.predict import DetectionPredictor +from ultralytics.utils import DEFAULT_CFG, LOGGER, ROOT, ops + + +class PosePredictor(DetectionPredictor): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + super().__init__(cfg, overrides, _callbacks) + self.args.task = 'pose' + if isinstance(self.args.device, str) and self.args.device.lower() == 'mps': + LOGGER.warning("WARNING ⚠️ Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. " + 'See https://github.com/ultralytics/ultralytics/issues/4031.') + + def postprocess(self, preds, img, orig_imgs): + """Return detection results for a given input image or list of images.""" + preds = ops.non_max_suppression(preds, + self.args.conf, + self.args.iou, + agnostic=self.args.agnostic_nms, + max_det=self.args.max_det, + classes=self.args.classes, + nc=len(self.model.names)) + + results = [] + for i, pred in enumerate(preds): + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + shape = orig_img.shape + pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], shape).round() + pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:] + pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, shape) + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + results.append( + Results(orig_img=orig_img, + path=img_path, + names=self.model.names, + boxes=pred[:, :6], + keypoints=pred_kpts)) + return results + + +def predict(cfg=DEFAULT_CFG, use_python=False): + """Runs YOLO to predict objects in an image or video.""" + model = cfg.model or 'yolov8n-pose.pt' + source = cfg.source if cfg.source is not None else ROOT / 'assets' if (ROOT / 'assets').exists() \ + else 'https://ultralytics.com/images/bus.jpg' + + args = dict(model=model, source=source) + if use_python: + from ultralytics import YOLO + YOLO(model)(**args) + else: + predictor = PosePredictor(overrides=args) + predictor.predict_cli() + + +if __name__ == '__main__': + predict() diff --git a/ultralytics/models/yolo/pose/train.py b/ultralytics/models/yolo/pose/train.py new file mode 100644 index 0000000000000000000000000000000000000000..5d9fab526c890dbab9cdb153daa0e2da0a341d4a --- /dev/null +++ b/ultralytics/models/yolo/pose/train.py @@ -0,0 +1,81 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from copy import copy + +from ultralytics.models import yolo +from ultralytics.nn.tasks import PoseModel +from ultralytics.utils import DEFAULT_CFG, LOGGER +from ultralytics.utils.plotting import plot_images, plot_results + + +# BaseTrainer python usage +class PoseTrainer(yolo.detect.DetectionTrainer): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + """Initialize a PoseTrainer object with specified configurations and overrides.""" + if overrides is None: + overrides = {} + overrides['task'] = 'pose' + super().__init__(cfg, overrides, _callbacks) + + if isinstance(self.args.device, str) and self.args.device.lower() == 'mps': + LOGGER.warning("WARNING ⚠️ Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. " + 'See https://github.com/ultralytics/ultralytics/issues/4031.') + + def get_model(self, cfg=None, weights=None, verbose=True): + """Get pose estimation model with specified configuration and weights.""" + model = PoseModel(cfg, ch=3, nc=self.data['nc'], data_kpt_shape=self.data['kpt_shape'], verbose=verbose) + if weights: + model.load(weights) + + return model + + def set_model_attributes(self): + """Sets keypoints shape attribute of PoseModel.""" + super().set_model_attributes() + self.model.kpt_shape = self.data['kpt_shape'] + + def get_validator(self): + """Returns an instance of the PoseValidator class for validation.""" + self.loss_names = 'box_loss', 'pose_loss', 'kobj_loss', 'cls_loss', 'dfl_loss' + return yolo.pose.PoseValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args)) + + def plot_training_samples(self, batch, ni): + """Plot a batch of training samples with annotated class labels, bounding boxes, and keypoints.""" + images = batch['img'] + kpts = batch['keypoints'] + cls = batch['cls'].squeeze(-1) + bboxes = batch['bboxes'] + paths = batch['im_file'] + batch_idx = batch['batch_idx'] + plot_images(images, + batch_idx, + cls, + bboxes, + kpts=kpts, + paths=paths, + fname=self.save_dir / f'train_batch{ni}.jpg', + on_plot=self.on_plot) + + def plot_metrics(self): + """Plots training/val metrics.""" + plot_results(file=self.csv, pose=True, on_plot=self.on_plot) # save results.png + + +def train(cfg=DEFAULT_CFG, use_python=False): + """Train the YOLO model on the given data and device.""" + model = cfg.model or 'yolov8n-pose.yaml' + data = cfg.data or 'coco8-pose.yaml' + device = cfg.device if cfg.device is not None else '' + + args = dict(model=model, data=data, device=device) + if use_python: + from ultralytics import YOLO + YOLO(model).train(**args) + else: + trainer = PoseTrainer(overrides=args) + trainer.train() + + +if __name__ == '__main__': + train() diff --git a/ultralytics/models/yolo/pose/val.py b/ultralytics/models/yolo/pose/val.py new file mode 100644 index 0000000000000000000000000000000000000000..e57f0e3034dac7e44b4490badf67a393f12c65e4 --- /dev/null +++ b/ultralytics/models/yolo/pose/val.py @@ -0,0 +1,227 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from pathlib import Path + +import numpy as np +import torch + +from ultralytics.models.yolo.detect import DetectionValidator +from ultralytics.utils import DEFAULT_CFG, LOGGER, ops +from ultralytics.utils.checks import check_requirements +from ultralytics.utils.metrics import OKS_SIGMA, PoseMetrics, box_iou, kpt_iou +from ultralytics.utils.plotting import output_to_target, plot_images + + +class PoseValidator(DetectionValidator): + + def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): + """Initialize a 'PoseValidator' object with custom parameters and assigned attributes.""" + super().__init__(dataloader, save_dir, pbar, args, _callbacks) + self.args.task = 'pose' + self.metrics = PoseMetrics(save_dir=self.save_dir, on_plot=self.on_plot) + if isinstance(self.args.device, str) and self.args.device.lower() == 'mps': + LOGGER.warning("WARNING ⚠️ Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. " + 'See https://github.com/ultralytics/ultralytics/issues/4031.') + + def preprocess(self, batch): + """Preprocesses the batch by converting the 'keypoints' data into a float and moving it to the device.""" + batch = super().preprocess(batch) + batch['keypoints'] = batch['keypoints'].to(self.device).float() + return batch + + def get_desc(self): + """Returns description of evaluation metrics in string format.""" + return ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Pose(P', + 'R', 'mAP50', 'mAP50-95)') + + def postprocess(self, preds): + """Apply non-maximum suppression and return detections with high confidence scores.""" + return ops.non_max_suppression(preds, + self.args.conf, + self.args.iou, + labels=self.lb, + multi_label=True, + agnostic=self.args.single_cls, + max_det=self.args.max_det, + nc=self.nc) + + def init_metrics(self, model): + """Initiate pose estimation metrics for YOLO model.""" + super().init_metrics(model) + self.kpt_shape = self.data['kpt_shape'] + is_pose = self.kpt_shape == [17, 3] + nkpt = self.kpt_shape[0] + self.sigma = OKS_SIGMA if is_pose else np.ones(nkpt) / nkpt + + def update_metrics(self, preds, batch): + """Metrics.""" + for si, pred in enumerate(preds): + idx = batch['batch_idx'] == si + cls = batch['cls'][idx] + bbox = batch['bboxes'][idx] + kpts = batch['keypoints'][idx] + nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions + nk = kpts.shape[1] # number of keypoints + shape = batch['ori_shape'][si] + correct_kpts = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + self.seen += 1 + + if npr == 0: + if nl: + self.stats.append((correct_bboxes, correct_kpts, *torch.zeros( + (2, 0), device=self.device), cls.squeeze(-1))) + if self.args.plots: + self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1)) + continue + + # Predictions + if self.args.single_cls: + pred[:, 5] = 0 + predn = pred.clone() + ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape, + ratio_pad=batch['ratio_pad'][si]) # native-space pred + pred_kpts = predn[:, 6:].view(npr, nk, -1) + ops.scale_coords(batch['img'][si].shape[1:], pred_kpts, shape, ratio_pad=batch['ratio_pad'][si]) + + # Evaluate + if nl: + height, width = batch['img'].shape[2:] + tbox = ops.xywh2xyxy(bbox) * torch.tensor( + (width, height, width, height), device=self.device) # target boxes + ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape, + ratio_pad=batch['ratio_pad'][si]) # native-space labels + tkpts = kpts.clone() + tkpts[..., 0] *= width + tkpts[..., 1] *= height + tkpts = ops.scale_coords(batch['img'][si].shape[1:], tkpts, shape, ratio_pad=batch['ratio_pad'][si]) + labelsn = torch.cat((cls, tbox), 1) # native-space labels + correct_bboxes = self._process_batch(predn[:, :6], labelsn) + correct_kpts = self._process_batch(predn[:, :6], labelsn, pred_kpts, tkpts) + if self.args.plots: + self.confusion_matrix.process_batch(predn, labelsn) + + # Append correct_masks, correct_boxes, pconf, pcls, tcls + self.stats.append((correct_bboxes, correct_kpts, pred[:, 4], pred[:, 5], cls.squeeze(-1))) + + # Save + if self.args.save_json: + self.pred_to_json(predn, batch['im_file'][si]) + # if self.args.save_txt: + # save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt') + + def _process_batch(self, detections, labels, pred_kpts=None, gt_kpts=None): + """ + Return correct prediction matrix + Arguments: + detections (array[N, 6]), x1, y1, x2, y2, conf, class + labels (array[M, 5]), class, x1, y1, x2, y2 + pred_kpts (array[N, 51]), 51 = 17 * 3 + gt_kpts (array[N, 51]) + Returns: + correct (array[N, 10]), for 10 IoU levels + """ + if pred_kpts is not None and gt_kpts is not None: + # `0.53` is from https://github.com/jin-s13/xtcocoapi/blob/master/xtcocotools/cocoeval.py#L384 + area = ops.xyxy2xywh(labels[:, 1:])[:, 2:].prod(1) * 0.53 + iou = kpt_iou(gt_kpts, pred_kpts, sigma=self.sigma, area=area) + else: # boxes + iou = box_iou(labels[:, 1:], detections[:, :4]) + + correct = np.zeros((detections.shape[0], self.iouv.shape[0])).astype(bool) + correct_class = labels[:, 0:1] == detections[:, 5] + for i in range(len(self.iouv)): + x = torch.where((iou >= self.iouv[i]) & correct_class) # IoU > threshold and classes match + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), + 1).cpu().numpy() # [label, detect, iou] + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + # matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + correct[matches[:, 1].astype(int), i] = True + return torch.tensor(correct, dtype=torch.bool, device=detections.device) + + def plot_val_samples(self, batch, ni): + """Plots and saves validation set samples with predicted bounding boxes and keypoints.""" + plot_images(batch['img'], + batch['batch_idx'], + batch['cls'].squeeze(-1), + batch['bboxes'], + kpts=batch['keypoints'], + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_labels.jpg', + names=self.names, + on_plot=self.on_plot) + + def plot_predictions(self, batch, preds, ni): + """Plots predictions for YOLO model.""" + pred_kpts = torch.cat([p[:, 6:].view(-1, *self.kpt_shape) for p in preds], 0) + plot_images(batch['img'], + *output_to_target(preds, max_det=self.args.max_det), + kpts=pred_kpts, + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_pred.jpg', + names=self.names, + on_plot=self.on_plot) # pred + + def pred_to_json(self, predn, filename): + """Converts YOLO predictions to COCO JSON format.""" + stem = Path(filename).stem + image_id = int(stem) if stem.isnumeric() else stem + box = ops.xyxy2xywh(predn[:, :4]) # xywh + box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner + for p, b in zip(predn.tolist(), box.tolist()): + self.jdict.append({ + 'image_id': image_id, + 'category_id': self.class_map[int(p[5])], + 'bbox': [round(x, 3) for x in b], + 'keypoints': p[6:], + 'score': round(p[4], 5)}) + + def eval_json(self, stats): + """Evaluates object detection model using COCO JSON format.""" + if self.args.save_json and self.is_coco and len(self.jdict): + anno_json = self.data['path'] / 'annotations/person_keypoints_val2017.json' # annotations + pred_json = self.save_dir / 'predictions.json' # predictions + LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...') + try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + check_requirements('pycocotools>=2.0.6') + from pycocotools.coco import COCO # noqa + from pycocotools.cocoeval import COCOeval # noqa + + for x in anno_json, pred_json: + assert x.is_file(), f'{x} file not found' + anno = COCO(str(anno_json)) # init annotations api + pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path) + for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'keypoints')]): + if self.is_coco: + eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # im to eval + eval.evaluate() + eval.accumulate() + eval.summarize() + idx = i * 4 + 2 + stats[self.metrics.keys[idx + 1]], stats[ + self.metrics.keys[idx]] = eval.stats[:2] # update mAP50-95 and mAP50 + except Exception as e: + LOGGER.warning(f'pycocotools unable to run: {e}') + return stats + + +def val(cfg=DEFAULT_CFG, use_python=False): + """Performs validation on YOLO model using given data.""" + model = cfg.model or 'yolov8n-pose.pt' + data = cfg.data or 'coco8-pose.yaml' + + args = dict(model=model, data=data) + if use_python: + from ultralytics import YOLO + YOLO(model).val(**args) + else: + validator = PoseValidator(args=args) + validator(model=args['model']) + + +if __name__ == '__main__': + val() diff --git a/ultralytics/models/yolo/segment/__init__.py b/ultralytics/models/yolo/segment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d68ceac6f7a33b87cb3c844b9aaa0759532a6edc --- /dev/null +++ b/ultralytics/models/yolo/segment/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .predict import SegmentationPredictor, predict +from .train import SegmentationTrainer, train +from .val import SegmentationValidator, val + +__all__ = 'SegmentationPredictor', 'predict', 'SegmentationTrainer', 'train', 'SegmentationValidator', 'val' diff --git a/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26daa502fbc2c7507d2aabf6bab5fb2caa6676b4 Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-39.pyc b/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a116533fa089c706d192c3dcca7d72d3ee46f836 Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/segment/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/predict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f230a13d1ea46a8f39e6dfe61596dd806a9a10b Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/predict.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/segment/__pycache__/predict.cpython-39.pyc b/ultralytics/models/yolo/segment/__pycache__/predict.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..794498f1a474edcdc7a0d73952d470f5436c15b9 Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/predict.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/segment/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/train.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe0af72a01542d5219e4f0e2e86a1bb7b7e13e80 Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/train.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/segment/__pycache__/train.cpython-39.pyc b/ultralytics/models/yolo/segment/__pycache__/train.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac2e7c04439d2c8040ba9803dfb0817add2e4757 Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/train.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/segment/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/val.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d842dc8e0b5b7b7b84abcd6eef0026b3515086c Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/val.cpython-310.pyc differ diff --git a/ultralytics/models/yolo/segment/__pycache__/val.cpython-39.pyc b/ultralytics/models/yolo/segment/__pycache__/val.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f8eff6c7148884a881088159ad8165712ab4a0d Binary files /dev/null and b/ultralytics/models/yolo/segment/__pycache__/val.cpython-39.pyc differ diff --git a/ultralytics/models/yolo/segment/predict.py b/ultralytics/models/yolo/segment/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..1007c3f309ba6b7dd56e10c9c01e7f73abbab96c --- /dev/null +++ b/ultralytics/models/yolo/segment/predict.py @@ -0,0 +1,63 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch + +from ultralytics.engine.results import Results +from ultralytics.models.yolo.detect.predict import DetectionPredictor +from ultralytics.utils import DEFAULT_CFG, ROOT, ops + + +class SegmentationPredictor(DetectionPredictor): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + super().__init__(cfg, overrides, _callbacks) + self.args.task = 'segment' + + def postprocess(self, preds, img, orig_imgs): + """TODO: filter by classes.""" + p = ops.non_max_suppression(preds[0], + self.args.conf, + self.args.iou, + agnostic=self.args.agnostic_nms, + max_det=self.args.max_det, + nc=len(self.model.names), + classes=self.args.classes) + results = [] + proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported + for i, pred in enumerate(p): + orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs + path = self.batch[0] + img_path = path[i] if isinstance(path, list) else path + if not len(pred): # save empty boxes + results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6])) + continue + if self.args.retina_masks: + if not isinstance(orig_imgs, torch.Tensor): + pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape) + masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC + else: + masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC + if not isinstance(orig_imgs, torch.Tensor): + pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape) + results.append( + Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks)) + return results + + +def predict(cfg=DEFAULT_CFG, use_python=False): + """Runs YOLO object detection on an image or video source.""" + model = cfg.model or 'yolov8n-seg.pt' + source = cfg.source if cfg.source is not None else ROOT / 'assets' if (ROOT / 'assets').exists() \ + else 'https://ultralytics.com/images/bus.jpg' + + args = dict(model=model, source=source) + if use_python: + from ultralytics import YOLO + YOLO(model)(**args) + else: + predictor = SegmentationPredictor(overrides=args) + predictor.predict_cli() + + +if __name__ == '__main__': + predict() diff --git a/ultralytics/models/yolo/segment/train.py b/ultralytics/models/yolo/segment/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6718b9ade374a98475c0e71632447e8ef6af212f --- /dev/null +++ b/ultralytics/models/yolo/segment/train.py @@ -0,0 +1,66 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from copy import copy + +from ultralytics.models import yolo +from ultralytics.nn.tasks import SegmentationModel +from ultralytics.utils import DEFAULT_CFG, RANK +from ultralytics.utils.plotting import plot_images, plot_results + + +# BaseTrainer python usage +class SegmentationTrainer(yolo.detect.DetectionTrainer): + + def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None): + """Initialize a SegmentationTrainer object with given arguments.""" + if overrides is None: + overrides = {} + overrides['task'] = 'segment' + super().__init__(cfg, overrides, _callbacks) + + def get_model(self, cfg=None, weights=None, verbose=True): + """Return SegmentationModel initialized with specified config and weights.""" + model = SegmentationModel(cfg, ch=3, nc=self.data['nc'], verbose=verbose and RANK == -1) + if weights: + model.load(weights) + + return model + + def get_validator(self): + """Return an instance of SegmentationValidator for validation of YOLO model.""" + self.loss_names = 'box_loss', 'seg_loss', 'cls_loss', 'dfl_loss' + return yolo.segment.SegmentationValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args)) + + def plot_training_samples(self, batch, ni): + """Creates a plot of training sample images with labels and box coordinates.""" + plot_images(batch['img'], + batch['batch_idx'], + batch['cls'].squeeze(-1), + batch['bboxes'], + batch['masks'], + paths=batch['im_file'], + fname=self.save_dir / f'train_batch{ni}.jpg', + on_plot=self.on_plot) + + def plot_metrics(self): + """Plots training/val metrics.""" + plot_results(file=self.csv, segment=True, on_plot=self.on_plot) # save results.png + + +def train(cfg=DEFAULT_CFG, use_python=False): + """Train a YOLO segmentation model based on passed arguments.""" + model = cfg.model or 'yolov8n-seg.pt' + data = cfg.data or 'coco128-seg.yaml' # or yolo.ClassificationDataset("mnist") + device = cfg.device if cfg.device is not None else '' + + args = dict(model=model, data=data, device=device) + if use_python: + from ultralytics import YOLO + YOLO(model).train(**args) + else: + trainer = SegmentationTrainer(overrides=args) + trainer.train() + + +if __name__ == '__main__': + train() diff --git a/ultralytics/models/yolo/segment/val.py b/ultralytics/models/yolo/segment/val.py new file mode 100644 index 0000000000000000000000000000000000000000..dd3683926c40764e82e87b308473771ebb2a32dc --- /dev/null +++ b/ultralytics/models/yolo/segment/val.py @@ -0,0 +1,262 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from multiprocessing.pool import ThreadPool +from pathlib import Path + +import numpy as np +import torch +import torch.nn.functional as F + +from ultralytics.models.yolo.detect import DetectionValidator +from ultralytics.utils import DEFAULT_CFG, LOGGER, NUM_THREADS, ops +from ultralytics.utils.checks import check_requirements +from ultralytics.utils.metrics import SegmentMetrics, box_iou, mask_iou +from ultralytics.utils.plotting import output_to_target, plot_images + + +class SegmentationValidator(DetectionValidator): + + def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): + """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.""" + super().__init__(dataloader, save_dir, pbar, args, _callbacks) + self.args.task = 'segment' + self.metrics = SegmentMetrics(save_dir=self.save_dir, on_plot=self.on_plot) + + def preprocess(self, batch): + """Preprocesses batch by converting masks to float and sending to device.""" + batch = super().preprocess(batch) + batch['masks'] = batch['masks'].to(self.device).float() + return batch + + def init_metrics(self, model): + """Initialize metrics and select mask processing function based on save_json flag.""" + super().init_metrics(model) + self.plot_masks = [] + if self.args.save_json: + check_requirements('pycocotools>=2.0.6') + self.process = ops.process_mask_upsample # more accurate + else: + self.process = ops.process_mask # faster + + def get_desc(self): + """Return a formatted description of evaluation metrics.""" + return ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P', + 'R', 'mAP50', 'mAP50-95)') + + def postprocess(self, preds): + """Postprocesses YOLO predictions and returns output detections with proto.""" + p = ops.non_max_suppression(preds[0], + self.args.conf, + self.args.iou, + labels=self.lb, + multi_label=True, + agnostic=self.args.single_cls, + max_det=self.args.max_det, + nc=self.nc) + proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported + return p, proto + + def update_metrics(self, preds, batch): + """Metrics.""" + for si, (pred, proto) in enumerate(zip(preds[0], preds[1])): + idx = batch['batch_idx'] == si + cls = batch['cls'][idx] + bbox = batch['bboxes'][idx] + nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions + shape = batch['ori_shape'][si] + correct_masks = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init + self.seen += 1 + + if npr == 0: + if nl: + self.stats.append((correct_bboxes, correct_masks, *torch.zeros( + (2, 0), device=self.device), cls.squeeze(-1))) + if self.args.plots: + self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1)) + continue + + # Masks + midx = [si] if self.args.overlap_mask else idx + gt_masks = batch['masks'][midx] + pred_masks = self.process(proto, pred[:, 6:], pred[:, :4], shape=batch['img'][si].shape[1:]) + + # Predictions + if self.args.single_cls: + pred[:, 5] = 0 + predn = pred.clone() + ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape, + ratio_pad=batch['ratio_pad'][si]) # native-space pred + + # Evaluate + if nl: + height, width = batch['img'].shape[2:] + tbox = ops.xywh2xyxy(bbox) * torch.tensor( + (width, height, width, height), device=self.device) # target boxes + ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape, + ratio_pad=batch['ratio_pad'][si]) # native-space labels + labelsn = torch.cat((cls, tbox), 1) # native-space labels + correct_bboxes = self._process_batch(predn, labelsn) + # TODO: maybe remove these `self.` arguments as they already are member variable + correct_masks = self._process_batch(predn, + labelsn, + pred_masks, + gt_masks, + overlap=self.args.overlap_mask, + masks=True) + if self.args.plots: + self.confusion_matrix.process_batch(predn, labelsn) + + # Append correct_masks, correct_boxes, pconf, pcls, tcls + self.stats.append((correct_bboxes, correct_masks, pred[:, 4], pred[:, 5], cls.squeeze(-1))) + + pred_masks = torch.as_tensor(pred_masks, dtype=torch.uint8) + if self.args.plots and self.batch_i < 3: + self.plot_masks.append(pred_masks[:15].cpu()) # filter top 15 to plot + + # Save + if self.args.save_json: + pred_masks = ops.scale_image(pred_masks.permute(1, 2, 0).contiguous().cpu().numpy(), + shape, + ratio_pad=batch['ratio_pad'][si]) + self.pred_to_json(predn, batch['im_file'][si], pred_masks) + # if self.args.save_txt: + # save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt') + + def finalize_metrics(self, *args, **kwargs): + """Sets speed and confusion matrix for evaluation metrics.""" + self.metrics.speed = self.speed + self.metrics.confusion_matrix = self.confusion_matrix + + def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False): + """ + Return correct prediction matrix + Arguments: + detections (array[N, 6]), x1, y1, x2, y2, conf, class + labels (array[M, 5]), class, x1, y1, x2, y2 + Returns: + correct (array[N, 10]), for 10 IoU levels + """ + if masks: + if overlap: + nl = len(labels) + index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1 + gt_masks = gt_masks.repeat(nl, 1, 1) # shape(1,640,640) -> (n,640,640) + gt_masks = torch.where(gt_masks == index, 1.0, 0.0) + if gt_masks.shape[1:] != pred_masks.shape[1:]: + gt_masks = F.interpolate(gt_masks[None], pred_masks.shape[1:], mode='bilinear', align_corners=False)[0] + gt_masks = gt_masks.gt_(0.5) + iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1)) + else: # boxes + iou = box_iou(labels[:, 1:], detections[:, :4]) + + correct = np.zeros((detections.shape[0], self.iouv.shape[0])).astype(bool) + correct_class = labels[:, 0:1] == detections[:, 5] + for i in range(len(self.iouv)): + x = torch.where((iou >= self.iouv[i]) & correct_class) # IoU > threshold and classes match + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), + 1).cpu().numpy() # [label, detect, iou] + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + # matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + correct[matches[:, 1].astype(int), i] = True + return torch.tensor(correct, dtype=torch.bool, device=detections.device) + + def plot_val_samples(self, batch, ni): + """Plots validation samples with bounding box labels.""" + plot_images(batch['img'], + batch['batch_idx'], + batch['cls'].squeeze(-1), + batch['bboxes'], + batch['masks'], + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_labels.jpg', + names=self.names, + on_plot=self.on_plot) + + def plot_predictions(self, batch, preds, ni): + """Plots batch predictions with masks and bounding boxes.""" + plot_images( + batch['img'], + *output_to_target(preds[0], max_det=15), # not set to self.args.max_det due to slow plotting speed + torch.cat(self.plot_masks, dim=0) if len(self.plot_masks) else self.plot_masks, + paths=batch['im_file'], + fname=self.save_dir / f'val_batch{ni}_pred.jpg', + names=self.names, + on_plot=self.on_plot) # pred + self.plot_masks.clear() + + def pred_to_json(self, predn, filename, pred_masks): + """Save one JSON result.""" + # Example result = {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236} + from pycocotools.mask import encode # noqa + + def single_encode(x): + """Encode predicted masks as RLE and append results to jdict.""" + rle = encode(np.asarray(x[:, :, None], order='F', dtype='uint8'))[0] + rle['counts'] = rle['counts'].decode('utf-8') + return rle + + stem = Path(filename).stem + image_id = int(stem) if stem.isnumeric() else stem + box = ops.xyxy2xywh(predn[:, :4]) # xywh + box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner + pred_masks = np.transpose(pred_masks, (2, 0, 1)) + with ThreadPool(NUM_THREADS) as pool: + rles = pool.map(single_encode, pred_masks) + for i, (p, b) in enumerate(zip(predn.tolist(), box.tolist())): + self.jdict.append({ + 'image_id': image_id, + 'category_id': self.class_map[int(p[5])], + 'bbox': [round(x, 3) for x in b], + 'score': round(p[4], 5), + 'segmentation': rles[i]}) + + def eval_json(self, stats): + """Return COCO-style object detection evaluation metrics.""" + if self.args.save_json and self.is_coco and len(self.jdict): + anno_json = self.data['path'] / 'annotations/instances_val2017.json' # annotations + pred_json = self.save_dir / 'predictions.json' # predictions + LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...') + try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + check_requirements('pycocotools>=2.0.6') + from pycocotools.coco import COCO # noqa + from pycocotools.cocoeval import COCOeval # noqa + + for x in anno_json, pred_json: + assert x.is_file(), f'{x} file not found' + anno = COCO(str(anno_json)) # init annotations api + pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path) + for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'segm')]): + if self.is_coco: + eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # im to eval + eval.evaluate() + eval.accumulate() + eval.summarize() + idx = i * 4 + 2 + stats[self.metrics.keys[idx + 1]], stats[ + self.metrics.keys[idx]] = eval.stats[:2] # update mAP50-95 and mAP50 + except Exception as e: + LOGGER.warning(f'pycocotools unable to run: {e}') + return stats + + +def val(cfg=DEFAULT_CFG, use_python=False): + """Validate trained YOLO model on validation data.""" + model = cfg.model or 'yolov8n-seg.pt' + data = cfg.data or 'coco128-seg.yaml' + + args = dict(model=model, data=data) + if use_python: + from ultralytics import YOLO + YOLO(model).val(**args) + else: + validator = SegmentationValidator(args=args) + validator(model=args['model']) + + +if __name__ == '__main__': + val() diff --git a/ultralytics/nn/__init__.py b/ultralytics/nn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72ae16b8c9907663de426402bf1fc8dc8a2d517c --- /dev/null +++ b/ultralytics/nn/__init__.py @@ -0,0 +1,9 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .tasks import (BaseModel, ClassificationModel, DetectionModel, SegmentationModel, attempt_load_one_weight, + attempt_load_weights, guess_model_scale, guess_model_task, parse_model, torch_safe_load, + yaml_model_load) + +__all__ = ('attempt_load_one_weight', 'attempt_load_weights', 'parse_model', 'yaml_model_load', 'guess_model_task', + 'guess_model_scale', 'torch_safe_load', 'DetectionModel', 'SegmentationModel', 'ClassificationModel', + 'BaseModel') diff --git a/ultralytics/nn/__pycache__/__init__.cpython-310.pyc b/ultralytics/nn/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66675c69b28f9b3f22088726ba6f72a66aa06290 Binary files /dev/null and b/ultralytics/nn/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/nn/__pycache__/__init__.cpython-39.pyc b/ultralytics/nn/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cae250fecda6b0e0168d1eef74820dab07a1c55 Binary files /dev/null and b/ultralytics/nn/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/nn/__pycache__/autobackend.cpython-310.pyc b/ultralytics/nn/__pycache__/autobackend.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4324842e92176327a894c79ce1a831ca383b7fd4 Binary files /dev/null and b/ultralytics/nn/__pycache__/autobackend.cpython-310.pyc differ diff --git a/ultralytics/nn/__pycache__/autobackend.cpython-39.pyc b/ultralytics/nn/__pycache__/autobackend.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0ab5826846383c7155d2c5282f79fe586af7275 Binary files /dev/null and b/ultralytics/nn/__pycache__/autobackend.cpython-39.pyc differ diff --git a/ultralytics/nn/__pycache__/tasks.cpython-310.pyc b/ultralytics/nn/__pycache__/tasks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c794e5c1a61cb3b8b14217f442f0f9fb5fec4a6 Binary files /dev/null and b/ultralytics/nn/__pycache__/tasks.cpython-310.pyc differ diff --git a/ultralytics/nn/__pycache__/tasks.cpython-39.pyc b/ultralytics/nn/__pycache__/tasks.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7bdd862ba75950d90cd9fbc00b48f2524c6fa7b Binary files /dev/null and b/ultralytics/nn/__pycache__/tasks.cpython-39.pyc differ diff --git a/ultralytics/nn/autobackend.py b/ultralytics/nn/autobackend.py new file mode 100644 index 0000000000000000000000000000000000000000..2cf1f3b3c254991f524b37a001b1561577d68ef8 --- /dev/null +++ b/ultralytics/nn/autobackend.py @@ -0,0 +1,492 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import ast +import contextlib +import json +import platform +import zipfile +from collections import OrderedDict, namedtuple +from pathlib import Path +from urllib.parse import urlparse + +import cv2 +import numpy as np +import torch +import torch.nn as nn +from PIL import Image + +from ultralytics.utils import ARM64, LINUX, LOGGER, ROOT, yaml_load +from ultralytics.utils.checks import check_requirements, check_suffix, check_version, check_yaml +from ultralytics.utils.downloads import attempt_download_asset, is_url +from ultralytics.utils.ops import xywh2xyxy + + +def check_class_names(names): + """Check class names. Map imagenet class codes to human-readable names if required. Convert lists to dicts.""" + if isinstance(names, list): # names is a list + names = dict(enumerate(names)) # convert to dict + if isinstance(names, dict): + # Convert 1) string keys to int, i.e. '0' to 0, and non-string values to strings, i.e. True to 'True' + names = {int(k): str(v) for k, v in names.items()} + n = len(names) + if max(names.keys()) >= n: + raise KeyError(f'{n}-class dataset requires class indices 0-{n - 1}, but you have invalid class indices ' + f'{min(names.keys())}-{max(names.keys())} defined in your dataset YAML.') + if isinstance(names[0], str) and names[0].startswith('n0'): # imagenet class codes, i.e. 'n01440764' + map = yaml_load(ROOT / 'cfg/datasets/ImageNet.yaml')['map'] # human-readable names + names = {k: map[v] for k, v in names.items()} + return names + + +class AutoBackend(nn.Module): + + def __init__(self, + weights='yolov8n.pt', + device=torch.device('cpu'), + dnn=False, + data=None, + fp16=False, + fuse=True, + verbose=True): + """ + MultiBackend class for python inference on various platforms using Ultralytics YOLO. + + Args: + weights (str): The path to the weights file. Default: 'yolov8n.pt' + device (torch.device): The device to run the model on. + dnn (bool): Use OpenCV DNN module for inference if True, defaults to False. + data (str | Path | optional): Additional data.yaml file for class names. + fp16 (bool): If True, use half precision. Default: False + fuse (bool): Whether to fuse the model or not. Default: True + verbose (bool): Whether to run in verbose mode or not. Default: True + + Supported formats and their naming conventions: + | Format | Suffix | + |-----------------------|------------------| + | PyTorch | *.pt | + | TorchScript | *.torchscript | + | ONNX Runtime | *.onnx | + | ONNX OpenCV DNN | *.onnx dnn=True | + | OpenVINO | *.xml | + | CoreML | *.mlmodel | + | TensorRT | *.engine | + | TensorFlow SavedModel | *_saved_model | + | TensorFlow GraphDef | *.pb | + | TensorFlow Lite | *.tflite | + | TensorFlow Edge TPU | *_edgetpu.tflite | + | PaddlePaddle | *_paddle_model | + | ncnn | *_ncnn_model | + """ + super().__init__() + w = str(weights[0] if isinstance(weights, list) else weights) + nn_module = isinstance(weights, torch.nn.Module) + pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, ncnn, triton = \ + self._model_type(w) + fp16 &= pt or jit or onnx or xml or engine or nn_module or triton # FP16 + nhwc = coreml or saved_model or pb or tflite or edgetpu # BHWC formats (vs torch BCWH) + stride = 32 # default stride + model, metadata = None, None + + # Set device + cuda = torch.cuda.is_available() and device.type != 'cpu' # use CUDA + if cuda and not any([nn_module, pt, jit, engine]): # GPU dataloader formats + device = torch.device('cpu') + cuda = False + + # Download if not local + if not (pt or triton or nn_module): + w = attempt_download_asset(w) + + # Load model + if nn_module: # in-memory PyTorch model + model = weights.to(device) + model = model.fuse(verbose=verbose) if fuse else model + if hasattr(model, 'kpt_shape'): + kpt_shape = model.kpt_shape # pose-only + stride = max(int(model.stride.max()), 32) # model stride + names = model.module.names if hasattr(model, 'module') else model.names # get class names + model.half() if fp16 else model.float() + self.model = model # explicitly assign for to(), cpu(), cuda(), half() + pt = True + elif pt: # PyTorch + from ultralytics.nn.tasks import attempt_load_weights + model = attempt_load_weights(weights if isinstance(weights, list) else w, + device=device, + inplace=True, + fuse=fuse) + if hasattr(model, 'kpt_shape'): + kpt_shape = model.kpt_shape # pose-only + stride = max(int(model.stride.max()), 32) # model stride + names = model.module.names if hasattr(model, 'module') else model.names # get class names + model.half() if fp16 else model.float() + self.model = model # explicitly assign for to(), cpu(), cuda(), half() + elif jit: # TorchScript + LOGGER.info(f'Loading {w} for TorchScript inference...') + extra_files = {'config.txt': ''} # model metadata + model = torch.jit.load(w, _extra_files=extra_files, map_location=device) + model.half() if fp16 else model.float() + if extra_files['config.txt']: # load metadata dict + metadata = json.loads(extra_files['config.txt'], object_hook=lambda x: dict(x.items())) + elif dnn: # ONNX OpenCV DNN + LOGGER.info(f'Loading {w} for ONNX OpenCV DNN inference...') + check_requirements('opencv-python>=4.5.4') + net = cv2.dnn.readNetFromONNX(w) + elif onnx: # ONNX Runtime + LOGGER.info(f'Loading {w} for ONNX Runtime inference...') + check_requirements(('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime')) + import onnxruntime + providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider'] + session = onnxruntime.InferenceSession(w, providers=providers) + output_names = [x.name for x in session.get_outputs()] + metadata = session.get_modelmeta().custom_metadata_map # metadata + elif xml: # OpenVINO + LOGGER.info(f'Loading {w} for OpenVINO inference...') + check_requirements('openvino>=2023.0') # requires openvino-dev: https://pypi.org/project/openvino-dev/ + from openvino.runtime import Core, Layout, get_batch # noqa + core = Core() + w = Path(w) + if not w.is_file(): # if not *.xml + w = next(w.glob('*.xml')) # get *.xml file from *_openvino_model dir + ov_model = core.read_model(model=str(w), weights=w.with_suffix('.bin')) + if ov_model.get_parameters()[0].get_layout().empty: + ov_model.get_parameters()[0].set_layout(Layout('NCHW')) + batch_dim = get_batch(ov_model) + if batch_dim.is_static: + batch_size = batch_dim.get_length() + ov_compiled_model = core.compile_model(ov_model, device_name='AUTO') # AUTO selects best available device + metadata = w.parent / 'metadata.yaml' + elif engine: # TensorRT + LOGGER.info(f'Loading {w} for TensorRT inference...') + try: + import tensorrt as trt # noqa https://developer.nvidia.com/nvidia-tensorrt-download + except ImportError: + if LINUX: + check_requirements('nvidia-tensorrt', cmds='-U --index-url https://pypi.ngc.nvidia.com') + import tensorrt as trt # noqa + check_version(trt.__version__, '7.0.0', hard=True) # require tensorrt>=7.0.0 + if device.type == 'cpu': + device = torch.device('cuda:0') + Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr')) + logger = trt.Logger(trt.Logger.INFO) + # Read file + with open(w, 'rb') as f, trt.Runtime(logger) as runtime: + meta_len = int.from_bytes(f.read(4), byteorder='little') # read metadata length + metadata = json.loads(f.read(meta_len).decode('utf-8')) # read metadata + model = runtime.deserialize_cuda_engine(f.read()) # read engine + context = model.create_execution_context() + bindings = OrderedDict() + output_names = [] + fp16 = False # default updated below + dynamic = False + for i in range(model.num_bindings): + name = model.get_binding_name(i) + dtype = trt.nptype(model.get_binding_dtype(i)) + if model.binding_is_input(i): + if -1 in tuple(model.get_binding_shape(i)): # dynamic + dynamic = True + context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2])) + if dtype == np.float16: + fp16 = True + else: # output + output_names.append(name) + shape = tuple(context.get_binding_shape(i)) + im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) + bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr())) + binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items()) + batch_size = bindings['images'].shape[0] # if dynamic, this is instead max batch size + elif coreml: # CoreML + LOGGER.info(f'Loading {w} for CoreML inference...') + import coremltools as ct + model = ct.models.MLModel(w) + metadata = dict(model.user_defined_metadata) + elif saved_model: # TF SavedModel + LOGGER.info(f'Loading {w} for TensorFlow SavedModel inference...') + import tensorflow as tf + keras = False # assume TF1 saved_model + model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w) + metadata = Path(w) / 'metadata.yaml' + elif pb: # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt + LOGGER.info(f'Loading {w} for TensorFlow GraphDef inference...') + import tensorflow as tf + + from ultralytics.engine.exporter import gd_outputs + + def wrap_frozen_graph(gd, inputs, outputs): + """Wrap frozen graphs for deployment.""" + x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=''), []) # wrapped + ge = x.graph.as_graph_element + return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs)) + + gd = tf.Graph().as_graph_def() # TF GraphDef + with open(w, 'rb') as f: + gd.ParseFromString(f.read()) + frozen_func = wrap_frozen_graph(gd, inputs='x:0', outputs=gd_outputs(gd)) + elif tflite or edgetpu: # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_python + try: # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu + from tflite_runtime.interpreter import Interpreter, load_delegate + except ImportError: + import tensorflow as tf + Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate + if edgetpu: # TF Edge TPU https://coral.ai/software/#edgetpu-runtime + LOGGER.info(f'Loading {w} for TensorFlow Lite Edge TPU inference...') + delegate = { + 'Linux': 'libedgetpu.so.1', + 'Darwin': 'libedgetpu.1.dylib', + 'Windows': 'edgetpu.dll'}[platform.system()] + interpreter = Interpreter(model_path=w, experimental_delegates=[load_delegate(delegate)]) + else: # TFLite + LOGGER.info(f'Loading {w} for TensorFlow Lite inference...') + interpreter = Interpreter(model_path=w) # load TFLite model + interpreter.allocate_tensors() # allocate + input_details = interpreter.get_input_details() # inputs + output_details = interpreter.get_output_details() # outputs + # Load metadata + with contextlib.suppress(zipfile.BadZipFile): + with zipfile.ZipFile(w, 'r') as model: + meta_file = model.namelist()[0] + metadata = ast.literal_eval(model.read(meta_file).decode('utf-8')) + elif tfjs: # TF.js + raise NotImplementedError('YOLOv8 TF.js inference is not currently supported.') + elif paddle: # PaddlePaddle + LOGGER.info(f'Loading {w} for PaddlePaddle inference...') + check_requirements('paddlepaddle-gpu' if cuda else 'paddlepaddle') + import paddle.inference as pdi # noqa + w = Path(w) + if not w.is_file(): # if not *.pdmodel + w = next(w.rglob('*.pdmodel')) # get *.pdmodel file from *_paddle_model dir + config = pdi.Config(str(w), str(w.with_suffix('.pdiparams'))) + if cuda: + config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0) + predictor = pdi.create_predictor(config) + input_handle = predictor.get_input_handle(predictor.get_input_names()[0]) + output_names = predictor.get_output_names() + metadata = w.parents[1] / 'metadata.yaml' + elif ncnn: # ncnn + LOGGER.info(f'Loading {w} for ncnn inference...') + check_requirements('git+https://github.com/Tencent/ncnn.git' if ARM64 else 'ncnn') # requires ncnn + import ncnn as pyncnn + net = pyncnn.Net() + net.opt.use_vulkan_compute = cuda + w = Path(w) + if not w.is_file(): # if not *.param + w = next(w.glob('*.param')) # get *.param file from *_ncnn_model dir + net.load_param(str(w)) + net.load_model(str(w.with_suffix('.bin'))) + metadata = w.parent / 'metadata.yaml' + elif triton: # NVIDIA Triton Inference Server + """TODO + check_requirements('tritonclient[all]') + from utils.triton import TritonRemoteModel + model = TritonRemoteModel(url=w) + nhwc = model.runtime.startswith("tensorflow") + """ + raise NotImplementedError('Triton Inference Server is not currently supported.') + else: + from ultralytics.engine.exporter import export_formats + raise TypeError(f"model='{w}' is not a supported model format. " + 'See https://docs.ultralytics.com/modes/predict for help.' + f'\n\n{export_formats()}') + + # Load external metadata YAML + if isinstance(metadata, (str, Path)) and Path(metadata).exists(): + metadata = yaml_load(metadata) + if metadata: + for k, v in metadata.items(): + if k in ('stride', 'batch'): + metadata[k] = int(v) + elif k in ('imgsz', 'names', 'kpt_shape') and isinstance(v, str): + metadata[k] = eval(v) + stride = metadata['stride'] + task = metadata['task'] + batch = metadata['batch'] + imgsz = metadata['imgsz'] + names = metadata['names'] + kpt_shape = metadata.get('kpt_shape') + elif not (pt or triton or nn_module): + LOGGER.warning(f"WARNING ⚠️ Metadata not found for 'model={weights}'") + + # Check names + if 'names' not in locals(): # names missing + names = self._apply_default_class_names(data) + names = check_class_names(names) + + self.__dict__.update(locals()) # assign all variables to self + + def forward(self, im, augment=False, visualize=False): + """ + Runs inference on the YOLOv8 MultiBackend model. + + Args: + im (torch.Tensor): The image tensor to perform inference on. + augment (bool): whether to perform data augmentation during inference, defaults to False + visualize (bool): whether to visualize the output predictions, defaults to False + + Returns: + (tuple): Tuple containing the raw output tensor, and processed output for visualization (if visualize=True) + """ + b, ch, h, w = im.shape # batch, channel, height, width + if self.fp16 and im.dtype != torch.float16: + im = im.half() # to FP16 + if self.nhwc: + im = im.permute(0, 2, 3, 1) # torch BCHW to numpy BHWC shape(1,320,192,3) + + if self.pt or self.nn_module: # PyTorch + y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im) + elif self.jit: # TorchScript + y = self.model(im) + elif self.dnn: # ONNX OpenCV DNN + im = im.cpu().numpy() # torch to numpy + self.net.setInput(im) + y = self.net.forward() + elif self.onnx: # ONNX Runtime + im = im.cpu().numpy() # torch to numpy + y = self.session.run(self.output_names, {self.session.get_inputs()[0].name: im}) + elif self.xml: # OpenVINO + im = im.cpu().numpy() # FP32 + y = list(self.ov_compiled_model(im).values()) + elif self.engine: # TensorRT + if self.dynamic and im.shape != self.bindings['images'].shape: + i = self.model.get_binding_index('images') + self.context.set_binding_shape(i, im.shape) # reshape if dynamic + self.bindings['images'] = self.bindings['images']._replace(shape=im.shape) + for name in self.output_names: + i = self.model.get_binding_index(name) + self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i))) + s = self.bindings['images'].shape + assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}" + self.binding_addrs['images'] = int(im.data_ptr()) + self.context.execute_v2(list(self.binding_addrs.values())) + y = [self.bindings[x].data for x in sorted(self.output_names)] + elif self.coreml: # CoreML + im = im[0].cpu().numpy() + im_pil = Image.fromarray((im * 255).astype('uint8')) + # im = im.resize((192, 320), Image.BILINEAR) + y = self.model.predict({'image': im_pil}) # coordinates are xywh normalized + if 'confidence' in y: + box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]]) # xyxy pixels + conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float) + y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1) + elif len(y) == 1: # classification model + y = list(y.values()) + elif len(y) == 2: # segmentation model + y = list(reversed(y.values())) # reversed for segmentation models (pred, proto) + elif self.paddle: # PaddlePaddle + im = im.cpu().numpy().astype(np.float32) + self.input_handle.copy_from_cpu(im) + self.predictor.run() + y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names] + elif self.ncnn: # ncnn + mat_in = self.pyncnn.Mat(im[0].cpu().numpy()) + ex = self.net.create_extractor() + input_names, output_names = self.net.input_names(), self.net.output_names() + ex.input(input_names[0], mat_in) + y = [] + for output_name in output_names: + mat_out = self.pyncnn.Mat() + ex.extract(output_name, mat_out) + y.append(np.array(mat_out)[None]) + elif self.triton: # NVIDIA Triton Inference Server + y = self.model(im) + else: # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU) + im = im.cpu().numpy() + if self.saved_model: # SavedModel + y = self.model(im, training=False) if self.keras else self.model(im) + if not isinstance(y, list): + y = [y] + elif self.pb: # GraphDef + y = self.frozen_func(x=self.tf.constant(im)) + if len(y) == 2 and len(self.names) == 999: # segments and names not defined + ip, ib = (0, 1) if len(y[0].shape) == 4 else (1, 0) # index of protos, boxes + nc = y[ib].shape[1] - y[ip].shape[3] - 4 # y = (1, 160, 160, 32), (1, 116, 8400) + self.names = {i: f'class{i}' for i in range(nc)} + else: # Lite or Edge TPU + details = self.input_details[0] + integer = details['dtype'] in (np.int8, np.int16) # is TFLite quantized int8 or int16 model + if integer: + scale, zero_point = details['quantization'] + im = (im / scale + zero_point).astype(details['dtype']) # de-scale + self.interpreter.set_tensor(details['index'], im) + self.interpreter.invoke() + y = [] + for output in self.output_details: + x = self.interpreter.get_tensor(output['index']) + if integer: + scale, zero_point = output['quantization'] + x = (x.astype(np.float32) - zero_point) * scale # re-scale + if x.ndim > 2: # if task is not classification + # Denormalize xywh with input image size + # xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models + # See this PR for details: https://github.com/ultralytics/ultralytics/pull/1695 + x[:, 0] *= w + x[:, 1] *= h + x[:, 2] *= w + x[:, 3] *= h + y.append(x) + # TF segment fixes: export is reversed vs ONNX export and protos are transposed + if len(y) == 2: # segment with (det, proto) output order reversed + if len(y[1].shape) != 4: + y = list(reversed(y)) # should be y = (1, 116, 8400), (1, 160, 160, 32) + y[1] = np.transpose(y[1], (0, 3, 1, 2)) # should be y = (1, 116, 8400), (1, 32, 160, 160) + y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y] + + # for x in y: + # print(type(x), len(x)) if isinstance(x, (list, tuple)) else print(type(x), x.shape) # debug shapes + if isinstance(y, (list, tuple)): + return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y] + else: + return self.from_numpy(y) + + def from_numpy(self, x): + """ + Convert a numpy array to a tensor. + + Args: + x (np.ndarray): The array to be converted. + + Returns: + (torch.Tensor): The converted tensor + """ + return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x + + def warmup(self, imgsz=(1, 3, 640, 640)): + """ + Warm up the model by running one forward pass with a dummy input. + + Args: + imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width) + + Returns: + (None): This method runs the forward pass and don't return any value + """ + warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module + if any(warmup_types) and (self.device.type != 'cpu' or self.triton): + im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input + for _ in range(2 if self.jit else 1): # + self.forward(im) # warmup + + @staticmethod + def _apply_default_class_names(data): + """Applies default class names to an input YAML file or returns numerical class names.""" + with contextlib.suppress(Exception): + return yaml_load(check_yaml(data))['names'] + return {i: f'class{i}' for i in range(999)} # return default if above errors + + @staticmethod + def _model_type(p='path/to/model.pt'): + """ + This function takes a path to a model file and returns the model type + + Args: + p: path to the model file. Defaults to path/to/model.pt + """ + # Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx + # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle] + from ultralytics.engine.exporter import export_formats + sf = list(export_formats().Suffix) # export suffixes + if not is_url(p, check=False) and not isinstance(p, str): + check_suffix(p, sf) # checks + url = urlparse(p) # if url may be Triton inference server + types = [s in Path(p).name for s in sf] + types[8] &= not types[9] # tflite &= not edgetpu + triton = not any(types) and all([any(s in url.scheme for s in ['http', 'grpc']), url.netloc]) + return types + [triton] diff --git a/ultralytics/nn/modules/__init__.py b/ultralytics/nn/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d933928050b9ce8e7fcccdfd1547495a2eca926a --- /dev/null +++ b/ultralytics/nn/modules/__init__.py @@ -0,0 +1,31 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Ultralytics modules. Visualize with: + +from ultralytics.nn.modules import * +import torch +import os + +x = torch.ones(1, 128, 40, 40) +m = Conv(128, 128) +f = f'{m._get_name()}.onnx' +torch.onnx.export(m, x, f) +os.system(f'onnxsim {f} {f} && open {f}') +""" + +from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck, + HGBlock, HGStem, Proto, RepC3) +from .conv import (CBAM, ChannelAttention, Concat, Conv, Conv2, ConvTranspose, DWConv, DWConvTranspose2d, Focus, + GhostConv, LightConv, RepConv, SpatialAttention, + GAM_Attention,GCT,ShuffleAttention,ResBlock_CBAM,ECAAttention,MHSA,GlobalContext,GatherExcite) +from .head import Classify, Detect, Pose, RTDETRDecoder, Segment +from .transformer import (AIFI, MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer, LayerNorm2d, + MLPBlock, MSDeformAttn, TransformerBlock, TransformerEncoderLayer, TransformerLayer) + +__all__ = ('Conv', 'Conv2', 'LightConv', 'RepConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', + 'GhostConv', 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'TransformerLayer', + 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', + 'C2f', 'C3x', 'C3TR', 'C3Ghost', 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'Detect', + 'Segment', 'Pose', 'Classify', 'TransformerEncoderLayer', 'RepC3', 'RTDETRDecoder', 'AIFI', + 'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP', + 'GAM_Attention','GCT','ShuffleAttention','ResBlock_CBAM','ECAAttention','MHSA','GatherExcite','GlobalContext') diff --git a/ultralytics/nn/modules/__pycache__/__init__.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6a1c246e20ec2fb9d49d66310e1251fdc087a92 Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/__init__.cpython-39.pyc b/ultralytics/nn/modules/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f303d854a1983623cfccb0ff7371bcdab326022b Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/block.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c93d8cb4a97cfce4dd3d47f40fc0b171b61ab228 Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/block.cpython-310.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/block.cpython-39.pyc b/ultralytics/nn/modules/__pycache__/block.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7a8b8c90c6ad199f10a9659bb70d9fd0d76e9cb Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/block.cpython-39.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/conv.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/conv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa399d0488eb03a84e0d8075413f5fa80b28ddc5 Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/conv.cpython-310.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/conv.cpython-39.pyc b/ultralytics/nn/modules/__pycache__/conv.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6ac278f01487923acf148172c96452e4b4aaba1 Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/conv.cpython-39.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/head.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c76cfb8460a4ab3c8ac2c772074a36095c19c5d4 Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/head.cpython-310.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/head.cpython-39.pyc b/ultralytics/nn/modules/__pycache__/head.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc432f49543d1cae816f9cfbffe621fa1678236d Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/head.cpython-39.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/transformer.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a07216e2a18000b0bb87390e0a4d2580b36d24df Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/transformer.cpython-310.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/transformer.cpython-39.pyc b/ultralytics/nn/modules/__pycache__/transformer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15e26298b74a4681be3e88d97a75b77a08dfa87f Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/transformer.cpython-39.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/utils.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f40fae24376c207293b00a63ad3426f6766bb777 Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/utils.cpython-310.pyc differ diff --git a/ultralytics/nn/modules/__pycache__/utils.cpython-39.pyc b/ultralytics/nn/modules/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8b86eb418cf88438280156b1611edde437f7cdb Binary files /dev/null and b/ultralytics/nn/modules/__pycache__/utils.cpython-39.pyc differ diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py new file mode 100644 index 0000000000000000000000000000000000000000..7d4bb5f9742ec9efe4186e8c3aab00f9b4410dda --- /dev/null +++ b/ultralytics/nn/modules/block.py @@ -0,0 +1,304 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Block modules +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .conv import Conv, DWConv, GhostConv, LightConv, RepConv +from .transformer import TransformerBlock + +__all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost', + 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3') + + +class DFL(nn.Module): + """ + Integral module of Distribution Focal Loss (DFL). + Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 + """ + + def __init__(self, c1=16): + """Initialize a convolutional layer with a given number of input channels.""" + super().__init__() + self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False) + x = torch.arange(c1, dtype=torch.float) + self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1)) + self.c1 = c1 + + def forward(self, x): + """Applies a transformer layer on input tensor 'x' and returns a tensor.""" + b, c, a = x.shape # batch, channels, anchors + return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a) + # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a) + + +class Proto(nn.Module): + """YOLOv8 mask Proto module for segmentation models.""" + + def __init__(self, c1, c_=256, c2=32): # ch_in, number of protos, number of masks + super().__init__() + self.cv1 = Conv(c1, c_, k=3) + self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True) # nn.Upsample(scale_factor=2, mode='nearest') + self.cv2 = Conv(c_, c_, k=3) + self.cv3 = Conv(c_, c2) + + def forward(self, x): + """Performs a forward pass through layers using an upsampled input image.""" + return self.cv3(self.cv2(self.upsample(self.cv1(x)))) + + +class HGStem(nn.Module): + """StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d. + https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py + """ + + def __init__(self, c1, cm, c2): + super().__init__() + self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU()) + self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU()) + self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU()) + self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU()) + self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU()) + self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True) + + def forward(self, x): + """Forward pass of a PPHGNetV2 backbone layer.""" + x = self.stem1(x) + x = F.pad(x, [0, 1, 0, 1]) + x2 = self.stem2a(x) + x2 = F.pad(x2, [0, 1, 0, 1]) + x2 = self.stem2b(x2) + x1 = self.pool(x) + x = torch.cat([x1, x2], dim=1) + x = self.stem3(x) + x = self.stem4(x) + return x + + +class HGBlock(nn.Module): + """HG_Block of PPHGNetV2 with 2 convolutions and LightConv. + https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py + """ + + def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()): + super().__init__() + block = LightConv if lightconv else Conv + self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n)) + self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act) # squeeze conv + self.ec = Conv(c2 // 2, c2, 1, 1, act=act) # excitation conv + self.add = shortcut and c1 == c2 + + def forward(self, x): + """Forward pass of a PPHGNetV2 backbone layer.""" + y = [x] + y.extend(m(y[-1]) for m in self.m) + y = self.ec(self.sc(torch.cat(y, 1))) + return y + x if self.add else y + + +class SPP(nn.Module): + """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.""" + + def __init__(self, c1, c2, k=(5, 9, 13)): + """Initialize the SPP layer with input/output channels and pooling kernel sizes.""" + super().__init__() + c_ = c1 // 2 # hidden channels + self.cv1 = Conv(c1, c_, 1, 1) + self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1) + self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k]) + + def forward(self, x): + """Forward pass of the SPP layer, performing spatial pyramid pooling.""" + x = self.cv1(x) + return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1)) + + +class SPPF(nn.Module): + """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.""" + + def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13)) + super().__init__() + c_ = c1 // 2 # hidden channels + self.cv1 = Conv(c1, c_, 1, 1) + self.cv2 = Conv(c_ * 4, c2, 1, 1) + self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) + + def forward(self, x): + """Forward pass through Ghost Convolution block.""" + x = self.cv1(x) + y1 = self.m(x) + y2 = self.m(y1) + return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1)) + + +class C1(nn.Module): + """CSP Bottleneck with 1 convolution.""" + + def __init__(self, c1, c2, n=1): # ch_in, ch_out, number + super().__init__() + self.cv1 = Conv(c1, c2, 1, 1) + self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n))) + + def forward(self, x): + """Applies cross-convolutions to input in the C3 module.""" + y = self.cv1(x) + return self.m(y) + y + + +class C2(nn.Module): + """CSP Bottleneck with 2 convolutions.""" + + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + self.c = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, 2 * self.c, 1, 1) + self.cv2 = Conv(2 * self.c, c2, 1) # optional act=FReLU(c2) + # self.attention = ChannelAttention(2 * self.c) # or SpatialAttention() + self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))) + + def forward(self, x): + """Forward pass through the CSP bottleneck with 2 convolutions.""" + a, b = self.cv1(x).chunk(2, 1) + return self.cv2(torch.cat((self.m(a), b), 1)) + + +class C2f(nn.Module): + """Faster Implementation of CSP Bottleneck with 2 convolutions.""" + + def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + self.c = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, 2 * self.c, 1, 1) + self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2) + self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)) + + def forward(self, x): + """Forward pass through C2f layer.""" + y = list(self.cv1(x).chunk(2, 1)) + y.extend(m(y[-1]) for m in self.m) + return self.cv2(torch.cat(y, 1)) + + def forward_split(self, x): + """Forward pass using split() instead of chunk().""" + y = list(self.cv1(x).split((self.c, self.c), 1)) + y.extend(m(y[-1]) for m in self.m) + return self.cv2(torch.cat(y, 1)) + + +class C3(nn.Module): + """CSP Bottleneck with 3 convolutions.""" + + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, 1, 1) + self.cv2 = Conv(c1, c_, 1, 1) + self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2) + self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n))) + + def forward(self, x): + """Forward pass through the CSP bottleneck with 2 convolutions.""" + return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) + + +class C3x(C3): + """C3 module with cross-convolutions.""" + + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): + """Initialize C3TR instance and set default parameters.""" + super().__init__(c1, c2, n, shortcut, g, e) + self.c_ = int(c2 * e) + self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n))) + + +class RepC3(nn.Module): + """Rep C3.""" + + def __init__(self, c1, c2, n=3, e=1.0): + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c2, 1, 1) + self.cv2 = Conv(c1, c2, 1, 1) + self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)]) + self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity() + + def forward(self, x): + """Forward pass of RT-DETR neck layer.""" + return self.cv3(self.m(self.cv1(x)) + self.cv2(x)) + + +class C3TR(C3): + """C3 module with TransformerBlock().""" + + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): + """Initialize C3Ghost module with GhostBottleneck().""" + super().__init__(c1, c2, n, shortcut, g, e) + c_ = int(c2 * e) + self.m = TransformerBlock(c_, c_, 4, n) + + +class C3Ghost(C3): + """C3 module with GhostBottleneck().""" + + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): + """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling.""" + super().__init__(c1, c2, n, shortcut, g, e) + c_ = int(c2 * e) # hidden channels + self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n))) + + +class GhostBottleneck(nn.Module): + """Ghost Bottleneck https://github.com/huawei-noah/ghostnet.""" + + def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride + super().__init__() + c_ = c2 // 2 + self.conv = nn.Sequential( + GhostConv(c1, c_, 1, 1), # pw + DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw + GhostConv(c_, c2, 1, 1, act=False)) # pw-linear + self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, + act=False)) if s == 2 else nn.Identity() + + def forward(self, x): + """Applies skip connection and concatenation to input tensor.""" + return self.conv(x) + self.shortcut(x) + + +class Bottleneck(nn.Module): + """Standard bottleneck.""" + + def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, groups, kernels, expand + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, k[0], 1) + self.cv2 = Conv(c_, c2, k[1], 1, g=g) + self.add = shortcut and c1 == c2 + + def forward(self, x): + """'forward()' applies the YOLOv5 FPN to input data.""" + return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + + +class BottleneckCSP(nn.Module): + """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.""" + + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, 1, 1) + self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) + self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) + self.cv4 = Conv(2 * c_, c2, 1, 1) + self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) + self.act = nn.SiLU() + self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n))) + + def forward(self, x): + """Applies a CSP bottleneck with 3 convolutions.""" + y1 = self.cv3(self.m(self.cv1(x))) + y2 = self.cv2(x) + return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1)))) diff --git a/ultralytics/nn/modules/conv.py b/ultralytics/nn/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..158d4a89c1e0b62a5bf07e16ef2e2204cdd723e4 --- /dev/null +++ b/ultralytics/nn/modules/conv.py @@ -0,0 +1,658 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Convolution modules +""" + +import math + +import numpy as np +import torch +import torch.nn as nn +from torch.nn import init +from torch.nn.parameter import Parameter + +__all__ = ('Conv', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv', + 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv') + + +def autopad(k, p=None, d=1): # kernel, padding, dilation + """Pad to 'same' shape outputs.""" + if d > 1: + k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size + if p is None: + p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad + return p + + +class Conv(nn.Module): + """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation).""" + default_act = nn.SiLU() # default activation + + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): + """Initialize Conv layer with given arguments including activation.""" + super().__init__() + self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) + self.bn = nn.BatchNorm2d(c2) + self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() + + def forward(self, x): + """Apply convolution, batch normalization and activation to input tensor.""" + return self.act(self.bn(self.conv(x))) + + def forward_fuse(self, x): + """Perform transposed convolution of 2D data.""" + return self.act(self.conv(x)) + + +class Conv2(Conv): + """Simplified RepConv module with Conv fusing.""" + + def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True): + """Initialize Conv layer with given arguments including activation.""" + super().__init__(c1, c2, k, s, p, g=g, d=d, act=act) + self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False) # add 1x1 conv + + def forward(self, x): + """Apply convolution, batch normalization and activation to input tensor.""" + return self.act(self.bn(self.conv(x) + self.cv2(x))) + + def fuse_convs(self): + """Fuse parallel convolutions.""" + w = torch.zeros_like(self.conv.weight.data) + i = [x // 2 for x in w.shape[2:]] + w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone() + self.conv.weight.data += w + self.__delattr__('cv2') + + +class LightConv(nn.Module): + """Light convolution with args(ch_in, ch_out, kernel). + https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py + """ + + def __init__(self, c1, c2, k=1, act=nn.ReLU()): + """Initialize Conv layer with given arguments including activation.""" + super().__init__() + self.conv1 = Conv(c1, c2, 1, act=False) + self.conv2 = DWConv(c2, c2, k, act=act) + + def forward(self, x): + """Apply 2 convolutions to input tensor.""" + return self.conv2(self.conv1(x)) + + +class DWConv(Conv): + """Depth-wise convolution.""" + + def __init__(self, c1, c2, k=1, s=1, d=1, act=True): # ch_in, ch_out, kernel, stride, dilation, activation + super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act) + + +class DWConvTranspose2d(nn.ConvTranspose2d): + """Depth-wise transpose convolution.""" + + def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): # ch_in, ch_out, kernel, stride, padding, padding_out + super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2)) + + +class ConvTranspose(nn.Module): + """Convolution transpose 2d layer.""" + default_act = nn.SiLU() # default activation + + def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True): + """Initialize ConvTranspose2d layer with batch normalization and activation function.""" + super().__init__() + self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn) + self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity() + self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() + + def forward(self, x): + """Applies transposed convolutions, batch normalization and activation to input.""" + return self.act(self.bn(self.conv_transpose(x))) + + def forward_fuse(self, x): + """Applies activation and convolution transpose operation to input.""" + return self.act(self.conv_transpose(x)) + + +class Focus(nn.Module): + """Focus wh information into c-space.""" + + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups + super().__init__() + self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act) + # self.contract = Contract(gain=2) + + def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) + return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1)) + # return self.conv(self.contract(x)) + + +class GhostConv(nn.Module): + """Ghost Convolution https://github.com/huawei-noah/ghostnet.""" + + def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups + super().__init__() + c_ = c2 // 2 # hidden channels + self.cv1 = Conv(c1, c_, k, s, None, g, act=act) + self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act) + + def forward(self, x): + """Forward propagation through a Ghost Bottleneck layer with skip connection.""" + y = self.cv1(x) + return torch.cat((y, self.cv2(y)), 1) + + +class RepConv(nn.Module): + """RepConv is a basic rep-style block, including training and deploy status + This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py + """ + default_act = nn.SiLU() # default activation + + def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False): + super().__init__() + assert k == 3 and p == 1 + self.g = g + self.c1 = c1 + self.c2 = c2 + self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() + + self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None + self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False) + self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False) + + def forward_fuse(self, x): + """Forward process""" + return self.act(self.conv(x)) + + def forward(self, x): + """Forward process""" + id_out = 0 if self.bn is None else self.bn(x) + return self.act(self.conv1(x) + self.conv2(x) + id_out) + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) + kernelid, biasid = self._fuse_bn_tensor(self.bn) + return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid + + def _avg_to_3x3_tensor(self, avgp): + channels = self.c1 + groups = self.g + kernel_size = avgp.kernel_size + input_dim = channels // groups + k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) + k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2 + return k + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch): + if branch is None: + return 0, 0 + if isinstance(branch, Conv): + kernel = branch.conv.weight + running_mean = branch.bn.running_mean + running_var = branch.bn.running_var + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn.eps + elif isinstance(branch, nn.BatchNorm2d): + if not hasattr(self, 'id_tensor'): + input_dim = self.c1 // self.g + kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32) + for i in range(self.c1): + kernel_value[i, i % input_dim, 1, 1] = 1 + self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) + kernel = self.id_tensor + running_mean = branch.running_mean + running_var = branch.running_var + gamma = branch.weight + beta = branch.bias + eps = branch.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def fuse_convs(self): + if hasattr(self, 'conv'): + return + kernel, bias = self.get_equivalent_kernel_bias() + self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels, + out_channels=self.conv1.conv.out_channels, + kernel_size=self.conv1.conv.kernel_size, + stride=self.conv1.conv.stride, + padding=self.conv1.conv.padding, + dilation=self.conv1.conv.dilation, + groups=self.conv1.conv.groups, + bias=True).requires_grad_(False) + self.conv.weight.data = kernel + self.conv.bias.data = bias + for para in self.parameters(): + para.detach_() + self.__delattr__('conv1') + self.__delattr__('conv2') + if hasattr(self, 'nm'): + self.__delattr__('nm') + if hasattr(self, 'bn'): + self.__delattr__('bn') + if hasattr(self, 'id_tensor'): + self.__delattr__('id_tensor') + + +class ChannelAttention(nn.Module): + """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet.""" + + def __init__(self, channels: int) -> None: + super().__init__() + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True) + self.act = nn.Sigmoid() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x * self.act(self.fc(self.pool(x))) + + +class SpatialAttention(nn.Module): + """Spatial-attention module.""" + + def __init__(self, kernel_size=7): + """Initialize Spatial-attention module with kernel size argument.""" + super().__init__() + assert kernel_size in (3, 7), 'kernel size must be 3 or 7' + padding = 3 if kernel_size == 7 else 1 + self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False) + self.act = nn.Sigmoid() + + def forward(self, x): + """Apply channel and spatial attention on input for feature recalibration.""" + return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1))) + + +class CBAM(nn.Module): + """Convolutional Block Attention Module.""" + + def __init__(self, c1, kernel_size=7): # ch_in, kernels + super().__init__() + self.channel_attention = ChannelAttention(c1) + self.spatial_attention = SpatialAttention(kernel_size) + + def forward(self, x): + """Applies the forward pass through C1 module.""" + return self.spatial_attention(self.channel_attention(x)) + + +class Concat(nn.Module): + """Concatenate a list of tensors along dimension.""" + + def __init__(self, dimension=1): + """Concatenates a list of tensors along a specified dimension.""" + super().__init__() + self.d = dimension + + def forward(self, x): + """Forward pass for the YOLOv8 mask Proto module.""" + return torch.cat(x, self.d) + + +def channel_shuffle(x, groups=2): ##shuffle channel + # RESHAPE----->transpose------->Flatten + B, C, H, W = x.size() + out = x.view(B, groups, C // groups, H, W).permute(0, 2, 1, 3, 4).contiguous() + out = out.view(B, C, H, W) + return out + + +class GAM_Attention(nn.Module): + def __init__(self, c1, c2, group=True, rate=4): + super(GAM_Attention, self).__init__() + + self.channel_attention = nn.Sequential( + nn.Linear(c1, int(c1 / rate)), + nn.ReLU(inplace=True), + nn.Linear(int(c1 / rate), c1) + ) + + self.spatial_attention = nn.Sequential( + + nn.Conv2d(c1, c1 // rate, kernel_size=7, padding=3, groups=rate) if group else nn.Conv2d(c1, int(c1 / rate), + kernel_size=7, + padding=3), + nn.BatchNorm2d(int(c1 / rate)), + nn.ReLU(inplace=True), + nn.Conv2d(c1 // rate, c2, kernel_size=7, padding=3, groups=rate) if group else nn.Conv2d(int(c1 / rate), c2, + kernel_size=7, + padding=3), + nn.BatchNorm2d(c2) + ) + + def forward(self, x): + b, c, h, w = x.shape + x_permute = x.permute(0, 2, 3, 1).view(b, -1, c) + x_att_permute = self.channel_attention(x_permute).view(b, h, w, c) + x_channel_att = x_att_permute.permute(0, 3, 1, 2) + # x_channel_att=channel_shuffle(x_channel_att,4) #last shuffle + x = x * x_channel_att + + x_spatial_att = self.spatial_attention(x).sigmoid() + x_spatial_att = channel_shuffle(x_spatial_att, 4) # last shuffle + out = x * x_spatial_att + # out=channel_shuffle(out,4) #last shuffle + return out + + +class GCT(nn.Module): + def __init__(self, channels, c=2, eps=1e-5): + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.eps = eps + self.c = c + + def forward(self, x): + y = self.avgpool(x) + mean = y.mean(dim=1, keepdim=True) + mean_x2 = (y ** 2).mean(dim=1, keepdim=True) + var = mean_x2 - mean ** 2 + y_norm = (y - mean) / torch.sqrt(var + self.eps) + y_transform = torch.exp(-(y_norm ** 2 / 2 * self.c)) + return x * y_transform.expand_as(x) + + +class ShuffleAttention(nn.Module): + + def __init__(self, channel=512, reduction=16, G=8): + super().__init__() + self.G = G + self.channel = channel + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.gn = nn.GroupNorm(channel // (2 * G), channel // (2 * G)) + self.cweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1)) + self.cbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1)) + self.sweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1)) + self.sbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1)) + self.sigmoid = nn.Sigmoid() + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + init.constant_(m.weight, 1) + init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + init.normal_(m.weight, std=0.001) + if m.bias is not None: + init.constant_(m.bias, 0) + + @staticmethod + def channel_shuffle(x, groups): + b, c, h, w = x.shape + x = x.reshape(b, groups, -1, h, w) + x = x.permute(0, 2, 1, 3, 4) + + # flatten + x = x.reshape(b, -1, h, w) + + return x + + def forward(self, x): + b, c, h, w = x.size() + # group into subfeatures + x = x.view(b * self.G, -1, h, w) # bs*G,c//G,h,w + + # channel_split + x_0, x_1 = x.chunk(2, dim=1) # bs*G,c//(2*G),h,w + + # channel attention + x_channel = self.avg_pool(x_0) # bs*G,c//(2*G),1,1 + x_channel = self.cweight * x_channel + self.cbias # bs*G,c//(2*G),1,1 + x_channel = x_0 * self.sigmoid(x_channel) + + # spatial attention + x_spatial = self.gn(x_1) # bs*G,c//(2*G),h,w + x_spatial = self.sweight * x_spatial + self.sbias # bs*G,c//(2*G),h,w + x_spatial = x_1 * self.sigmoid(x_spatial) # bs*G,c//(2*G),h,w + + # concatenate along channel axis + out = torch.cat([x_channel, x_spatial], dim=1) # bs*G,c//G,h,w + out = out.contiguous().view(b, -1, h, w) + + # channel shuffle + out = self.channel_shuffle(out, 2) + return out + + +class ResBlock_CBAM(nn.Module): + def __init__(self, in_places, places, stride=1, downsampling=False, expansion=1): + super(ResBlock_CBAM, self).__init__() + self.expansion = expansion + self.downsampling = downsampling + + self.bottleneck = nn.Sequential( + nn.Conv2d(in_channels=in_places, out_channels=places, kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(places), + nn.LeakyReLU(0.1, inplace=True), + nn.Conv2d(in_channels=places, out_channels=places, kernel_size=3, stride=stride, padding=1, bias=False), + nn.BatchNorm2d(places), + nn.LeakyReLU(0.1, inplace=True), + nn.Conv2d(in_channels=places, out_channels=places * self.expansion, kernel_size=1, stride=1, + bias=False), + nn.BatchNorm2d(places * self.expansion), + ) + # self.cbam = CBAM(c1=places * self.expansion, c2=places * self.expansion, ) + self.cbam = CBAM(c1=places * self.expansion) + + if self.downsampling: + self.downsample = nn.Sequential( + nn.Conv2d(in_channels=in_places, out_channels=places * self.expansion, kernel_size=1, stride=stride, + bias=False), + nn.BatchNorm2d(places * self.expansion) + ) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + residual = x + out = self.bottleneck(x) + out = self.cbam(out) + if self.downsampling: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + return out + + +class ECAAttention(nn.Module): + """Constructs a ECA module. + Args: + channel: Number of channels of the input feature map + k_size: Adaptive selection of kernel size + """ + + def __init__(self, c1, k_size=3): + super(ECAAttention, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv = nn.Conv1d(1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + # feature descriptor on the global spatial information + y = self.avg_pool(x) + y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1) + # Multi-scale information fusion + y = self.sigmoid(y) + + return x * y.expand_as(x) + + +class MHSA(nn.Module): + def __init__(self, n_dims, width=14, height=14, heads=4, pos_emb=False): + super(MHSA, self).__init__() + + self.heads = heads + self.query = nn.Conv2d(n_dims, n_dims, kernel_size=1) + self.key = nn.Conv2d(n_dims, n_dims, kernel_size=1) + self.value = nn.Conv2d(n_dims, n_dims, kernel_size=1) + self.pos = pos_emb + if self.pos: + self.rel_h_weight = nn.Parameter(torch.randn([1, heads, (n_dims) // heads, 1, int(height)]), + requires_grad=True) + self.rel_w_weight = nn.Parameter(torch.randn([1, heads, (n_dims) // heads, int(width), 1]), + requires_grad=True) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x): + n_batch, C, width, height = x.size() + q = self.query(x).view(n_batch, self.heads, C // self.heads, -1) + k = self.key(x).view(n_batch, self.heads, C // self.heads, -1) + v = self.value(x).view(n_batch, self.heads, C // self.heads, -1) + content_content = torch.matmul(q.permute(0, 1, 3, 2), k) # 1,C,h*w,h*w + c1, c2, c3, c4 = content_content.size() + if self.pos: + content_position = (self.rel_h_weight + self.rel_w_weight).view(1, self.heads, C // self.heads, -1).permute( + 0, 1, 3, 2) # 1,4,1024,64 + + content_position = torch.matmul(content_position, q) # ([1, 4, 1024, 256]) + content_position = content_position if ( + content_content.shape == content_position.shape) else content_position[:, :, :c3, ] + assert (content_content.shape == content_position.shape) + energy = content_content + content_position + else: + energy = content_content + attention = self.softmax(energy) + out = torch.matmul(v, attention.permute(0, 1, 3, 2)) # 1,4,256,64 + out = out.view(n_batch, C, width, height) + return out + + +import torch.nn.functional as F +from timm.layers.create_act import create_act_layer, get_act_layer +from timm.layers.helpers import make_divisible +from timm.layers.mlp import ConvMlp +from timm.layers.norm import LayerNorm2d + + +class GlobalContext(nn.Module): + + def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False, + rd_ratio=1. / 8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'): + super(GlobalContext, self).__init__() + act_layer = get_act_layer(act_layer) + + self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None + + if rd_channels is None: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + if fuse_add: + self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) + else: + self.mlp_add = None + if fuse_scale: + self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) + else: + self.mlp_scale = None + + self.gate = create_act_layer(gate_layer) + self.init_last_zero = init_last_zero + self.reset_parameters() + + def reset_parameters(self): + if self.conv_attn is not None: + nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu') + if self.mlp_add is not None: + nn.init.zeros_(self.mlp_add.fc2.weight) + + def forward(self, x): + B, C, H, W = x.shape + + if self.conv_attn is not None: + attn = self.conv_attn(x).reshape(B, 1, H * W) # (B, 1, H * W) + attn = F.softmax(attn, dim=-1).unsqueeze(3) # (B, 1, H * W, 1) + context = x.reshape(B, C, H * W).unsqueeze(1) @ attn + context = context.view(B, C, 1, 1) + else: + context = x.mean(dim=(2, 3), keepdim=True) + + if self.mlp_scale is not None: + mlp_x = self.mlp_scale(context) + x = x * self.gate(mlp_x) + if self.mlp_add is not None: + mlp_x = self.mlp_add(context) + x = x + mlp_x + + return x + + +from timm.layers.create_conv2d import create_conv2d + + +class GatherExcite(nn.Module): + def __init__( + self, channels, feat_size=None, extra_params=False, extent=0, use_mlp=True, + rd_ratio=1. / 16, rd_channels=None, rd_divisor=1, add_maxpool=False, + act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, gate_layer='sigmoid'): + super(GatherExcite, self).__init__() + self.add_maxpool = add_maxpool + act_layer = get_act_layer(act_layer) + self.extent = extent + if extra_params: + self.gather = nn.Sequential() + if extent == 0: + assert feat_size is not None, 'spatial feature size must be specified for global extent w/ params' + self.gather.add_module( + 'conv1', create_conv2d(channels, channels, kernel_size=feat_size, stride=1, depthwise=True)) + if norm_layer: + self.gather.add_module(f'norm1', nn.BatchNorm2d(channels)) + else: + assert extent % 2 == 0 + num_conv = int(math.log2(extent)) + for i in range(num_conv): + self.gather.add_module( + f'conv{i + 1}', + create_conv2d(channels, channels, kernel_size=3, stride=2, depthwise=True)) + if norm_layer: + self.gather.add_module(f'norm{i + 1}', nn.BatchNorm2d(channels)) + if i != num_conv - 1: + self.gather.add_module(f'act{i + 1}', act_layer(inplace=True)) + else: + self.gather = None + if self.extent == 0: + self.gk = 0 + self.gs = 0 + else: + assert extent % 2 == 0 + self.gk = self.extent * 2 - 1 + self.gs = self.extent + + if not rd_channels: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + self.mlp = ConvMlp(channels, rd_channels, act_layer=act_layer) if use_mlp else nn.Identity() + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + size = x.shape[-2:] + if self.gather is not None: + x_ge = self.gather(x) + else: + if self.extent == 0: + # global extent + x_ge = x.mean(dim=(2, 3), keepdims=True) + if self.add_maxpool: + # experimental codepath, may remove or change + x_ge = 0.5 * x_ge + 0.5 * x.amax((2, 3), keepdim=True) + else: + x_ge = F.avg_pool2d( + x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2, count_include_pad=False) + if self.add_maxpool: + # experimental codepath, may remove or change + x_ge = 0.5 * x_ge + 0.5 * F.max_pool2d(x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2) + x_ge = self.mlp(x_ge) + if x_ge.shape[-1] != 1 or x_ge.shape[-2] != 1: + x_ge = F.interpolate(x_ge, size=size) + return x * self.gate(x_ge) diff --git a/ultralytics/nn/modules/head.py b/ultralytics/nn/modules/head.py new file mode 100644 index 0000000000000000000000000000000000000000..acd2eab0f0fc278981a841f410c16d88c932dc1d --- /dev/null +++ b/ultralytics/nn/modules/head.py @@ -0,0 +1,362 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Model head modules +""" + +import math + +import torch +import torch.nn as nn +from torch.nn.init import constant_, xavier_uniform_ + +from ultralytics.utils.tal import dist2bbox, make_anchors + +from .block import DFL, Proto +from .conv import Conv +from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer +from .utils import bias_init_with_prob, linear_init_ + +__all__ = 'Detect', 'Segment', 'Pose', 'Classify', 'RTDETRDecoder' + + +class Detect(nn.Module): + """YOLOv8 Detect head for detection models.""" + dynamic = False # force grid reconstruction + export = False # export mode + shape = None + anchors = torch.empty(0) # init + strides = torch.empty(0) # init + + def __init__(self, nc=80, ch=()): # detection layer + super().__init__() + self.nc = nc # number of classes + self.nl = len(ch) # number of detection layers + self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x) + self.no = nc + self.reg_max * 4 # number of outputs per anchor + self.stride = torch.zeros(self.nl) # strides computed during build + c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels + self.cv2 = nn.ModuleList( + nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch) + self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch) + self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity() + + def forward(self, x): + """Concatenates and returns predicted bounding boxes and class probabilities.""" + shape = x[0].shape # BCHW + for i in range(self.nl): + x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1) + if self.training: + return x + elif self.dynamic or self.shape != shape: + self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)) + self.shape = shape + + x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2) + if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops + box = x_cat[:, :self.reg_max * 4] + cls = x_cat[:, self.reg_max * 4:] + else: + box, cls = x_cat.split((self.reg_max * 4, self.nc), 1) + dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides + + if self.export and self.format in ('tflite', 'edgetpu'): + # Normalize xywh with image size to mitigate quantization error of TFLite integer models as done in YOLOv5: + # https://github.com/ultralytics/yolov5/blob/0c8de3fca4a702f8ff5c435e67f378d1fce70243/models/tf.py#L307-L309 + # See this PR for details: https://github.com/ultralytics/ultralytics/pull/1695 + img_h = shape[2] * self.stride[0] + img_w = shape[3] * self.stride[0] + img_size = torch.tensor([img_w, img_h, img_w, img_h], device=dbox.device).reshape(1, 4, 1) + dbox /= img_size + + y = torch.cat((dbox, cls.sigmoid()), 1) + return y if self.export else (y, x) + + def bias_init(self): + """Initialize Detect() biases, WARNING: requires stride availability.""" + m = self # self.model[-1] # Detect() module + # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1 + # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency + for a, b, s in zip(m.cv2, m.cv3, m.stride): # from + a[-1].bias.data[:] = 1.0 # box + b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img) + + +class Segment(Detect): + """YOLOv8 Segment head for segmentation models.""" + + def __init__(self, nc=80, nm=32, npr=256, ch=()): + """Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers.""" + super().__init__(nc, ch) + self.nm = nm # number of masks + self.npr = npr # number of protos + self.proto = Proto(ch[0], self.npr, self.nm) # protos + self.detect = Detect.forward + + c4 = max(ch[0] // 4, self.nm) + self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch) + + def forward(self, x): + """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients.""" + p = self.proto(x[0]) # mask protos + bs = p.shape[0] # batch size + + mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2) # mask coefficients + x = self.detect(self, x) + if self.training: + return x, mc, p + return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p)) + + +class Pose(Detect): + """YOLOv8 Pose head for keypoints models.""" + + def __init__(self, nc=80, kpt_shape=(17, 3), ch=()): + """Initialize YOLO network with default parameters and Convolutional Layers.""" + super().__init__(nc, ch) + self.kpt_shape = kpt_shape # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible) + self.nk = kpt_shape[0] * kpt_shape[1] # number of keypoints total + self.detect = Detect.forward + + c4 = max(ch[0] // 4, self.nk) + self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch) + + def forward(self, x): + """Perform forward pass through YOLO model and return predictions.""" + bs = x[0].shape[0] # batch size + kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1) # (bs, 17*3, h*w) + x = self.detect(self, x) + if self.training: + return x, kpt + pred_kpt = self.kpts_decode(bs, kpt) + return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt)) + + def kpts_decode(self, bs, kpts): + """Decodes keypoints.""" + ndim = self.kpt_shape[1] + if self.export: # required for TFLite export to avoid 'PLACEHOLDER_FOR_GREATER_OP_CODES' bug + y = kpts.view(bs, *self.kpt_shape, -1) + a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * self.strides + if ndim == 3: + a = torch.cat((a, y[:, :, 2:3].sigmoid()), 2) + return a.view(bs, self.nk, -1) + else: + y = kpts.clone() + if ndim == 3: + y[:, 2::3].sigmoid_() # inplace sigmoid + y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides + y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides + return y + + +class Classify(nn.Module): + """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2).""" + + def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups + super().__init__() + c_ = 1280 # efficientnet_b0 size + self.conv = Conv(c1, c_, k, s, p, g) + self.pool = nn.AdaptiveAvgPool2d(1) # to x(b,c_,1,1) + self.drop = nn.Dropout(p=0.0, inplace=True) + self.linear = nn.Linear(c_, c2) # to x(b,c2) + + def forward(self, x): + """Performs a forward pass of the YOLO model on input image data.""" + if isinstance(x, list): + x = torch.cat(x, 1) + x = self.linear(self.drop(self.pool(self.conv(x)).flatten(1))) + return x if self.training else x.softmax(1) + + +class RTDETRDecoder(nn.Module): + export = False # export mode + + def __init__( + self, + nc=80, + ch=(512, 1024, 2048), + hd=256, # hidden dim + nq=300, # num queries + ndp=4, # num decoder points + nh=8, # num head + ndl=6, # num decoder layers + d_ffn=1024, # dim of feedforward + dropout=0., + act=nn.ReLU(), + eval_idx=-1, + # training args + nd=100, # num denoising + label_noise_ratio=0.5, + box_noise_scale=1.0, + learnt_init_query=False): + super().__init__() + self.hidden_dim = hd + self.nhead = nh + self.nl = len(ch) # num level + self.nc = nc + self.num_queries = nq + self.num_decoder_layers = ndl + + # backbone feature projection + self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch) + # NOTE: simplified version but it's not consistent with .pt weights. + # self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch) + + # Transformer module + decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp) + self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx) + + # denoising part + self.denoising_class_embed = nn.Embedding(nc, hd) + self.num_denoising = nd + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + + # decoder embedding + self.learnt_init_query = learnt_init_query + if learnt_init_query: + self.tgt_embed = nn.Embedding(nq, hd) + self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2) + + # encoder head + self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd)) + self.enc_score_head = nn.Linear(hd, nc) + self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3) + + # decoder head + self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)]) + self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)]) + + self._reset_parameters() + + def forward(self, x, batch=None): + from ultralytics.models.utils.ops import get_cdn_group + + # input projection and embedding + feats, shapes = self._get_encoder_input(x) + + # prepare denoising training + dn_embed, dn_bbox, attn_mask, dn_meta = \ + get_cdn_group(batch, + self.nc, + self.num_queries, + self.denoising_class_embed.weight, + self.num_denoising, + self.label_noise_ratio, + self.box_noise_scale, + self.training) + + embed, refer_bbox, enc_bboxes, enc_scores = \ + self._get_decoder_input(feats, shapes, dn_embed, dn_bbox) + + # decoder + dec_bboxes, dec_scores = self.decoder(embed, + refer_bbox, + feats, + shapes, + self.dec_bbox_head, + self.dec_score_head, + self.query_pos_head, + attn_mask=attn_mask) + x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta + if self.training: + return x + # (bs, 300, 4+nc) + y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1) + return y if self.export else (y, x) + + def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2): + anchors = [] + for i, (h, w) in enumerate(shapes): + grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=dtype, device=device), + torch.arange(end=w, dtype=dtype, device=device), + indexing='ij') + grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2) + + valid_WH = torch.tensor([h, w], dtype=dtype, device=device) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2) + wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i) + anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4) + + anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4) + valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1 + anchors = torch.log(anchors / (1 - anchors)) + anchors = anchors.masked_fill(~valid_mask, float('inf')) + return anchors, valid_mask + + def _get_encoder_input(self, x): + # get projection features + x = [self.input_proj[i](feat) for i, feat in enumerate(x)] + # get encoder inputs + feats = [] + shapes = [] + for feat in x: + h, w = feat.shape[2:] + # [b, c, h, w] -> [b, h*w, c] + feats.append(feat.flatten(2).permute(0, 2, 1)) + # [nl, 2] + shapes.append([h, w]) + + # [b, h*w, c] + feats = torch.cat(feats, 1) + return feats, shapes + + def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None): + bs = len(feats) + # prepare input for decoder + anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device) + features = self.enc_output(valid_mask * feats) # bs, h*w, 256 + + enc_outputs_scores = self.enc_score_head(features) # (bs, h*w, nc) + + # query selection + # (bs, num_queries) + topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1) + # (bs, num_queries) + batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1) + + # (bs, num_queries, 256) + top_k_features = features[batch_ind, topk_ind].view(bs, self.num_queries, -1) + # (bs, num_queries, 4) + top_k_anchors = anchors[:, topk_ind].view(bs, self.num_queries, -1) + + # dynamic anchors + static content + refer_bbox = self.enc_bbox_head(top_k_features) + top_k_anchors + + enc_bboxes = refer_bbox.sigmoid() + if dn_bbox is not None: + refer_bbox = torch.cat([dn_bbox, refer_bbox], 1) + enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1) + + embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1) if self.learnt_init_query else top_k_features + if self.training: + refer_bbox = refer_bbox.detach() + if not self.learnt_init_query: + embeddings = embeddings.detach() + if dn_embed is not None: + embeddings = torch.cat([dn_embed, embeddings], 1) + + return embeddings, refer_bbox, enc_bboxes, enc_scores + + # TODO + def _reset_parameters(self): + # class and bbox head init + bias_cls = bias_init_with_prob(0.01) / 80 * self.nc + # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets. + # linear_init_(self.enc_score_head) + constant_(self.enc_score_head.bias, bias_cls) + constant_(self.enc_bbox_head.layers[-1].weight, 0.) + constant_(self.enc_bbox_head.layers[-1].bias, 0.) + for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): + # linear_init_(cls_) + constant_(cls_.bias, bias_cls) + constant_(reg_.layers[-1].weight, 0.) + constant_(reg_.layers[-1].bias, 0.) + + linear_init_(self.enc_output[0]) + xavier_uniform_(self.enc_output[0].weight) + if self.learnt_init_query: + xavier_uniform_(self.tgt_embed.weight) + xavier_uniform_(self.query_pos_head.layers[0].weight) + xavier_uniform_(self.query_pos_head.layers[1].weight) + for layer in self.input_proj: + xavier_uniform_(layer[0].weight) diff --git a/ultralytics/nn/modules/transformer.py b/ultralytics/nn/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..02ffe8a82cc6d39d36f6eac9b4bd42b31e2f8887 --- /dev/null +++ b/ultralytics/nn/modules/transformer.py @@ -0,0 +1,378 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Transformer modules +""" + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.init import constant_, xavier_uniform_ + +from .conv import Conv +from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch + +__all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'AIFI', + 'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP') + + +class TransformerEncoderLayer(nn.Module): + """Transformer Encoder.""" + + def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False): + super().__init__() + self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True) + # Implementation of Feedforward model + self.fc1 = nn.Linear(c1, cm) + self.fc2 = nn.Linear(cm, c1) + + self.norm1 = nn.LayerNorm(c1) + self.norm2 = nn.LayerNorm(c1) + self.dropout = nn.Dropout(dropout) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.act = act + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos=None): + """Add position embeddings if given.""" + return tensor if pos is None else tensor + pos + + def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None): + q = k = self.with_pos_embed(src, pos) + src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.fc2(self.dropout(self.act(self.fc1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None): + src2 = self.norm1(src) + q = k = self.with_pos_embed(src2, pos) + src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src2 = self.norm2(src) + src2 = self.fc2(self.dropout(self.act(self.fc1(src2)))) + src = src + self.dropout2(src2) + return src + + def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None): + """Forward propagates the input through the encoder module.""" + if self.normalize_before: + return self.forward_pre(src, src_mask, src_key_padding_mask, pos) + return self.forward_post(src, src_mask, src_key_padding_mask, pos) + + +class AIFI(TransformerEncoderLayer): + + def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False): + super().__init__(c1, cm, num_heads, dropout, act, normalize_before) + + def forward(self, x): + c, h, w = x.shape[1:] + pos_embed = self.build_2d_sincos_position_embedding(w, h, c) + # flatten [B, C, H, W] to [B, HxW, C] + x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype)) + return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous() + + @staticmethod + def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.): + grid_w = torch.arange(int(w), dtype=torch.float32) + grid_h = torch.arange(int(h), dtype=torch.float32) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij') + assert embed_dim % 4 == 0, \ + 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim + omega = 1. / (temperature ** omega) + + out_w = grid_w.flatten()[..., None] @ omega[None] + out_h = grid_h.flatten()[..., None] @ omega[None] + + return torch.concat([torch.sin(out_w), torch.cos(out_w), + torch.sin(out_h), torch.cos(out_h)], axis=1)[None, :, :] + + +class TransformerLayer(nn.Module): + """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance).""" + + def __init__(self, c, num_heads): + """Initializes a self-attention mechanism using linear transformations and multi-head attention.""" + super().__init__() + self.q = nn.Linear(c, c, bias=False) + self.k = nn.Linear(c, c, bias=False) + self.v = nn.Linear(c, c, bias=False) + self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads) + self.fc1 = nn.Linear(c, c, bias=False) + self.fc2 = nn.Linear(c, c, bias=False) + + def forward(self, x): + """Apply a transformer block to the input x and return the output.""" + x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x + x = self.fc2(self.fc1(x)) + x + return x + + +class TransformerBlock(nn.Module): + """Vision Transformer https://arxiv.org/abs/2010.11929.""" + + def __init__(self, c1, c2, num_heads, num_layers): + """Initialize a Transformer module with position embedding and specified number of heads and layers.""" + super().__init__() + self.conv = None + if c1 != c2: + self.conv = Conv(c1, c2) + self.linear = nn.Linear(c2, c2) # learnable position embedding + self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers))) + self.c2 = c2 + + def forward(self, x): + """Forward propagates the input through the bottleneck module.""" + if self.conv is not None: + x = self.conv(x) + b, _, w, h = x.shape + p = x.flatten(2).permute(2, 0, 1) + return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h) + + +class MLPBlock(nn.Module): + + def __init__(self, embedding_dim, mlp_dim, act=nn.GELU): + super().__init__() + self.lin1 = nn.Linear(embedding_dim, mlp_dim) + self.lin2 = nn.Linear(mlp_dim, embedding_dim) + self.act = act() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.lin2(self.act(self.lin1(x))) + + +class MLP(nn.Module): + """ Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa +# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa +class LayerNorm2d(nn.Module): + + def __init__(self, num_channels, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x): + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class MSDeformAttn(nn.Module): + """ + Original Multi-Scale Deformable Attention Module. + https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py + """ + + def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): + super().__init__() + if d_model % n_heads != 0: + raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}') + _d_per_head = d_model // n_heads + # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation + assert _d_per_head * n_heads == d_model, '`d_model` must be divisible by `n_heads`' + + self.im2col_step = 64 + + self.d_model = d_model + self.n_levels = n_levels + self.n_heads = n_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) + self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) + self.value_proj = nn.Linear(d_model, d_model) + self.output_proj = nn.Linear(d_model, d_model) + + self._reset_parameters() + + def _reset_parameters(self): + constant_(self.sampling_offsets.weight.data, 0.) + thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat( + 1, self.n_levels, self.n_points, 1) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.) + constant_(self.attention_weights.bias.data, 0.) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.) + + def forward(self, query, refer_bbox, value, value_shapes, value_mask=None): + """ + https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py + Args: + query (torch.Tensor): [bs, query_length, C] + refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (torch.Tensor): [bs, value_length, C] + value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, len_q = query.shape[:2] + len_v = value.shape[1] + assert sum(s[0] * s[1] for s in value_shapes) == len_v + + value = self.value_proj(value) + if value_mask is not None: + value = value.masked_fill(value_mask[..., None], float(0)) + value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2) + attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points) + attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points) + # N, Len_q, n_heads, n_levels, n_points, 2 + num_points = refer_bbox.shape[-1] + if num_points == 2: + offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1) + add = sampling_offsets / offset_normalizer[None, None, None, :, None, :] + sampling_locations = refer_bbox[:, :, None, :, None, :] + add + elif num_points == 4: + add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5 + sampling_locations = refer_bbox[:, :, None, :, None, :2] + add + else: + raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.') + output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights) + output = self.output_proj(output) + return output + + +class DeformableTransformerDecoderLayer(nn.Module): + """ + https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py + https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py + """ + + def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4): + super().__init__() + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # cross attention + self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.act = act + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm(d_model) + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt)))) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None): + # self attention + q = k = self.with_pos_embed(embed, query_pos) + tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), + attn_mask=attn_mask)[0].transpose(0, 1) + embed = embed + self.dropout1(tgt) + embed = self.norm1(embed) + + # cross attention + tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, + padding_mask) + embed = embed + self.dropout2(tgt) + embed = self.norm2(embed) + + # ffn + embed = self.forward_ffn(embed) + + return embed + + +class DeformableTransformerDecoder(nn.Module): + """ + https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py + """ + + def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): + super().__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.hidden_dim = hidden_dim + self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx + + def forward( + self, + embed, # decoder embeddings + refer_bbox, # anchor + feats, # image features + shapes, # feature shapes + bbox_head, + score_head, + pos_mlp, + attn_mask=None, + padding_mask=None): + output = embed + dec_bboxes = [] + dec_cls = [] + last_refined_bbox = None + refer_bbox = refer_bbox.sigmoid() + for i, layer in enumerate(self.layers): + output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox)) + + # refine bboxes, (bs, num_queries+num_denoising, 4) + refined_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(refer_bbox)) + + if self.training: + dec_cls.append(score_head[i](output)) + if i == 0: + dec_bboxes.append(refined_bbox) + else: + dec_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(last_refined_bbox))) + elif i == self.eval_idx: + dec_cls.append(score_head[i](output)) + dec_bboxes.append(refined_bbox) + break + + last_refined_bbox = refined_bbox + refer_bbox = refined_bbox.detach() if self.training else refined_bbox + + return torch.stack(dec_bboxes), torch.stack(dec_cls) diff --git a/ultralytics/nn/modules/utils.py b/ultralytics/nn/modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..35c0fabe6631af95d232d848bb452f59881ba932 --- /dev/null +++ b/ultralytics/nn/modules/utils.py @@ -0,0 +1,78 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Module utils +""" + +import copy +import math + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.init import uniform_ + +__all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid' + + +def _get_clones(module, n): + return nn.ModuleList([copy.deepcopy(module) for _ in range(n)]) + + +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init + + +def linear_init_(module): + bound = 1 / math.sqrt(module.weight.shape[0]) + uniform_(module.weight, -bound, bound) + if hasattr(module, 'bias') and module.bias is not None: + uniform_(module.bias, -bound, bound) + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor, + sampling_locations: torch.Tensor, + attention_weights: torch.Tensor) -> torch.Tensor: + """ + Multi-scale deformable attention. + https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py + """ + + bs, _, num_heads, embed_dims = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (H_, W_) in enumerate(value_spatial_shapes): + # bs, H_*W_, num_heads, embed_dims -> + # bs, H_*W_, num_heads*embed_dims -> + # bs, num_heads*embed_dims, H_*W_ -> + # bs*num_heads, embed_dims, H_, W_ + value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)) + # bs, num_queries, num_heads, num_points, 2 -> + # bs, num_heads, num_queries, num_points, 2 -> + # bs*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) + # bs*num_heads, embed_dims, num_queries, num_points + sampling_value_l_ = F.grid_sample(value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (bs, num_queries, num_heads, num_levels, num_points) -> + # (bs, num_heads, num_queries, num_levels, num_points) -> + # (bs, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries, + num_levels * num_points) + output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view( + bs, num_heads * embed_dims, num_queries)) + return output.transpose(1, 2).contiguous() diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..7b13642e18799c683c37d50276832edf90faae79 --- /dev/null +++ b/ultralytics/nn/tasks.py @@ -0,0 +1,845 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import contextlib +from copy import deepcopy +from pathlib import Path + +import torch +import torch.nn as nn + +from ultralytics.nn.modules import (AIFI, C1, C2, C3, C3TR, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, + Classify, Concat, Conv, Conv2, ConvTranspose, Detect, DWConv, DWConvTranspose2d, + Focus, GhostBottleneck, GhostConv, HGBlock, HGStem, Pose, RepC3, RepConv, + RTDETRDecoder, Segment, + GAM_Attention,GCT,ShuffleAttention,ResBlock_CBAM,ECAAttention,MHSA,GlobalContext,GatherExcite) + +from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load +from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml +from ultralytics.utils.loss import v8ClassificationLoss, v8DetectionLoss, v8PoseLoss, v8SegmentationLoss +from ultralytics.utils.plotting import feature_visualization +from ultralytics.utils.torch_utils import (fuse_conv_and_bn, fuse_deconv_and_bn, initialize_weights, intersect_dicts, + make_divisible, model_info, scale_img, time_sync) + +try: + import thop +except ImportError: + thop = None + + +class BaseModel(nn.Module): + """ + The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family. + """ + + def forward(self, x, *args, **kwargs): + """ + Forward pass of the model on a single scale. + Wrapper for `_forward_once` method. + + Args: + x (torch.Tensor | dict): The input image tensor or a dict including image tensor and gt labels. + + Returns: + (torch.Tensor): The output of the network. + """ + if isinstance(x, dict): # for cases of training and validating while training. + return self.loss(x, *args, **kwargs) + return self.predict(x, *args, **kwargs) + + def predict(self, x, profile=False, visualize=False, augment=False): + """ + Perform a forward pass through the network. + + Args: + x (torch.Tensor): The input tensor to the model. + profile (bool): Print the computation time of each layer if True, defaults to False. + visualize (bool): Save the feature maps of the model if True, defaults to False. + augment (bool): Augment image during prediction, defaults to False. + + Returns: + (torch.Tensor): The last output of the model. + """ + if augment: + return self._predict_augment(x) + return self._predict_once(x, profile, visualize) + + def _predict_once(self, x, profile=False, visualize=False): + """ + Perform a forward pass through the network. + + Args: + x (torch.Tensor): The input tensor to the model. + profile (bool): Print the computation time of each layer if True, defaults to False. + visualize (bool): Save the feature maps of the model if True, defaults to False. + + Returns: + (torch.Tensor): The last output of the model. + """ + y, dt = [], [] # outputs + for m in self.model: + if m.f != -1: # if not from previous layer + x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers + if profile: + self._profile_one_layer(m, x, dt) + x = m(x) # run + y.append(x if m.i in self.save else None) # save output + if visualize: + feature_visualization(x, m.type, m.i, save_dir=visualize) + return x + + def _predict_augment(self, x): + """Perform augmentations on input image x and return augmented inference.""" + LOGGER.warning(f'WARNING ⚠️ {self.__class__.__name__} does not support augmented inference yet. ' + f'Reverting to single-scale inference instead.') + return self._predict_once(x) + + def _profile_one_layer(self, m, x, dt): + """ + Profile the computation time and FLOPs of a single layer of the model on a given input. + Appends the results to the provided list. + + Args: + m (nn.Module): The layer to be profiled. + x (torch.Tensor): The input data to the layer. + dt (list): A list to store the computation time of the layer. + + Returns: + None + """ + c = m == self.model[-1] and isinstance(x, list) # is final layer list, copy input as inplace fix + flops = thop.profile(m, inputs=[x.copy() if c else x], verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPs + t = time_sync() + for _ in range(10): + m(x.copy() if c else x) + dt.append((time_sync() - t) * 100) + if m == self.model[0]: + LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s} module") + LOGGER.info(f'{dt[-1]:10.2f} {flops:10.2f} {m.np:10.0f} {m.type}') + if c: + LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s} Total") + + def fuse(self, verbose=True): + """ + Fuse the `Conv2d()` and `BatchNorm2d()` layers of the model into a single layer, in order to improve the + computation efficiency. + + Returns: + (nn.Module): The fused model is returned. + """ + if not self.is_fused(): + for m in self.model.modules(): + if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, 'bn'): + if isinstance(m, Conv2): + m.fuse_convs() + m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv + delattr(m, 'bn') # remove batchnorm + m.forward = m.forward_fuse # update forward + if isinstance(m, ConvTranspose) and hasattr(m, 'bn'): + m.conv_transpose = fuse_deconv_and_bn(m.conv_transpose, m.bn) + delattr(m, 'bn') # remove batchnorm + m.forward = m.forward_fuse # update forward + if isinstance(m, RepConv): + m.fuse_convs() + m.forward = m.forward_fuse # update forward + self.info(verbose=verbose) + + return self + + def is_fused(self, thresh=10): + """ + Check if the model has less than a certain threshold of BatchNorm layers. + + Args: + thresh (int, optional): The threshold number of BatchNorm layers. Default is 10. + + Returns: + (bool): True if the number of BatchNorm layers in the model is less than the threshold, False otherwise. + """ + bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k) # normalization layers, i.e. BatchNorm2d() + return sum(isinstance(v, bn) for v in self.modules()) < thresh # True if < 'thresh' BatchNorm layers in model + + def info(self, detailed=False, verbose=True, imgsz=640): + """ + Prints model information + + Args: + verbose (bool): if True, prints out the model information. Defaults to False + imgsz (int): the size of the image that the model will be trained on. Defaults to 640 + """ + return model_info(self, detailed=detailed, verbose=verbose, imgsz=imgsz) + + def _apply(self, fn): + """ + `_apply()` is a function that applies a function to all the tensors in the model that are not + parameters or registered buffers + + Args: + fn: the function to apply to the model + + Returns: + A model that is a Detect() object. + """ + self = super()._apply(fn) + m = self.model[-1] # Detect() + if isinstance(m, (Detect, Segment)): + m.stride = fn(m.stride) + m.anchors = fn(m.anchors) + m.strides = fn(m.strides) + return self + + def load(self, weights, verbose=True): + """Load the weights into the model. + + Args: + weights (dict | torch.nn.Module): The pre-trained weights to be loaded. + verbose (bool, optional): Whether to log the transfer progress. Defaults to True. + """ + model = weights['model'] if isinstance(weights, dict) else weights # torchvision models are not dicts + csd = model.float().state_dict() # checkpoint state_dict as FP32 + csd = intersect_dicts(csd, self.state_dict()) # intersect + self.load_state_dict(csd, strict=False) # load + if verbose: + LOGGER.info(f'Transferred {len(csd)}/{len(self.model.state_dict())} items from pretrained weights') + + def loss(self, batch, preds=None): + """ + Compute loss + + Args: + batch (dict): Batch to compute loss on + preds (torch.Tensor | List[torch.Tensor]): Predictions. + """ + if not hasattr(self, 'criterion'): + self.criterion = self.init_criterion() + + preds = self.forward(batch['img']) if preds is None else preds + return self.criterion(preds, batch) + + def init_criterion(self): + raise NotImplementedError('compute_loss() needs to be implemented by task heads') + + +class DetectionModel(BaseModel): + """YOLOv8 detection model.""" + + def __init__(self, cfg='yolov8n.yaml', ch=3, nc=None, verbose=True): # model, input channels, number of classes + super().__init__() + self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict + + # Define model + ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels + if nc and nc != self.yaml['nc']: + LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}") + self.yaml['nc'] = nc # override yaml value + self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist + self.names = {i: f'{i}' for i in range(self.yaml['nc'])} # default names dict + self.inplace = self.yaml.get('inplace', True) + + # Build strides + m = self.model[-1] # Detect() + if isinstance(m, (Detect, Segment, Pose)): + s = 256 # 2x min stride + m.inplace = self.inplace + forward = lambda x: self.forward(x)[0] if isinstance(m, (Segment, Pose)) else self.forward(x) + m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))]) # forward + self.stride = m.stride + m.bias_init() # only run once + else: + self.stride = torch.Tensor([32]) # default stride for i.e. RTDETR + + # Init weights, biases + initialize_weights(self) + if verbose: + self.info() + LOGGER.info('') + + def _predict_augment(self, x): + """Perform augmentations on input image x and return augmented inference and train outputs.""" + img_size = x.shape[-2:] # height, width + s = [1, 0.83, 0.67] # scales + f = [None, 3, None] # flips (2-ud, 3-lr) + y = [] # outputs + for si, fi in zip(s, f): + xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max())) + yi = super().predict(xi)[0] # forward + # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1]) # save + yi = self._descale_pred(yi, fi, si, img_size) + y.append(yi) + y = self._clip_augmented(y) # clip augmented tails + return torch.cat(y, -1), None # augmented inference, train + + @staticmethod + def _descale_pred(p, flips, scale, img_size, dim=1): + """De-scale predictions following augmented inference (inverse operation).""" + p[:, :4] /= scale # de-scale + x, y, wh, cls = p.split((1, 1, 2, p.shape[dim] - 4), dim) + if flips == 2: + y = img_size[0] - y # de-flip ud + elif flips == 3: + x = img_size[1] - x # de-flip lr + return torch.cat((x, y, wh, cls), dim) + + def _clip_augmented(self, y): + """Clip YOLOv5 augmented inference tails.""" + nl = self.model[-1].nl # number of detection layers (P3-P5) + g = sum(4 ** x for x in range(nl)) # grid points + e = 1 # exclude layer count + i = (y[0].shape[-1] // g) * sum(4 ** x for x in range(e)) # indices + y[0] = y[0][..., :-i] # large + i = (y[-1].shape[-1] // g) * sum(4 ** (nl - 1 - x) for x in range(e)) # indices + y[-1] = y[-1][..., i:] # small + return y + + def init_criterion(self): + return v8DetectionLoss(self) + + +class SegmentationModel(DetectionModel): + """YOLOv8 segmentation model.""" + + def __init__(self, cfg='yolov8n-seg.yaml', ch=3, nc=None, verbose=True): + """Initialize YOLOv8 segmentation model with given config and parameters.""" + super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose) + + def init_criterion(self): + return v8SegmentationLoss(self) + + def _predict_augment(self, x): + """Perform augmentations on input image x and return augmented inference.""" + LOGGER.warning( + f'WARNING ⚠️ {self.__class__.__name__} has not supported augment inference yet! Now using single-scale inference instead.' + ) + return self._predict_once(x) + + +class PoseModel(DetectionModel): + """YOLOv8 pose model.""" + + def __init__(self, cfg='yolov8n-pose.yaml', ch=3, nc=None, data_kpt_shape=(None, None), verbose=True): + """Initialize YOLOv8 Pose model.""" + if not isinstance(cfg, dict): + cfg = yaml_model_load(cfg) # load model YAML + if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg['kpt_shape']): + LOGGER.info(f"Overriding model.yaml kpt_shape={cfg['kpt_shape']} with kpt_shape={data_kpt_shape}") + cfg['kpt_shape'] = data_kpt_shape + super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose) + + def init_criterion(self): + return v8PoseLoss(self) + + def _predict_augment(self, x): + """Perform augmentations on input image x and return augmented inference.""" + LOGGER.warning( + f'WARNING ⚠️ {self.__class__.__name__} has not supported augment inference yet! Now using single-scale inference instead.' + ) + return self._predict_once(x) + + +class ClassificationModel(BaseModel): + """YOLOv8 classification model.""" + + def __init__(self, + cfg='yolov8n-cls.yaml', + model=None, + ch=3, + nc=None, + cutoff=10, + verbose=True): # yaml, model, channels, number of classes, cutoff index, verbose flag + super().__init__() + self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg, ch, nc, verbose) + + def _from_detection_model(self, model, nc=1000, cutoff=10): + """Create a YOLOv5 classification model from a YOLOv5 detection model.""" + from ultralytics.nn.autobackend import AutoBackend + if isinstance(model, AutoBackend): + model = model.model # unwrap DetectMultiBackend + model.model = model.model[:cutoff] # backbone + m = model.model[-1] # last layer + ch = m.conv.in_channels if hasattr(m, 'conv') else m.cv1.conv.in_channels # ch into module + c = Classify(ch, nc) # Classify() + c.i, c.f, c.type = m.i, m.f, 'models.common.Classify' # index, from, type + model.model[-1] = c # replace + self.model = model.model + self.stride = model.stride + self.save = [] + self.nc = nc + + def _from_yaml(self, cfg, ch, nc, verbose): + """Set YOLOv8 model configurations and define the model architecture.""" + self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict + + # Define model + ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels + if nc and nc != self.yaml['nc']: + LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}") + self.yaml['nc'] = nc # override yaml value + elif not nc and not self.yaml.get('nc', None): + raise ValueError('nc not specified. Must specify nc in model.yaml or function arguments.') + self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist + self.stride = torch.Tensor([1]) # no stride constraints + self.names = {i: f'{i}' for i in range(self.yaml['nc'])} # default names dict + self.info() + + @staticmethod + def reshape_outputs(model, nc): + """Update a TorchVision classification model to class count 'n' if required.""" + name, m = list((model.model if hasattr(model, 'model') else model).named_children())[-1] # last module + if isinstance(m, Classify): # YOLO Classify() head + if m.linear.out_features != nc: + m.linear = nn.Linear(m.linear.in_features, nc) + elif isinstance(m, nn.Linear): # ResNet, EfficientNet + if m.out_features != nc: + setattr(model, name, nn.Linear(m.in_features, nc)) + elif isinstance(m, nn.Sequential): + types = [type(x) for x in m] + if nn.Linear in types: + i = types.index(nn.Linear) # nn.Linear index + if m[i].out_features != nc: + m[i] = nn.Linear(m[i].in_features, nc) + elif nn.Conv2d in types: + i = types.index(nn.Conv2d) # nn.Conv2d index + if m[i].out_channels != nc: + m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None) + + def init_criterion(self): + """Compute the classification loss between predictions and true labels.""" + return v8ClassificationLoss() + + +class RTDETRDetectionModel(DetectionModel): + + def __init__(self, cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True): + super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose) + + def init_criterion(self): + """Compute the classification loss between predictions and true labels.""" + from ultralytics.models.utils.loss import RTDETRDetectionLoss + + return RTDETRDetectionLoss(nc=self.nc, use_vfl=True) + + def loss(self, batch, preds=None): + if not hasattr(self, 'criterion'): + self.criterion = self.init_criterion() + + img = batch['img'] + # NOTE: preprocess gt_bbox and gt_labels to list. + bs = len(img) + batch_idx = batch['batch_idx'] + gt_groups = [(batch_idx == i).sum().item() for i in range(bs)] + targets = { + 'cls': batch['cls'].to(img.device, dtype=torch.long).view(-1), + 'bboxes': batch['bboxes'].to(device=img.device), + 'batch_idx': batch_idx.to(img.device, dtype=torch.long).view(-1), + 'gt_groups': gt_groups} + + preds = self.predict(img, batch=targets) if preds is None else preds + dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta = preds if self.training else preds[1] + if dn_meta is None: + dn_bboxes, dn_scores = None, None + else: + dn_bboxes, dec_bboxes = torch.split(dec_bboxes, dn_meta['dn_num_split'], dim=2) + dn_scores, dec_scores = torch.split(dec_scores, dn_meta['dn_num_split'], dim=2) + + dec_bboxes = torch.cat([enc_bboxes.unsqueeze(0), dec_bboxes]) # (7, bs, 300, 4) + dec_scores = torch.cat([enc_scores.unsqueeze(0), dec_scores]) + + loss = self.criterion((dec_bboxes, dec_scores), + targets, + dn_bboxes=dn_bboxes, + dn_scores=dn_scores, + dn_meta=dn_meta) + # NOTE: There are like 12 losses in RTDETR, backward with all losses but only show the main three losses. + return sum(loss.values()), torch.as_tensor([loss[k].detach() for k in ['loss_giou', 'loss_class', 'loss_bbox']], + device=img.device) + + def predict(self, x, profile=False, visualize=False, batch=None, augment=False): + """ + Perform a forward pass through the network. + + Args: + x (torch.Tensor): The input tensor to the model + profile (bool): Print the computation time of each layer if True, defaults to False. + visualize (bool): Save the feature maps of the model if True, defaults to False + batch (dict): A dict including gt boxes and labels from dataloader. + + Returns: + (torch.Tensor): The last output of the model. + """ + y, dt = [], [] # outputs + for m in self.model[:-1]: # except the head part + if m.f != -1: # if not from previous layer + x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers + if profile: + self._profile_one_layer(m, x, dt) + x = m(x) # run + y.append(x if m.i in self.save else None) # save output + if visualize: + feature_visualization(x, m.type, m.i, save_dir=visualize) + head = self.model[-1] + x = head([y[j] for j in head.f], batch) # head inference + return x + + +class Ensemble(nn.ModuleList): + """Ensemble of models.""" + + def __init__(self): + """Initialize an ensemble of models.""" + super().__init__() + + def forward(self, x, augment=False, profile=False, visualize=False): + """Function generates the YOLOv5 network's final layer.""" + y = [module(x, augment, profile, visualize)[0] for module in self] + # y = torch.stack(y).max(0)[0] # max ensemble + # y = torch.stack(y).mean(0) # mean ensemble + y = torch.cat(y, 2) # nms ensemble, y shape(B, HW, C) + return y, None # inference, train output + + +# Functions ------------------------------------------------------------------------------------------------------------ + + +@contextlib.contextmanager +def temporary_modules(modules=None): + """ + Context manager for temporarily adding or modifying modules in Python's module cache (`sys.modules`). + + This function can be used to change the module paths during runtime. It's useful when refactoring code, + where you've moved a module from one location to another, but you still want to support the old import + paths for backwards compatibility. + + Args: + modules (dict, optional): A dictionary mapping old module paths to new module paths. + + Example: + ```python + with temporary_modules({'old.module.path': 'new.module.path'}): + import old.module.path # this will now import new.module.path + ``` + + Note: + The changes are only in effect inside the context manager and are undone once the context manager exits. + Be aware that directly manipulating `sys.modules` can lead to unpredictable results, especially in larger + applications or libraries. Use this function with caution. + """ + if not modules: + modules = {} + + import importlib + import sys + try: + # Set modules in sys.modules under their old name + for old, new in modules.items(): + sys.modules[old] = importlib.import_module(new) + + yield + finally: + # Remove the temporary module paths + for old in modules: + if old in sys.modules: + del sys.modules[old] + + +def torch_safe_load(weight): + """ + This function attempts to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised, + it catches the error, logs a warning message, and attempts to install the missing module via the + check_requirements() function. After installation, the function again attempts to load the model using torch.load(). + + Args: + weight (str): The file path of the PyTorch model. + + Returns: + (dict): The loaded PyTorch model. + """ + from ultralytics.utils.downloads import attempt_download_asset + + check_suffix(file=weight, suffix='.pt') + file = attempt_download_asset(weight) # search online if missing locally + try: + with temporary_modules({ + 'ultralytics.yolo.utils': 'ultralytics.utils', + 'ultralytics.yolo.v8': 'ultralytics.models.yolo', + 'ultralytics.yolo.data': 'ultralytics.data'}): # for legacy 8.0 Classify and Pose models + return torch.load(file, map_location='cpu'), file # load + + except ModuleNotFoundError as e: # e.name is missing module name + if e.name == 'models': + raise TypeError( + emojis(f'ERROR ❌️ {weight} appears to be an Ultralytics YOLOv5 model originally trained ' + f'with https://github.com/ultralytics/yolov5.\nThis model is NOT forwards compatible with ' + f'YOLOv8 at https://github.com/ultralytics/ultralytics.' + f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to " + f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'")) from e + LOGGER.warning(f"WARNING ⚠️ {weight} appears to require '{e.name}', which is not in ultralytics requirements." + f"\nAutoInstall will run now for '{e.name}' but this feature will be removed in the future." + f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to " + f"run a command with an official YOLOv8 model, i.e. 'yolo predict model=yolov8n.pt'") + check_requirements(e.name) # install missing module + + return torch.load(file, map_location='cpu'), file # load + + +def attempt_load_weights(weights, device=None, inplace=True, fuse=False): + """Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a.""" + + ensemble = Ensemble() + for w in weights if isinstance(weights, list) else [weights]: + ckpt, w = torch_safe_load(w) # load ckpt + args = {**DEFAULT_CFG_DICT, **ckpt['train_args']} if 'train_args' in ckpt else None # combined args + model = (ckpt.get('ema') or ckpt['model']).to(device).float() # FP32 model + + # Model compatibility updates + model.args = args # attach args to model + model.pt_path = w # attach *.pt file path to model + model.task = guess_model_task(model) + if not hasattr(model, 'stride'): + model.stride = torch.tensor([32.]) + + # Append + ensemble.append(model.fuse().eval() if fuse and hasattr(model, 'fuse') else model.eval()) # model in eval mode + + # Module compatibility updates + for m in ensemble.modules(): + t = type(m) + if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Segment): + m.inplace = inplace # torch 1.7.0 compatibility + elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'): + m.recompute_scale_factor = None # torch 1.11.0 compatibility + + # Return model + if len(ensemble) == 1: + return ensemble[-1] + + # Return ensemble + LOGGER.info(f'Ensemble created with {weights}\n') + for k in 'names', 'nc', 'yaml': + setattr(ensemble, k, getattr(ensemble[0], k)) + ensemble.stride = ensemble[torch.argmax(torch.tensor([m.stride.max() for m in ensemble])).int()].stride + assert all(ensemble[0].nc == m.nc for m in ensemble), f'Models differ in class counts {[m.nc for m in ensemble]}' + return ensemble + + +def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False): + """Loads a single model weights.""" + ckpt, weight = torch_safe_load(weight) # load ckpt + args = {**DEFAULT_CFG_DICT, **(ckpt.get('train_args', {}))} # combine model and default args, preferring model args + model = (ckpt.get('ema') or ckpt['model']).to(device).float() # FP32 model + + # Model compatibility updates + model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # attach args to model + model.pt_path = weight # attach *.pt file path to model + model.task = guess_model_task(model) + if not hasattr(model, 'stride'): + model.stride = torch.tensor([32.]) + + model = model.fuse().eval() if fuse and hasattr(model, 'fuse') else model.eval() # model in eval mode + + # Module compatibility updates + for m in model.modules(): + t = type(m) + if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Segment): + m.inplace = inplace # torch 1.7.0 compatibility + elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'): + m.recompute_scale_factor = None # torch 1.11.0 compatibility + + # Return model and ckpt + return model, ckpt + + +def parse_model(d, ch, verbose=True): # model_dict, input_channels(3) + """Parse a YOLO model.yaml dictionary into a PyTorch model.""" + import ast + + # Args + max_channels = float('inf') + nc, act, scales = (d.get(x) for x in ('nc', 'activation', 'scales')) + depth, width, kpt_shape = (d.get(x, 1.0) for x in ('depth_multiple', 'width_multiple', 'kpt_shape')) + if scales: + scale = d.get('scale') + if not scale: + scale = tuple(scales.keys())[0] + LOGGER.warning(f"WARNING ⚠️ no model scale passed. Assuming scale='{scale}'.") + depth, width, max_channels = scales[scale] + + if act: + Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU() + if verbose: + LOGGER.info(f"{colorstr('activation:')} {act}") # print + + if verbose: + LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10} {'module':<45}{'arguments':<30}") + ch = [ch] + layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out + for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args + m = getattr(torch.nn, m[3:]) if 'nn.' in m else globals()[m] # get module + for j, a in enumerate(args): + if isinstance(a, str): + with contextlib.suppress(ValueError): + args[j] = locals()[a] if a in locals() else ast.literal_eval(a) + + n = n_ = max(round(n * depth), 1) if n > 1 else n # depth gain + if m in (Classify, Conv, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, Focus, + BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, + GAM_Attention,GCT,ResBlock_CBAM,GlobalContext,GatherExcite): + c1, c2 = ch[f], args[0] + if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output) + c2 = make_divisible(min(c2, max_channels) * width, 8) + + args = [c1, c2, *args[1:]] + if m in (BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, C3x, RepC3): + args.insert(2, n) # number of repeats + n = 1 + elif m is AIFI: + args = [ch[f], *args] + elif m in (HGStem, HGBlock): + c1, cm, c2 = ch[f], args[0], args[1] + args = [c1, cm, c2, *args[2:]] + if m is HGBlock: + args.insert(4, n) # number of repeats + n = 1 + + elif m in {MHSA}: + args=[ch[f],*args] + + elif m is ShuffleAttention: + c1, c2 = ch[f], args[0] + if c2 != nc: + c2 = make_divisible(min(c2, max_channels) * width, 8) + args = [c1, *args[1:]] + + elif m is ECAAttention: + c1, c2 = ch[f], args[0] + if c2 != nc: + c2 = make_divisible(min(c2, max_channels) * width, 8) + args = [c1, *args[1:]] + + elif m is nn.BatchNorm2d: + args = [ch[f]] + elif m is Concat: + c2 = sum(ch[x] for x in f) + elif m in (Detect, Segment, Pose): + args.append([ch[x] for x in f]) + if m is Segment: + args[2] = make_divisible(min(args[2], max_channels) * width, 8) + elif m is RTDETRDecoder: # special case, channels arg must be passed in index 1 + args.insert(1, [ch[x] for x in f]) + else: + c2 = ch[f] + + m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module + t = str(m)[8:-2].replace('__main__.', '') # module type + m.np = sum(x.numel() for x in m_.parameters()) # number params + m_.i, m_.f, m_.type = i, f, t # attach index, 'from' index, type + if verbose: + LOGGER.info(f'{i:>3}{str(f):>20}{n_:>3}{m.np:10.0f} {t:<45}{str(args):<30}') # print + save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist + layers.append(m_) + if i == 0: + ch = [] + ch.append(c2) + return nn.Sequential(*layers), sorted(save) + + +def yaml_model_load(path): + """Load a YOLOv8 model from a YAML file.""" + import re + + path = Path(path) + if path.stem in (f'yolov{d}{x}6' for x in 'nsmlx' for d in (5, 8)): + new_stem = re.sub(r'(\d+)([nslmx])6(.+)?$', r'\1\2-p6\3', path.stem) + LOGGER.warning(f'WARNING ⚠️ Ultralytics YOLO P6 models now use -p6 suffix. Renaming {path.stem} to {new_stem}.') + path = path.with_name(new_stem + path.suffix) + + unified_path = re.sub(r'(\d+)([nslmx])(.+)?$', r'\1\3', str(path)) # i.e. yolov8x.yaml -> yolov8.yaml + yaml_file = check_yaml(unified_path, hard=False) or check_yaml(path) + d = yaml_load(yaml_file) # model dict + d['scale'] = guess_model_scale(path) + d['yaml_file'] = str(path) + return d + + +def guess_model_scale(model_path): + """ + Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale. + The function uses regular expression matching to find the pattern of the model scale in the YAML file name, + which is denoted by n, s, m, l, or x. The function returns the size character of the model scale as a string. + + Args: + model_path (str | Path): The path to the YOLO model's YAML file. + + Returns: + (str): The size character of the model's scale, which can be n, s, m, l, or x. + """ + with contextlib.suppress(AttributeError): + import re + return re.search(r'yolov\d+([nslmx])', Path(model_path).stem).group(1) # n, s, m, l, or x + return '' + + +def guess_model_task(model): + """ + Guess the task of a PyTorch model from its architecture or configuration. + + Args: + model (nn.Module | dict): PyTorch model or model configuration in YAML format. + + Returns: + (str): Task of the model ('detect', 'segment', 'classify', 'pose'). + + Raises: + SyntaxError: If the task of the model could not be determined. + """ + + def cfg2task(cfg): + """Guess from YAML dictionary.""" + m = cfg['head'][-1][-2].lower() # output module name + if m in ('classify', 'classifier', 'cls', 'fc'): + return 'classify' + if m == 'detect': + return 'detect' + if m == 'segment': + return 'segment' + if m == 'pose': + return 'pose' + + # Guess from model cfg + if isinstance(model, dict): + with contextlib.suppress(Exception): + return cfg2task(model) + + # Guess from PyTorch model + if isinstance(model, nn.Module): # PyTorch model + for x in 'model.args', 'model.model.args', 'model.model.model.args': + with contextlib.suppress(Exception): + return eval(x)['task'] + for x in 'model.yaml', 'model.model.yaml', 'model.model.model.yaml': + with contextlib.suppress(Exception): + return cfg2task(eval(x)) + + for m in model.modules(): + if isinstance(m, Detect): + return 'detect' + elif isinstance(m, Segment): + return 'segment' + elif isinstance(m, Classify): + return 'classify' + elif isinstance(m, Pose): + return 'pose' + + # Guess from model filename + if isinstance(model, (str, Path)): + model = Path(model) + if '-seg' in model.stem or 'segment' in model.parts: + return 'segment' + elif '-cls' in model.stem or 'classify' in model.parts: + return 'classify' + elif '-pose' in model.stem or 'pose' in model.parts: + return 'pose' + elif 'detect' in model.parts: + return 'detect' + + # Unable to determine task from model + LOGGER.warning("WARNING ⚠️ Unable to automatically guess model task, assuming 'task=detect'. " + "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify', or 'pose'.") + return 'detect' # assume detect diff --git a/ultralytics/trackers/__init__.py b/ultralytics/trackers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6525fcc494ebc3d5c7ccfbe0e04226e18e000169 --- /dev/null +++ b/ultralytics/trackers/__init__.py @@ -0,0 +1,7 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .bot_sort import BOTSORT +from .byte_tracker import BYTETracker +from .track import register_tracker + +__all__ = 'register_tracker', 'BOTSORT', 'BYTETracker' # allow simpler import diff --git a/ultralytics/trackers/basetrack.py b/ultralytics/trackers/basetrack.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7b09c8744d4cd57b299da848a01fe213ad6b9b --- /dev/null +++ b/ultralytics/trackers/basetrack.py @@ -0,0 +1,71 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from collections import OrderedDict + +import numpy as np + + +class TrackState: + """Enumeration of possible object tracking states.""" + + New = 0 + Tracked = 1 + Lost = 2 + Removed = 3 + + +class BaseTrack: + """Base class for object tracking, handling basic track attributes and operations.""" + + _count = 0 + + track_id = 0 + is_activated = False + state = TrackState.New + + history = OrderedDict() + features = [] + curr_feature = None + score = 0 + start_frame = 0 + frame_id = 0 + time_since_update = 0 + + # Multi-camera + location = (np.inf, np.inf) + + @property + def end_frame(self): + """Return the last frame ID of the track.""" + return self.frame_id + + @staticmethod + def next_id(): + """Increment and return the global track ID counter.""" + BaseTrack._count += 1 + return BaseTrack._count + + def activate(self, *args): + """Activate the track with the provided arguments.""" + raise NotImplementedError + + def predict(self): + """Predict the next state of the track.""" + raise NotImplementedError + + def update(self, *args, **kwargs): + """Update the track with new observations.""" + raise NotImplementedError + + def mark_lost(self): + """Mark the track as lost.""" + self.state = TrackState.Lost + + def mark_removed(self): + """Mark the track as removed.""" + self.state = TrackState.Removed + + @staticmethod + def reset_id(): + """Reset the global track ID counter.""" + BaseTrack._count = 0 diff --git a/ultralytics/trackers/bot_sort.py b/ultralytics/trackers/bot_sort.py new file mode 100644 index 0000000000000000000000000000000000000000..397fc256acc8b35ada6a1e4b3a24687806225920 --- /dev/null +++ b/ultralytics/trackers/bot_sort.py @@ -0,0 +1,148 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from collections import deque + +import numpy as np + +from .basetrack import TrackState +from .byte_tracker import BYTETracker, STrack +from .utils import matching +from .utils.gmc import GMC +from .utils.kalman_filter import KalmanFilterXYWH + + +class BOTrack(STrack): + shared_kalman = KalmanFilterXYWH() + + def __init__(self, tlwh, score, cls, feat=None, feat_history=50): + """Initialize YOLOv8 object with temporal parameters, such as feature history, alpha and current features.""" + super().__init__(tlwh, score, cls) + + self.smooth_feat = None + self.curr_feat = None + if feat is not None: + self.update_features(feat) + self.features = deque([], maxlen=feat_history) + self.alpha = 0.9 + + def update_features(self, feat): + """Update features vector and smooth it using exponential moving average.""" + feat /= np.linalg.norm(feat) + self.curr_feat = feat + if self.smooth_feat is None: + self.smooth_feat = feat + else: + self.smooth_feat = self.alpha * self.smooth_feat + (1 - self.alpha) * feat + self.features.append(feat) + self.smooth_feat /= np.linalg.norm(self.smooth_feat) + + def predict(self): + """Predicts the mean and covariance using Kalman filter.""" + mean_state = self.mean.copy() + if self.state != TrackState.Tracked: + mean_state[6] = 0 + mean_state[7] = 0 + + self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance) + + def re_activate(self, new_track, frame_id, new_id=False): + """Reactivates a track with updated features and optionally assigns a new ID.""" + if new_track.curr_feat is not None: + self.update_features(new_track.curr_feat) + super().re_activate(new_track, frame_id, new_id) + + def update(self, new_track, frame_id): + """Update the YOLOv8 instance with new track and frame ID.""" + if new_track.curr_feat is not None: + self.update_features(new_track.curr_feat) + super().update(new_track, frame_id) + + @property + def tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + if self.mean is None: + return self._tlwh.copy() + ret = self.mean[:4].copy() + ret[:2] -= ret[2:] / 2 + return ret + + @staticmethod + def multi_predict(stracks): + """Predicts the mean and covariance of multiple object tracks using shared Kalman filter.""" + if len(stracks) <= 0: + return + multi_mean = np.asarray([st.mean.copy() for st in stracks]) + multi_covariance = np.asarray([st.covariance for st in stracks]) + for i, st in enumerate(stracks): + if st.state != TrackState.Tracked: + multi_mean[i][6] = 0 + multi_mean[i][7] = 0 + multi_mean, multi_covariance = BOTrack.shared_kalman.multi_predict(multi_mean, multi_covariance) + for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): + stracks[i].mean = mean + stracks[i].covariance = cov + + def convert_coords(self, tlwh): + """Converts Top-Left-Width-Height bounding box coordinates to X-Y-Width-Height format.""" + return self.tlwh_to_xywh(tlwh) + + @staticmethod + def tlwh_to_xywh(tlwh): + """Convert bounding box to format `(center x, center y, width, + height)`. + """ + ret = np.asarray(tlwh).copy() + ret[:2] += ret[2:] / 2 + return ret + + +class BOTSORT(BYTETracker): + + def __init__(self, args, frame_rate=30): + """Initialize YOLOv8 object with ReID module and GMC algorithm.""" + super().__init__(args, frame_rate) + # ReID module + self.proximity_thresh = args.proximity_thresh + self.appearance_thresh = args.appearance_thresh + + if args.with_reid: + # Haven't supported BoT-SORT(reid) yet + self.encoder = None + # self.gmc = GMC(method=args.cmc_method, verbose=[args.name, args.ablation]) + self.gmc = GMC(method=args.cmc_method) + + def get_kalmanfilter(self): + """Returns an instance of KalmanFilterXYWH for object tracking.""" + return KalmanFilterXYWH() + + def init_track(self, dets, scores, cls, img=None): + """Initialize track with detections, scores, and classes.""" + if len(dets) == 0: + return [] + if self.args.with_reid and self.encoder is not None: + features_keep = self.encoder.inference(img, dets) + return [BOTrack(xyxy, s, c, f) for (xyxy, s, c, f) in zip(dets, scores, cls, features_keep)] # detections + else: + return [BOTrack(xyxy, s, c) for (xyxy, s, c) in zip(dets, scores, cls)] # detections + + def get_dists(self, tracks, detections): + """Get distances between tracks and detections using IoU and (optionally) ReID embeddings.""" + dists = matching.iou_distance(tracks, detections) + dists_mask = (dists > self.proximity_thresh) + + # TODO: mot20 + # if not self.args.mot20: + dists = matching.fuse_score(dists, detections) + + if self.args.with_reid and self.encoder is not None: + emb_dists = matching.embedding_distance(tracks, detections) / 2.0 + emb_dists[emb_dists > self.appearance_thresh] = 1.0 + emb_dists[dists_mask] = 1.0 + dists = np.minimum(dists, emb_dists) + return dists + + def multi_predict(self, tracks): + """Predict and track multiple objects with YOLOv8 model.""" + BOTrack.multi_predict(tracks) diff --git a/ultralytics/trackers/byte_tracker.py b/ultralytics/trackers/byte_tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..93a02e23f2bbba7e69c66786759be27fc7a2f8ec --- /dev/null +++ b/ultralytics/trackers/byte_tracker.py @@ -0,0 +1,364 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import numpy as np + +from .basetrack import BaseTrack, TrackState +from .utils import matching +from .utils.kalman_filter import KalmanFilterXYAH + + +class STrack(BaseTrack): + shared_kalman = KalmanFilterXYAH() + + def __init__(self, tlwh, score, cls): + """wait activate.""" + self._tlwh = np.asarray(self.tlbr_to_tlwh(tlwh[:-1]), dtype=np.float32) + self.kalman_filter = None + self.mean, self.covariance = None, None + self.is_activated = False + + self.score = score + self.tracklet_len = 0 + self.cls = cls + self.idx = tlwh[-1] + + def predict(self): + """Predicts mean and covariance using Kalman filter.""" + mean_state = self.mean.copy() + if self.state != TrackState.Tracked: + mean_state[7] = 0 + self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance) + + @staticmethod + def multi_predict(stracks): + """Perform multi-object predictive tracking using Kalman filter for given stracks.""" + if len(stracks) <= 0: + return + multi_mean = np.asarray([st.mean.copy() for st in stracks]) + multi_covariance = np.asarray([st.covariance for st in stracks]) + for i, st in enumerate(stracks): + if st.state != TrackState.Tracked: + multi_mean[i][7] = 0 + multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(multi_mean, multi_covariance) + for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): + stracks[i].mean = mean + stracks[i].covariance = cov + + @staticmethod + def multi_gmc(stracks, H=np.eye(2, 3)): + """Update state tracks positions and covariances using a homography matrix.""" + if len(stracks) > 0: + multi_mean = np.asarray([st.mean.copy() for st in stracks]) + multi_covariance = np.asarray([st.covariance for st in stracks]) + + R = H[:2, :2] + R8x8 = np.kron(np.eye(4, dtype=float), R) + t = H[:2, 2] + + for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): + mean = R8x8.dot(mean) + mean[:2] += t + cov = R8x8.dot(cov).dot(R8x8.transpose()) + + stracks[i].mean = mean + stracks[i].covariance = cov + + def activate(self, kalman_filter, frame_id): + """Start a new tracklet.""" + self.kalman_filter = kalman_filter + self.track_id = self.next_id() + self.mean, self.covariance = self.kalman_filter.initiate(self.convert_coords(self._tlwh)) + + self.tracklet_len = 0 + self.state = TrackState.Tracked + if frame_id == 1: + self.is_activated = True + self.frame_id = frame_id + self.start_frame = frame_id + + def re_activate(self, new_track, frame_id, new_id=False): + """Reactivates a previously lost track with a new detection.""" + self.mean, self.covariance = self.kalman_filter.update(self.mean, self.covariance, + self.convert_coords(new_track.tlwh)) + self.tracklet_len = 0 + self.state = TrackState.Tracked + self.is_activated = True + self.frame_id = frame_id + if new_id: + self.track_id = self.next_id() + self.score = new_track.score + self.cls = new_track.cls + self.idx = new_track.idx + + def update(self, new_track, frame_id): + """ + Update a matched track + :type new_track: STrack + :type frame_id: int + :return: + """ + self.frame_id = frame_id + self.tracklet_len += 1 + + new_tlwh = new_track.tlwh + self.mean, self.covariance = self.kalman_filter.update(self.mean, self.covariance, + self.convert_coords(new_tlwh)) + self.state = TrackState.Tracked + self.is_activated = True + + self.score = new_track.score + self.cls = new_track.cls + self.idx = new_track.idx + + def convert_coords(self, tlwh): + """Convert a bounding box's top-left-width-height format to its x-y-angle-height equivalent.""" + return self.tlwh_to_xyah(tlwh) + + @property + def tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + if self.mean is None: + return self._tlwh.copy() + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + @property + def tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + @staticmethod + def tlwh_to_xyah(tlwh): + """Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = np.asarray(tlwh).copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret + + @staticmethod + def tlbr_to_tlwh(tlbr): + """Converts top-left bottom-right format to top-left width height format.""" + ret = np.asarray(tlbr).copy() + ret[2:] -= ret[:2] + return ret + + @staticmethod + def tlwh_to_tlbr(tlwh): + """Converts tlwh bounding box format to tlbr format.""" + ret = np.asarray(tlwh).copy() + ret[2:] += ret[:2] + return ret + + def __repr__(self): + """Return a string representation of the BYTETracker object with start and end frames and track ID.""" + return f'OT_{self.track_id}_({self.start_frame}-{self.end_frame})' + + +class BYTETracker: + + def __init__(self, args, frame_rate=30): + """Initialize a YOLOv8 object to track objects with given arguments and frame rate.""" + self.tracked_stracks = [] # type: list[STrack] + self.lost_stracks = [] # type: list[STrack] + self.removed_stracks = [] # type: list[STrack] + + self.frame_id = 0 + self.args = args + self.max_time_lost = int(frame_rate / 30.0 * args.track_buffer) + self.kalman_filter = self.get_kalmanfilter() + self.reset_id() + + def update(self, results, img=None): + """Updates object tracker with new detections and returns tracked object bounding boxes.""" + self.frame_id += 1 + activated_stracks = [] + refind_stracks = [] + lost_stracks = [] + removed_stracks = [] + + scores = results.conf + bboxes = results.xyxy + # Add index + bboxes = np.concatenate([bboxes, np.arange(len(bboxes)).reshape(-1, 1)], axis=-1) + cls = results.cls + + remain_inds = scores > self.args.track_high_thresh + inds_low = scores > self.args.track_low_thresh + inds_high = scores < self.args.track_high_thresh + + inds_second = np.logical_and(inds_low, inds_high) + dets_second = bboxes[inds_second] + dets = bboxes[remain_inds] + scores_keep = scores[remain_inds] + scores_second = scores[inds_second] + cls_keep = cls[remain_inds] + cls_second = cls[inds_second] + + detections = self.init_track(dets, scores_keep, cls_keep, img) + # Add newly detected tracklets to tracked_stracks + unconfirmed = [] + tracked_stracks = [] # type: list[STrack] + for track in self.tracked_stracks: + if not track.is_activated: + unconfirmed.append(track) + else: + tracked_stracks.append(track) + # Step 2: First association, with high score detection boxes + strack_pool = self.joint_stracks(tracked_stracks, self.lost_stracks) + # Predict the current location with KF + self.multi_predict(strack_pool) + if hasattr(self, 'gmc') and img is not None: + warp = self.gmc.apply(img, dets) + STrack.multi_gmc(strack_pool, warp) + STrack.multi_gmc(unconfirmed, warp) + + dists = self.get_dists(strack_pool, detections) + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=self.args.match_thresh) + + for itracked, idet in matches: + track = strack_pool[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(det, self.frame_id) + activated_stracks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + # Step 3: Second association, with low score detection boxes + # association the untrack to the low score detections + detections_second = self.init_track(dets_second, scores_second, cls_second, img) + r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state == TrackState.Tracked] + # TODO + dists = matching.iou_distance(r_tracked_stracks, detections_second) + matches, u_track, u_detection_second = matching.linear_assignment(dists, thresh=0.5) + for itracked, idet in matches: + track = r_tracked_stracks[itracked] + det = detections_second[idet] + if track.state == TrackState.Tracked: + track.update(det, self.frame_id) + activated_stracks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + for it in u_track: + track = r_tracked_stracks[it] + if track.state != TrackState.Lost: + track.mark_lost() + lost_stracks.append(track) + # Deal with unconfirmed tracks, usually tracks with only one beginning frame + detections = [detections[i] for i in u_detection] + dists = self.get_dists(unconfirmed, detections) + matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7) + for itracked, idet in matches: + unconfirmed[itracked].update(detections[idet], self.frame_id) + activated_stracks.append(unconfirmed[itracked]) + for it in u_unconfirmed: + track = unconfirmed[it] + track.mark_removed() + removed_stracks.append(track) + # Step 4: Init new stracks + for inew in u_detection: + track = detections[inew] + if track.score < self.args.new_track_thresh: + continue + track.activate(self.kalman_filter, self.frame_id) + activated_stracks.append(track) + # Step 5: Update state + for track in self.lost_stracks: + if self.frame_id - track.end_frame > self.max_time_lost: + track.mark_removed() + removed_stracks.append(track) + + self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] + self.tracked_stracks = self.joint_stracks(self.tracked_stracks, activated_stracks) + self.tracked_stracks = self.joint_stracks(self.tracked_stracks, refind_stracks) + self.lost_stracks = self.sub_stracks(self.lost_stracks, self.tracked_stracks) + self.lost_stracks.extend(lost_stracks) + self.lost_stracks = self.sub_stracks(self.lost_stracks, self.removed_stracks) + self.tracked_stracks, self.lost_stracks = self.remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks) + self.removed_stracks.extend(removed_stracks) + if len(self.removed_stracks) > 1000: + self.removed_stracks = self.removed_stracks[-999:] # clip remove stracks to 1000 maximum + return np.asarray( + [x.tlbr.tolist() + [x.track_id, x.score, x.cls, x.idx] for x in self.tracked_stracks if x.is_activated], + dtype=np.float32) + + def get_kalmanfilter(self): + """Returns a Kalman filter object for tracking bounding boxes.""" + return KalmanFilterXYAH() + + def init_track(self, dets, scores, cls, img=None): + """Initialize object tracking with detections and scores using STrack algorithm.""" + return [STrack(xyxy, s, c) for (xyxy, s, c) in zip(dets, scores, cls)] if len(dets) else [] # detections + + def get_dists(self, tracks, detections): + """Calculates the distance between tracks and detections using IOU and fuses scores.""" + dists = matching.iou_distance(tracks, detections) + # TODO: mot20 + # if not self.args.mot20: + dists = matching.fuse_score(dists, detections) + return dists + + def multi_predict(self, tracks): + """Returns the predicted tracks using the YOLOv8 network.""" + STrack.multi_predict(tracks) + + def reset_id(self): + """Resets the ID counter of STrack.""" + STrack.reset_id() + + @staticmethod + def joint_stracks(tlista, tlistb): + """Combine two lists of stracks into a single one.""" + exists = {} + res = [] + for t in tlista: + exists[t.track_id] = 1 + res.append(t) + for t in tlistb: + tid = t.track_id + if not exists.get(tid, 0): + exists[tid] = 1 + res.append(t) + return res + + @staticmethod + def sub_stracks(tlista, tlistb): + """DEPRECATED CODE in https://github.com/ultralytics/ultralytics/pull/1890/ + stracks = {t.track_id: t for t in tlista} + for t in tlistb: + tid = t.track_id + if stracks.get(tid, 0): + del stracks[tid] + return list(stracks.values()) + """ + track_ids_b = {t.track_id for t in tlistb} + return [t for t in tlista if t.track_id not in track_ids_b] + + @staticmethod + def remove_duplicate_stracks(stracksa, stracksb): + """Remove duplicate stracks with non-maximum IOU distance.""" + pdist = matching.iou_distance(stracksa, stracksb) + pairs = np.where(pdist < 0.15) + dupa, dupb = [], [] + for p, q in zip(*pairs): + timep = stracksa[p].frame_id - stracksa[p].start_frame + timeq = stracksb[q].frame_id - stracksb[q].start_frame + if timep > timeq: + dupb.append(q) + else: + dupa.append(p) + resa = [t for i, t in enumerate(stracksa) if i not in dupa] + resb = [t for i, t in enumerate(stracksb) if i not in dupb] + return resa, resb diff --git a/ultralytics/trackers/track.py b/ultralytics/trackers/track.py new file mode 100644 index 0000000000000000000000000000000000000000..9cb4da694c1da7b7753edfb83108b2e042e1b788 --- /dev/null +++ b/ultralytics/trackers/track.py @@ -0,0 +1,66 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from functools import partial + +import torch + +from ultralytics.utils import IterableSimpleNamespace, yaml_load +from ultralytics.utils.checks import check_yaml + +from .bot_sort import BOTSORT +from .byte_tracker import BYTETracker + +TRACKER_MAP = {'bytetrack': BYTETracker, 'botsort': BOTSORT} + + +def on_predict_start(predictor, persist=False): + """ + Initialize trackers for object tracking during prediction. + + Args: + predictor (object): The predictor object to initialize trackers for. + persist (bool, optional): Whether to persist the trackers if they already exist. Defaults to False. + + Raises: + AssertionError: If the tracker_type is not 'bytetrack' or 'botsort'. + """ + if hasattr(predictor, 'trackers') and persist: + return + tracker = check_yaml(predictor.args.tracker) + cfg = IterableSimpleNamespace(**yaml_load(tracker)) + assert cfg.tracker_type in ['bytetrack', 'botsort'], \ + f"Only support 'bytetrack' and 'botsort' for now, but got '{cfg.tracker_type}'" + trackers = [] + for _ in range(predictor.dataset.bs): + tracker = TRACKER_MAP[cfg.tracker_type](args=cfg, frame_rate=30) + trackers.append(tracker) + predictor.trackers = trackers + + +def on_predict_postprocess_end(predictor): + """Postprocess detected boxes and update with object tracking.""" + bs = predictor.dataset.bs + im0s = predictor.batch[1] + for i in range(bs): + det = predictor.results[i].boxes.cpu().numpy() + if len(det) == 0: + continue + tracks = predictor.trackers[i].update(det, im0s[i]) + if len(tracks) == 0: + continue + idx = tracks[:, -1].astype(int) + predictor.results[i] = predictor.results[i][idx] + predictor.results[i].update(boxes=torch.as_tensor(tracks[:, :-1])) + + +def register_tracker(model, persist): + """ + Register tracking callbacks to the model for object tracking during prediction. + + Args: + model (object): The model object to register tracking callbacks for. + persist (bool): Whether to persist the trackers if they already exist. + + """ + model.add_callback('on_predict_start', partial(on_predict_start, persist=persist)) + model.add_callback('on_predict_postprocess_end', on_predict_postprocess_end) diff --git a/ultralytics/utils/__init__.py b/ultralytics/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b30ca61f2a1b06a6d5f4b93033820e3e729dd49b --- /dev/null +++ b/ultralytics/utils/__init__.py @@ -0,0 +1,830 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import contextlib +import inspect +import logging.config +import os +import platform +import re +import subprocess +import sys +import threading +import urllib +import uuid +from pathlib import Path +from types import SimpleNamespace +from typing import Union + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import torch +import yaml + +from ultralytics import __version__ + +# PyTorch Multi-GPU DDP Constants +RANK = int(os.getenv('RANK', -1)) +LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html +WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) + +# Other Constants +FILE = Path(__file__).resolve() +ROOT = FILE.parents[1] # YOLO +DEFAULT_CFG_PATH = ROOT / 'cfg/default.yaml' +NUM_THREADS = min(8, max(1, os.cpu_count() - 1)) # number of YOLOv5 multiprocessing threads +AUTOINSTALL = str(os.getenv('YOLO_AUTOINSTALL', True)).lower() == 'true' # global auto-install mode +VERBOSE = str(os.getenv('YOLO_VERBOSE', True)).lower() == 'true' # global verbose mode +TQDM_BAR_FORMAT = '{l_bar}{bar:10}{r_bar}' # tqdm bar format +LOGGING_NAME = 'ultralytics' +MACOS, LINUX, WINDOWS = (platform.system() == x for x in ['Darwin', 'Linux', 'Windows']) # environment booleans +ARM64 = platform.machine() in ('arm64', 'aarch64') # ARM64 booleans +HELP_MSG = \ + """ + Usage examples for running YOLOv8: + + 1. Install the ultralytics package: + + pip install ultralytics + + 2. Use the Python SDK: + + from ultralytics import YOLO + + # Load a model + model = YOLO('yolov8n.yaml') # build a new model from scratch + model = YOLO("yolov8n.pt") # load a pretrained model (recommended for training) + + # Use the model + results = model.train(data="coco128.yaml", epochs=3) # train the model + results = model.val() # evaluate model performance on the validation set + results = model('https://ultralytics.com/images/bus.jpg') # predict on an image + success = model.export(format='onnx') # export the model to ONNX format + + 3. Use the command line interface (CLI): + + YOLOv8 'yolo' CLI commands use the following syntax: + + yolo TASK MODE ARGS + + Where TASK (optional) is one of [detect, segment, classify] + MODE (required) is one of [train, val, predict, export] + ARGS (optional) are any number of custom 'arg=value' pairs like 'imgsz=320' that override defaults. + See all ARGS at https://docs.ultralytics.com/usage/cfg or with 'yolo cfg' + + - Train a detection model for 10 epochs with an initial learning_rate of 0.01 + yolo detect train data=coco128.yaml model=yolov8n.pt epochs=10 lr0=0.01 + + - Predict a YouTube video using a pretrained segmentation model at image size 320: + yolo segment predict model=yolov8n-seg.pt source='https://youtu.be/Zgi9g1ksQHc' imgsz=320 + + - Val a pretrained detection model at batch-size 1 and image size 640: + yolo detect val model=yolov8n.pt data=coco128.yaml batch=1 imgsz=640 + + - Export a YOLOv8n classification model to ONNX format at image size 224 by 128 (no TASK required) + yolo export model=yolov8n-cls.pt format=onnx imgsz=224,128 + + - Run special commands: + yolo help + yolo checks + yolo version + yolo settings + yolo copy-cfg + yolo cfg + + Docs: https://docs.ultralytics.com + Community: https://community.ultralytics.com + GitHub: https://github.com/ultralytics/ultralytics + """ + +# Settings +torch.set_printoptions(linewidth=320, precision=4, profile='default') +np.set_printoptions(linewidth=320, formatter={'float_kind': '{:11.5g}'.format}) # format short g, %precision=5 +cv2.setNumThreads(0) # prevent OpenCV from multithreading (incompatible with PyTorch DataLoader) +os.environ['NUMEXPR_MAX_THREADS'] = str(NUM_THREADS) # NumExpr max threads +os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # for deterministic training +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # suppress verbose TF compiler warnings in Colab + + +class SimpleClass: + """ + Ultralytics SimpleClass is a base class providing helpful string representation, error reporting, and attribute + access methods for easier debugging and usage. + """ + + def __str__(self): + """Return a human-readable string representation of the object.""" + attr = [] + for a in dir(self): + v = getattr(self, a) + if not callable(v) and not a.startswith('_'): + if isinstance(v, SimpleClass): + # Display only the module and class name for subclasses + s = f'{a}: {v.__module__}.{v.__class__.__name__} object' + else: + s = f'{a}: {repr(v)}' + attr.append(s) + return f'{self.__module__}.{self.__class__.__name__} object with attributes:\n\n' + '\n'.join(attr) + + def __repr__(self): + """Return a machine-readable string representation of the object.""" + return self.__str__() + + def __getattr__(self, attr): + """Custom attribute access error message with helpful information.""" + name = self.__class__.__name__ + raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}") + + +class IterableSimpleNamespace(SimpleNamespace): + """ + Ultralytics IterableSimpleNamespace is an extension class of SimpleNamespace that adds iterable functionality and + enables usage with dict() and for loops. + """ + + def __iter__(self): + """Return an iterator of key-value pairs from the namespace's attributes.""" + return iter(vars(self).items()) + + def __str__(self): + """Return a human-readable string representation of the object.""" + return '\n'.join(f'{k}={v}' for k, v in vars(self).items()) + + def __getattr__(self, attr): + """Custom attribute access error message with helpful information.""" + name = self.__class__.__name__ + raise AttributeError(f""" + '{name}' object has no attribute '{attr}'. This may be caused by a modified or out of date ultralytics + 'default.yaml' file.\nPlease update your code with 'pip install -U ultralytics' and if necessary replace + {DEFAULT_CFG_PATH} with the latest version from + https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/default.yaml + """) + + def get(self, key, default=None): + """Return the value of the specified key if it exists; otherwise, return the default value.""" + return getattr(self, key, default) + + +def plt_settings(rcparams=None, backend='Agg'): + """ + Decorator to temporarily set rc parameters and the backend for a plotting function. + + Usage: + decorator: @plt_settings({"font.size": 12}) + context manager: with plt_settings({"font.size": 12}): + + Args: + rcparams (dict): Dictionary of rc parameters to set. + backend (str, optional): Name of the backend to use. Defaults to 'Agg'. + + Returns: + (Callable): Decorated function with temporarily set rc parameters and backend. This decorator can be + applied to any function that needs to have specific matplotlib rc parameters and backend for its execution. + """ + + if rcparams is None: + rcparams = {'font.size': 11} + + def decorator(func): + """Decorator to apply temporary rc parameters and backend to a function.""" + + def wrapper(*args, **kwargs): + """Sets rc parameters and backend, calls the original function, and restores the settings.""" + original_backend = plt.get_backend() + plt.switch_backend(backend) + + with plt.rc_context(rcparams): + result = func(*args, **kwargs) + + plt.switch_backend(original_backend) + return result + + return wrapper + + return decorator + + +def set_logging(name=LOGGING_NAME, verbose=True): + """Sets up logging for the given name.""" + rank = int(os.getenv('RANK', -1)) # rank in world for Multi-GPU trainings + level = logging.INFO if verbose and rank in {-1, 0} else logging.ERROR + logging.config.dictConfig({ + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + name: { + 'format': '%(message)s'}}, + 'handlers': { + name: { + 'class': 'logging.StreamHandler', + 'formatter': name, + 'level': level}}, + 'loggers': { + name: { + 'level': level, + 'handlers': [name], + 'propagate': False}}}) + + +def emojis(string=''): + """Return platform-dependent emoji-safe version of string.""" + return string.encode().decode('ascii', 'ignore') if WINDOWS else string + + +class EmojiFilter(logging.Filter): + """ + A custom logging filter class for removing emojis in log messages. + + This filter is particularly useful for ensuring compatibility with Windows terminals + that may not support the display of emojis in log messages. + """ + + def filter(self, record): + """Filter logs by emoji unicode characters on windows.""" + record.msg = emojis(record.msg) + return super().filter(record) + + +# Set logger +set_logging(LOGGING_NAME, verbose=VERBOSE) # run before defining LOGGER +LOGGER = logging.getLogger(LOGGING_NAME) # define globally (used in train.py, val.py, detect.py, etc.) +if WINDOWS: # emoji-safe logging + LOGGER.addFilter(EmojiFilter()) + + +class ThreadingLocked: + """ + A decorator class for ensuring thread-safe execution of a function or method. + This class can be used as a decorator to make sure that if the decorated function + is called from multiple threads, only one thread at a time will be able to execute the function. + + Attributes: + lock (threading.Lock): A lock object used to manage access to the decorated function. + + Usage: + @ThreadingLocked() + def my_function(): + # Your code here + pass + """ + + def __init__(self): + self.lock = threading.Lock() + + def __call__(self, f): + from functools import wraps + + @wraps(f) + def decorated(*args, **kwargs): + with self.lock: + return f(*args, **kwargs) + + return decorated + + +def yaml_save(file='data.yaml', data=None): + """ + Save YAML data to a file. + + Args: + file (str, optional): File name. Default is 'data.yaml'. + data (dict): Data to save in YAML format. + + Returns: + (None): Data is saved to the specified file. + """ + if data is None: + data = {} + file = Path(file) + if not file.parent.exists(): + # Create parent directories if they don't exist + file.parent.mkdir(parents=True, exist_ok=True) + + # Convert Path objects to strings + for k, v in data.items(): + if isinstance(v, Path): + data[k] = str(v) + + # Dump data to file in YAML format + with open(file, 'w') as f: + yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True) + + +def yaml_load(file='data.yaml', append_filename=False): + """ + Load YAML data from a file. + + Args: + file (str, optional): File name. Default is 'data.yaml'. + append_filename (bool): Add the YAML filename to the YAML dictionary. Default is False. + + Returns: + (dict): YAML data and file name. + """ + with open(file, errors='ignore', encoding='utf-8') as f: + s = f.read() # string + + # Remove special characters + if not s.isprintable(): + s = re.sub(r'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]+', '', s) + + # Add YAML filename to dict and return + return {**yaml.safe_load(s), 'yaml_file': str(file)} if append_filename else yaml.safe_load(s) + + +def yaml_print(yaml_file: Union[str, Path, dict]) -> None: + """ + Pretty prints a yaml file or a yaml-formatted dictionary. + + Args: + yaml_file: The file path of the yaml file or a yaml-formatted dictionary. + + Returns: + None + """ + yaml_dict = yaml_load(yaml_file) if isinstance(yaml_file, (str, Path)) else yaml_file + dump = yaml.dump(yaml_dict, sort_keys=False, allow_unicode=True) + LOGGER.info(f"Printing '{colorstr('bold', 'black', yaml_file)}'\n\n{dump}") + + +# Default configuration +DEFAULT_CFG_DICT = yaml_load(DEFAULT_CFG_PATH) +for k, v in DEFAULT_CFG_DICT.items(): + if isinstance(v, str) and v.lower() == 'none': + DEFAULT_CFG_DICT[k] = None +DEFAULT_CFG_KEYS = DEFAULT_CFG_DICT.keys() +DEFAULT_CFG = IterableSimpleNamespace(**DEFAULT_CFG_DICT) + + +def is_colab(): + """ + Check if the current script is running inside a Google Colab notebook. + + Returns: + (bool): True if running inside a Colab notebook, False otherwise. + """ + return 'COLAB_RELEASE_TAG' in os.environ or 'COLAB_BACKEND_VERSION' in os.environ + + +def is_kaggle(): + """ + Check if the current script is running inside a Kaggle kernel. + + Returns: + (bool): True if running inside a Kaggle kernel, False otherwise. + """ + return os.environ.get('PWD') == '/kaggle/working' and os.environ.get('KAGGLE_URL_BASE') == 'https://www.kaggle.com' + + +def is_jupyter(): + """ + Check if the current script is running inside a Jupyter Notebook. + Verified on Colab, Jupyterlab, Kaggle, Paperspace. + + Returns: + (bool): True if running inside a Jupyter Notebook, False otherwise. + """ + with contextlib.suppress(Exception): + from IPython import get_ipython + return get_ipython() is not None + return False + + +def is_docker() -> bool: + """ + Determine if the script is running inside a Docker container. + + Returns: + (bool): True if the script is running inside a Docker container, False otherwise. + """ + file = Path('/proc/self/cgroup') + if file.exists(): + with open(file) as f: + return 'docker' in f.read() + else: + return False + + +def is_online() -> bool: + """ + Check internet connectivity by attempting to connect to a known online host. + + Returns: + (bool): True if connection is successful, False otherwise. + """ + import socket + + for host in '1.1.1.1', '8.8.8.8', '223.5.5.5': # Cloudflare, Google, AliDNS: + try: + test_connection = socket.create_connection(address=(host, 53), timeout=2) + except (socket.timeout, socket.gaierror, OSError): + continue + else: + # If the connection was successful, close it to avoid a ResourceWarning + test_connection.close() + return True + return False + + +ONLINE = is_online() + + +def is_pip_package(filepath: str = __name__) -> bool: + """ + Determines if the file at the given filepath is part of a pip package. + + Args: + filepath (str): The filepath to check. + + Returns: + (bool): True if the file is part of a pip package, False otherwise. + """ + import importlib.util + + # Get the spec for the module + spec = importlib.util.find_spec(filepath) + + # Return whether the spec is not None and the origin is not None (indicating it is a package) + return spec is not None and spec.origin is not None + + +def is_dir_writeable(dir_path: Union[str, Path]) -> bool: + """ + Check if a directory is writeable. + + Args: + dir_path (str | Path): The path to the directory. + + Returns: + (bool): True if the directory is writeable, False otherwise. + """ + return os.access(str(dir_path), os.W_OK) + + +def is_pytest_running(): + """ + Determines whether pytest is currently running or not. + + Returns: + (bool): True if pytest is running, False otherwise. + """ + return ('PYTEST_CURRENT_TEST' in os.environ) or ('pytest' in sys.modules) or ('pytest' in Path(sys.argv[0]).stem) + + +def is_github_actions_ci() -> bool: + """ + Determine if the current environment is a GitHub Actions CI Python runner. + + Returns: + (bool): True if the current environment is a GitHub Actions CI Python runner, False otherwise. + """ + return 'GITHUB_ACTIONS' in os.environ and 'RUNNER_OS' in os.environ and 'RUNNER_TOOL_CACHE' in os.environ + + +def is_git_dir(): + """ + Determines whether the current file is part of a git repository. + If the current file is not part of a git repository, returns None. + + Returns: + (bool): True if current file is part of a git repository. + """ + return get_git_dir() is not None + + +def get_git_dir(): + """ + Determines whether the current file is part of a git repository and if so, returns the repository root directory. + If the current file is not part of a git repository, returns None. + + Returns: + (Path | None): Git root directory if found or None if not found. + """ + for d in Path(__file__).parents: + if (d / '.git').is_dir(): + return d + return None # no .git dir found + + +def get_git_origin_url(): + """ + Retrieves the origin URL of a git repository. + + Returns: + (str | None): The origin URL of the git repository. + """ + if is_git_dir(): + with contextlib.suppress(subprocess.CalledProcessError): + origin = subprocess.check_output(['git', 'config', '--get', 'remote.origin.url']) + return origin.decode().strip() + return None # if not git dir or on error + + +def get_git_branch(): + """ + Returns the current git branch name. If not in a git repository, returns None. + + Returns: + (str | None): The current git branch name. + """ + if is_git_dir(): + with contextlib.suppress(subprocess.CalledProcessError): + origin = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) + return origin.decode().strip() + return None # if not git dir or on error + + +def get_default_args(func): + """Returns a dictionary of default arguments for a function. + + Args: + func (callable): The function to inspect. + + Returns: + (dict): A dictionary where each key is a parameter name, and each value is the default value of that parameter. + """ + signature = inspect.signature(func) + return {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty} + + +def get_user_config_dir(sub_dir='Ultralytics'): + """ + Get the user config directory. + + Args: + sub_dir (str): The name of the subdirectory to create. + + Returns: + (Path): The path to the user config directory. + """ + # Return the appropriate config directory for each operating system + if WINDOWS: + path = Path.home() / 'AppData' / 'Roaming' / sub_dir + elif MACOS: # macOS + path = Path.home() / 'Library' / 'Application Support' / sub_dir + elif LINUX: + path = Path.home() / '.config' / sub_dir + else: + raise ValueError(f'Unsupported operating system: {platform.system()}') + + # GCP and AWS lambda fix, only /tmp is writeable + if not is_dir_writeable(path.parent): + LOGGER.warning(f"WARNING ⚠️ user config directory '{path}' is not writeable, defaulting to '/tmp' or CWD." + 'Alternatively you can define a YOLO_CONFIG_DIR environment variable for this path.') + path = Path('/tmp') / sub_dir if is_dir_writeable('/tmp') else Path().cwd() / sub_dir + + # Create the subdirectory if it does not exist + path.mkdir(parents=True, exist_ok=True) + + return path + + +USER_CONFIG_DIR = Path(os.getenv('YOLO_CONFIG_DIR') or get_user_config_dir()) # Ultralytics settings dir +SETTINGS_YAML = USER_CONFIG_DIR / 'settings.yaml' + + +def colorstr(*input): + """Colors a string https://en.wikipedia.org/wiki/ANSI_escape_code, i.e. colorstr('blue', 'hello world').""" + *args, string = input if len(input) > 1 else ('blue', 'bold', input[0]) # color arguments, string + colors = { + 'black': '\033[30m', # basic colors + 'red': '\033[31m', + 'green': '\033[32m', + 'yellow': '\033[33m', + 'blue': '\033[34m', + 'magenta': '\033[35m', + 'cyan': '\033[36m', + 'white': '\033[37m', + 'bright_black': '\033[90m', # bright colors + 'bright_red': '\033[91m', + 'bright_green': '\033[92m', + 'bright_yellow': '\033[93m', + 'bright_blue': '\033[94m', + 'bright_magenta': '\033[95m', + 'bright_cyan': '\033[96m', + 'bright_white': '\033[97m', + 'end': '\033[0m', # misc + 'bold': '\033[1m', + 'underline': '\033[4m'} + return ''.join(colors[x] for x in args) + f'{string}' + colors['end'] + + +class TryExcept(contextlib.ContextDecorator): + """YOLOv8 TryExcept class. Usage: @TryExcept() decorator or 'with TryExcept():' context manager.""" + + def __init__(self, msg='', verbose=True): + """Initialize TryExcept class with optional message and verbosity settings.""" + self.msg = msg + self.verbose = verbose + + def __enter__(self): + """Executes when entering TryExcept context, initializes instance.""" + pass + + def __exit__(self, exc_type, value, traceback): + """Defines behavior when exiting a 'with' block, prints error message if necessary.""" + if self.verbose and value: + print(emojis(f"{self.msg}{': ' if self.msg else ''}{value}")) + return True + + +def threaded(func): + """Multi-threads a target function and returns thread. Usage: @threaded decorator.""" + + def wrapper(*args, **kwargs): + """Multi-threads a given function and returns the thread.""" + thread = threading.Thread(target=func, args=args, kwargs=kwargs, daemon=True) + thread.start() + return thread + + return wrapper + + +def set_sentry(): + """ + Initialize the Sentry SDK for error tracking and reporting. Only used if sentry_sdk package is installed and + sync=True in settings. Run 'yolo settings' to see and update settings YAML file. + + Conditions required to send errors (ALL conditions must be met or no errors will be reported): + - sentry_sdk package is installed + - sync=True in YOLO settings + - pytest is not running + - running in a pip package installation + - running in a non-git directory + - running with rank -1 or 0 + - online environment + - CLI used to run package (checked with 'yolo' as the name of the main CLI command) + + The function also configures Sentry SDK to ignore KeyboardInterrupt and FileNotFoundError + exceptions and to exclude events with 'out of memory' in their exception message. + + Additionally, the function sets custom tags and user information for Sentry events. + """ + + def before_send(event, hint): + """ + Modify the event before sending it to Sentry based on specific exception types and messages. + + Args: + event (dict): The event dictionary containing information about the error. + hint (dict): A dictionary containing additional information about the error. + + Returns: + dict: The modified event or None if the event should not be sent to Sentry. + """ + if 'exc_info' in hint: + exc_type, exc_value, tb = hint['exc_info'] + if exc_type in (KeyboardInterrupt, FileNotFoundError) \ + or 'out of memory' in str(exc_value): + return None # do not send event + + event['tags'] = { + 'sys_argv': sys.argv[0], + 'sys_argv_name': Path(sys.argv[0]).name, + 'install': 'git' if is_git_dir() else 'pip' if is_pip_package() else 'other', + 'os': ENVIRONMENT} + return event + + if SETTINGS['sync'] and \ + RANK in (-1, 0) and \ + Path(sys.argv[0]).name == 'yolo' and \ + not TESTS_RUNNING and \ + ONLINE and \ + is_pip_package() and \ + not is_git_dir(): + + # If sentry_sdk package is not installed then return and do not use Sentry + try: + import sentry_sdk # noqa + except ImportError: + return + + sentry_sdk.init( + dsn='https://5ff1556b71594bfea135ff0203a0d290@o4504521589325824.ingest.sentry.io/4504521592406016', + debug=False, + traces_sample_rate=1.0, + release=__version__, + environment='production', # 'dev' or 'production' + before_send=before_send, + ignore_errors=[KeyboardInterrupt, FileNotFoundError]) + sentry_sdk.set_user({'id': SETTINGS['uuid']}) # SHA-256 anonymized UUID hash + + # Disable all sentry logging + for logger in 'sentry_sdk', 'sentry_sdk.errors': + logging.getLogger(logger).setLevel(logging.CRITICAL) + + +class SettingsManager(dict): + """ + Manages Ultralytics settings stored in a YAML file. + + Args: + file (str | Path): Path to the Ultralytics settings YAML file. Default is USER_CONFIG_DIR / 'settings.yaml'. + version (str): Settings version. In case of local version mismatch, new default settings will be saved. + """ + + def __init__(self, file=SETTINGS_YAML, version='0.0.4'): + import copy + import hashlib + + from ultralytics.utils.checks import check_version + from ultralytics.utils.torch_utils import torch_distributed_zero_first + + git_dir = get_git_dir() + root = git_dir or Path() + datasets_root = (root.parent if git_dir and is_dir_writeable(root.parent) else root).resolve() + + self.file = Path(file) + self.version = version + self.defaults = { + 'settings_version': version, + 'datasets_dir': str(datasets_root / 'datasets'), + 'weights_dir': str(root / 'weights'), + 'runs_dir': str(root / 'runs'), + 'uuid': hashlib.sha256(str(uuid.getnode()).encode()).hexdigest(), + 'sync': True, + 'api_key': '', + 'clearml': True, # integrations + 'comet': True, + 'dvc': True, + 'hub': True, + 'mlflow': True, + 'neptune': True, + 'raytune': True, + 'tensorboard': True, + 'wandb': True} + + super().__init__(copy.deepcopy(self.defaults)) + + with torch_distributed_zero_first(RANK): + if not self.file.exists(): + self.save() + + self.load() + correct_keys = self.keys() == self.defaults.keys() + correct_types = all(type(a) == type(b) for a, b in zip(self.values(), self.defaults.values())) + correct_version = check_version(self['settings_version'], self.version) + if not (correct_keys and correct_types and correct_version): + LOGGER.warning( + 'WARNING ⚠️ Ultralytics settings reset to default values. This may be due to a possible problem ' + 'with your settings or a recent ultralytics package update. ' + f"\nView settings with 'yolo settings' or at '{self.file}'" + "\nUpdate settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'.") + self.reset() + + def load(self): + """Loads settings from the YAML file.""" + super().update(yaml_load(self.file)) + + def save(self): + """Saves the current settings to the YAML file.""" + yaml_save(self.file, dict(self)) + + def update(self, *args, **kwargs): + """Updates a setting value in the current settings.""" + super().update(*args, **kwargs) + self.save() + + def reset(self): + """Resets the settings to default and saves them.""" + self.clear() + self.update(self.defaults) + self.save() + + +def deprecation_warn(arg, new_arg, version=None): + """Issue a deprecation warning when a deprecated argument is used, suggesting an updated argument.""" + if not version: + version = float(__version__[:3]) + 0.2 # deprecate after 2nd major release + LOGGER.warning(f"WARNING ⚠️ '{arg}' is deprecated and will be removed in 'ultralytics {version}' in the future. " + f"Please use '{new_arg}' instead.") + + +def clean_url(url): + """Strip auth from URL, i.e. https://url.com/file.txt?auth -> https://url.com/file.txt.""" + url = Path(url).as_posix().replace(':/', '://') # Pathlib turns :// -> :/, as_posix() for Windows + return urllib.parse.unquote(url).split('?')[0] # '%2F' to '/', split https://url.com/file.txt?auth + + +def url2file(url): + """Convert URL to filename, i.e. https://url.com/file.txt?auth -> file.txt.""" + return Path(clean_url(url)).name + + +# Run below code on utils init ------------------------------------------------------------------------------------ + +# Check first-install steps +PREFIX = colorstr('Ultralytics: ') +SETTINGS = SettingsManager() # initialize settings +DATASETS_DIR = Path(SETTINGS['datasets_dir']) # global datasets directory +ENVIRONMENT = 'Colab' if is_colab() else 'Kaggle' if is_kaggle() else 'Jupyter' if is_jupyter() else \ + 'Docker' if is_docker() else platform.system() +TESTS_RUNNING = is_pytest_running() or is_github_actions_ci() +set_sentry() + +# Apply monkey patches if the script is being run from within the parent directory of the script's location +from .patches import imread, imshow, imwrite + +# torch.save = torch_save +if Path(inspect.stack()[0].filename).parent.parent.as_posix() in inspect.stack()[-1].filename: + cv2.imread, cv2.imwrite, cv2.imshow = imread, imwrite, imshow diff --git a/ultralytics/utils/__pycache__/__init__.cpython-310.pyc b/ultralytics/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0336801457061cd7c71898066b3ca8d0596b53f2 Binary files /dev/null and b/ultralytics/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/__init__.cpython-39.pyc b/ultralytics/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..846296a6b01dfacecba89ebdc3e6e992dc875a98 Binary files /dev/null and b/ultralytics/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/autobatch.cpython-310.pyc b/ultralytics/utils/__pycache__/autobatch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2aa8dc7560eed6e9987fa32db51625dd3aa53118 Binary files /dev/null and b/ultralytics/utils/__pycache__/autobatch.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/autobatch.cpython-39.pyc b/ultralytics/utils/__pycache__/autobatch.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bf8836b23b1688eab001b77a41d3e979b58aff9 Binary files /dev/null and b/ultralytics/utils/__pycache__/autobatch.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/checks.cpython-310.pyc b/ultralytics/utils/__pycache__/checks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22c03712c02851c42108f9f9211363668cbd9705 Binary files /dev/null and b/ultralytics/utils/__pycache__/checks.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/checks.cpython-39.pyc b/ultralytics/utils/__pycache__/checks.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a9c92375614bc43f0e1d68e9ab9e9f1a1b9e379 Binary files /dev/null and b/ultralytics/utils/__pycache__/checks.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/dist.cpython-310.pyc b/ultralytics/utils/__pycache__/dist.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..469b6ec55115d996afec65d6d50a805a254bb8c4 Binary files /dev/null and b/ultralytics/utils/__pycache__/dist.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/dist.cpython-39.pyc b/ultralytics/utils/__pycache__/dist.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa9feea99a2fc656338d6ee14c4840853e5d657f Binary files /dev/null and b/ultralytics/utils/__pycache__/dist.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/downloads.cpython-310.pyc b/ultralytics/utils/__pycache__/downloads.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50891a7ba043f85916f2d0cc37b586448c076750 Binary files /dev/null and b/ultralytics/utils/__pycache__/downloads.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/downloads.cpython-39.pyc b/ultralytics/utils/__pycache__/downloads.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99fdbdee6c31f9453874bcd0c175d5c6fbdea411 Binary files /dev/null and b/ultralytics/utils/__pycache__/downloads.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/files.cpython-310.pyc b/ultralytics/utils/__pycache__/files.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..712133c907d4f81f78b71f3939eeb2af2618af41 Binary files /dev/null and b/ultralytics/utils/__pycache__/files.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/files.cpython-39.pyc b/ultralytics/utils/__pycache__/files.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e0fbbe5d2e0b12232f6651fb770f21915f44687 Binary files /dev/null and b/ultralytics/utils/__pycache__/files.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/instance.cpython-310.pyc b/ultralytics/utils/__pycache__/instance.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cee6059c704d492eeeebcd6f8fe2c179e01092e Binary files /dev/null and b/ultralytics/utils/__pycache__/instance.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/instance.cpython-39.pyc b/ultralytics/utils/__pycache__/instance.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1825d22f1dbd8130c3bebdb7e0f0dd70a5358737 Binary files /dev/null and b/ultralytics/utils/__pycache__/instance.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/iou.cpython-310.pyc b/ultralytics/utils/__pycache__/iou.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1d8698198a2b43cba5bf3f0770e78d4a95e6f3a Binary files /dev/null and b/ultralytics/utils/__pycache__/iou.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/iou.cpython-39.pyc b/ultralytics/utils/__pycache__/iou.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc13806e2d43ab5c885b4353b4c4548e25057495 Binary files /dev/null and b/ultralytics/utils/__pycache__/iou.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/loss.cpython-310.pyc b/ultralytics/utils/__pycache__/loss.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23dc635e1467accd8c98ac51027218ee312c2a95 Binary files /dev/null and b/ultralytics/utils/__pycache__/loss.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/loss.cpython-39.pyc b/ultralytics/utils/__pycache__/loss.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8c976d617049f8a5b9fbb120f94cd4b67266404 Binary files /dev/null and b/ultralytics/utils/__pycache__/loss.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/metrics.cpython-310.pyc b/ultralytics/utils/__pycache__/metrics.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b1bbdf5dd89654571abdc1da164a0eca1179374 Binary files /dev/null and b/ultralytics/utils/__pycache__/metrics.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/metrics.cpython-39.pyc b/ultralytics/utils/__pycache__/metrics.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed0990ccb63cb5670572ee18934d89fbcb07c4c7 Binary files /dev/null and b/ultralytics/utils/__pycache__/metrics.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/ops.cpython-310.pyc b/ultralytics/utils/__pycache__/ops.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b077fd1bdefcc1ffb274b2c73fdc604a164d3c5c Binary files /dev/null and b/ultralytics/utils/__pycache__/ops.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/ops.cpython-39.pyc b/ultralytics/utils/__pycache__/ops.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f709654a04907e4db3ee06efd8881002eb75f345 Binary files /dev/null and b/ultralytics/utils/__pycache__/ops.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/patches.cpython-310.pyc b/ultralytics/utils/__pycache__/patches.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b0adbaa3f3bb6d05cbf0f240da1091ddefa077f Binary files /dev/null and b/ultralytics/utils/__pycache__/patches.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/patches.cpython-39.pyc b/ultralytics/utils/__pycache__/patches.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1ab038b410d52451a7269250bd540e23ed9d7b9 Binary files /dev/null and b/ultralytics/utils/__pycache__/patches.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/plotting.cpython-310.pyc b/ultralytics/utils/__pycache__/plotting.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7a1e12429602560e2849f01e5a7636255fb1e2b Binary files /dev/null and b/ultralytics/utils/__pycache__/plotting.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/plotting.cpython-39.pyc b/ultralytics/utils/__pycache__/plotting.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f236a04919c167bf1de0190fa8555da2eb54de0c Binary files /dev/null and b/ultralytics/utils/__pycache__/plotting.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/tal.cpython-310.pyc b/ultralytics/utils/__pycache__/tal.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27eedf33a26181104bfc065b492ac938e420fe6f Binary files /dev/null and b/ultralytics/utils/__pycache__/tal.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/tal.cpython-39.pyc b/ultralytics/utils/__pycache__/tal.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02df9aff47b7a9cd5938b704866e9b45ff515542 Binary files /dev/null and b/ultralytics/utils/__pycache__/tal.cpython-39.pyc differ diff --git a/ultralytics/utils/__pycache__/torch_utils.cpython-310.pyc b/ultralytics/utils/__pycache__/torch_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77884925e4795710ff60c793bedea1ee183c834a Binary files /dev/null and b/ultralytics/utils/__pycache__/torch_utils.cpython-310.pyc differ diff --git a/ultralytics/utils/__pycache__/torch_utils.cpython-39.pyc b/ultralytics/utils/__pycache__/torch_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d0dc84c11bce175008206ccdf0e7c4cb56aba4b Binary files /dev/null and b/ultralytics/utils/__pycache__/torch_utils.cpython-39.pyc differ diff --git a/ultralytics/utils/autobatch.py b/ultralytics/utils/autobatch.py new file mode 100644 index 0000000000000000000000000000000000000000..94d3c80f8560ccd92e5c061a1b08bbddbc4893fe --- /dev/null +++ b/ultralytics/utils/autobatch.py @@ -0,0 +1,90 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Functions for estimating the best YOLO batch size to use a fraction of the available CUDA memory in PyTorch. +""" + +from copy import deepcopy + +import numpy as np +import torch + +from ultralytics.utils import DEFAULT_CFG, LOGGER, colorstr +from ultralytics.utils.torch_utils import profile + + +def check_train_batch_size(model, imgsz=640, amp=True): + """ + Check YOLO training batch size using the autobatch() function. + + Args: + model (torch.nn.Module): YOLO model to check batch size for. + imgsz (int): Image size used for training. + amp (bool): If True, use automatic mixed precision (AMP) for training. + + Returns: + (int): Optimal batch size computed using the autobatch() function. + """ + + with torch.cuda.amp.autocast(amp): + return autobatch(deepcopy(model).train(), imgsz) # compute optimal batch size + + +def autobatch(model, imgsz=640, fraction=0.67, batch_size=DEFAULT_CFG.batch): + """ + Automatically estimate the best YOLO batch size to use a fraction of the available CUDA memory. + + Args: + model (torch.nn.module): YOLO model to compute batch size for. + imgsz (int, optional): The image size used as input for the YOLO model. Defaults to 640. + fraction (float, optional): The fraction of available CUDA memory to use. Defaults to 0.67. + batch_size (int, optional): The default batch size to use if an error is detected. Defaults to 16. + + Returns: + (int): The optimal batch size. + """ + + # Check device + prefix = colorstr('AutoBatch: ') + LOGGER.info(f'{prefix}Computing optimal batch size for imgsz={imgsz}') + device = next(model.parameters()).device # get model device + if device.type == 'cpu': + LOGGER.info(f'{prefix}CUDA not detected, using default CPU batch-size {batch_size}') + return batch_size + if torch.backends.cudnn.benchmark: + LOGGER.info(f'{prefix} ⚠️ Requires torch.backends.cudnn.benchmark=False, using default batch-size {batch_size}') + return batch_size + + # Inspect CUDA memory + gb = 1 << 30 # bytes to GiB (1024 ** 3) + d = str(device).upper() # 'CUDA:0' + properties = torch.cuda.get_device_properties(device) # device properties + t = properties.total_memory / gb # GiB total + r = torch.cuda.memory_reserved(device) / gb # GiB reserved + a = torch.cuda.memory_allocated(device) / gb # GiB allocated + f = t - (r + a) # GiB free + LOGGER.info(f'{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free') + + # Profile batch sizes + batch_sizes = [1, 2, 4, 8, 16] + try: + img = [torch.empty(b, 3, imgsz, imgsz) for b in batch_sizes] + results = profile(img, model, n=3, device=device) + + # Fit a solution + y = [x[2] for x in results if x] # memory [2] + p = np.polyfit(batch_sizes[:len(y)], y, deg=1) # first degree polynomial fit + b = int((f * fraction - p[1]) / p[0]) # y intercept (optimal batch size) + if None in results: # some sizes failed + i = results.index(None) # first fail index + if b >= batch_sizes[i]: # y intercept above failure point + b = batch_sizes[max(i - 1, 0)] # select prior safe point + if b < 1 or b > 1024: # b outside of safe range + b = batch_size + LOGGER.info(f'{prefix}WARNING ⚠️ CUDA anomaly detected, using default batch-size {batch_size}.') + + fraction = (np.polyval(p, b) + r + a) / t # actual fraction predicted + LOGGER.info(f'{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅') + return b + except Exception as e: + LOGGER.warning(f'{prefix}WARNING ⚠️ error detected: {e}, using default batch-size {batch_size}.') + return batch_size diff --git a/ultralytics/utils/benchmarks.py b/ultralytics/utils/benchmarks.py new file mode 100644 index 0000000000000000000000000000000000000000..91c7abb5dbee139da65c8cf9e3e29c6bf46d26b4 --- /dev/null +++ b/ultralytics/utils/benchmarks.py @@ -0,0 +1,363 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Benchmark a YOLO model formats for speed and accuracy + +Usage: + from ultralytics.utils.benchmarks import ProfileModels, benchmark + ProfileModels(['yolov8n.yaml', 'yolov8s.yaml']).profile() + benchmark(model='yolov8n.pt', imgsz=160) + +Format | `format=argument` | Model +--- | --- | --- +PyTorch | - | yolov8n.pt +TorchScript | `torchscript` | yolov8n.torchscript +ONNX | `onnx` | yolov8n.onnx +OpenVINO | `openvino` | yolov8n_openvino_model/ +TensorRT | `engine` | yolov8n.engine +CoreML | `coreml` | yolov8n.mlmodel +TensorFlow SavedModel | `saved_model` | yolov8n_saved_model/ +TensorFlow GraphDef | `pb` | yolov8n.pb +TensorFlow Lite | `tflite` | yolov8n.tflite +TensorFlow Edge TPU | `edgetpu` | yolov8n_edgetpu.tflite +TensorFlow.js | `tfjs` | yolov8n_web_model/ +PaddlePaddle | `paddle` | yolov8n_paddle_model/ +ncnn | `ncnn` | yolov8n_ncnn_model/ +""" + +import glob +import platform +import sys +import time +from pathlib import Path + +import numpy as np +import torch.cuda +from tqdm import tqdm + +from ultralytics import YOLO +from ultralytics.cfg import TASK2DATA, TASK2METRIC +from ultralytics.engine.exporter import export_formats +from ultralytics.utils import LINUX, LOGGER, MACOS, ROOT, SETTINGS +from ultralytics.utils.checks import check_requirements, check_yolo +from ultralytics.utils.downloads import download +from ultralytics.utils.files import file_size +from ultralytics.utils.torch_utils import select_device + + +def benchmark(model=Path(SETTINGS['weights_dir']) / 'yolov8n.pt', + data=None, + imgsz=160, + half=False, + int8=False, + device='cpu', + verbose=False): + """ + Benchmark a YOLO model across different formats for speed and accuracy. + + Args: + model (str | Path | optional): Path to the model file or directory. Default is + Path(SETTINGS['weights_dir']) / 'yolov8n.pt'. + data (str, optional): Dataset to evaluate on, inherited from TASK2DATA if not passed. Default is None. + imgsz (int, optional): Image size for the benchmark. Default is 160. + half (bool, optional): Use half-precision for the model if True. Default is False. + int8 (bool, optional): Use int8-precision for the model if True. Default is False. + device (str, optional): Device to run the benchmark on, either 'cpu' or 'cuda'. Default is 'cpu'. + verbose (bool | float | optional): If True or a float, assert benchmarks pass with given metric. + Default is False. + + Returns: + df (pandas.DataFrame): A pandas DataFrame with benchmark results for each format, including file size, + metric, and inference time. + """ + + import pandas as pd + pd.options.display.max_columns = 10 + pd.options.display.width = 120 + device = select_device(device, verbose=False) + if isinstance(model, (str, Path)): + model = YOLO(model) + + y = [] + t0 = time.time() + for i, (name, format, suffix, cpu, gpu) in export_formats().iterrows(): # index, (name, format, suffix, CPU, GPU) + emoji, filename = '❌', None # export defaults + try: + assert i != 9 or LINUX, 'Edge TPU export only supported on Linux' + if i == 10: + assert MACOS or LINUX, 'TF.js export only supported on macOS and Linux' + elif i == 11: + assert sys.version_info < (3, 11), 'PaddlePaddle export only supported on Python<=3.10' + if 'cpu' in device.type: + assert cpu, 'inference not supported on CPU' + if 'cuda' in device.type: + assert gpu, 'inference not supported on GPU' + + # Export + if format == '-': + filename = model.ckpt_path or model.cfg + export = model # PyTorch format + else: + filename = model.export(imgsz=imgsz, format=format, half=half, int8=int8, device=device, verbose=False) + export = YOLO(filename, task=model.task) + assert suffix in str(filename), 'export failed' + emoji = '❎' # indicates export succeeded + + # Predict + assert model.task != 'pose' or i != 7, 'GraphDef Pose inference is not supported' + assert i not in (9, 10), 'inference not supported' # Edge TPU and TF.js are unsupported + assert i != 5 or platform.system() == 'Darwin', 'inference only supported on macOS>=10.13' # CoreML + if not (ROOT / 'assets/bus.jpg').exists(): + download(url='https://ultralytics.com/images/bus.jpg', dir=ROOT / 'assets') + export.predict(ROOT / 'assets/bus.jpg', imgsz=imgsz, device=device, half=half) + + # Validate + data = data or TASK2DATA[model.task] # task to dataset, i.e. coco8.yaml for task=detect + key = TASK2METRIC[model.task] # task to metric, i.e. metrics/mAP50-95(B) for task=detect + results = export.val(data=data, + batch=1, + imgsz=imgsz, + plots=False, + device=device, + half=half, + int8=int8, + verbose=False) + metric, speed = results.results_dict[key], results.speed['inference'] + y.append([name, '✅', round(file_size(filename), 1), round(metric, 4), round(speed, 2)]) + except Exception as e: + if verbose: + assert type(e) is AssertionError, f'Benchmark failure for {name}: {e}' + LOGGER.warning(f'ERROR ❌️ Benchmark failure for {name}: {e}') + y.append([name, emoji, round(file_size(filename), 1), None, None]) # mAP, t_inference + + # Print results + check_yolo(device=device) # print system info + df = pd.DataFrame(y, columns=['Format', 'Status❔', 'Size (MB)', key, 'Inference time (ms/im)']) + + name = Path(model.ckpt_path).name + s = f'\nBenchmarks complete for {name} on {data} at imgsz={imgsz} ({time.time() - t0:.2f}s)\n{df}\n' + LOGGER.info(s) + with open('benchmarks.log', 'a', errors='ignore', encoding='utf-8') as f: + f.write(s) + + if verbose and isinstance(verbose, float): + metrics = df[key].array # values to compare to floor + floor = verbose # minimum metric floor to pass, i.e. = 0.29 mAP for YOLOv5n + assert all(x > floor for x in metrics if pd.notna(x)), f'Benchmark failure: metric(s) < floor {floor}' + + return df + + +class ProfileModels: + """ + ProfileModels class for profiling different models on ONNX and TensorRT. + + This class profiles the performance of different models, provided their paths. The profiling includes parameters such as + model speed and FLOPs. + + Attributes: + paths (list): Paths of the models to profile. + num_timed_runs (int): Number of timed runs for the profiling. Default is 100. + num_warmup_runs (int): Number of warmup runs before profiling. Default is 10. + min_time (float): Minimum number of seconds to profile for. Default is 60. + imgsz (int): Image size used in the models. Default is 640. + + Methods: + profile(): Profiles the models and prints the result. + """ + + def __init__(self, + paths: list, + num_timed_runs=100, + num_warmup_runs=10, + min_time=60, + imgsz=640, + trt=True, + device=None): + self.paths = paths + self.num_timed_runs = num_timed_runs + self.num_warmup_runs = num_warmup_runs + self.min_time = min_time + self.imgsz = imgsz + self.trt = trt # run TensorRT profiling + self.device = device or torch.device(0 if torch.cuda.is_available() else 'cpu') + + def profile(self): + files = self.get_files() + + if not files: + print('No matching *.pt or *.onnx files found.') + return + + table_rows = [] + output = [] + for file in files: + engine_file = file.with_suffix('.engine') + if file.suffix in ('.pt', '.yaml', '.yml'): + model = YOLO(str(file)) + model.fuse() # to report correct params and GFLOPs in model.info() + model_info = model.info() + if self.trt and self.device.type != 'cpu' and not engine_file.is_file(): + engine_file = model.export(format='engine', + half=True, + imgsz=self.imgsz, + device=self.device, + verbose=False) + onnx_file = model.export(format='onnx', + half=True, + imgsz=self.imgsz, + simplify=True, + device=self.device, + verbose=False) + elif file.suffix == '.onnx': + model_info = self.get_onnx_model_info(file) + onnx_file = file + else: + continue + + t_engine = self.profile_tensorrt_model(str(engine_file)) + t_onnx = self.profile_onnx_model(str(onnx_file)) + table_rows.append(self.generate_table_row(file.stem, t_onnx, t_engine, model_info)) + output.append(self.generate_results_dict(file.stem, t_onnx, t_engine, model_info)) + + self.print_table(table_rows) + return output + + def get_files(self): + files = [] + for path in self.paths: + path = Path(path) + if path.is_dir(): + extensions = ['*.pt', '*.onnx', '*.yaml'] + files.extend([file for ext in extensions for file in glob.glob(str(path / ext))]) + elif path.suffix in ('.pt', '.yaml', '.yml'): # add non-existing + files.append(str(path)) + else: + files.extend(glob.glob(str(path))) + + print(f'Profiling: {sorted(files)}') + return [Path(file) for file in sorted(files)] + + def get_onnx_model_info(self, onnx_file: str): + # return (num_layers, num_params, num_gradients, num_flops) + return 0.0, 0.0, 0.0, 0.0 + + def iterative_sigma_clipping(self, data, sigma=2, max_iters=3): + data = np.array(data) + for _ in range(max_iters): + mean, std = np.mean(data), np.std(data) + clipped_data = data[(data > mean - sigma * std) & (data < mean + sigma * std)] + if len(clipped_data) == len(data): + break + data = clipped_data + return data + + def profile_tensorrt_model(self, engine_file: str): + if not self.trt or not Path(engine_file).is_file(): + return 0.0, 0.0 + + # Model and input + model = YOLO(engine_file) + input_data = np.random.rand(self.imgsz, self.imgsz, 3).astype(np.float32) # must be FP32 + + # Warmup runs + elapsed = 0.0 + for _ in range(3): + start_time = time.time() + for _ in range(self.num_warmup_runs): + model(input_data, imgsz=self.imgsz, verbose=False) + elapsed = time.time() - start_time + + # Compute number of runs as higher of min_time or num_timed_runs + num_runs = max(round(self.min_time / elapsed * self.num_warmup_runs), self.num_timed_runs * 50) + + # Timed runs + run_times = [] + for _ in tqdm(range(num_runs), desc=engine_file): + results = model(input_data, imgsz=self.imgsz, verbose=False) + run_times.append(results[0].speed['inference']) # Convert to milliseconds + + run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=3) # sigma clipping + return np.mean(run_times), np.std(run_times) + + def profile_onnx_model(self, onnx_file: str): + check_requirements('onnxruntime') + import onnxruntime as ort + + # Session with either 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider' + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + sess_options.intra_op_num_threads = 8 # Limit the number of threads + sess = ort.InferenceSession(onnx_file, sess_options, providers=['CPUExecutionProvider']) + + input_tensor = sess.get_inputs()[0] + input_type = input_tensor.type + + # Mapping ONNX datatype to numpy datatype + if 'float16' in input_type: + input_dtype = np.float16 + elif 'float' in input_type: + input_dtype = np.float32 + elif 'double' in input_type: + input_dtype = np.float64 + elif 'int64' in input_type: + input_dtype = np.int64 + elif 'int32' in input_type: + input_dtype = np.int32 + else: + raise ValueError(f'Unsupported ONNX datatype {input_type}') + + input_data = np.random.rand(*input_tensor.shape).astype(input_dtype) + input_name = input_tensor.name + output_name = sess.get_outputs()[0].name + + # Warmup runs + elapsed = 0.0 + for _ in range(3): + start_time = time.time() + for _ in range(self.num_warmup_runs): + sess.run([output_name], {input_name: input_data}) + elapsed = time.time() - start_time + + # Compute number of runs as higher of min_time or num_timed_runs + num_runs = max(round(self.min_time / elapsed * self.num_warmup_runs), self.num_timed_runs) + + # Timed runs + run_times = [] + for _ in tqdm(range(num_runs), desc=onnx_file): + start_time = time.time() + sess.run([output_name], {input_name: input_data}) + run_times.append((time.time() - start_time) * 1000) # Convert to milliseconds + + run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=5) # sigma clipping + return np.mean(run_times), np.std(run_times) + + def generate_table_row(self, model_name, t_onnx, t_engine, model_info): + layers, params, gradients, flops = model_info + return f'| {model_name:18s} | {self.imgsz} | - | {t_onnx[0]:.2f} ± {t_onnx[1]:.2f} ms | {t_engine[0]:.2f} ± {t_engine[1]:.2f} ms | {params / 1e6:.1f} | {flops:.1f} |' + + def generate_results_dict(self, model_name, t_onnx, t_engine, model_info): + layers, params, gradients, flops = model_info + return { + 'model/name': model_name, + 'model/parameters': params, + 'model/GFLOPs': round(flops, 3), + 'model/speed_ONNX(ms)': round(t_onnx[0], 3), + 'model/speed_TensorRT(ms)': round(t_engine[0], 3)} + + def print_table(self, table_rows): + gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'GPU' + header = f'| Model | size
(pixels) | mAPval
50-95 | Speed
CPU ONNX
(ms) | Speed
{gpu} TensorRT
(ms) | params
(M) | FLOPs
(B) |' + separator = '|-------------|---------------------|--------------------|------------------------------|-----------------------------------|------------------|-----------------|' + + print(f'\n\n{header}') + print(separator) + for row in table_rows: + print(row) + + +if __name__ == '__main__': + # Benchmark all export formats + benchmark() + + # Profiling models on ONNX and TensorRT + ProfileModels(['yolov8n.yaml', 'yolov8s.yaml']) diff --git a/ultralytics/utils/callbacks/__init__.py b/ultralytics/utils/callbacks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..43fd2635d6c135c60c93f02efdbf476d3293de89 --- /dev/null +++ b/ultralytics/utils/callbacks/__init__.py @@ -0,0 +1,5 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from .base import add_integration_callbacks, default_callbacks, get_default_callbacks + +__all__ = 'add_integration_callbacks', 'default_callbacks', 'get_default_callbacks' diff --git a/ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a3dd70d47663b8daabdeceee44e4c1d9fd0c53c Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/__init__.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35b203935d9dcfd463f77dd570b37024c27f80c7 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/__init__.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..442b9f00dc762708b88eac0d330d5acc1f3d4ba3 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/base.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/base.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18a8a5c586077b8731240bbd54ec08db24fea330 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/base.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/clearml.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/clearml.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..380746ca460e3b197a83402ecf07b1cca65923b2 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/clearml.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/clearml.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/clearml.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f9039582d5ad13c949d0c6bb6212b5342add4cc Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/clearml.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/comet.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/comet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d946390d10bdf8a9a1773320555657c500fd122 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/comet.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/comet.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/comet.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ece42b8fef3fe0256a2e0162e7b942f2ef0beeb Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/comet.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/dvc.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/dvc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dd1ef7981c8f82de18aae447351167785b764ba Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/dvc.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/dvc.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/dvc.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62053d7d0606cd8e3ae4afec18e46f94a3cca979 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/dvc.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41741134786d640c9bca4fec878769a0c802610f Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/hub.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/hub.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b84ad5243f6baef4f6b3d689510d2aec54c7555 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/hub.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/mlflow.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/mlflow.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02a88615b51f512f526de4a18d5ec03e80b6949d Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/mlflow.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/mlflow.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/mlflow.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd3fa16024acf9b2c4e6e3cca842a037dd7d06a5 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/mlflow.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/neptune.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/neptune.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc5ecb9116c574cd5b6764c653907df7f5cedd83 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/neptune.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/neptune.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/neptune.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2214a8ba4c9594c8a67fd09ce093474c5abbfeb8 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/neptune.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/raytune.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/raytune.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bf84fe4f6b02c09dcd3fe4d80e2ba7d287b1b49 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/raytune.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/raytune.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/raytune.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ecdc571b7cd7184cb0e8a68683f5eba1f3fbb9d Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/raytune.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/tensorboard.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/tensorboard.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc94b61b5ffc44c228b0276645420ad9ed8deb3a Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/tensorboard.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/tensorboard.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/tensorboard.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa28ff98df0b16697f3a25bc8bccbd97ad516012 Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/tensorboard.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/wb.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/wb.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee8d8a1aaa8e1466ac4fd9ca53d8fcb65e48ba1a Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/wb.cpython-310.pyc differ diff --git a/ultralytics/utils/callbacks/__pycache__/wb.cpython-39.pyc b/ultralytics/utils/callbacks/__pycache__/wb.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d2bcd4e3516504fdfefcba8ed7fa8d1fcef339d Binary files /dev/null and b/ultralytics/utils/callbacks/__pycache__/wb.cpython-39.pyc differ diff --git a/ultralytics/utils/callbacks/base.py b/ultralytics/utils/callbacks/base.py new file mode 100644 index 0000000000000000000000000000000000000000..6ec8aaad348546572802772fc210dba44a127c30 --- /dev/null +++ b/ultralytics/utils/callbacks/base.py @@ -0,0 +1,212 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Base callbacks +""" + +from collections import defaultdict +from copy import deepcopy + +# Trainer callbacks ---------------------------------------------------------------------------------------------------- + + +def on_pretrain_routine_start(trainer): + """Called before the pretraining routine starts.""" + pass + + +def on_pretrain_routine_end(trainer): + """Called after the pretraining routine ends.""" + pass + + +def on_train_start(trainer): + """Called when the training starts.""" + pass + + +def on_train_epoch_start(trainer): + """Called at the start of each training epoch.""" + pass + + +def on_train_batch_start(trainer): + """Called at the start of each training batch.""" + pass + + +def optimizer_step(trainer): + """Called when the optimizer takes a step.""" + pass + + +def on_before_zero_grad(trainer): + """Called before the gradients are set to zero.""" + pass + + +def on_train_batch_end(trainer): + """Called at the end of each training batch.""" + pass + + +def on_train_epoch_end(trainer): + """Called at the end of each training epoch.""" + pass + + +def on_fit_epoch_end(trainer): + """Called at the end of each fit epoch (train + val).""" + pass + + +def on_model_save(trainer): + """Called when the model is saved.""" + pass + + +def on_train_end(trainer): + """Called when the training ends.""" + pass + + +def on_params_update(trainer): + """Called when the model parameters are updated.""" + pass + + +def teardown(trainer): + """Called during the teardown of the training process.""" + pass + + +# Validator callbacks -------------------------------------------------------------------------------------------------- + + +def on_val_start(validator): + """Called when the validation starts.""" + pass + + +def on_val_batch_start(validator): + """Called at the start of each validation batch.""" + pass + + +def on_val_batch_end(validator): + """Called at the end of each validation batch.""" + pass + + +def on_val_end(validator): + """Called when the validation ends.""" + pass + + +# Predictor callbacks -------------------------------------------------------------------------------------------------- + + +def on_predict_start(predictor): + """Called when the prediction starts.""" + pass + + +def on_predict_batch_start(predictor): + """Called at the start of each prediction batch.""" + pass + + +def on_predict_batch_end(predictor): + """Called at the end of each prediction batch.""" + pass + + +def on_predict_postprocess_end(predictor): + """Called after the post-processing of the prediction ends.""" + pass + + +def on_predict_end(predictor): + """Called when the prediction ends.""" + pass + + +# Exporter callbacks --------------------------------------------------------------------------------------------------- + + +def on_export_start(exporter): + """Called when the model export starts.""" + pass + + +def on_export_end(exporter): + """Called when the model export ends.""" + pass + + +default_callbacks = { + # Run in trainer + 'on_pretrain_routine_start': [on_pretrain_routine_start], + 'on_pretrain_routine_end': [on_pretrain_routine_end], + 'on_train_start': [on_train_start], + 'on_train_epoch_start': [on_train_epoch_start], + 'on_train_batch_start': [on_train_batch_start], + 'optimizer_step': [optimizer_step], + 'on_before_zero_grad': [on_before_zero_grad], + 'on_train_batch_end': [on_train_batch_end], + 'on_train_epoch_end': [on_train_epoch_end], + 'on_fit_epoch_end': [on_fit_epoch_end], # fit = train + val + 'on_model_save': [on_model_save], + 'on_train_end': [on_train_end], + 'on_params_update': [on_params_update], + 'teardown': [teardown], + + # Run in validator + 'on_val_start': [on_val_start], + 'on_val_batch_start': [on_val_batch_start], + 'on_val_batch_end': [on_val_batch_end], + 'on_val_end': [on_val_end], + + # Run in predictor + 'on_predict_start': [on_predict_start], + 'on_predict_batch_start': [on_predict_batch_start], + 'on_predict_postprocess_end': [on_predict_postprocess_end], + 'on_predict_batch_end': [on_predict_batch_end], + 'on_predict_end': [on_predict_end], + + # Run in exporter + 'on_export_start': [on_export_start], + 'on_export_end': [on_export_end]} + + +def get_default_callbacks(): + """ + Return a copy of the default_callbacks dictionary with lists as default values. + + Returns: + (defaultdict): A defaultdict with keys from default_callbacks and empty lists as default values. + """ + return defaultdict(list, deepcopy(default_callbacks)) + + +def add_integration_callbacks(instance): + """ + Add integration callbacks from various sources to the instance's callbacks. + + Args: + instance (Trainer, Predictor, Validator, Exporter): An object with a 'callbacks' attribute that is a dictionary + of callback lists. + """ + from .clearml import callbacks as clearml_cb + from .comet import callbacks as comet_cb + from .dvc import callbacks as dvc_cb + from .hub import callbacks as hub_cb + from .mlflow import callbacks as mlflow_cb + from .neptune import callbacks as neptune_cb + from .raytune import callbacks as tune_cb + from .tensorboard import callbacks as tensorboard_cb + from .wb import callbacks as wb_cb + + for x in clearml_cb, comet_cb, hub_cb, mlflow_cb, neptune_cb, tune_cb, tensorboard_cb, wb_cb, dvc_cb: + for k, v in x.items(): + if v not in instance.callbacks[k]: # prevent duplicate callbacks addition + instance.callbacks[k].append(v) # callback[name].append(func) diff --git a/ultralytics/utils/callbacks/clearml.py b/ultralytics/utils/callbacks/clearml.py new file mode 100644 index 0000000000000000000000000000000000000000..cb2db240f21365778caa567a5b15b51ffb20fd15 --- /dev/null +++ b/ultralytics/utils/callbacks/clearml.py @@ -0,0 +1,144 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import re + +import matplotlib.image as mpimg +import matplotlib.pyplot as plt + +from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING +from ultralytics.utils.torch_utils import model_info_for_loggers + +try: + import clearml + from clearml import Task + from clearml.binding.frameworks.pytorch_bind import PatchPyTorchModelIO + from clearml.binding.matplotlib_bind import PatchedMatplotlib + + assert hasattr(clearml, '__version__') # verify package is not directory + assert not TESTS_RUNNING # do not log pytest + assert SETTINGS['clearml'] is True # verify integration is enabled +except (ImportError, AssertionError): + clearml = None + + +def _log_debug_samples(files, title='Debug Samples') -> None: + """ + Log files (images) as debug samples in the ClearML task. + + Args: + files (list): A list of file paths in PosixPath format. + title (str): A title that groups together images with the same values. + """ + task = Task.current_task() + if task: + for f in files: + if f.exists(): + it = re.search(r'_batch(\d+)', f.name) + iteration = int(it.groups()[0]) if it else 0 + task.get_logger().report_image(title=title, + series=f.name.replace(it.group(), ''), + local_path=str(f), + iteration=iteration) + + +def _log_plot(title, plot_path) -> None: + """ + Log an image as a plot in the plot section of ClearML. + + Args: + title (str): The title of the plot. + plot_path (str): The path to the saved image file. + """ + img = mpimg.imread(plot_path) + fig = plt.figure() + ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect='auto', xticks=[], yticks=[]) # no ticks + ax.imshow(img) + + Task.current_task().get_logger().report_matplotlib_figure(title=title, + series='', + figure=fig, + report_interactive=False) + + +def on_pretrain_routine_start(trainer): + """Runs at start of pretraining routine; initializes and connects/ logs task to ClearML.""" + try: + task = Task.current_task() + if task: + # Make sure the automatic pytorch and matplotlib bindings are disabled! + # We are logging these plots and model files manually in the integration + PatchPyTorchModelIO.update_current_task(None) + PatchedMatplotlib.update_current_task(None) + else: + task = Task.init(project_name=trainer.args.project or 'YOLOv8', + task_name=trainer.args.name, + tags=['YOLOv8'], + output_uri=True, + reuse_last_task_id=False, + auto_connect_frameworks={ + 'pytorch': False, + 'matplotlib': False}) + LOGGER.warning('ClearML Initialized a new task. If you want to run remotely, ' + 'please add clearml-init and connect your arguments before initializing YOLO.') + task.connect(vars(trainer.args), name='General') + except Exception as e: + LOGGER.warning(f'WARNING ⚠️ ClearML installed but not initialized correctly, not logging this run. {e}') + + +def on_train_epoch_end(trainer): + task = Task.current_task() + + if task: + """Logs debug samples for the first epoch of YOLO training.""" + if trainer.epoch == 1: + _log_debug_samples(sorted(trainer.save_dir.glob('train_batch*.jpg')), 'Mosaic') + """Report the current training progress.""" + for k, v in trainer.validator.metrics.results_dict.items(): + task.get_logger().report_scalar('train', k, v, iteration=trainer.epoch) + + +def on_fit_epoch_end(trainer): + """Reports model information to logger at the end of an epoch.""" + task = Task.current_task() + if task: + # You should have access to the validation bboxes under jdict + task.get_logger().report_scalar(title='Epoch Time', + series='Epoch Time', + value=trainer.epoch_time, + iteration=trainer.epoch) + if trainer.epoch == 0: + for k, v in model_info_for_loggers(trainer).items(): + task.get_logger().report_single_value(k, v) + + +def on_val_end(validator): + """Logs validation results including labels and predictions.""" + if Task.current_task(): + # Log val_labels and val_pred + _log_debug_samples(sorted(validator.save_dir.glob('val*.jpg')), 'Validation') + + +def on_train_end(trainer): + """Logs final model and its name on training completion.""" + task = Task.current_task() + if task: + # Log final results, CM matrix + PR plots + files = [ + 'results.png', 'confusion_matrix.png', 'confusion_matrix_normalized.png', + *(f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R'))] + files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()] # filter + for f in files: + _log_plot(title=f.stem, plot_path=f) + # Report final metrics + for k, v in trainer.validator.metrics.results_dict.items(): + task.get_logger().report_single_value(k, v) + # Log the final model + task.update_output_model(model_path=str(trainer.best), model_name=trainer.args.name, auto_delete_file=False) + + +callbacks = { + 'on_pretrain_routine_start': on_pretrain_routine_start, + 'on_train_epoch_end': on_train_epoch_end, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_val_end': on_val_end, + 'on_train_end': on_train_end} if clearml else {} diff --git a/ultralytics/utils/callbacks/comet.py b/ultralytics/utils/callbacks/comet.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ca78e5e97b3c65db444acfbdd4cea890a6abcf --- /dev/null +++ b/ultralytics/utils/callbacks/comet.py @@ -0,0 +1,369 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import os +from pathlib import Path + +from ultralytics.utils import LOGGER, RANK, SETTINGS, TESTS_RUNNING, ops +from ultralytics.utils.torch_utils import model_info_for_loggers + +try: + import comet_ml + + assert not TESTS_RUNNING # do not log pytest + assert hasattr(comet_ml, '__version__') # verify package is not directory + assert SETTINGS['comet'] is True # verify integration is enabled +except (ImportError, AssertionError): + comet_ml = None + +# Ensures certain logging functions only run for supported tasks +COMET_SUPPORTED_TASKS = ['detect'] + +# Names of plots created by YOLOv8 that are logged to Comet +EVALUATION_PLOT_NAMES = 'F1_curve', 'P_curve', 'R_curve', 'PR_curve', 'confusion_matrix' +LABEL_PLOT_NAMES = 'labels', 'labels_correlogram' + +_comet_image_prediction_count = 0 + + +def _get_comet_mode(): + return os.getenv('COMET_MODE', 'online') + + +def _get_comet_model_name(): + return os.getenv('COMET_MODEL_NAME', 'YOLOv8') + + +def _get_eval_batch_logging_interval(): + return int(os.getenv('COMET_EVAL_BATCH_LOGGING_INTERVAL', 1)) + + +def _get_max_image_predictions_to_log(): + return int(os.getenv('COMET_MAX_IMAGE_PREDICTIONS', 100)) + + +def _scale_confidence_score(score): + scale = float(os.getenv('COMET_MAX_CONFIDENCE_SCORE', 100.0)) + return score * scale + + +def _should_log_confusion_matrix(): + return os.getenv('COMET_EVAL_LOG_CONFUSION_MATRIX', 'false').lower() == 'true' + + +def _should_log_image_predictions(): + return os.getenv('COMET_EVAL_LOG_IMAGE_PREDICTIONS', 'true').lower() == 'true' + + +def _get_experiment_type(mode, project_name): + """Return an experiment based on mode and project name.""" + if mode == 'offline': + return comet_ml.OfflineExperiment(project_name=project_name) + + return comet_ml.Experiment(project_name=project_name) + + +def _create_experiment(args): + """Ensures that the experiment object is only created in a single process during distributed training.""" + if RANK not in (-1, 0): + return + try: + comet_mode = _get_comet_mode() + _project_name = os.getenv('COMET_PROJECT_NAME', args.project) + experiment = _get_experiment_type(comet_mode, _project_name) + experiment.log_parameters(vars(args)) + experiment.log_others({ + 'eval_batch_logging_interval': _get_eval_batch_logging_interval(), + 'log_confusion_matrix_on_eval': _should_log_confusion_matrix(), + 'log_image_predictions': _should_log_image_predictions(), + 'max_image_predictions': _get_max_image_predictions_to_log(), }) + experiment.log_other('Created from', 'yolov8') + + except Exception as e: + LOGGER.warning(f'WARNING ⚠️ Comet installed but not initialized correctly, not logging this run. {e}') + + +def _fetch_trainer_metadata(trainer): + """Returns metadata for YOLO training including epoch and asset saving status.""" + curr_epoch = trainer.epoch + 1 + + train_num_steps_per_epoch = len(trainer.train_loader.dataset) // trainer.batch_size + curr_step = curr_epoch * train_num_steps_per_epoch + final_epoch = curr_epoch == trainer.epochs + + save = trainer.args.save + save_period = trainer.args.save_period + save_interval = curr_epoch % save_period == 0 + save_assets = save and save_period > 0 and save_interval and not final_epoch + + return dict( + curr_epoch=curr_epoch, + curr_step=curr_step, + save_assets=save_assets, + final_epoch=final_epoch, + ) + + +def _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad): + """YOLOv8 resizes images during training and the label values + are normalized based on this resized shape. This function rescales the + bounding box labels to the original image shape. + """ + + resized_image_height, resized_image_width = resized_image_shape + + # Convert normalized xywh format predictions to xyxy in resized scale format + box = ops.xywhn2xyxy(box, h=resized_image_height, w=resized_image_width) + # Scale box predictions from resized image scale back to original image scale + box = ops.scale_boxes(resized_image_shape, box, original_image_shape, ratio_pad) + # Convert bounding box format from xyxy to xywh for Comet logging + box = ops.xyxy2xywh(box) + # Adjust xy center to correspond top-left corner + box[:2] -= box[2:] / 2 + box = box.tolist() + + return box + + +def _format_ground_truth_annotations_for_detection(img_idx, image_path, batch, class_name_map=None): + """Format ground truth annotations for detection.""" + indices = batch['batch_idx'] == img_idx + bboxes = batch['bboxes'][indices] + if len(bboxes) == 0: + LOGGER.debug(f'COMET WARNING: Image: {image_path} has no bounding boxes labels') + return None + + cls_labels = batch['cls'][indices].squeeze(1).tolist() + if class_name_map: + cls_labels = [str(class_name_map[label]) for label in cls_labels] + + original_image_shape = batch['ori_shape'][img_idx] + resized_image_shape = batch['resized_shape'][img_idx] + ratio_pad = batch['ratio_pad'][img_idx] + + data = [] + for box, label in zip(bboxes, cls_labels): + box = _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad) + data.append({ + 'boxes': [box], + 'label': f'gt_{label}', + 'score': _scale_confidence_score(1.0), }) + + return {'name': 'ground_truth', 'data': data} + + +def _format_prediction_annotations_for_detection(image_path, metadata, class_label_map=None): + """Format YOLO predictions for object detection visualization.""" + stem = image_path.stem + image_id = int(stem) if stem.isnumeric() else stem + + predictions = metadata.get(image_id) + if not predictions: + LOGGER.debug(f'COMET WARNING: Image: {image_path} has no bounding boxes predictions') + return None + + data = [] + for prediction in predictions: + boxes = prediction['bbox'] + score = _scale_confidence_score(prediction['score']) + cls_label = prediction['category_id'] + if class_label_map: + cls_label = str(class_label_map[cls_label]) + + data.append({'boxes': [boxes], 'label': cls_label, 'score': score}) + + return {'name': 'prediction', 'data': data} + + +def _fetch_annotations(img_idx, image_path, batch, prediction_metadata_map, class_label_map): + """Join the ground truth and prediction annotations if they exist.""" + ground_truth_annotations = _format_ground_truth_annotations_for_detection(img_idx, image_path, batch, + class_label_map) + prediction_annotations = _format_prediction_annotations_for_detection(image_path, prediction_metadata_map, + class_label_map) + + annotations = [ + annotation for annotation in [ground_truth_annotations, prediction_annotations] if annotation is not None] + return [annotations] if annotations else None + + +def _create_prediction_metadata_map(model_predictions): + """Create metadata map for model predictions by groupings them based on image ID.""" + pred_metadata_map = {} + for prediction in model_predictions: + pred_metadata_map.setdefault(prediction['image_id'], []) + pred_metadata_map[prediction['image_id']].append(prediction) + + return pred_metadata_map + + +def _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch): + """Log the confusion matrix to Comet experiment.""" + conf_mat = trainer.validator.confusion_matrix.matrix + names = list(trainer.data['names'].values()) + ['background'] + experiment.log_confusion_matrix( + matrix=conf_mat, + labels=names, + max_categories=len(names), + epoch=curr_epoch, + step=curr_step, + ) + + +def _log_images(experiment, image_paths, curr_step, annotations=None): + """Logs images to the experiment with optional annotations.""" + if annotations: + for image_path, annotation in zip(image_paths, annotations): + experiment.log_image(image_path, name=image_path.stem, step=curr_step, annotations=annotation) + + else: + for image_path in image_paths: + experiment.log_image(image_path, name=image_path.stem, step=curr_step) + + +def _log_image_predictions(experiment, validator, curr_step): + """Logs predicted boxes for a single image during training.""" + global _comet_image_prediction_count + + task = validator.args.task + if task not in COMET_SUPPORTED_TASKS: + return + + jdict = validator.jdict + if not jdict: + return + + predictions_metadata_map = _create_prediction_metadata_map(jdict) + dataloader = validator.dataloader + class_label_map = validator.names + + batch_logging_interval = _get_eval_batch_logging_interval() + max_image_predictions = _get_max_image_predictions_to_log() + + for batch_idx, batch in enumerate(dataloader): + if (batch_idx + 1) % batch_logging_interval != 0: + continue + + image_paths = batch['im_file'] + for img_idx, image_path in enumerate(image_paths): + if _comet_image_prediction_count >= max_image_predictions: + return + + image_path = Path(image_path) + annotations = _fetch_annotations( + img_idx, + image_path, + batch, + predictions_metadata_map, + class_label_map, + ) + _log_images( + experiment, + [image_path], + curr_step, + annotations=annotations, + ) + _comet_image_prediction_count += 1 + + +def _log_plots(experiment, trainer): + """Logs evaluation plots and label plots for the experiment.""" + plot_filenames = [trainer.save_dir / f'{plots}.png' for plots in EVALUATION_PLOT_NAMES] + _log_images(experiment, plot_filenames, None) + + label_plot_filenames = [trainer.save_dir / f'{labels}.jpg' for labels in LABEL_PLOT_NAMES] + _log_images(experiment, label_plot_filenames, None) + + +def _log_model(experiment, trainer): + """Log the best-trained model to Comet.ml.""" + model_name = _get_comet_model_name() + experiment.log_model( + model_name, + file_or_folder=str(trainer.best), + file_name='best.pt', + overwrite=True, + ) + + +def on_pretrain_routine_start(trainer): + """Creates or resumes a CometML experiment at the start of a YOLO pre-training routine.""" + experiment = comet_ml.get_global_experiment() + is_alive = getattr(experiment, 'alive', False) + if not experiment or not is_alive: + _create_experiment(trainer.args) + + +def on_train_epoch_end(trainer): + """Log metrics and save batch images at the end of training epochs.""" + experiment = comet_ml.get_global_experiment() + if not experiment: + return + + metadata = _fetch_trainer_metadata(trainer) + curr_epoch = metadata['curr_epoch'] + curr_step = metadata['curr_step'] + + experiment.log_metrics( + trainer.label_loss_items(trainer.tloss, prefix='train'), + step=curr_step, + epoch=curr_epoch, + ) + + if curr_epoch == 1: + _log_images(experiment, trainer.save_dir.glob('train_batch*.jpg'), curr_step) + + +def on_fit_epoch_end(trainer): + """Logs model assets at the end of each epoch.""" + experiment = comet_ml.get_global_experiment() + if not experiment: + return + + metadata = _fetch_trainer_metadata(trainer) + curr_epoch = metadata['curr_epoch'] + curr_step = metadata['curr_step'] + save_assets = metadata['save_assets'] + + experiment.log_metrics(trainer.metrics, step=curr_step, epoch=curr_epoch) + experiment.log_metrics(trainer.lr, step=curr_step, epoch=curr_epoch) + if curr_epoch == 1: + experiment.log_metrics(model_info_for_loggers(trainer), step=curr_step, epoch=curr_epoch) + + if not save_assets: + return + + _log_model(experiment, trainer) + if _should_log_confusion_matrix(): + _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch) + if _should_log_image_predictions(): + _log_image_predictions(experiment, trainer.validator, curr_step) + + +def on_train_end(trainer): + """Perform operations at the end of training.""" + experiment = comet_ml.get_global_experiment() + if not experiment: + return + + metadata = _fetch_trainer_metadata(trainer) + curr_epoch = metadata['curr_epoch'] + curr_step = metadata['curr_step'] + plots = trainer.args.plots + + _log_model(experiment, trainer) + if plots: + _log_plots(experiment, trainer) + + _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch) + _log_image_predictions(experiment, trainer.validator, curr_step) + experiment.end() + + global _comet_image_prediction_count + _comet_image_prediction_count = 0 + + +callbacks = { + 'on_pretrain_routine_start': on_pretrain_routine_start, + 'on_train_epoch_end': on_train_epoch_end, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_train_end': on_train_end} if comet_ml else {} diff --git a/ultralytics/utils/callbacks/dvc.py b/ultralytics/utils/callbacks/dvc.py new file mode 100644 index 0000000000000000000000000000000000000000..dd931ea14a16d0bd89a405984a4e81f3eb5eb49e --- /dev/null +++ b/ultralytics/utils/callbacks/dvc.py @@ -0,0 +1,137 @@ +# Ultralytics YOLO 🚀, GPL-3.0 license +import os + +import pkg_resources as pkg + +from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING +from ultralytics.utils.torch_utils import model_info_for_loggers + +try: + from importlib.metadata import version + + import dvclive + + assert not TESTS_RUNNING # do not log pytest + assert SETTINGS['dvc'] is True # verify integration is enabled + + ver = version('dvclive') + if pkg.parse_version(ver) < pkg.parse_version('2.11.0'): + LOGGER.debug(f'DVCLive is detected but version {ver} is incompatible (>=2.11 required).') + dvclive = None # noqa: F811 +except (ImportError, AssertionError, TypeError): + dvclive = None + +# DVCLive logger instance +live = None +_processed_plots = {} + +# `on_fit_epoch_end` is called on final validation (probably need to be fixed) +# for now this is the way we distinguish final evaluation of the best model vs +# last epoch validation +_training_epoch = False + + +def _logger_disabled(): + return os.getenv('ULTRALYTICS_DVC_DISABLED', 'false').lower() == 'true' + + +def _log_images(image_path, prefix=''): + if live: + live.log_image(os.path.join(prefix, image_path.name), image_path) + + +def _log_plots(plots, prefix=''): + for name, params in plots.items(): + timestamp = params['timestamp'] + if _processed_plots.get(name) != timestamp: + _log_images(name, prefix) + _processed_plots[name] = timestamp + + +def _log_confusion_matrix(validator): + targets = [] + preds = [] + matrix = validator.confusion_matrix.matrix + names = list(validator.names.values()) + if validator.confusion_matrix.task == 'detect': + names += ['background'] + + for ti, pred in enumerate(matrix.T.astype(int)): + for pi, num in enumerate(pred): + targets.extend([names[ti]] * num) + preds.extend([names[pi]] * num) + + live.log_sklearn_plot('confusion_matrix', targets, preds, name='cf.json', normalized=True) + + +def on_pretrain_routine_start(trainer): + try: + global live + if not _logger_disabled(): + live = dvclive.Live(save_dvc_exp=True, cache_images=True) + LOGGER.info( + 'DVCLive is detected and auto logging is enabled (can be disabled with `ULTRALYTICS_DVC_DISABLED=true`).' + ) + else: + LOGGER.debug('DVCLive is detected and auto logging is disabled via `ULTRALYTICS_DVC_DISABLED`.') + live = None + except Exception as e: + LOGGER.warning(f'WARNING ⚠️ DVCLive installed but not initialized correctly, not logging this run. {e}') + + +def on_pretrain_routine_end(trainer): + _log_plots(trainer.plots, 'train') + + +def on_train_start(trainer): + if live: + live.log_params(trainer.args) + + +def on_train_epoch_start(trainer): + global _training_epoch + _training_epoch = True + + +def on_fit_epoch_end(trainer): + global _training_epoch + if live and _training_epoch: + all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr} + for metric, value in all_metrics.items(): + live.log_metric(metric, value) + + if trainer.epoch == 0: + for metric, value in model_info_for_loggers(trainer).items(): + live.log_metric(metric, value, plot=False) + + _log_plots(trainer.plots, 'train') + _log_plots(trainer.validator.plots, 'val') + + live.next_step() + _training_epoch = False + + +def on_train_end(trainer): + if live: + # At the end log the best metrics. It runs validator on the best model internally. + all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics, **trainer.lr} + for metric, value in all_metrics.items(): + live.log_metric(metric, value, plot=False) + + _log_plots(trainer.plots, 'val') + _log_plots(trainer.validator.plots, 'val') + _log_confusion_matrix(trainer.validator) + + if trainer.best.exists(): + live.log_artifact(trainer.best, copy=True) + + live.end() + + +callbacks = { + 'on_pretrain_routine_start': on_pretrain_routine_start, + 'on_pretrain_routine_end': on_pretrain_routine_end, + 'on_train_start': on_train_start, + 'on_train_epoch_start': on_train_epoch_start, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_train_end': on_train_end} if dvclive else {} diff --git a/ultralytics/utils/callbacks/hub.py b/ultralytics/utils/callbacks/hub.py new file mode 100644 index 0000000000000000000000000000000000000000..9a02bcfdfef08edfacbf8749dd7cf41b0364ff05 --- /dev/null +++ b/ultralytics/utils/callbacks/hub.py @@ -0,0 +1,87 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import json +from time import time + +from ultralytics.hub.utils import HUB_WEB_ROOT, PREFIX, events +from ultralytics.utils import LOGGER, SETTINGS +from ultralytics.utils.torch_utils import model_info_for_loggers + + +def on_pretrain_routine_end(trainer): + """Logs info before starting timer for upload rate limit.""" + session = getattr(trainer, 'hub_session', None) + if session: + # Start timer for upload rate limit + LOGGER.info(f'{PREFIX}View model at {HUB_WEB_ROOT}/models/{session.model_id} 🚀') + session.timers = {'metrics': time(), 'ckpt': time()} # start timer on session.rate_limit + + +def on_fit_epoch_end(trainer): + """Uploads training progress metrics at the end of each epoch.""" + session = getattr(trainer, 'hub_session', None) + if session: + # Upload metrics after val end + all_plots = {**trainer.label_loss_items(trainer.tloss, prefix='train'), **trainer.metrics} + if trainer.epoch == 0: + all_plots = {**all_plots, **model_info_for_loggers(trainer)} + session.metrics_queue[trainer.epoch] = json.dumps(all_plots) + if time() - session.timers['metrics'] > session.rate_limits['metrics']: + session.upload_metrics() + session.timers['metrics'] = time() # reset timer + session.metrics_queue = {} # reset queue + + +def on_model_save(trainer): + """Saves checkpoints to Ultralytics HUB with rate limiting.""" + session = getattr(trainer, 'hub_session', None) + if session: + # Upload checkpoints with rate limiting + is_best = trainer.best_fitness == trainer.fitness + if time() - session.timers['ckpt'] > session.rate_limits['ckpt']: + LOGGER.info(f'{PREFIX}Uploading checkpoint {HUB_WEB_ROOT}/models/{session.model_id}') + session.upload_model(trainer.epoch, trainer.last, is_best) + session.timers['ckpt'] = time() # reset timer + + +def on_train_end(trainer): + """Upload final model and metrics to Ultralytics HUB at the end of training.""" + session = getattr(trainer, 'hub_session', None) + if session: + # Upload final model and metrics with exponential standoff + LOGGER.info(f'{PREFIX}Syncing final model...') + session.upload_model(trainer.epoch, trainer.best, map=trainer.metrics.get('metrics/mAP50-95(B)', 0), final=True) + session.alive = False # stop heartbeats + LOGGER.info(f'{PREFIX}Done ✅\n' + f'{PREFIX}View model at {HUB_WEB_ROOT}/models/{session.model_id} 🚀') + + +def on_train_start(trainer): + """Run events on train start.""" + events(trainer.args) + + +def on_val_start(validator): + """Runs events on validation start.""" + events(validator.args) + + +def on_predict_start(predictor): + """Run events on predict start.""" + events(predictor.args) + + +def on_export_start(exporter): + """Run events on export start.""" + events(exporter.args) + + +callbacks = { + 'on_pretrain_routine_end': on_pretrain_routine_end, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_model_save': on_model_save, + 'on_train_end': on_train_end, + 'on_train_start': on_train_start, + 'on_val_start': on_val_start, + 'on_predict_start': on_predict_start, + 'on_export_start': on_export_start} if SETTINGS['hub'] is True else {} # verify enabled diff --git a/ultralytics/utils/callbacks/mlflow.py b/ultralytics/utils/callbacks/mlflow.py new file mode 100644 index 0000000000000000000000000000000000000000..26841ed9fae489563650bdce60dcd73b418ab8ef --- /dev/null +++ b/ultralytics/utils/callbacks/mlflow.py @@ -0,0 +1,72 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import os +import re +from pathlib import Path + +from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, colorstr + +try: + import mlflow + + assert not TESTS_RUNNING # do not log pytest + assert hasattr(mlflow, '__version__') # verify package is not directory + assert SETTINGS['mlflow'] is True # verify integration is enabled +except (ImportError, AssertionError): + mlflow = None + + +def on_pretrain_routine_end(trainer): + """Logs training parameters to MLflow.""" + global mlflow, run, run_id, experiment_name + + if os.environ.get('MLFLOW_TRACKING_URI') is None: + mlflow = None + + if mlflow: + mlflow_location = os.environ['MLFLOW_TRACKING_URI'] # "http://192.168.xxx.xxx:5000" + mlflow.set_tracking_uri(mlflow_location) + + experiment_name = os.environ.get('MLFLOW_EXPERIMENT_NAME') or trainer.args.project or '/Shared/YOLOv8' + run_name = os.environ.get('MLFLOW_RUN') or trainer.args.name + experiment = mlflow.get_experiment_by_name(experiment_name) + if experiment is None: + mlflow.create_experiment(experiment_name) + mlflow.set_experiment(experiment_name) + + prefix = colorstr('MLFlow: ') + try: + run, active_run = mlflow, mlflow.active_run() + if not active_run: + active_run = mlflow.start_run(experiment_id=experiment.experiment_id, run_name=run_name) + run_id = active_run.info.run_id + LOGGER.info(f'{prefix}Using run_id({run_id}) at {mlflow_location}') + run.log_params(vars(trainer.model.args)) + except Exception as err: + LOGGER.error(f'{prefix}Failing init - {repr(err)}') + LOGGER.warning(f'{prefix}Continuing without Mlflow') + + +def on_fit_epoch_end(trainer): + """Logs training metrics to Mlflow.""" + if mlflow: + metrics_dict = {f"{re.sub('[()]', '', k)}": float(v) for k, v in trainer.metrics.items()} + run.log_metrics(metrics=metrics_dict, step=trainer.epoch) + + +def on_train_end(trainer): + """Called at end of train loop to log model artifact info.""" + if mlflow: + root_dir = Path(__file__).resolve().parents[3] + run.log_artifact(trainer.last) + run.log_artifact(trainer.best) + run.pyfunc.log_model(artifact_path=experiment_name, + code_path=[str(root_dir)], + artifacts={'model_path': str(trainer.save_dir)}, + python_model=run.pyfunc.PythonModel()) + + +callbacks = { + 'on_pretrain_routine_end': on_pretrain_routine_end, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_train_end': on_train_end} if mlflow else {} diff --git a/ultralytics/utils/callbacks/neptune.py b/ultralytics/utils/callbacks/neptune.py new file mode 100644 index 0000000000000000000000000000000000000000..ce059cd2ebaa10acba50dc70912e24325e6c1564 --- /dev/null +++ b/ultralytics/utils/callbacks/neptune.py @@ -0,0 +1,104 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import matplotlib.image as mpimg +import matplotlib.pyplot as plt + +from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING +from ultralytics.utils.torch_utils import model_info_for_loggers + +try: + import neptune + from neptune.types import File + + assert not TESTS_RUNNING # do not log pytest + assert hasattr(neptune, '__version__') + assert SETTINGS['neptune'] is True # verify integration is enabled +except (ImportError, AssertionError): + neptune = None + +run = None # NeptuneAI experiment logger instance + + +def _log_scalars(scalars, step=0): + """Log scalars to the NeptuneAI experiment logger.""" + if run: + for k, v in scalars.items(): + run[k].append(value=v, step=step) + + +def _log_images(imgs_dict, group=''): + """Log scalars to the NeptuneAI experiment logger.""" + if run: + for k, v in imgs_dict.items(): + run[f'{group}/{k}'].upload(File(v)) + + +def _log_plot(title, plot_path): + """Log plots to the NeptuneAI experiment logger.""" + """ + Log image as plot in the plot section of NeptuneAI + + arguments: + title (str) Title of the plot + plot_path (PosixPath or str) Path to the saved image file + """ + img = mpimg.imread(plot_path) + fig = plt.figure() + ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect='auto', xticks=[], yticks=[]) # no ticks + ax.imshow(img) + run[f'Plots/{title}'].upload(fig) + + +def on_pretrain_routine_start(trainer): + """Callback function called before the training routine starts.""" + try: + global run + run = neptune.init_run(project=trainer.args.project or 'YOLOv8', name=trainer.args.name, tags=['YOLOv8']) + run['Configuration/Hyperparameters'] = {k: '' if v is None else v for k, v in vars(trainer.args).items()} + except Exception as e: + LOGGER.warning(f'WARNING ⚠️ NeptuneAI installed but not initialized correctly, not logging this run. {e}') + + +def on_train_epoch_end(trainer): + """Callback function called at end of each training epoch.""" + _log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1) + _log_scalars(trainer.lr, trainer.epoch + 1) + if trainer.epoch == 1: + _log_images({f.stem: str(f) for f in trainer.save_dir.glob('train_batch*.jpg')}, 'Mosaic') + + +def on_fit_epoch_end(trainer): + """Callback function called at end of each fit (train+val) epoch.""" + if run and trainer.epoch == 0: + run['Configuration/Model'] = model_info_for_loggers(trainer) + _log_scalars(trainer.metrics, trainer.epoch + 1) + + +def on_val_end(validator): + """Callback function called at end of each validation.""" + if run: + # Log val_labels and val_pred + _log_images({f.stem: str(f) for f in validator.save_dir.glob('val*.jpg')}, 'Validation') + + +def on_train_end(trainer): + """Callback function called at end of training.""" + if run: + # Log final results, CM matrix + PR plots + files = [ + 'results.png', 'confusion_matrix.png', 'confusion_matrix_normalized.png', + *(f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R'))] + files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()] # filter + for f in files: + _log_plot(title=f.stem, plot_path=f) + # Log the final model + run[f'weights/{trainer.args.name or trainer.args.task}/{str(trainer.best.name)}'].upload(File(str( + trainer.best))) + + +callbacks = { + 'on_pretrain_routine_start': on_pretrain_routine_start, + 'on_train_epoch_end': on_train_epoch_end, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_val_end': on_val_end, + 'on_train_end': on_train_end} if neptune else {} diff --git a/ultralytics/utils/callbacks/raytune.py b/ultralytics/utils/callbacks/raytune.py new file mode 100644 index 0000000000000000000000000000000000000000..8f9e472120404346e26dbd1f341de244d3e894ff --- /dev/null +++ b/ultralytics/utils/callbacks/raytune.py @@ -0,0 +1,24 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from ultralytics.utils import SETTINGS + +try: + import ray + from ray import tune + from ray.air import session + + assert SETTINGS['raytune'] is True # verify integration is enabled +except (ImportError, AssertionError): + tune = None + + +def on_fit_epoch_end(trainer): + """Sends training metrics to Ray Tune at end of each epoch.""" + if ray.tune.is_session_enabled(): + metrics = trainer.metrics + metrics['epoch'] = trainer.epoch + session.report(metrics) + + +callbacks = { + 'on_fit_epoch_end': on_fit_epoch_end, } if tune else {} diff --git a/ultralytics/utils/callbacks/tensorboard.py b/ultralytics/utils/callbacks/tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..6485a293b130cbc0d2cdb161ff6ffb5293c2eee6 --- /dev/null +++ b/ultralytics/utils/callbacks/tensorboard.py @@ -0,0 +1,50 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, colorstr + +try: + from torch.utils.tensorboard import SummaryWriter + + assert not TESTS_RUNNING # do not log pytest + assert SETTINGS['tensorboard'] is True # verify integration is enabled + +# TypeError for handling 'Descriptors cannot not be created directly.' protobuf errors in Windows +except (ImportError, AssertionError, TypeError): + SummaryWriter = None + +writer = None # TensorBoard SummaryWriter instance + + +def _log_scalars(scalars, step=0): + """Logs scalar values to TensorBoard.""" + if writer: + for k, v in scalars.items(): + writer.add_scalar(k, v, step) + + +def on_pretrain_routine_start(trainer): + """Initialize TensorBoard logging with SummaryWriter.""" + if SummaryWriter: + try: + global writer + writer = SummaryWriter(str(trainer.save_dir)) + prefix = colorstr('TensorBoard: ') + LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/") + except Exception as e: + LOGGER.warning(f'WARNING ⚠️ TensorBoard not initialized correctly, not logging this run. {e}') + + +def on_batch_end(trainer): + """Logs scalar statistics at the end of a training batch.""" + _log_scalars(trainer.label_loss_items(trainer.tloss, prefix='train'), trainer.epoch + 1) + + +def on_fit_epoch_end(trainer): + """Logs epoch metrics at end of training epoch.""" + _log_scalars(trainer.metrics, trainer.epoch + 1) + + +callbacks = { + 'on_pretrain_routine_start': on_pretrain_routine_start, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_batch_end': on_batch_end} diff --git a/ultralytics/utils/callbacks/wb.py b/ultralytics/utils/callbacks/wb.py new file mode 100644 index 0000000000000000000000000000000000000000..d67f12e31b0edf674294b183347c11e29ea6b393 --- /dev/null +++ b/ultralytics/utils/callbacks/wb.py @@ -0,0 +1,61 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +from ultralytics.utils import SETTINGS, TESTS_RUNNING +from ultralytics.utils.torch_utils import model_info_for_loggers + +try: + import wandb as wb + + assert hasattr(wb, '__version__') + assert not TESTS_RUNNING # do not log pytest + assert SETTINGS['wandb'] is True # verify integration is enabled +except (ImportError, AssertionError): + wb = None + +_processed_plots = {} + + +def _log_plots(plots, step): + for name, params in plots.items(): + timestamp = params['timestamp'] + if _processed_plots.get(name) != timestamp: + wb.run.log({name.stem: wb.Image(str(name))}, step=step) + _processed_plots[name] = timestamp + + +def on_pretrain_routine_start(trainer): + """Initiate and start project if module is present.""" + wb.run or wb.init(project=trainer.args.project or 'YOLOv8', name=trainer.args.name, config=vars(trainer.args)) + + +def on_fit_epoch_end(trainer): + """Logs training metrics and model information at the end of an epoch.""" + wb.run.log(trainer.metrics, step=trainer.epoch + 1) + _log_plots(trainer.plots, step=trainer.epoch + 1) + _log_plots(trainer.validator.plots, step=trainer.epoch + 1) + if trainer.epoch == 0: + wb.run.log(model_info_for_loggers(trainer), step=trainer.epoch + 1) + + +def on_train_epoch_end(trainer): + """Log metrics and save images at the end of each training epoch.""" + wb.run.log(trainer.label_loss_items(trainer.tloss, prefix='train'), step=trainer.epoch + 1) + wb.run.log(trainer.lr, step=trainer.epoch + 1) + if trainer.epoch == 1: + _log_plots(trainer.plots, step=trainer.epoch + 1) + + +def on_train_end(trainer): + """Save the best model as an artifact at end of training.""" + _log_plots(trainer.validator.plots, step=trainer.epoch + 1) + _log_plots(trainer.plots, step=trainer.epoch + 1) + art = wb.Artifact(type='model', name=f'run_{wb.run.id}_model') + if trainer.best.exists(): + art.add_file(trainer.best) + wb.run.log_artifact(art, aliases=['best']) + + +callbacks = { + 'on_pretrain_routine_start': on_pretrain_routine_start, + 'on_train_epoch_end': on_train_epoch_end, + 'on_fit_epoch_end': on_fit_epoch_end, + 'on_train_end': on_train_end} if wb else {} diff --git a/ultralytics/utils/checks.py b/ultralytics/utils/checks.py new file mode 100644 index 0000000000000000000000000000000000000000..4a8b1adaf7b4a3ad2af41c6376ec3659e6688846 --- /dev/null +++ b/ultralytics/utils/checks.py @@ -0,0 +1,457 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +import contextlib +import glob +import inspect +import math +import os +import platform +import re +import shutil +import subprocess +import time +from pathlib import Path +from typing import Optional + +import cv2 +import numpy as np +import pkg_resources as pkg +import psutil +import requests +import torch +from matplotlib import font_manager + +from ultralytics.utils import (AUTOINSTALL, LOGGER, ONLINE, ROOT, USER_CONFIG_DIR, ThreadingLocked, TryExcept, + clean_url, colorstr, downloads, emojis, is_colab, is_docker, is_jupyter, is_kaggle, + is_online, is_pip_package, url2file) + + +def is_ascii(s) -> bool: + """ + Check if a string is composed of only ASCII characters. + + Args: + s (str): String to be checked. + + Returns: + bool: True if the string is composed only of ASCII characters, False otherwise. + """ + # Convert list, tuple, None, etc. to string + s = str(s) + + # Check if the string is composed of only ASCII characters + return all(ord(c) < 128 for c in s) + + +def check_imgsz(imgsz, stride=32, min_dim=1, max_dim=2, floor=0): + """ + Verify image size is a multiple of the given stride in each dimension. If the image size is not a multiple of the + stride, update it to the nearest multiple of the stride that is greater than or equal to the given floor value. + + Args: + imgsz (int | cList[int]): Image size. + stride (int): Stride value. + min_dim (int): Minimum number of dimensions. + floor (int): Minimum allowed value for image size. + + Returns: + (List[int]): Updated image size. + """ + # Convert stride to integer if it is a tensor + stride = int(stride.max() if isinstance(stride, torch.Tensor) else stride) + + # Convert image size to list if it is an integer + if isinstance(imgsz, int): + imgsz = [imgsz] + elif isinstance(imgsz, (list, tuple)): + imgsz = list(imgsz) + else: + raise TypeError(f"'imgsz={imgsz}' is of invalid type {type(imgsz).__name__}. " + f"Valid imgsz types are int i.e. 'imgsz=640' or list i.e. 'imgsz=[640,640]'") + + # Apply max_dim + if len(imgsz) > max_dim: + msg = "'train' and 'val' imgsz must be an integer, while 'predict' and 'export' imgsz may be a [h, w] list " \ + "or an integer, i.e. 'yolo export imgsz=640,480' or 'yolo export imgsz=640'" + if max_dim != 1: + raise ValueError(f'imgsz={imgsz} is not a valid image size. {msg}') + LOGGER.warning(f"WARNING ⚠️ updating to 'imgsz={max(imgsz)}'. {msg}") + imgsz = [max(imgsz)] + # Make image size a multiple of the stride + sz = [max(math.ceil(x / stride) * stride, floor) for x in imgsz] + + # Print warning message if image size was updated + if sz != imgsz: + LOGGER.warning(f'WARNING ⚠️ imgsz={imgsz} must be multiple of max stride {stride}, updating to {sz}') + + # Add missing dimensions if necessary + sz = [sz[0], sz[0]] if min_dim == 2 and len(sz) == 1 else sz[0] if min_dim == 1 and len(sz) == 1 else sz + + return sz + + +def check_version(current: str = '0.0.0', + minimum: str = '0.0.0', + name: str = 'version ', + pinned: bool = False, + hard: bool = False, + verbose: bool = False) -> bool: + """ + Check current version against the required minimum version. + + Args: + current (str): Current version. + minimum (str): Required minimum version. + name (str): Name to be used in warning message. + pinned (bool): If True, versions must match exactly. If False, minimum version must be satisfied. + hard (bool): If True, raise an AssertionError if the minimum version is not met. + verbose (bool): If True, print warning message if minimum version is not met. + + Returns: + (bool): True if minimum version is met, False otherwise. + """ + current, minimum = (pkg.parse_version(x) for x in (current, minimum)) + result = (current == minimum) if pinned else (current >= minimum) # bool + warning_message = f'WARNING ⚠️ {name}{minimum} is required by YOLOv8, but {name}{current} is currently installed' + if hard: + assert result, emojis(warning_message) # assert min requirements met + if verbose and not result: + LOGGER.warning(warning_message) + return result + + +def check_latest_pypi_version(package_name='ultralytics'): + """ + Returns the latest version of a PyPI package without downloading or installing it. + + Parameters: + package_name (str): The name of the package to find the latest version for. + + Returns: + (str): The latest version of the package. + """ + with contextlib.suppress(Exception): + requests.packages.urllib3.disable_warnings() # Disable the InsecureRequestWarning + response = requests.get(f'https://pypi.org/pypi/{package_name}/json', timeout=3) + if response.status_code == 200: + return response.json()['info']['version'] + return None + + +def check_pip_update_available(): + """ + Checks if a new version of the ultralytics package is available on PyPI. + + Returns: + (bool): True if an update is available, False otherwise. + """ + if ONLINE and is_pip_package(): + with contextlib.suppress(Exception): + from ultralytics import __version__ + latest = check_latest_pypi_version() + if pkg.parse_version(__version__) < pkg.parse_version(latest): # update is available + LOGGER.info(f'New https://pypi.org/project/ultralytics/{latest} available 😃 ' + f"Update with 'pip install -U ultralytics'") + return True + return False + + +@ThreadingLocked() +def check_font(font='Arial.ttf'): + """ + Find font locally or download to user's configuration directory if it does not already exist. + + Args: + font (str): Path or name of font. + + Returns: + file (Path): Resolved font file path. + """ + name = Path(font).name + + # Check USER_CONFIG_DIR + file = USER_CONFIG_DIR / name + if file.exists(): + return file + + # Check system fonts + matches = [s for s in font_manager.findSystemFonts() if font in s] + if any(matches): + return matches[0] + + # Download to USER_CONFIG_DIR if missing + url = f'https://ultralytics.com/assets/{name}' + if downloads.is_url(url): + downloads.safe_download(url=url, file=file) + return file + + +def check_python(minimum: str = '3.7.0') -> bool: + """ + Check current python version against the required minimum version. + + Args: + minimum (str): Required minimum version of python. + + Returns: + None + """ + return check_version(platform.python_version(), minimum, name='Python ', hard=True) + + +@TryExcept() +def check_requirements(requirements=ROOT.parent / 'requirements.txt', exclude=(), install=True, cmds=''): + """ + Check if installed dependencies meet YOLOv8 requirements and attempt to auto-update if needed. + + Args: + requirements (Union[Path, str, List[str]]): Path to a requirements.txt file, a single package requirement as a + string, or a list of package requirements as strings. + exclude (Tuple[str]): Tuple of package names to exclude from checking. + install (bool): If True, attempt to auto-update packages that don't meet requirements. + cmds (str): Additional commands to pass to the pip install command when auto-updating. + """ + prefix = colorstr('red', 'bold', 'requirements:') + check_python() # check python version + check_torchvision() # check torch-torchvision compatibility + if isinstance(requirements, Path): # requirements.txt file + file = requirements.resolve() + assert file.exists(), f'{prefix} {file} not found, check failed.' + with file.open() as f: + requirements = [f'{x.name}{x.specifier}' for x in pkg.parse_requirements(f) if x.name not in exclude] + elif isinstance(requirements, str): + requirements = [requirements] + + s = '' # console string + pkgs = [] + for r in requirements: + r_stripped = r.split('/')[-1].replace('.git', '') # replace git+https://org/repo.git -> 'repo' + try: + pkg.require(r_stripped) + except (pkg.VersionConflict, pkg.DistributionNotFound): # exception if requirements not met + try: # attempt to import (slower but more accurate) + import importlib + importlib.import_module(next(pkg.parse_requirements(r_stripped)).name) + except ImportError: + s += f'"{r}" ' + pkgs.append(r) + + if s: + if install and AUTOINSTALL: # check environment variable + n = len(pkgs) # number of packages updates + LOGGER.info(f"{prefix} Ultralytics requirement{'s' * (n > 1)} {pkgs} not found, attempting AutoUpdate...") + try: + t = time.time() + assert is_online(), 'AutoUpdate skipped (offline)' + LOGGER.info(subprocess.check_output(f'pip install --no-cache {s} {cmds}', shell=True).decode()) + dt = time.time() - t + LOGGER.info( + f"{prefix} AutoUpdate success ✅ {dt:.1f}s, installed {n} package{'s' * (n > 1)}: {pkgs}\n" + f"{prefix} ⚠️ {colorstr('bold', 'Restart runtime or rerun command for updates to take effect')}\n") + except Exception as e: + LOGGER.warning(f'{prefix} ❌ {e}') + return False + else: + return False + + return True + + +def check_torchvision(): + """ + Checks the installed versions of PyTorch and Torchvision to ensure they're compatible. + + This function checks the installed versions of PyTorch and Torchvision, and warns if they're incompatible according + to the provided compatibility table based on https://github.com/pytorch/vision#installation. The + compatibility table is a dictionary where the keys are PyTorch versions and the values are lists of compatible + Torchvision versions. + """ + + import torchvision + + # Compatibility table + compatibility_table = {'2.0': ['0.15'], '1.13': ['0.14'], '1.12': ['0.13']} + + # Extract only the major and minor versions + v_torch = '.'.join(torch.__version__.split('+')[0].split('.')[:2]) + v_torchvision = '.'.join(torchvision.__version__.split('+')[0].split('.')[:2]) + + if v_torch in compatibility_table: + compatible_versions = compatibility_table[v_torch] + if all(pkg.parse_version(v_torchvision) != pkg.parse_version(v) for v in compatible_versions): + print(f'WARNING ⚠️ torchvision=={v_torchvision} is incompatible with torch=={v_torch}.\n' + f"Run 'pip install torchvision=={compatible_versions[0]}' to fix torchvision or " + "'pip install -U torch torchvision' to update both.\n" + 'For a full compatibility table see https://github.com/pytorch/vision#installation') + + +def check_suffix(file='yolov8n.pt', suffix='.pt', msg=''): + """Check file(s) for acceptable suffix.""" + if file and suffix: + if isinstance(suffix, str): + suffix = (suffix, ) + for f in file if isinstance(file, (list, tuple)) else [file]: + s = Path(f).suffix.lower().strip() # file suffix + if len(s): + assert s in suffix, f'{msg}{f} acceptable suffix is {suffix}, not {s}' + + +def check_yolov5u_filename(file: str, verbose: bool = True): + """Replace legacy YOLOv5 filenames with updated YOLOv5u filenames.""" + if ('yolov3' in file or 'yolov5' in file) and 'u' not in file: + original_file = file + file = re.sub(r'(.*yolov5([nsmlx]))\.pt', '\\1u.pt', file) # i.e. yolov5n.pt -> yolov5nu.pt + file = re.sub(r'(.*yolov5([nsmlx])6)\.pt', '\\1u.pt', file) # i.e. yolov5n6.pt -> yolov5n6u.pt + file = re.sub(r'(.*yolov3(|-tiny|-spp))\.pt', '\\1u.pt', file) # i.e. yolov3-spp.pt -> yolov3-sppu.pt + if file != original_file and verbose: + LOGGER.info(f"PRO TIP 💡 Replace 'model={original_file}' with new 'model={file}'.\nYOLOv5 'u' models are " + f'trained with https://github.com/ultralytics/ultralytics and feature improved performance vs ' + f'standard YOLOv5 models trained with https://github.com/ultralytics/yolov5.\n') + return file + + +def check_file(file, suffix='', download=True, hard=True): + """Search/download file (if necessary) and return path.""" + check_suffix(file, suffix) # optional + file = str(file).strip() # convert to string and strip spaces + file = check_yolov5u_filename(file) # yolov5n -> yolov5nu + if not file or ('://' not in file and Path(file).exists()): # exists ('://' check required in Windows Python<3.10) + return file + elif download and file.lower().startswith(('https://', 'http://', 'rtsp://', 'rtmp://')): # download + url = file # warning: Pathlib turns :// -> :/ + file = url2file(file) # '%2F' to '/', split https://url.com/file.txt?auth + if Path(file).exists(): + LOGGER.info(f'Found {clean_url(url)} locally at {file}') # file already exists + else: + downloads.safe_download(url=url, file=file, unzip=False) + return file + else: # search + files = glob.glob(str(ROOT / 'cfg' / '**' / file), recursive=True) # find file + if not files and hard: + raise FileNotFoundError(f"'{file}' does not exist") + elif len(files) > 1 and hard: + raise FileNotFoundError(f"Multiple files match '{file}', specify exact path: {files}") + return files[0] if len(files) else [] # return file + + +def check_yaml(file, suffix=('.yaml', '.yml'), hard=True): + """Search/download YAML file (if necessary) and return path, checking suffix.""" + return check_file(file, suffix, hard=hard) + + +def check_imshow(warn=False): + """Check if environment supports image displays.""" + try: + assert not any((is_colab(), is_kaggle(), is_docker())) + cv2.imshow('test', np.zeros((1, 1, 3))) + cv2.waitKey(1) + cv2.destroyAllWindows() + cv2.waitKey(1) + return True + except Exception as e: + if warn: + LOGGER.warning(f'WARNING ⚠️ Environment does not support cv2.imshow() or PIL Image.show()\n{e}') + return False + + +def check_yolo(verbose=True, device=''): + """Return a human-readable YOLO software and hardware summary.""" + from ultralytics.utils.torch_utils import select_device + + if is_jupyter(): + if check_requirements('wandb', install=False): + os.system('pip uninstall -y wandb') # uninstall wandb: unwanted account creation prompt with infinite hang + if is_colab(): + shutil.rmtree('sample_data', ignore_errors=True) # remove colab /sample_data directory + + if verbose: + # System info + gib = 1 << 30 # bytes per GiB + ram = psutil.virtual_memory().total + total, used, free = shutil.disk_usage('/') + s = f'({os.cpu_count()} CPUs, {ram / gib:.1f} GB RAM, {(total - free) / gib:.1f}/{total / gib:.1f} GB disk)' + with contextlib.suppress(Exception): # clear display if ipython is installed + from IPython import display + display.clear_output() + else: + s = '' + + select_device(device=device, newline=False) + LOGGER.info(f'Setup complete ✅ {s}') + + +def check_amp(model): + """ + This function checks the PyTorch Automatic Mixed Precision (AMP) functionality of a YOLOv8 model. + If the checks fail, it means there are anomalies with AMP on the system that may cause NaN losses or zero-mAP + results, so AMP will be disabled during training. + + Args: + model (nn.Module): A YOLOv8 model instance. + + Returns: + (bool): Returns True if the AMP functionality works correctly with YOLOv8 model, else False. + + Raises: + AssertionError: If the AMP checks fail, indicating anomalies with the AMP functionality on the system. + """ + device = next(model.parameters()).device # get model device + if device.type in ('cpu', 'mps'): + return False # AMP only used on CUDA devices + + def amp_allclose(m, im): + """All close FP32 vs AMP results.""" + a = m(im, device=device, verbose=False)[0].boxes.data # FP32 inference + with torch.cuda.amp.autocast(True): + b = m(im, device=device, verbose=False)[0].boxes.data # AMP inference + del m + return a.shape == b.shape and torch.allclose(a, b.float(), atol=0.5) # close to 0.5 absolute tolerance + + f = ROOT / 'assets/bus.jpg' # image to check + im = f if f.exists() else 'https://ultralytics.com/images/bus.jpg' if ONLINE else np.ones((640, 640, 3)) + prefix = colorstr('AMP: ') + LOGGER.info(f'{prefix}running Automatic Mixed Precision (AMP) checks with YOLOv8n...') + warning_msg = "Setting 'amp=True'. If you experience zero-mAP or NaN losses you can disable AMP with amp=False." + try: + from ultralytics import YOLO + assert amp_allclose(YOLO('yolov8n.pt'), im) + LOGGER.info(f'{prefix}checks passed ✅') + except ConnectionError: + LOGGER.warning(f'{prefix}checks skipped ⚠️, offline and unable to download YOLOv8n. {warning_msg}') + except (AttributeError, ModuleNotFoundError): + LOGGER.warning( + f'{prefix}checks skipped ⚠️. Unable to load YOLOv8n due to possible Ultralytics package modifications. {warning_msg}' + ) + except AssertionError: + LOGGER.warning(f'{prefix}checks failed ❌. Anomalies were detected with AMP on your system that may lead to ' + f'NaN losses or zero-mAP results, so AMP will be disabled during training.') + return False + return True + + +def git_describe(path=ROOT): # path must be a directory + """Return human-readable git description, i.e. v5.0-5-g3e25f1e https://git-scm.com/docs/git-describe.""" + try: + assert (Path(path) / '.git').is_dir() + return subprocess.check_output(f'git -C {path} describe --tags --long --always', shell=True).decode()[:-1] + except AssertionError: + return '' + + +def print_args(args: Optional[dict] = None, show_file=True, show_func=False): + """Print function arguments (optional args dict).""" + + def strip_auth(v): + """Clean longer Ultralytics HUB URLs by stripping potential authentication information.""" + return clean_url(v) if (isinstance(v, str) and v.startswith('http') and len(v) > 100) else v + + x = inspect.currentframe().f_back # previous frame + file, _, func, _, _ = inspect.getframeinfo(x) + if args is None: # get args automatically + args, _, _, frm = inspect.getargvalues(x) + args = {k: v for k, v in frm.items() if k in args} + try: + file = Path(file).resolve().relative_to(ROOT).with_suffix('') + except ValueError: + file = Path(file).stem + s = (f'{file}: ' if show_file else '') + (f'{func}: ' if show_func else '') + LOGGER.info(colorstr(s) + ', '.join(f'{k}={strip_auth(v)}' for k, v in args.items())) diff --git a/ultralytics/utils/dist.py b/ultralytics/utils/dist.py new file mode 100644 index 0000000000000000000000000000000000000000..8ba232b314c7a6105d9339a443f02941ff40b1df --- /dev/null +++ b/ultralytics/utils/dist.py @@ -0,0 +1,67 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import os +import re +import shutil +import socket +import sys +import tempfile +from pathlib import Path + +from . import USER_CONFIG_DIR +from .torch_utils import TORCH_1_9 + + +def find_free_network_port() -> int: + """Finds a free port on localhost. + + It is useful in single-node training when we don't want to connect to a real main node but have to set the + `MASTER_PORT` environment variable. + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('127.0.0.1', 0)) + return s.getsockname()[1] # port + + +def generate_ddp_file(trainer): + """Generates a DDP file and returns its file name.""" + module, name = f'{trainer.__class__.__module__}.{trainer.__class__.__name__}'.rsplit('.', 1) + + content = f'''overrides = {vars(trainer.args)} \nif __name__ == "__main__": + from {module} import {name} + from ultralytics.utils import DEFAULT_CFG_DICT + + cfg = DEFAULT_CFG_DICT.copy() + cfg.update(save_dir='') # handle the extra key 'save_dir' + trainer = {name}(cfg=cfg, overrides=overrides) + trainer.train()''' + (USER_CONFIG_DIR / 'DDP').mkdir(exist_ok=True) + with tempfile.NamedTemporaryFile(prefix='_temp_', + suffix=f'{id(trainer)}.py', + mode='w+', + encoding='utf-8', + dir=USER_CONFIG_DIR / 'DDP', + delete=False) as file: + file.write(content) + return file.name + + +def generate_ddp_command(world_size, trainer): + """Generates and returns command for distributed training.""" + import __main__ # noqa local import to avoid https://github.com/Lightning-AI/lightning/issues/15218 + if not trainer.resume: + shutil.rmtree(trainer.save_dir) # remove the save_dir + file = str(Path(sys.argv[0]).resolve()) + safe_pattern = re.compile(r'^[a-zA-Z0-9_. /\\-]{1,128}$') # allowed characters and maximum of 100 characters + if not (safe_pattern.match(file) and Path(file).exists() and file.endswith('.py')): # using CLI + file = generate_ddp_file(trainer) + dist_cmd = 'torch.distributed.run' if TORCH_1_9 else 'torch.distributed.launch' + port = find_free_network_port() + cmd = [sys.executable, '-m', dist_cmd, '--nproc_per_node', f'{world_size}', '--master_port', f'{port}', file] + return cmd, file + + +def ddp_cleanup(trainer, file): + """Delete temp file if created.""" + if f'{id(trainer)}.py' in file: # if temp_file suffix in file + os.remove(file) diff --git a/ultralytics/utils/downloads.py b/ultralytics/utils/downloads.py new file mode 100644 index 0000000000000000000000000000000000000000..c506c09371a4996dc2d541d7cd4d59c9a40db2bb --- /dev/null +++ b/ultralytics/utils/downloads.py @@ -0,0 +1,275 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import contextlib +import shutil +import subprocess +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path +from urllib import parse, request +from zipfile import BadZipFile, ZipFile, is_zipfile + +import requests +import torch +from tqdm import tqdm + +from ultralytics.utils import LOGGER, checks, clean_url, emojis, is_online, url2file + +GITHUB_ASSET_NAMES = [f'yolov8{k}{suffix}.pt' for k in 'nsmlx' for suffix in ('', '6', '-cls', '-seg', '-pose')] + \ + [f'yolov5{k}u.pt' for k in 'nsmlx'] + \ + [f'yolov3{k}u.pt' for k in ('', '-spp', '-tiny')] + \ + [f'yolo_nas_{k}.pt' for k in 'sml'] + \ + [f'sam_{k}.pt' for k in 'bl'] + \ + [f'FastSAM-{k}.pt' for k in 'sx'] + \ + [f'rtdetr-{k}.pt' for k in 'lx'] + \ + ['mobile_sam.pt'] +GITHUB_ASSET_STEMS = [Path(k).stem for k in GITHUB_ASSET_NAMES] + + +def is_url(url, check=True): + """Check if string is URL and check if URL exists.""" + with contextlib.suppress(Exception): + url = str(url) + result = parse.urlparse(url) + assert all([result.scheme, result.netloc]) # check if is url + if check: + with request.urlopen(url) as response: + return response.getcode() == 200 # check if exists online + return True + return False + + +def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX'), exist_ok=False): + """ + Unzips a *.zip file to the specified path, excluding files containing strings in the exclude list. + + If the zipfile does not contain a single top-level directory, the function will create a new + directory with the same name as the zipfile (without the extension) to extract its contents. + If a path is not provided, the function will use the parent directory of the zipfile as the default path. + + Args: + file (str): The path to the zipfile to be extracted. + path (str, optional): The path to extract the zipfile to. Defaults to None. + exclude (tuple, optional): A tuple of filename strings to be excluded. Defaults to ('.DS_Store', '__MACOSX'). + exist_ok (bool, optional): Whether to overwrite existing contents if they exist. Defaults to False. + + Raises: + BadZipFile: If the provided file does not exist or is not a valid zipfile. + + Returns: + (Path): The path to the directory where the zipfile was extracted. + """ + if not (Path(file).exists() and is_zipfile(file)): + raise BadZipFile(f"File '{file}' does not exist or is a bad zip file.") + if path is None: + path = Path(file).parent # default path + + # Unzip the file contents + with ZipFile(file) as zipObj: + file_list = [f for f in zipObj.namelist() if all(x not in f for x in exclude)] + top_level_dirs = {Path(f).parts[0] for f in file_list} + + if len(top_level_dirs) > 1 or not file_list[0].endswith('/'): + path = Path(path) / Path(file).stem # define new unzip directory + + # Check if destination directory already exists and contains files + extract_path = Path(path) / list(top_level_dirs)[0] + if extract_path.exists() and any(extract_path.iterdir()) and not exist_ok: + # If it exists and is not empty, return the path without unzipping + LOGGER.info(f'Skipping {file} unzip (already unzipped)') + return path + + for f in file_list: + zipObj.extract(f, path=path) + + return path # return unzip dir + + +def check_disk_space(url='https://ultralytics.com/assets/coco128.zip', sf=1.5, hard=True): + """ + Check if there is sufficient disk space to download and store a file. + + Args: + url (str, optional): The URL to the file. Defaults to 'https://ultralytics.com/assets/coco128.zip'. + sf (float, optional): Safety factor, the multiplier for the required free space. Defaults to 2.0. + hard (bool, optional): Whether to throw an error or not on insufficient disk space. Defaults to True. + + Returns: + (bool): True if there is sufficient disk space, False otherwise. + """ + with contextlib.suppress(Exception): + gib = 1 << 30 # bytes per GiB + data = int(requests.head(url).headers['Content-Length']) / gib # file size (GB) + total, used, free = (x / gib for x in shutil.disk_usage('/')) # bytes + if data * sf < free: + return True # sufficient space + + # Insufficient space + text = (f'WARNING ⚠️ Insufficient free disk space {free:.1f} GB < {data * sf:.3f} GB required, ' + f'Please free {data * sf - free:.1f} GB additional disk space and try again.') + if hard: + raise MemoryError(text) + else: + LOGGER.warning(text) + return False + + # Pass if error + return True + + +def safe_download(url, + file=None, + dir=None, + unzip=True, + delete=False, + curl=False, + retry=3, + min_bytes=1E0, + progress=True): + """ + Downloads files from a URL, with options for retrying, unzipping, and deleting the downloaded file. + + Args: + url (str): The URL of the file to be downloaded. + file (str, optional): The filename of the downloaded file. + If not provided, the file will be saved with the same name as the URL. + dir (str, optional): The directory to save the downloaded file. + If not provided, the file will be saved in the current working directory. + unzip (bool, optional): Whether to unzip the downloaded file. Default: True. + delete (bool, optional): Whether to delete the downloaded file after unzipping. Default: False. + curl (bool, optional): Whether to use curl command line tool for downloading. Default: False. + retry (int, optional): The number of times to retry the download in case of failure. Default: 3. + min_bytes (float, optional): The minimum number of bytes that the downloaded file should have, to be considered + a successful download. Default: 1E0. + progress (bool, optional): Whether to display a progress bar during the download. Default: True. + """ + f = dir / url2file(url) if dir else Path(file) # URL converted to filename + if '://' not in str(url) and Path(url).is_file(): # URL exists ('://' check required in Windows Python<3.10) + f = Path(url) # filename + elif not f.is_file(): # URL and file do not exist + assert dir or file, 'dir or file required for download' + f = dir / url2file(url) if dir else Path(file) + desc = f"Downloading {clean_url(url)} to '{f}'" + LOGGER.info(f'{desc}...') + f.parent.mkdir(parents=True, exist_ok=True) # make directory if missing + check_disk_space(url) + for i in range(retry + 1): + try: + if curl or i > 0: # curl download with retry, continue + s = 'sS' * (not progress) # silent + r = subprocess.run(['curl', '-#', f'-{s}L', url, '-o', f, '--retry', '3', '-C', '-']).returncode + assert r == 0, f'Curl return value {r}' + else: # urllib download + method = 'torch' + if method == 'torch': + torch.hub.download_url_to_file(url, f, progress=progress) + else: + from ultralytics.utils import TQDM_BAR_FORMAT + with request.urlopen(url) as response, tqdm(total=int(response.getheader('Content-Length', 0)), + desc=desc, + disable=not progress, + unit='B', + unit_scale=True, + unit_divisor=1024, + bar_format=TQDM_BAR_FORMAT) as pbar: + with open(f, 'wb') as f_opened: + for data in response: + f_opened.write(data) + pbar.update(len(data)) + + if f.exists(): + if f.stat().st_size > min_bytes: + break # success + f.unlink() # remove partial downloads + except Exception as e: + if i == 0 and not is_online(): + raise ConnectionError(emojis(f'❌ Download failure for {url}. Environment is not online.')) from e + elif i >= retry: + raise ConnectionError(emojis(f'❌ Download failure for {url}. Retry limit reached.')) from e + LOGGER.warning(f'⚠️ Download failure, retrying {i + 1}/{retry} {url}...') + + if unzip and f.exists() and f.suffix in ('', '.zip', '.tar', '.gz'): + unzip_dir = dir or f.parent # unzip to dir if provided else unzip in place + LOGGER.info(f'Unzipping {f} to {unzip_dir.absolute()}...') + if is_zipfile(f): + unzip_dir = unzip_file(file=f, path=unzip_dir) # unzip + elif f.suffix == '.tar': + subprocess.run(['tar', 'xf', f, '--directory', unzip_dir], check=True) # unzip + elif f.suffix == '.gz': + subprocess.run(['tar', 'xfz', f, '--directory', unzip_dir], check=True) # unzip + if delete: + f.unlink() # remove zip + return unzip_dir + + +def get_github_assets(repo='ultralytics/assets', version='latest', retry=False): + """Return GitHub repo tag and assets (i.e. ['yolov8n.pt', 'yolov8s.pt', ...]).""" + if version != 'latest': + version = f'tags/{version}' # i.e. tags/v6.2 + url = f'https://api.github.com/repos/{repo}/releases/{version}' + r = requests.get(url) # github api + if r.status_code != 200 and retry: + r = requests.get(url) # try again + data = r.json() + return data['tag_name'], [x['name'] for x in data['assets']] # tag, assets + + +def attempt_download_asset(file, repo='ultralytics/assets', release='v0.0.0'): + """Attempt file download from GitHub release assets if not found locally. release = 'latest', 'v6.2', etc.""" + from ultralytics.utils import SETTINGS # scoped for circular import + + # YOLOv3/5u updates + file = str(file) + file = checks.check_yolov5u_filename(file) + file = Path(file.strip().replace("'", '')) + if file.exists(): + return str(file) + elif (SETTINGS['weights_dir'] / file).exists(): + return str(SETTINGS['weights_dir'] / file) + else: + # URL specified + name = Path(parse.unquote(str(file))).name # decode '%2F' to '/' etc. + if str(file).startswith(('http:/', 'https:/')): # download + url = str(file).replace(':/', '://') # Pathlib turns :// -> :/ + file = url2file(name) # parse authentication https://url.com/file.txt?auth... + if Path(file).is_file(): + LOGGER.info(f'Found {clean_url(url)} locally at {file}') # file already exists + else: + safe_download(url=url, file=file, min_bytes=1E5) + return file + + # GitHub assets + assets = GITHUB_ASSET_NAMES + try: + tag, assets = get_github_assets(repo, release) + except Exception: + try: + tag, assets = get_github_assets(repo) # latest release + except Exception: + try: + tag = subprocess.check_output(['git', 'tag']).decode().split()[-1] + except Exception: + tag = release + + file.parent.mkdir(parents=True, exist_ok=True) # make parent dir (if required) + if name in assets: + safe_download(url=f'https://github.com/{repo}/releases/download/{tag}/{name}', file=file, min_bytes=1E5) + + return str(file) + + +def download(url, dir=Path.cwd(), unzip=True, delete=False, curl=False, threads=1, retry=3): + """Downloads and unzips files concurrently if threads > 1, else sequentially.""" + dir = Path(dir) + dir.mkdir(parents=True, exist_ok=True) # make directory + if threads > 1: + with ThreadPool(threads) as pool: + pool.map( + lambda x: safe_download( + url=x[0], dir=x[1], unzip=unzip, delete=delete, curl=curl, retry=retry, progress=threads <= 1), + zip(url, repeat(dir))) + pool.close() + pool.join() + else: + for u in [url] if isinstance(url, (str, Path)) else url: + safe_download(url=u, dir=dir, unzip=unzip, delete=delete, curl=curl, retry=retry) diff --git a/ultralytics/utils/errors.py b/ultralytics/utils/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..0d776f330635f89668b5a1939d8bd11cd47d6e0c --- /dev/null +++ b/ultralytics/utils/errors.py @@ -0,0 +1,10 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from ultralytics.utils import emojis + + +class HUBModelError(Exception): + + def __init__(self, message='Model not found. Please check model URL and try again.'): + """Create an exception for when a model is not found.""" + super().__init__(emojis(message)) diff --git a/ultralytics/utils/files.py b/ultralytics/utils/files.py new file mode 100644 index 0000000000000000000000000000000000000000..0c757fa75cc5bc971c20278da5ffb87c74b0710c --- /dev/null +++ b/ultralytics/utils/files.py @@ -0,0 +1,155 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import contextlib +import glob +import os +import shutil +import tempfile +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path + + +class WorkingDirectory(contextlib.ContextDecorator): + """Usage: @WorkingDirectory(dir) decorator or 'with WorkingDirectory(dir):' context manager.""" + + def __init__(self, new_dir): + """Sets the working directory to 'new_dir' upon instantiation.""" + self.dir = new_dir # new dir + self.cwd = Path.cwd().resolve() # current dir + + def __enter__(self): + """Changes the current directory to the specified directory.""" + os.chdir(self.dir) + + def __exit__(self, exc_type, exc_val, exc_tb): + """Restore the current working directory on context exit.""" + os.chdir(self.cwd) + + +@contextmanager +def spaces_in_path(path): + """ + Context manager to handle paths with spaces in their names. + If a path contains spaces, it replaces them with underscores, copies the file/directory to the new path, + executes the context code block, then copies the file/directory back to its original location. + + Args: + path (str | Path): The original path. + + Yields: + (Path): Temporary path with spaces replaced by underscores if spaces were present, otherwise the original path. + + Example: + ```python + with spaces_in_path('/path/with spaces') as new_path: + # your code here + ``` + """ + + # If path has spaces, replace them with underscores + if ' ' in str(path): + string = isinstance(path, str) # input type + path = Path(path) + + # Create a temporary directory and construct the new path + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) / path.name.replace(' ', '_') + + # Copy file/directory + if path.is_dir(): + # tmp_path.mkdir(parents=True, exist_ok=True) + shutil.copytree(path, tmp_path) + elif path.is_file(): + tmp_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(path, tmp_path) + + try: + # Yield the temporary path + yield str(tmp_path) if string else tmp_path + + finally: + # Copy file/directory back + if tmp_path.is_dir(): + shutil.copytree(tmp_path, path, dirs_exist_ok=True) + elif tmp_path.is_file(): + shutil.copy2(tmp_path, path) # Copy back the file + + else: + # If there are no spaces, just yield the original path + yield path + + +def increment_path(path, exist_ok=False, sep='', mkdir=False): + """ + Increments a file or directory path, i.e. runs/exp --> runs/exp{sep}2, runs/exp{sep}3, ... etc. + + If the path exists and exist_ok is not set to True, the path will be incremented by appending a number and sep to + the end of the path. If the path is a file, the file extension will be preserved. If the path is a directory, the + number will be appended directly to the end of the path. If mkdir is set to True, the path will be created as a + directory if it does not already exist. + + Args: + path (str, pathlib.Path): Path to increment. + exist_ok (bool, optional): If True, the path will not be incremented and returned as-is. Defaults to False. + sep (str, optional): Separator to use between the path and the incrementation number. Defaults to ''. + mkdir (bool, optional): Create a directory if it does not exist. Defaults to False. + + Returns: + (pathlib.Path): Incremented path. + """ + path = Path(path) # os-agnostic + if path.exists() and not exist_ok: + path, suffix = (path.with_suffix(''), path.suffix) if path.is_file() else (path, '') + + # Method 1 + for n in range(2, 9999): + p = f'{path}{sep}{n}{suffix}' # increment path + if not os.path.exists(p): # + break + path = Path(p) + + if mkdir: + path.mkdir(parents=True, exist_ok=True) # make directory + + return path + + +def file_age(path=__file__): + """Return days since last file update.""" + dt = (datetime.now() - datetime.fromtimestamp(Path(path).stat().st_mtime)) # delta + return dt.days # + dt.seconds / 86400 # fractional days + + +def file_date(path=__file__): + """Return human-readable file modification date, i.e. '2021-3-26'.""" + t = datetime.fromtimestamp(Path(path).stat().st_mtime) + return f'{t.year}-{t.month}-{t.day}' + + +def file_size(path): + """Return file/dir size (MB).""" + if isinstance(path, (str, Path)): + mb = 1 << 20 # bytes to MiB (1024 ** 2) + path = Path(path) + if path.is_file(): + return path.stat().st_size / mb + elif path.is_dir(): + return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file()) / mb + return 0.0 + + +def get_latest_run(search_dir='.'): + """Return path to most recent 'last.pt' in /runs (i.e. to --resume from).""" + last_list = glob.glob(f'{search_dir}/**/last*.pt', recursive=True) + return max(last_list, key=os.path.getctime) if last_list else '' + + +def make_dirs(dir='new_dir/'): + """Create directories.""" + dir = Path(dir) + if dir.exists(): + shutil.rmtree(dir) # delete dir + for p in dir, dir / 'labels', dir / 'images': + p.mkdir(parents=True, exist_ok=True) # make dir + return dir diff --git a/ultralytics/utils/instance.py b/ultralytics/utils/instance.py new file mode 100644 index 0000000000000000000000000000000000000000..7d82274aff47723bbaf5731dc8f7481835ee2045 --- /dev/null +++ b/ultralytics/utils/instance.py @@ -0,0 +1,392 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from collections import abc +from itertools import repeat +from numbers import Number +from typing import List + +import numpy as np + +from .ops import ltwh2xywh, ltwh2xyxy, resample_segments, xywh2ltwh, xywh2xyxy, xyxy2ltwh, xyxy2xywh + + +def _ntuple(n): + """From PyTorch internals.""" + + def parse(x): + """Parse bounding boxes format between XYWH and LTWH.""" + return x if isinstance(x, abc.Iterable) else tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) +to_4tuple = _ntuple(4) + +# `xyxy` means left top and right bottom +# `xywh` means center x, center y and width, height(yolo format) +# `ltwh` means left top and width, height(coco format) +_formats = ['xyxy', 'xywh', 'ltwh'] + +__all__ = 'Bboxes', # tuple or list + + +class Bboxes: + """Now only numpy is supported.""" + + def __init__(self, bboxes, format='xyxy') -> None: + assert format in _formats, f'Invalid bounding box format: {format}, format must be one of {_formats}' + bboxes = bboxes[None, :] if bboxes.ndim == 1 else bboxes + assert bboxes.ndim == 2 + assert bboxes.shape[1] == 4 + self.bboxes = bboxes + self.format = format + # self.normalized = normalized + + # def convert(self, format): + # assert format in _formats + # if self.format == format: + # bboxes = self.bboxes + # elif self.format == "xyxy": + # if format == "xywh": + # bboxes = xyxy2xywh(self.bboxes) + # else: + # bboxes = xyxy2ltwh(self.bboxes) + # elif self.format == "xywh": + # if format == "xyxy": + # bboxes = xywh2xyxy(self.bboxes) + # else: + # bboxes = xywh2ltwh(self.bboxes) + # else: + # if format == "xyxy": + # bboxes = ltwh2xyxy(self.bboxes) + # else: + # bboxes = ltwh2xywh(self.bboxes) + # + # return Bboxes(bboxes, format) + + def convert(self, format): + """Converts bounding box format from one type to another.""" + assert format in _formats, f'Invalid bounding box format: {format}, format must be one of {_formats}' + if self.format == format: + return + elif self.format == 'xyxy': + bboxes = xyxy2xywh(self.bboxes) if format == 'xywh' else xyxy2ltwh(self.bboxes) + elif self.format == 'xywh': + bboxes = xywh2xyxy(self.bboxes) if format == 'xyxy' else xywh2ltwh(self.bboxes) + else: + bboxes = ltwh2xyxy(self.bboxes) if format == 'xyxy' else ltwh2xywh(self.bboxes) + self.bboxes = bboxes + self.format = format + + def areas(self): + """Return box areas.""" + self.convert('xyxy') + return (self.bboxes[:, 2] - self.bboxes[:, 0]) * (self.bboxes[:, 3] - self.bboxes[:, 1]) + + # def denormalize(self, w, h): + # if not self.normalized: + # return + # assert (self.bboxes <= 1.0).all() + # self.bboxes[:, 0::2] *= w + # self.bboxes[:, 1::2] *= h + # self.normalized = False + # + # def normalize(self, w, h): + # if self.normalized: + # return + # assert (self.bboxes > 1.0).any() + # self.bboxes[:, 0::2] /= w + # self.bboxes[:, 1::2] /= h + # self.normalized = True + + def mul(self, scale): + """ + Args: + scale (tuple | list | int): the scale for four coords. + """ + if isinstance(scale, Number): + scale = to_4tuple(scale) + assert isinstance(scale, (tuple, list)) + assert len(scale) == 4 + self.bboxes[:, 0] *= scale[0] + self.bboxes[:, 1] *= scale[1] + self.bboxes[:, 2] *= scale[2] + self.bboxes[:, 3] *= scale[3] + + def add(self, offset): + """ + Args: + offset (tuple | list | int): the offset for four coords. + """ + if isinstance(offset, Number): + offset = to_4tuple(offset) + assert isinstance(offset, (tuple, list)) + assert len(offset) == 4 + self.bboxes[:, 0] += offset[0] + self.bboxes[:, 1] += offset[1] + self.bboxes[:, 2] += offset[2] + self.bboxes[:, 3] += offset[3] + + def __len__(self): + """Return the number of boxes.""" + return len(self.bboxes) + + @classmethod + def concatenate(cls, boxes_list: List['Bboxes'], axis=0) -> 'Bboxes': + """ + Concatenate a list of Bboxes objects into a single Bboxes object. + + Args: + boxes_list (List[Bboxes]): A list of Bboxes objects to concatenate. + axis (int, optional): The axis along which to concatenate the bounding boxes. + Defaults to 0. + + Returns: + Bboxes: A new Bboxes object containing the concatenated bounding boxes. + + Note: + The input should be a list or tuple of Bboxes objects. + """ + assert isinstance(boxes_list, (list, tuple)) + if not boxes_list: + return cls(np.empty(0)) + assert all(isinstance(box, Bboxes) for box in boxes_list) + + if len(boxes_list) == 1: + return boxes_list[0] + return cls(np.concatenate([b.bboxes for b in boxes_list], axis=axis)) + + def __getitem__(self, index) -> 'Bboxes': + """ + Retrieve a specific bounding box or a set of bounding boxes using indexing. + + Args: + index (int, slice, or np.ndarray): The index, slice, or boolean array to select + the desired bounding boxes. + + Returns: + Bboxes: A new Bboxes object containing the selected bounding boxes. + + Raises: + AssertionError: If the indexed bounding boxes do not form a 2-dimensional matrix. + + Note: + When using boolean indexing, make sure to provide a boolean array with the same + length as the number of bounding boxes. + """ + if isinstance(index, int): + return Bboxes(self.bboxes[index].view(1, -1)) + b = self.bboxes[index] + assert b.ndim == 2, f'Indexing on Bboxes with {index} failed to return a matrix!' + return Bboxes(b) + + +class Instances: + + def __init__(self, bboxes, segments=None, keypoints=None, bbox_format='xywh', normalized=True) -> None: + """ + Args: + bboxes (ndarray): bboxes with shape [N, 4]. + segments (list | ndarray): segments. + keypoints (ndarray): keypoints(x, y, visible) with shape [N, 17, 3]. + """ + if segments is None: + segments = [] + self._bboxes = Bboxes(bboxes=bboxes, format=bbox_format) + self.keypoints = keypoints + self.normalized = normalized + + if len(segments) > 0: + # list[np.array(1000, 2)] * num_samples + segments = resample_segments(segments) + # (N, 1000, 2) + segments = np.stack(segments, axis=0) + else: + segments = np.zeros((0, 1000, 2), dtype=np.float32) + self.segments = segments + + def convert_bbox(self, format): + """Convert bounding box format.""" + self._bboxes.convert(format=format) + + @property + def bbox_areas(self): + """Calculate the area of bounding boxes.""" + return self._bboxes.areas() + + def scale(self, scale_w, scale_h, bbox_only=False): + """this might be similar with denormalize func but without normalized sign.""" + self._bboxes.mul(scale=(scale_w, scale_h, scale_w, scale_h)) + if bbox_only: + return + self.segments[..., 0] *= scale_w + self.segments[..., 1] *= scale_h + if self.keypoints is not None: + self.keypoints[..., 0] *= scale_w + self.keypoints[..., 1] *= scale_h + + def denormalize(self, w, h): + """Denormalizes boxes, segments, and keypoints from normalized coordinates.""" + if not self.normalized: + return + self._bboxes.mul(scale=(w, h, w, h)) + self.segments[..., 0] *= w + self.segments[..., 1] *= h + if self.keypoints is not None: + self.keypoints[..., 0] *= w + self.keypoints[..., 1] *= h + self.normalized = False + + def normalize(self, w, h): + """Normalize bounding boxes, segments, and keypoints to image dimensions.""" + if self.normalized: + return + self._bboxes.mul(scale=(1 / w, 1 / h, 1 / w, 1 / h)) + self.segments[..., 0] /= w + self.segments[..., 1] /= h + if self.keypoints is not None: + self.keypoints[..., 0] /= w + self.keypoints[..., 1] /= h + self.normalized = True + + def add_padding(self, padw, padh): + """Handle rect and mosaic situation.""" + assert not self.normalized, 'you should add padding with absolute coordinates.' + self._bboxes.add(offset=(padw, padh, padw, padh)) + self.segments[..., 0] += padw + self.segments[..., 1] += padh + if self.keypoints is not None: + self.keypoints[..., 0] += padw + self.keypoints[..., 1] += padh + + def __getitem__(self, index) -> 'Instances': + """ + Retrieve a specific instance or a set of instances using indexing. + + Args: + index (int, slice, or np.ndarray): The index, slice, or boolean array to select + the desired instances. + + Returns: + Instances: A new Instances object containing the selected bounding boxes, + segments, and keypoints if present. + + Note: + When using boolean indexing, make sure to provide a boolean array with the same + length as the number of instances. + """ + segments = self.segments[index] if len(self.segments) else self.segments + keypoints = self.keypoints[index] if self.keypoints is not None else None + bboxes = self.bboxes[index] + bbox_format = self._bboxes.format + return Instances( + bboxes=bboxes, + segments=segments, + keypoints=keypoints, + bbox_format=bbox_format, + normalized=self.normalized, + ) + + def flipud(self, h): + """Flips the coordinates of bounding boxes, segments, and keypoints vertically.""" + if self._bboxes.format == 'xyxy': + y1 = self.bboxes[:, 1].copy() + y2 = self.bboxes[:, 3].copy() + self.bboxes[:, 1] = h - y2 + self.bboxes[:, 3] = h - y1 + else: + self.bboxes[:, 1] = h - self.bboxes[:, 1] + self.segments[..., 1] = h - self.segments[..., 1] + if self.keypoints is not None: + self.keypoints[..., 1] = h - self.keypoints[..., 1] + + def fliplr(self, w): + """Reverses the order of the bounding boxes and segments horizontally.""" + if self._bboxes.format == 'xyxy': + x1 = self.bboxes[:, 0].copy() + x2 = self.bboxes[:, 2].copy() + self.bboxes[:, 0] = w - x2 + self.bboxes[:, 2] = w - x1 + else: + self.bboxes[:, 0] = w - self.bboxes[:, 0] + self.segments[..., 0] = w - self.segments[..., 0] + if self.keypoints is not None: + self.keypoints[..., 0] = w - self.keypoints[..., 0] + + def clip(self, w, h): + """Clips bounding boxes, segments, and keypoints values to stay within image boundaries.""" + ori_format = self._bboxes.format + self.convert_bbox(format='xyxy') + self.bboxes[:, [0, 2]] = self.bboxes[:, [0, 2]].clip(0, w) + self.bboxes[:, [1, 3]] = self.bboxes[:, [1, 3]].clip(0, h) + if ori_format != 'xyxy': + self.convert_bbox(format=ori_format) + self.segments[..., 0] = self.segments[..., 0].clip(0, w) + self.segments[..., 1] = self.segments[..., 1].clip(0, h) + if self.keypoints is not None: + self.keypoints[..., 0] = self.keypoints[..., 0].clip(0, w) + self.keypoints[..., 1] = self.keypoints[..., 1].clip(0, h) + + def remove_zero_area_boxes(self): + """Remove zero-area boxes, i.e. after clipping some boxes may have zero width or height. This removes them.""" + good = self.bbox_areas > 0 + if not all(good): + self._bboxes = self._bboxes[good] + if len(self.segments): + self.segments = self.segments[good] + if self.keypoints is not None: + self.keypoints = self.keypoints[good] + return good + + def update(self, bboxes, segments=None, keypoints=None): + """Updates instance variables.""" + self._bboxes = Bboxes(bboxes, format=self._bboxes.format) + if segments is not None: + self.segments = segments + if keypoints is not None: + self.keypoints = keypoints + + def __len__(self): + """Return the length of the instance list.""" + return len(self.bboxes) + + @classmethod + def concatenate(cls, instances_list: List['Instances'], axis=0) -> 'Instances': + """ + Concatenates a list of Instances objects into a single Instances object. + + Args: + instances_list (List[Instances]): A list of Instances objects to concatenate. + axis (int, optional): The axis along which the arrays will be concatenated. Defaults to 0. + + Returns: + Instances: A new Instances object containing the concatenated bounding boxes, + segments, and keypoints if present. + + Note: + The `Instances` objects in the list should have the same properties, such as + the format of the bounding boxes, whether keypoints are present, and if the + coordinates are normalized. + """ + assert isinstance(instances_list, (list, tuple)) + if not instances_list: + return cls(np.empty(0)) + assert all(isinstance(instance, Instances) for instance in instances_list) + + if len(instances_list) == 1: + return instances_list[0] + + use_keypoint = instances_list[0].keypoints is not None + bbox_format = instances_list[0]._bboxes.format + normalized = instances_list[0].normalized + + cat_boxes = np.concatenate([ins.bboxes for ins in instances_list], axis=axis) + cat_segments = np.concatenate([b.segments for b in instances_list], axis=axis) + cat_keypoints = np.concatenate([b.keypoints for b in instances_list], axis=axis) if use_keypoint else None + return cls(cat_boxes, cat_segments, cat_keypoints, bbox_format, normalized) + + @property + def bboxes(self): + """Return bounding boxes.""" + return self._bboxes.bboxes diff --git a/ultralytics/utils/iou.py b/ultralytics/utils/iou.py new file mode 100644 index 0000000000000000000000000000000000000000..8aadc6c2b0fa526572c585ae6e96fa43d62769ff --- /dev/null +++ b/ultralytics/utils/iou.py @@ -0,0 +1,121 @@ +import math +import torch +class IoU_Cal: + ''' pred, target: x0,y0,x1,y1 + monotonous: { + None: origin v1 + True: monotonic FM v2 + False: non-monotonic FM v3 + } + momentum: The momentum of running mean (This can be set by the function )''' + iou_mean = 1. + monotonous = True #v1:none v2:true v3:false + momentum = 1 - 0.5 ** (1 / 7000) + _is_train = True + @classmethod + def momentum_estimation(cls, n, t): + ''' n: Number of batches per training epoch + t: The epoch when mAP's ascension slowed significantly''' + time_to_real = n * t + cls.momentum = 1 - pow(0.05, 1 / time_to_real) + return cls.momentum + def __init__(self, pred, target): + self.pred, self.target = pred, target + self._fget = { + # x,y,w,h + 'pred_xy': lambda: (self.pred[..., :2] + self.pred[..., 2: 4]) / 2, + 'pred_wh': lambda: self.pred[..., 2: 4] - self.pred[..., :2], + 'target_xy': lambda: (self.target[..., :2] + self.target[..., 2: 4]) / 2, + 'target_wh': lambda: self.target[..., 2: 4] - self.target[..., :2], + # x0,y0,x1,y1 + 'min_coord': lambda: torch.minimum(self.pred[..., :4], self.target[..., :4]), + 'max_coord': lambda: torch.maximum(self.pred[..., :4], self.target[..., :4]), + # The overlapping region + 'wh_inter': lambda: torch.relu(self.min_coord[..., 2: 4] - self.max_coord[..., :2]), + 's_inter': lambda: torch.prod(self.wh_inter, dim=-1), + # The area covered + 's_union': lambda: torch.prod(self.pred_wh, dim=-1) + + torch.prod(self.target_wh, dim=-1) - self.s_inter, + # The smallest enclosing box + 'wh_box': lambda: self.max_coord[..., 2: 4] - self.min_coord[..., :2], + 's_box': lambda: torch.prod(self.wh_box, dim=-1), + 'l2_box': lambda: torch.square(self.wh_box).sum(dim=-1), + # The central points' connection of the bounding boxes + 'd_center': lambda: self.pred_xy - self.target_xy, + 'l2_center': lambda: torch.square(self.d_center).sum(dim=-1), + # IoU + 'iou': lambda: 1 - self.s_inter / self.s_union + } + self._update(self) + def __setitem__(self, key, value): + self._fget[key] = value + def __getattr__(self, item): + if callable(self._fget[item]): + self._fget[item] = self._fget[item]() + return self._fget[item] + @classmethod + def train(cls): + cls._is_train = True + @classmethod + def eval(cls): + cls._is_train = False + @classmethod + def _update(cls, self): + if cls._is_train: cls.iou_mean = (1 - cls.momentum) * cls.iou_mean + \ + cls.momentum * self.iou.detach().mean().item() + def _scaled_loss(self, loss, alpha=1.9, delta=3): + if isinstance(self.monotonous, bool): + beta = self.iou.detach() / self.iou_mean + if self.monotonous: + loss *= beta.sqrt() + else: + divisor = delta * torch.pow(alpha, beta - delta) + loss *= beta / divisor + return loss + @classmethod + def IoU(cls, pred, target, self=None): + self = self if self else cls(pred, target) + return self.iou + @classmethod + def WIoU(cls, pred, target, self=None): + self = self if self else cls(pred, target) + dist = torch.exp(self.l2_center / self.l2_box.detach()) + return self._scaled_loss(dist * self.iou) + @classmethod + def EIoU(cls, pred, target, self=None): + self = self if self else cls(pred, target) + penalty = self.l2_center / self.l2_box.detach() \ + + torch.square(self.d_center / self.wh_box).sum(dim=-1) + return self._scaled_loss(self.iou + penalty) + @classmethod + def GIoU(cls, pred, target, self=None): + self = self if self else cls(pred, target) + return self._scaled_loss(self.iou + (self.s_box - self.s_union) / self.s_box) + @classmethod + def DIoU(cls, pred, target, self=None): + self = self if self else cls(pred, target) + return self._scaled_loss(self.iou + self.l2_center / self.l2_box) + @classmethod + def CIoU(cls, pred, target, eps=1e-4, self=None): + self = self if self else cls(pred, target) + v = 4 / math.pi ** 2 * \ + (torch.atan(self.pred_wh[..., 0] / (self.pred_wh[..., 1] + eps)) - + torch.atan(self.target_wh[..., 0] / (self.target_wh[..., 1] + eps))) ** 2 + alpha = v / (self.iou + v) + return self._scaled_loss(self.iou + self.l2_center / self.l2_box + alpha.detach() * v) + @classmethod + def SIoU(cls, pred, target, theta=4, self=None): + self = self if self else cls(pred, target) + # Angle Cost + angle = torch.arcsin(torch.abs(self.d_center).min(dim=-1)[0] / (self.l2_center.sqrt() + 1e-4)) + angle = torch.sin(2 * angle) - 2 + # Dist Cost + dist = angle[..., None] * torch.square(self.d_center / self.wh_box) + dist = 2 - torch.exp(dist[..., 0]) - torch.exp(dist[..., 1]) + # Shape Cost + d_shape = torch.abs(self.pred_wh - self.target_wh) + big_shape = torch.maximum(self.pred_wh, self.target_wh) + w_shape = 1 - torch.exp(- d_shape[..., 0] / big_shape[..., 0]) + h_shape = 1 - torch.exp(- d_shape[..., 1] / big_shape[..., 1]) + shape = w_shape ** theta + h_shape ** theta + return self._scaled_loss(self.iou + (dist + shape) / 2) \ No newline at end of file diff --git a/ultralytics/utils/loss-origin.py b/ultralytics/utils/loss-origin.py new file mode 100644 index 0000000000000000000000000000000000000000..85bc439926879a93a854469ae307f45bec4b1ab3 --- /dev/null +++ b/ultralytics/utils/loss-origin.py @@ -0,0 +1,393 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ultralytics.utils.metrics import OKS_SIGMA +from ultralytics.utils.ops import crop_mask, xywh2xyxy, xyxy2xywh +from ultralytics.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors + +from .metrics import bbox_iou +from .tal import bbox2dist + + +class VarifocalLoss(nn.Module): + """Varifocal loss by Zhang et al. https://arxiv.org/abs/2008.13367.""" + + def __init__(self): + """Initialize the VarifocalLoss class.""" + super().__init__() + + def forward(self, pred_score, gt_score, label, alpha=0.75, gamma=2.0): + """Computes varfocal loss.""" + weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label + with torch.cuda.amp.autocast(enabled=False): + loss = (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(), reduction='none') * + weight).mean(1).sum() + return loss + + +# Losses +class FocalLoss(nn.Module): + """Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5).""" + + def __init__(self, ): + super().__init__() + + def forward(self, pred, label, gamma=1.5, alpha=0.25): + """Calculates and updates confusion matrix for object detection/classification tasks.""" + loss = F.binary_cross_entropy_with_logits(pred, label, reduction='none') + # p_t = torch.exp(-loss) + # loss *= self.alpha * (1.000001 - p_t) ** self.gamma # non-zero power for gradient stability + + # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py + pred_prob = pred.sigmoid() # prob from logits + p_t = label * pred_prob + (1 - label) * (1 - pred_prob) + modulating_factor = (1.0 - p_t) ** gamma + loss *= modulating_factor + if alpha > 0: + alpha_factor = label * alpha + (1 - label) * (1 - alpha) + loss *= alpha_factor + return loss.mean(1).sum() + + +class BboxLoss(nn.Module): + + def __init__(self, reg_max, use_dfl=False): + """Initialize the BboxLoss module with regularization maximum and DFL settings.""" + super().__init__() + self.reg_max = reg_max + self.use_dfl = use_dfl + + def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask): + """IoU loss.""" + weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1) + iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True) + loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum + + + # DFL loss + if self.use_dfl: + target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max) + loss_dfl = self._df_loss(pred_dist[fg_mask].view(-1, self.reg_max + 1), target_ltrb[fg_mask]) * weight + loss_dfl = loss_dfl.sum() / target_scores_sum + else: + loss_dfl = torch.tensor(0.0).to(pred_dist.device) + + return loss_iou, loss_dfl + + @staticmethod + def _df_loss(pred_dist, target): + """Return sum of left and right DFL losses.""" + # Distribution Focal Loss (DFL) proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 + tl = target.long() # target left + tr = tl + 1 # target right + wl = tr - target # weight left + wr = 1 - wl # weight right + return (F.cross_entropy(pred_dist, tl.view(-1), reduction='none').view(tl.shape) * wl + + F.cross_entropy(pred_dist, tr.view(-1), reduction='none').view(tl.shape) * wr).mean(-1, keepdim=True) + + +class KeypointLoss(nn.Module): + + def __init__(self, sigmas) -> None: + super().__init__() + self.sigmas = sigmas + + def forward(self, pred_kpts, gt_kpts, kpt_mask, area): + """Calculates keypoint loss factor and Euclidean distance loss for predicted and actual keypoints.""" + d = (pred_kpts[..., 0] - gt_kpts[..., 0]) ** 2 + (pred_kpts[..., 1] - gt_kpts[..., 1]) ** 2 + kpt_loss_factor = (torch.sum(kpt_mask != 0) + torch.sum(kpt_mask == 0)) / (torch.sum(kpt_mask != 0) + 1e-9) + # e = d / (2 * (area * self.sigmas) ** 2 + 1e-9) # from formula + e = d / (2 * self.sigmas) ** 2 / (area + 1e-9) / 2 # from cocoeval + return kpt_loss_factor * ((1 - torch.exp(-e)) * kpt_mask).mean() + + +# Criterion class for computing Detection training losses +class v8DetectionLoss: + + def __init__(self, model): # model must be de-paralleled + + device = next(model.parameters()).device # get model device + h = model.args # hyperparameters + + m = model.model[-1] # Detect() module + self.bce = nn.BCEWithLogitsLoss(reduction='none') + self.hyp = h + self.stride = m.stride # model strides + self.nc = m.nc # number of classes + self.no = m.no + self.reg_max = m.reg_max + self.device = device + + self.use_dfl = m.reg_max > 1 + + self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0) + self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=self.use_dfl).to(device) + self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device) + + def preprocess(self, targets, batch_size, scale_tensor): + """Preprocesses the target counts and matches with the input batch size to output a tensor.""" + if targets.shape[0] == 0: + out = torch.zeros(batch_size, 0, 5, device=self.device) + else: + i = targets[:, 0] # image index + _, counts = i.unique(return_counts=True) + counts = counts.to(dtype=torch.int32) + out = torch.zeros(batch_size, counts.max(), 5, device=self.device) + for j in range(batch_size): + matches = i == j + n = matches.sum() + if n: + out[j, :n] = targets[matches, 1:] + out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor)) + return out + + def bbox_decode(self, anchor_points, pred_dist): + """Decode predicted object bounding box coordinates from anchor points and distribution.""" + if self.use_dfl: + b, a, c = pred_dist.shape # batch, anchors, channels + pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype)) + # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype)) + # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2) + return dist2bbox(pred_dist, anchor_points, xywh=False) + + def __call__(self, preds, batch): + """Calculate the sum of the loss for box, cls and dfl multiplied by batch size.""" + loss = torch.zeros(3, device=self.device) # box, cls, dfl + feats = preds[1] if isinstance(preds, tuple) else preds + pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( + (self.reg_max * 4, self.nc), 1) + + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + + dtype = pred_scores.dtype + batch_size = pred_scores.shape[0] + imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # targets + targets = torch.cat((batch['batch_idx'].view(-1, 1), batch['cls'].view(-1, 1), batch['bboxes']), 1) + targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + + # pboxes + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + + _, target_bboxes, target_scores, fg_mask, _ = self.assigner( + pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) + + target_scores_sum = max(target_scores.sum(), 1) + + # cls loss + # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way + loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE + + # bbox loss + if fg_mask.sum(): + target_bboxes /= stride_tensor + loss[0], loss[2] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, + target_scores_sum, fg_mask) + + loss[0] *= self.hyp.box # box gain + loss[1] *= self.hyp.cls # cls gain + loss[2] *= self.hyp.dfl # dfl gain + + return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl) + + +# Criterion class for computing training losses +class v8SegmentationLoss(v8DetectionLoss): + + def __init__(self, model): # model must be de-paralleled + super().__init__(model) + self.nm = model.model[-1].nm # number of masks + self.overlap = model.args.overlap_mask + + def __call__(self, preds, batch): + """Calculate and return the loss for the YOLO model.""" + loss = torch.zeros(4, device=self.device) # box, cls, dfl + feats, pred_masks, proto = preds if len(preds) == 3 else preds[1] + batch_size, _, mask_h, mask_w = proto.shape # batch size, number of masks, mask height, mask width + pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( + (self.reg_max * 4, self.nc), 1) + + # b, grids, .. + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + pred_masks = pred_masks.permute(0, 2, 1).contiguous() + + dtype = pred_scores.dtype + imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # targets + try: + batch_idx = batch['batch_idx'].view(-1, 1) + targets = torch.cat((batch_idx, batch['cls'].view(-1, 1), batch['bboxes']), 1) + targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + except RuntimeError as e: + raise TypeError('ERROR ❌ segment dataset incorrectly formatted or not a segment dataset.\n' + "This error can occur when incorrectly training a 'segment' model on a 'detect' dataset, " + "i.e. 'yolo train model=yolov8n-seg.pt data=coco128.yaml'.\nVerify your dataset is a " + "correctly formatted 'segment' dataset using 'data=coco128-seg.yaml' " + 'as an example.\nSee https://docs.ultralytics.com/tasks/segment/ for help.') from e + + # pboxes + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + + _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner( + pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) + + target_scores_sum = max(target_scores.sum(), 1) + + # cls loss + # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way + loss[2] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE + + if fg_mask.sum(): + # bbox loss + loss[0], loss[3] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes / stride_tensor, + target_scores, target_scores_sum, fg_mask) + # masks loss + masks = batch['masks'].to(self.device).float() + if tuple(masks.shape[-2:]) != (mask_h, mask_w): # downsample + masks = F.interpolate(masks[None], (mask_h, mask_w), mode='nearest')[0] + + for i in range(batch_size): + if fg_mask[i].sum(): + mask_idx = target_gt_idx[i][fg_mask[i]] + if self.overlap: + gt_mask = torch.where(masks[[i]] == (mask_idx + 1).view(-1, 1, 1), 1.0, 0.0) + else: + gt_mask = masks[batch_idx.view(-1) == i][mask_idx] + xyxyn = target_bboxes[i][fg_mask[i]] / imgsz[[1, 0, 1, 0]] + marea = xyxy2xywh(xyxyn)[:, 2:].prod(1) + mxyxy = xyxyn * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device) + loss[1] += self.single_mask_loss(gt_mask, pred_masks[i][fg_mask[i]], proto[i], mxyxy, marea) # seg + + # WARNING: lines below prevents Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove + else: + loss[1] += (proto * 0).sum() + (pred_masks * 0).sum() # inf sums may lead to nan loss + + # WARNING: lines below prevent Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove + else: + loss[1] += (proto * 0).sum() + (pred_masks * 0).sum() # inf sums may lead to nan loss + + loss[0] *= self.hyp.box # box gain + loss[1] *= self.hyp.box / batch_size # seg gain + loss[2] *= self.hyp.cls # cls gain + loss[3] *= self.hyp.dfl # dfl gain + + return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl) + + def single_mask_loss(self, gt_mask, pred, proto, xyxy, area): + """Mask loss for one image.""" + pred_mask = (pred @ proto.view(self.nm, -1)).view(-1, *proto.shape[1:]) # (n, 32) @ (32,80,80) -> (n,80,80) + loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction='none') + return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean() + + +# Criterion class for computing training losses +class v8PoseLoss(v8DetectionLoss): + + def __init__(self, model): # model must be de-paralleled + super().__init__(model) + self.kpt_shape = model.model[-1].kpt_shape + self.bce_pose = nn.BCEWithLogitsLoss() + is_pose = self.kpt_shape == [17, 3] + nkpt = self.kpt_shape[0] # number of keypoints + sigmas = torch.from_numpy(OKS_SIGMA).to(self.device) if is_pose else torch.ones(nkpt, device=self.device) / nkpt + self.keypoint_loss = KeypointLoss(sigmas=sigmas) + + def __call__(self, preds, batch): + """Calculate the total loss and detach it.""" + loss = torch.zeros(5, device=self.device) # box, cls, dfl, kpt_location, kpt_visibility + feats, pred_kpts = preds if isinstance(preds[0], list) else preds[1] + pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( + (self.reg_max * 4, self.nc), 1) + + # b, grids, .. + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + pred_kpts = pred_kpts.permute(0, 2, 1).contiguous() + + dtype = pred_scores.dtype + imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # targets + batch_size = pred_scores.shape[0] + batch_idx = batch['batch_idx'].view(-1, 1) + targets = torch.cat((batch_idx, batch['cls'].view(-1, 1), batch['bboxes']), 1) + targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + + # pboxes + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + pred_kpts = self.kpts_decode(anchor_points, pred_kpts.view(batch_size, -1, *self.kpt_shape)) # (b, h*w, 17, 3) + + _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner( + pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) + + target_scores_sum = max(target_scores.sum(), 1) + + # cls loss + # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way + loss[3] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE + + # bbox loss + if fg_mask.sum(): + target_bboxes /= stride_tensor + loss[0], loss[4] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, + target_scores_sum, fg_mask) + keypoints = batch['keypoints'].to(self.device).float().clone() + keypoints[..., 0] *= imgsz[1] + keypoints[..., 1] *= imgsz[0] + for i in range(batch_size): + if fg_mask[i].sum(): + idx = target_gt_idx[i][fg_mask[i]] + gt_kpt = keypoints[batch_idx.view(-1) == i][idx] # (n, 51) + gt_kpt[..., 0] /= stride_tensor[fg_mask[i]] + gt_kpt[..., 1] /= stride_tensor[fg_mask[i]] + area = xyxy2xywh(target_bboxes[i][fg_mask[i]])[:, 2:].prod(1, keepdim=True) + pred_kpt = pred_kpts[i][fg_mask[i]] + kpt_mask = gt_kpt[..., 2] != 0 + loss[1] += self.keypoint_loss(pred_kpt, gt_kpt, kpt_mask, area) # pose loss + # kpt_score loss + if pred_kpt.shape[-1] == 3: + loss[2] += self.bce_pose(pred_kpt[..., 2], kpt_mask.float()) # keypoint obj loss + + loss[0] *= self.hyp.box # box gain + loss[1] *= self.hyp.pose / batch_size # pose gain + loss[2] *= self.hyp.kobj / batch_size # kobj gain + loss[3] *= self.hyp.cls # cls gain + loss[4] *= self.hyp.dfl # dfl gain + + return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl) + + def kpts_decode(self, anchor_points, pred_kpts): + """Decodes predicted keypoints to image coordinates.""" + y = pred_kpts.clone() + y[..., :2] *= 2.0 + y[..., 0] += anchor_points[:, [0]] - 0.5 + y[..., 1] += anchor_points[:, [1]] - 0.5 + return y + + +class v8ClassificationLoss: + + def __call__(self, preds, batch): + """Compute the classification loss between predictions and true labels.""" + loss = torch.nn.functional.cross_entropy(preds, batch['cls'], reduction='sum') / 64 + loss_items = loss.detach() + return loss, loss_items diff --git a/ultralytics/utils/loss.py b/ultralytics/utils/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..cc49f2a87149a9562d8a14c8e30135f9599eb829 --- /dev/null +++ b/ultralytics/utils/loss.py @@ -0,0 +1,392 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ultralytics.utils.metrics import OKS_SIGMA +from ultralytics.utils.ops import crop_mask, xywh2xyxy, xyxy2xywh +from ultralytics.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors + +from .metrics import bbox_iou +from .tal import bbox2dist + + +class VarifocalLoss(nn.Module): + """Varifocal loss by Zhang et al. https://arxiv.org/abs/2008.13367.""" + + def __init__(self): + """Initialize the VarifocalLoss class.""" + super().__init__() + + def forward(self, pred_score, gt_score, label, alpha=0.75, gamma=2.0): + """Computes varfocal loss.""" + weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label + with torch.cuda.amp.autocast(enabled=False): + loss = (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(), reduction='none') * + weight).mean(1).sum() + return loss + + +# Losses +class FocalLoss(nn.Module): + """Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5).""" + + def __init__(self, ): + super().__init__() + + def forward(self, pred, label, gamma=1.5, alpha=0.25): + """Calculates and updates confusion matrix for object detection/classification tasks.""" + loss = F.binary_cross_entropy_with_logits(pred, label, reduction='none') + # p_t = torch.exp(-loss) + # loss *= self.alpha * (1.000001 - p_t) ** self.gamma # non-zero power for gradient stability + + # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py + pred_prob = pred.sigmoid() # prob from logits + p_t = label * pred_prob + (1 - label) * (1 - pred_prob) + modulating_factor = (1.0 - p_t) ** gamma + loss *= modulating_factor + if alpha > 0: + alpha_factor = label * alpha + (1 - label) * (1 - alpha) + loss *= alpha_factor + return loss.mean(1).sum() + + +class BboxLoss(nn.Module): + + def __init__(self, reg_max, use_dfl=False): + """Initialize the BboxLoss module with regularization maximum and DFL settings.""" + super().__init__() + self.reg_max = reg_max + self.use_dfl = use_dfl + + def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask): + """IoU loss.""" + weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1) + loss,iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False,type_='WIoU') + loss_iou=loss.sum()/target_scores_sum + + # DFL loss + if self.use_dfl: + target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max) + loss_dfl = self._df_loss(pred_dist[fg_mask].view(-1, self.reg_max + 1), target_ltrb[fg_mask]) * weight + loss_dfl = loss_dfl.sum() / target_scores_sum + else: + loss_dfl = torch.tensor(0.0).to(pred_dist.device) + + return loss_iou, loss_dfl + + @staticmethod + def _df_loss(pred_dist, target): + """Return sum of left and right DFL losses.""" + # Distribution Focal Loss (DFL) proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 + tl = target.long() # target left + tr = tl + 1 # target right + wl = tr - target # weight left + wr = 1 - wl # weight right + return (F.cross_entropy(pred_dist, tl.view(-1), reduction='none').view(tl.shape) * wl + + F.cross_entropy(pred_dist, tr.view(-1), reduction='none').view(tl.shape) * wr).mean(-1, keepdim=True) + + +class KeypointLoss(nn.Module): + + def __init__(self, sigmas) -> None: + super().__init__() + self.sigmas = sigmas + + def forward(self, pred_kpts, gt_kpts, kpt_mask, area): + """Calculates keypoint loss factor and Euclidean distance loss for predicted and actual keypoints.""" + d = (pred_kpts[..., 0] - gt_kpts[..., 0]) ** 2 + (pred_kpts[..., 1] - gt_kpts[..., 1]) ** 2 + kpt_loss_factor = (torch.sum(kpt_mask != 0) + torch.sum(kpt_mask == 0)) / (torch.sum(kpt_mask != 0) + 1e-9) + # e = d / (2 * (area * self.sigmas) ** 2 + 1e-9) # from formula + e = d / (2 * self.sigmas) ** 2 / (area + 1e-9) / 2 # from cocoeval + return kpt_loss_factor * ((1 - torch.exp(-e)) * kpt_mask).mean() + + +# Criterion class for computing Detection training losses +class v8DetectionLoss: + + def __init__(self, model): # model must be de-paralleled + + device = next(model.parameters()).device # get model device + h = model.args # hyperparameters + + m = model.model[-1] # Detect() module + self.bce = nn.BCEWithLogitsLoss(reduction='none') + self.hyp = h + self.stride = m.stride # model strides + self.nc = m.nc # number of classes + self.no = m.no + self.reg_max = m.reg_max + self.device = device + + self.use_dfl = m.reg_max > 1 + + self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0) + self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=self.use_dfl).to(device) + self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device) + + def preprocess(self, targets, batch_size, scale_tensor): + """Preprocesses the target counts and matches with the input batch size to output a tensor.""" + if targets.shape[0] == 0: + out = torch.zeros(batch_size, 0, 5, device=self.device) + else: + i = targets[:, 0] # image index + _, counts = i.unique(return_counts=True) + counts = counts.to(dtype=torch.int32) + out = torch.zeros(batch_size, counts.max(), 5, device=self.device) + for j in range(batch_size): + matches = i == j + n = matches.sum() + if n: + out[j, :n] = targets[matches, 1:] + out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor)) + return out + + def bbox_decode(self, anchor_points, pred_dist): + """Decode predicted object bounding box coordinates from anchor points and distribution.""" + if self.use_dfl: + b, a, c = pred_dist.shape # batch, anchors, channels + pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype)) + # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype)) + # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2) + return dist2bbox(pred_dist, anchor_points, xywh=False) + + def __call__(self, preds, batch): + """Calculate the sum of the loss for box, cls and dfl multiplied by batch size.""" + loss = torch.zeros(3, device=self.device) # box, cls, dfl + feats = preds[1] if isinstance(preds, tuple) else preds + pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( + (self.reg_max * 4, self.nc), 1) + + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + + dtype = pred_scores.dtype + batch_size = pred_scores.shape[0] + imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # targets + targets = torch.cat((batch['batch_idx'].view(-1, 1), batch['cls'].view(-1, 1), batch['bboxes']), 1) + targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + + # pboxes + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + + _, target_bboxes, target_scores, fg_mask, _ = self.assigner( + pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) + + target_scores_sum = max(target_scores.sum(), 1) + + # cls loss + # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way + loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE + + # bbox loss + if fg_mask.sum(): + target_bboxes /= stride_tensor + loss[0], loss[2] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, + target_scores_sum, fg_mask) + + loss[0] *= self.hyp.box # box gain + loss[1] *= self.hyp.cls # cls gain + loss[2] *= self.hyp.dfl # dfl gain + + return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl) + + +# Criterion class for computing training losses +class v8SegmentationLoss(v8DetectionLoss): + + def __init__(self, model): # model must be de-paralleled + super().__init__(model) + self.nm = model.model[-1].nm # number of masks + self.overlap = model.args.overlap_mask + + def __call__(self, preds, batch): + """Calculate and return the loss for the YOLO model.""" + loss = torch.zeros(4, device=self.device) # box, cls, dfl + feats, pred_masks, proto = preds if len(preds) == 3 else preds[1] + batch_size, _, mask_h, mask_w = proto.shape # batch size, number of masks, mask height, mask width + pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( + (self.reg_max * 4, self.nc), 1) + + # b, grids, .. + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + pred_masks = pred_masks.permute(0, 2, 1).contiguous() + + dtype = pred_scores.dtype + imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # targets + try: + batch_idx = batch['batch_idx'].view(-1, 1) + targets = torch.cat((batch_idx, batch['cls'].view(-1, 1), batch['bboxes']), 1) + targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + except RuntimeError as e: + raise TypeError('ERROR ❌ segment dataset incorrectly formatted or not a segment dataset.\n' + "This error can occur when incorrectly training a 'segment' model on a 'detect' dataset, " + "i.e. 'yolo train model=yolov8n-seg.pt data=coco128.yaml'.\nVerify your dataset is a " + "correctly formatted 'segment' dataset using 'data=coco128-seg.yaml' " + 'as an example.\nSee https://docs.ultralytics.com/tasks/segment/ for help.') from e + + # pboxes + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + + _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner( + pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) + + target_scores_sum = max(target_scores.sum(), 1) + + # cls loss + # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way + loss[2] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE + + if fg_mask.sum(): + # bbox loss + loss[0], loss[3] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes / stride_tensor, + target_scores, target_scores_sum, fg_mask) + # masks loss + masks = batch['masks'].to(self.device).float() + if tuple(masks.shape[-2:]) != (mask_h, mask_w): # downsample + masks = F.interpolate(masks[None], (mask_h, mask_w), mode='nearest')[0] + + for i in range(batch_size): + if fg_mask[i].sum(): + mask_idx = target_gt_idx[i][fg_mask[i]] + if self.overlap: + gt_mask = torch.where(masks[[i]] == (mask_idx + 1).view(-1, 1, 1), 1.0, 0.0) + else: + gt_mask = masks[batch_idx.view(-1) == i][mask_idx] + xyxyn = target_bboxes[i][fg_mask[i]] / imgsz[[1, 0, 1, 0]] + marea = xyxy2xywh(xyxyn)[:, 2:].prod(1) + mxyxy = xyxyn * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device) + loss[1] += self.single_mask_loss(gt_mask, pred_masks[i][fg_mask[i]], proto[i], mxyxy, marea) # seg + + # WARNING: lines below prevents Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove + else: + loss[1] += (proto * 0).sum() + (pred_masks * 0).sum() # inf sums may lead to nan loss + + # WARNING: lines below prevent Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove + else: + loss[1] += (proto * 0).sum() + (pred_masks * 0).sum() # inf sums may lead to nan loss + + loss[0] *= self.hyp.box # box gain + loss[1] *= self.hyp.box / batch_size # seg gain + loss[2] *= self.hyp.cls # cls gain + loss[3] *= self.hyp.dfl # dfl gain + + return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl) + + def single_mask_loss(self, gt_mask, pred, proto, xyxy, area): + """Mask loss for one image.""" + pred_mask = (pred @ proto.view(self.nm, -1)).view(-1, *proto.shape[1:]) # (n, 32) @ (32,80,80) -> (n,80,80) + loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction='none') + return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean() + + +# Criterion class for computing training losses +class v8PoseLoss(v8DetectionLoss): + + def __init__(self, model): # model must be de-paralleled + super().__init__(model) + self.kpt_shape = model.model[-1].kpt_shape + self.bce_pose = nn.BCEWithLogitsLoss() + is_pose = self.kpt_shape == [17, 3] + nkpt = self.kpt_shape[0] # number of keypoints + sigmas = torch.from_numpy(OKS_SIGMA).to(self.device) if is_pose else torch.ones(nkpt, device=self.device) / nkpt + self.keypoint_loss = KeypointLoss(sigmas=sigmas) + + def __call__(self, preds, batch): + """Calculate the total loss and detach it.""" + loss = torch.zeros(5, device=self.device) # box, cls, dfl, kpt_location, kpt_visibility + feats, pred_kpts = preds if isinstance(preds[0], list) else preds[1] + pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( + (self.reg_max * 4, self.nc), 1) + + # b, grids, .. + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + pred_kpts = pred_kpts.permute(0, 2, 1).contiguous() + + dtype = pred_scores.dtype + imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # targets + batch_size = pred_scores.shape[0] + batch_idx = batch['batch_idx'].view(-1, 1) + targets = torch.cat((batch_idx, batch['cls'].view(-1, 1), batch['bboxes']), 1) + targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + + # pboxes + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + pred_kpts = self.kpts_decode(anchor_points, pred_kpts.view(batch_size, -1, *self.kpt_shape)) # (b, h*w, 17, 3) + + _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner( + pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt) + + target_scores_sum = max(target_scores.sum(), 1) + + # cls loss + # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way + loss[3] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE + + # bbox loss + if fg_mask.sum(): + target_bboxes /= stride_tensor + loss[0], loss[4] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, + target_scores_sum, fg_mask) + keypoints = batch['keypoints'].to(self.device).float().clone() + keypoints[..., 0] *= imgsz[1] + keypoints[..., 1] *= imgsz[0] + for i in range(batch_size): + if fg_mask[i].sum(): + idx = target_gt_idx[i][fg_mask[i]] + gt_kpt = keypoints[batch_idx.view(-1) == i][idx] # (n, 51) + gt_kpt[..., 0] /= stride_tensor[fg_mask[i]] + gt_kpt[..., 1] /= stride_tensor[fg_mask[i]] + area = xyxy2xywh(target_bboxes[i][fg_mask[i]])[:, 2:].prod(1, keepdim=True) + pred_kpt = pred_kpts[i][fg_mask[i]] + kpt_mask = gt_kpt[..., 2] != 0 + loss[1] += self.keypoint_loss(pred_kpt, gt_kpt, kpt_mask, area) # pose loss + # kpt_score loss + if pred_kpt.shape[-1] == 3: + loss[2] += self.bce_pose(pred_kpt[..., 2], kpt_mask.float()) # keypoint obj loss + + loss[0] *= self.hyp.box # box gain + loss[1] *= self.hyp.pose / batch_size # pose gain + loss[2] *= self.hyp.kobj / batch_size # kobj gain + loss[3] *= self.hyp.cls # cls gain + loss[4] *= self.hyp.dfl # dfl gain + + return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl) + + def kpts_decode(self, anchor_points, pred_kpts): + """Decodes predicted keypoints to image coordinates.""" + y = pred_kpts.clone() + y[..., :2] *= 2.0 + y[..., 0] += anchor_points[:, [0]] - 0.5 + y[..., 1] += anchor_points[:, [1]] - 0.5 + return y + + +class v8ClassificationLoss: + + def __call__(self, preds, batch): + """Compute the classification loss between predictions and true labels.""" + loss = torch.nn.functional.cross_entropy(preds, batch['cls'], reduction='sum') / 64 + loss_items = loss.detach() + return loss, loss_items diff --git a/ultralytics/utils/metrics-origin.py b/ultralytics/utils/metrics-origin.py new file mode 100644 index 0000000000000000000000000000000000000000..a844299eadacf8f288e1c986f6de6559a6f1b855 --- /dev/null +++ b/ultralytics/utils/metrics-origin.py @@ -0,0 +1,978 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Model validation metrics +""" +import math +import warnings +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from ultralytics.utils import LOGGER, SimpleClass, TryExcept, plt_settings +from ultralytics.utils.iou import * + +OKS_SIGMA = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0 + + +# Boxes +def box_area(box): + """Return box area, where box shape is xyxy(4,n).""" + return (box[2] - box[0]) * (box[3] - box[1]) + + +def bbox_ioa(box1, box2, eps=1e-7): + """ + Calculate the intersection over box2 area given box1 and box2. Boxes are in x1y1x2y2 format. + + Args: + box1 (np.array): A numpy array of shape (n, 4) representing n bounding boxes. + box2 (np.array): A numpy array of shape (m, 4) representing m bounding boxes. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (np.array): A numpy array of shape (n, m) representing the intersection over box2 area. + """ + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1.T + b2_x1, b2_y1, b2_x2, b2_y2 = box2.T + + # Intersection area + inter_area = (np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)).clip(0) * \ + (np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps + + # Intersection over box2 area + return inter_area / box2_area + + +def box_iou(box1, box2, eps=1e-7): + """ + Calculate intersection-over-union (IoU) of boxes. + Both sets of boxes are expected to be in (x1, y1, x2, y2) format. + Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py + + Args: + box1 (torch.Tensor): A tensor of shape (N, 4) representing N bounding boxes. + box2 (torch.Tensor): A tensor of shape (M, 4) representing M bounding boxes. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2. + """ + + # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) + (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2) + inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp_(0).prod(2) + + # IoU = inter / (area1 + area2 - inter) + return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps) + + +def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7): + """ + Calculate Intersection over Union (IoU) of box1(1, 4) to box2(n, 4). + + Args: + box1 (torch.Tensor): A tensor representing a single bounding box with shape (1, 4). + box2 (torch.Tensor): A tensor representing n bounding boxes with shape (n, 4). + xywh (bool, optional): If True, input boxes are in (x, y, w, h) format. If False, input boxes are in + (x1, y1, x2, y2) format. Defaults to True. + GIoU (bool, optional): If True, calculate Generalized IoU. Defaults to False. + DIoU (bool, optional): If True, calculate Distance IoU. Defaults to False. + CIoU (bool, optional): If True, calculate Complete IoU. Defaults to False. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (torch.Tensor): IoU, GIoU, DIoU, or CIoU values depending on the specified flags. + """ + + # Get the coordinates of bounding boxes + if xywh: # transform from xywh to xyxy + (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) + w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 + b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ + b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ + else: # x1, y1, x2, y2 = box1 + b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) + b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) + w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps + w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps + + # Intersection area + inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * \ + (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp_(0) + + # Union Area + union = w1 * h1 + w2 * h2 - inter + eps + + # IoU + iou = inter / union + if CIoU or DIoU or GIoU: + cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width + ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height + if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 + c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared + rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2 + if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 + v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) + with torch.no_grad(): + alpha = v / (v - iou + (1 + eps)) + return iou - (rho2 / c2 + v * alpha) # CIoU + return iou - rho2 / c2 # DIoU + c_area = cw * ch + eps # convex area + return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf + return iou # IoU + + +def mask_iou(mask1, mask2, eps=1e-7): + """ + Calculate masks IoU. + + Args: + mask1 (torch.Tensor): A tensor of shape (N, n) where N is the number of ground truth objects and n is the + product of image width and height. + mask2 (torch.Tensor): A tensor of shape (M, n) where M is the number of predicted objects and n is the + product of image width and height. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (torch.Tensor): A tensor of shape (N, M) representing masks IoU. + """ + intersection = torch.matmul(mask1, mask2.T).clamp_(0) + union = (mask1.sum(1)[:, None] + mask2.sum(1)[None]) - intersection # (area1 + area2) - intersection + return intersection / (union + eps) + + +def kpt_iou(kpt1, kpt2, area, sigma, eps=1e-7): + """ + Calculate Object Keypoint Similarity (OKS). + + Args: + kpt1 (torch.Tensor): A tensor of shape (N, 17, 3) representing ground truth keypoints. + kpt2 (torch.Tensor): A tensor of shape (M, 17, 3) representing predicted keypoints. + area (torch.Tensor): A tensor of shape (N,) representing areas from ground truth. + sigma (list): A list containing 17 values representing keypoint scales. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (torch.Tensor): A tensor of shape (N, M) representing keypoint similarities. + """ + d = (kpt1[:, None, :, 0] - kpt2[..., 0]) ** 2 + (kpt1[:, None, :, 1] - kpt2[..., 1]) ** 2 # (N, M, 17) + sigma = torch.tensor(sigma, device=kpt1.device, dtype=kpt1.dtype) # (17, ) + kpt_mask = kpt1[..., 2] != 0 # (N, 17) + e = d / (2 * sigma) ** 2 / (area[:, None, None] + eps) / 2 # from cocoeval + # e = d / ((area[None, :, None] + eps) * sigma) ** 2 / 2 # from formula + return (torch.exp(-e) * kpt_mask[:, None]).sum(-1) / (kpt_mask.sum(-1)[:, None] + eps) + + +def smooth_BCE(eps=0.1): # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441 + # return positive, negative label smoothing BCE targets + return 1.0 - 0.5 * eps, 0.5 * eps + + +class ConfusionMatrix: + """ + A class for calculating and updating a confusion matrix for object detection and classification tasks. + + Attributes: + task (str): The type of task, either 'detect' or 'classify'. + matrix (np.array): The confusion matrix, with dimensions depending on the task. + nc (int): The number of classes. + conf (float): The confidence threshold for detections. + iou_thres (float): The Intersection over Union threshold. + """ + + def __init__(self, nc, conf=0.25, iou_thres=0.45, task='detect'): + """Initialize attributes for the YOLO model.""" + self.task = task + self.matrix = np.zeros((nc + 1, nc + 1)) if self.task == 'detect' else np.zeros((nc, nc)) + self.nc = nc # number of classes + self.conf = conf + self.iou_thres = iou_thres + + def process_cls_preds(self, preds, targets): + """ + Update confusion matrix for classification task + + Args: + preds (Array[N, min(nc,5)]): Predicted class labels. + targets (Array[N, 1]): Ground truth class labels. + """ + preds, targets = torch.cat(preds)[:, 0], torch.cat(targets) + for p, t in zip(preds.cpu().numpy(), targets.cpu().numpy()): + self.matrix[p][t] += 1 + + def process_batch(self, detections, labels): + """ + Update confusion matrix for object detection task. + + Args: + detections (Array[N, 6]): Detected bounding boxes and their associated information. + Each row should contain (x1, y1, x2, y2, conf, class). + labels (Array[M, 5]): Ground truth bounding boxes and their associated class labels. + Each row should contain (class, x1, y1, x2, y2). + """ + if detections is None: + gt_classes = labels.int() + for gc in gt_classes: + self.matrix[self.nc, gc] += 1 # background FN + return + + detections = detections[detections[:, 4] > self.conf] + gt_classes = labels[:, 0].int() + detection_classes = detections[:, 5].int() + iou = box_iou(labels[:, 1:], detections[:, :4]) + + x = torch.where(iou > self.iou_thres) + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + else: + matches = np.zeros((0, 3)) + + n = matches.shape[0] > 0 + m0, m1, _ = matches.transpose().astype(int) + for i, gc in enumerate(gt_classes): + j = m0 == i + if n and sum(j) == 1: + self.matrix[detection_classes[m1[j]], gc] += 1 # correct + else: + self.matrix[self.nc, gc] += 1 # true background + + if n: + for i, dc in enumerate(detection_classes): + if not any(m1 == i): + self.matrix[dc, self.nc] += 1 # predicted background + + def matrix(self): + """Returns the confusion matrix.""" + return self.matrix + + def tp_fp(self): + """Returns true positives and false positives.""" + tp = self.matrix.diagonal() # true positives + fp = self.matrix.sum(1) - tp # false positives + # fn = self.matrix.sum(0) - tp # false negatives (missed detections) + return (tp[:-1], fp[:-1]) if self.task == 'detect' else (tp, fp) # remove background class if task=detect + + @TryExcept('WARNING ⚠️ ConfusionMatrix plot failure') + @plt_settings() + def plot(self, normalize=True, save_dir='', names=(), on_plot=None): + """ + Plot the confusion matrix using seaborn and save it to a file. + + Args: + normalize (bool): Whether to normalize the confusion matrix. + save_dir (str): Directory where the plot will be saved. + names (tuple): Names of classes, used as labels on the plot. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + """ + import seaborn as sn + + array = self.matrix / ((self.matrix.sum(0).reshape(1, -1) + 1E-9) if normalize else 1) # normalize columns + array[array < 0.005] = np.nan # don't annotate (would appear as 0.00) + + fig, ax = plt.subplots(1, 1, figsize=(12, 9), tight_layout=True) + nc, nn = self.nc, len(names) # number of classes, names + sn.set(font_scale=1.0 if nc < 50 else 0.8) # for label size + labels = (0 < nn < 99) and (nn == nc) # apply names to ticklabels + ticklabels = (list(names) + ['background']) if labels else 'auto' + with warnings.catch_warnings(): + warnings.simplefilter('ignore') # suppress empty matrix RuntimeWarning: All-NaN slice encountered + sn.heatmap(array, + ax=ax, + annot=nc < 30, + annot_kws={ + 'size': 8}, + cmap='Blues', + fmt='.2f' if normalize else '.0f', + square=True, + vmin=0.0, + xticklabels=ticklabels, + yticklabels=ticklabels).set_facecolor((1, 1, 1)) + title = 'Confusion Matrix' + ' Normalized' * normalize + ax.set_xlabel('True') + ax.set_ylabel('Predicted') + ax.set_title(title) + plot_fname = Path(save_dir) / f'{title.lower().replace(" ", "_")}.png' + fig.savefig(plot_fname, dpi=250) + plt.close(fig) + if on_plot: + on_plot(plot_fname) + + def print(self): + """ + Print the confusion matrix to the console. + """ + for i in range(self.nc + 1): + LOGGER.info(' '.join(map(str, self.matrix[i]))) + + +def smooth(y, f=0.05): + """Box filter of fraction f.""" + nf = round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd) + p = np.ones(nf // 2) # ones padding + yp = np.concatenate((p * y[0], y, p * y[-1]), 0) # y padded + return np.convolve(yp, np.ones(nf) / nf, mode='valid') # y-smoothed + + +@plt_settings() +def plot_pr_curve(px, py, ap, save_dir=Path('pr_curve.png'), names=(), on_plot=None): + """Plots a precision-recall curve.""" + fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) + py = np.stack(py, axis=1) + + if 0 < len(names) < 21: # display per-class legend if < 21 classes + for i, y in enumerate(py.T): + ax.plot(px, y, linewidth=1, label=f'{names[i]} {ap[i, 0]:.3f}') # plot(recall, precision) + else: + ax.plot(px, py, linewidth=1, color='grey') # plot(recall, precision) + + ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean()) + ax.set_xlabel('Recall') + ax.set_ylabel('Precision') + ax.set_xlim(0, 1) + ax.set_ylim(0, 1) + ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left') + ax.set_title('Precision-Recall Curve') + fig.savefig(save_dir, dpi=250) + plt.close(fig) + if on_plot: + on_plot(save_dir) + + +@plt_settings() +def plot_mc_curve(px, py, save_dir=Path('mc_curve.png'), names=(), xlabel='Confidence', ylabel='Metric', on_plot=None): + """Plots a metric-confidence curve.""" + fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) + + if 0 < len(names) < 21: # display per-class legend if < 21 classes + for i, y in enumerate(py): + ax.plot(px, y, linewidth=1, label=f'{names[i]}') # plot(confidence, metric) + else: + ax.plot(px, py.T, linewidth=1, color='grey') # plot(confidence, metric) + + y = smooth(py.mean(0), 0.05) + ax.plot(px, y, linewidth=3, color='blue', label=f'all classes {y.max():.2f} at {px[y.argmax()]:.3f}') + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + ax.set_xlim(0, 1) + ax.set_ylim(0, 1) + ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left') + ax.set_title(f'{ylabel}-Confidence Curve') + fig.savefig(save_dir, dpi=250) + plt.close(fig) + if on_plot: + on_plot(save_dir) + + +def compute_ap(recall, precision): + """ + Compute the average precision (AP) given the recall and precision curves. + + Arguments: + recall (list): The recall curve. + precision (list): The precision curve. + + Returns: + (float): Average precision. + (np.ndarray): Precision envelope curve. + (np.ndarray): Modified recall curve with sentinel values added at the beginning and end. + """ + + # Append sentinel values to beginning and end + mrec = np.concatenate(([0.0], recall, [1.0])) + mpre = np.concatenate(([1.0], precision, [0.0])) + + # Compute the precision envelope + mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) + + # Integrate area under curve + method = 'interp' # methods: 'continuous', 'interp' + if method == 'interp': + x = np.linspace(0, 1, 101) # 101-point interp (COCO) + ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate + else: # 'continuous' + i = np.where(mrec[1:] != mrec[:-1])[0] # points where x-axis (recall) changes + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve + + return ap, mpre, mrec + + +def ap_per_class(tp, + conf, + pred_cls, + target_cls, + plot=False, + on_plot=None, + save_dir=Path(), + names=(), + eps=1e-16, + prefix=''): + """ + Computes the average precision per class for object detection evaluation. + + Args: + tp (np.ndarray): Binary array indicating whether the detection is correct (True) or not (False). + conf (np.ndarray): Array of confidence scores of the detections. + pred_cls (np.ndarray): Array of predicted classes of the detections. + target_cls (np.ndarray): Array of true classes of the detections. + plot (bool, optional): Whether to plot PR curves or not. Defaults to False. + on_plot (func, optional): A callback to pass plots path and data when they are rendered. Defaults to None. + save_dir (Path, optional): Directory to save the PR curves. Defaults to an empty path. + names (tuple, optional): Tuple of class names to plot PR curves. Defaults to an empty tuple. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-16. + prefix (str, optional): A prefix string for saving the plot files. Defaults to an empty string. + + Returns: + (tuple): A tuple of six arrays and one array of unique classes, where: + tp (np.ndarray): True positive counts for each class. + fp (np.ndarray): False positive counts for each class. + p (np.ndarray): Precision values at each confidence threshold. + r (np.ndarray): Recall values at each confidence threshold. + f1 (np.ndarray): F1-score values at each confidence threshold. + ap (np.ndarray): Average precision for each class at different IoU thresholds. + unique_classes (np.ndarray): An array of unique classes that have data. + + """ + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes, nt = np.unique(target_cls, return_counts=True) + nc = unique_classes.shape[0] # number of classes, number of detections + + # Create Precision-Recall curve and compute AP for each class + px, py = np.linspace(0, 1, 1000), [] # for plotting + ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000)) + for ci, c in enumerate(unique_classes): + i = pred_cls == c + n_l = nt[ci] # number of labels + n_p = i.sum() # number of predictions + if n_p == 0 or n_l == 0: + continue + + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum(0) + tpc = tp[i].cumsum(0) + + # Recall + recall = tpc / (n_l + eps) # recall curve + r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases + + # Precision + precision = tpc / (tpc + fpc) # precision curve + p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score + + # AP from recall-precision curve + for j in range(tp.shape[1]): + ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) + if plot and j == 0: + py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 + + # Compute F1 (harmonic mean of precision and recall) + f1 = 2 * p * r / (p + r + eps) + names = [v for k, v in names.items() if k in unique_classes] # list: only classes that have data + names = dict(enumerate(names)) # to dict + if plot: + plot_pr_curve(px, py, ap, save_dir / f'{prefix}PR_curve.png', names, on_plot=on_plot) + plot_mc_curve(px, f1, save_dir / f'{prefix}F1_curve.png', names, ylabel='F1', on_plot=on_plot) + plot_mc_curve(px, p, save_dir / f'{prefix}P_curve.png', names, ylabel='Precision', on_plot=on_plot) + plot_mc_curve(px, r, save_dir / f'{prefix}R_curve.png', names, ylabel='Recall', on_plot=on_plot) + + i = smooth(f1.mean(0), 0.1).argmax() # max F1 index + p, r, f1 = p[:, i], r[:, i], f1[:, i] + tp = (r * nt).round() # true positives + fp = (tp / (p + eps) - tp).round() # false positives + return tp, fp, p, r, f1, ap, unique_classes.astype(int) + + +class Metric(SimpleClass): + """ + Class for computing evaluation metrics for YOLOv8 model. + + Attributes: + p (list): Precision for each class. Shape: (nc,). + r (list): Recall for each class. Shape: (nc,). + f1 (list): F1 score for each class. Shape: (nc,). + all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10). + ap_class_index (list): Index of class for each AP score. Shape: (nc,). + nc (int): Number of classes. + + Methods: + ap50(): AP at IoU threshold of 0.5 for all classes. Returns: List of AP scores. Shape: (nc,) or []. + ap(): AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: List of AP scores. Shape: (nc,) or []. + mp(): Mean precision of all classes. Returns: Float. + mr(): Mean recall of all classes. Returns: Float. + map50(): Mean AP at IoU threshold of 0.5 for all classes. Returns: Float. + map75(): Mean AP at IoU threshold of 0.75 for all classes. Returns: Float. + map(): Mean AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: Float. + mean_results(): Mean of results, returns mp, mr, map50, map. + class_result(i): Class-aware result, returns p[i], r[i], ap50[i], ap[i]. + maps(): mAP of each class. Returns: Array of mAP scores, shape: (nc,). + fitness(): Model fitness as a weighted combination of metrics. Returns: Float. + update(results): Update metric attributes with new evaluation results. + + """ + + def __init__(self) -> None: + self.p = [] # (nc, ) + self.r = [] # (nc, ) + self.f1 = [] # (nc, ) + self.all_ap = [] # (nc, 10) + self.ap_class_index = [] # (nc, ) + self.nc = 0 + + @property + def ap50(self): + """ + Returns the Average Precision (AP) at an IoU threshold of 0.5 for all classes. + + Returns: + (np.ndarray, list): Array of shape (nc,) with AP50 values per class, or an empty list if not available. + """ + return self.all_ap[:, 0] if len(self.all_ap) else [] + + @property + def ap(self): + """ + Returns the Average Precision (AP) at an IoU threshold of 0.5-0.95 for all classes. + + Returns: + (np.ndarray, list): Array of shape (nc,) with AP50-95 values per class, or an empty list if not available. + """ + return self.all_ap.mean(1) if len(self.all_ap) else [] + + @property + def mp(self): + """ + Returns the Mean Precision of all classes. + + Returns: + (float): The mean precision of all classes. + """ + return self.p.mean() if len(self.p) else 0.0 + + @property + def mr(self): + """ + Returns the Mean Recall of all classes. + + Returns: + (float): The mean recall of all classes. + """ + return self.r.mean() if len(self.r) else 0.0 + + @property + def map50(self): + """ + Returns the mean Average Precision (mAP) at an IoU threshold of 0.5. + + Returns: + (float): The mAP50 at an IoU threshold of 0.5. + """ + return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0 + + @property + def map75(self): + """ + Returns the mean Average Precision (mAP) at an IoU threshold of 0.75. + + Returns: + (float): The mAP50 at an IoU threshold of 0.75. + """ + return self.all_ap[:, 5].mean() if len(self.all_ap) else 0.0 + + @property + def map(self): + """ + Returns the mean Average Precision (mAP) over IoU thresholds of 0.5 - 0.95 in steps of 0.05. + + Returns: + (float): The mAP over IoU thresholds of 0.5 - 0.95 in steps of 0.05. + """ + return self.all_ap.mean() if len(self.all_ap) else 0.0 + + def mean_results(self): + """Mean of results, return mp, mr, map50, map.""" + return [self.mp, self.mr, self.map50, self.map] + + def class_result(self, i): + """class-aware result, return p[i], r[i], ap50[i], ap[i].""" + return self.p[i], self.r[i], self.ap50[i], self.ap[i] + + @property + def maps(self): + """mAP of each class.""" + maps = np.zeros(self.nc) + self.map + for i, c in enumerate(self.ap_class_index): + maps[c] = self.ap[i] + return maps + + def fitness(self): + """Model fitness as a weighted combination of metrics.""" + w = [0.0, 0.0, 0.1, 0.9] # weights for [P, R, mAP@0.5, mAP@0.5:0.95] + return (np.array(self.mean_results()) * w).sum() + + def update(self, results): + """ + Args: + results (tuple): A tuple of (p, r, ap, f1, ap_class) + """ + self.p, self.r, self.f1, self.all_ap, self.ap_class_index = results + + +class DetMetrics(SimpleClass): + """ + This class is a utility class for computing detection metrics such as precision, recall, and mean average precision + (mAP) of an object detection model. + + Args: + save_dir (Path): A path to the directory where the output plots will be saved. Defaults to current directory. + plot (bool): A flag that indicates whether to plot precision-recall curves for each class. Defaults to False. + on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None. + names (tuple of str): A tuple of strings that represents the names of the classes. Defaults to an empty tuple. + + Attributes: + save_dir (Path): A path to the directory where the output plots will be saved. + plot (bool): A flag that indicates whether to plot the precision-recall curves for each class. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + names (tuple of str): A tuple of strings that represents the names of the classes. + box (Metric): An instance of the Metric class for storing the results of the detection metrics. + speed (dict): A dictionary for storing the execution time of different parts of the detection process. + + Methods: + process(tp, conf, pred_cls, target_cls): Updates the metric results with the latest batch of predictions. + keys: Returns a list of keys for accessing the computed detection metrics. + mean_results: Returns a list of mean values for the computed detection metrics. + class_result(i): Returns a list of values for the computed detection metrics for a specific class. + maps: Returns a dictionary of mean average precision (mAP) values for different IoU thresholds. + fitness: Computes the fitness score based on the computed detection metrics. + ap_class_index: Returns a list of class indices sorted by their average precision (AP) values. + results_dict: Returns a dictionary that maps detection metric keys to their computed values. + """ + + def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: + self.save_dir = save_dir + self.plot = plot + self.on_plot = on_plot + self.names = names + self.box = Metric() + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def process(self, tp, conf, pred_cls, target_cls): + """Process predicted results for object detection and update metrics.""" + results = ap_per_class(tp, + conf, + pred_cls, + target_cls, + plot=self.plot, + save_dir=self.save_dir, + names=self.names, + on_plot=self.on_plot)[2:] + self.box.nc = len(self.names) + self.box.update(results) + + @property + def keys(self): + """Returns a list of keys for accessing specific metrics.""" + return ['metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP50(B)', 'metrics/mAP50-95(B)'] + + def mean_results(self): + """Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95.""" + return self.box.mean_results() + + def class_result(self, i): + """Return the result of evaluating the performance of an object detection model on a specific class.""" + return self.box.class_result(i) + + @property + def maps(self): + """Returns mean Average Precision (mAP) scores per class.""" + return self.box.maps + + @property + def fitness(self): + """Returns the fitness of box object.""" + return self.box.fitness() + + @property + def ap_class_index(self): + """Returns the average precision index per class.""" + return self.box.ap_class_index + + @property + def results_dict(self): + """Returns dictionary of computed performance metrics and statistics.""" + return dict(zip(self.keys + ['fitness'], self.mean_results() + [self.fitness])) + + +class SegmentMetrics(SimpleClass): + """ + Calculates and aggregates detection and segmentation metrics over a given set of classes. + + Args: + save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory. + plot (bool): Whether to save the detection and segmentation plots. Default is False. + on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None. + names (list): List of class names. Default is an empty list. + + Attributes: + save_dir (Path): Path to the directory where the output plots should be saved. + plot (bool): Whether to save the detection and segmentation plots. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + names (list): List of class names. + box (Metric): An instance of the Metric class to calculate box detection metrics. + seg (Metric): An instance of the Metric class to calculate mask segmentation metrics. + speed (dict): Dictionary to store the time taken in different phases of inference. + + Methods: + process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions. + mean_results(): Returns the mean of the detection and segmentation metrics over all the classes. + class_result(i): Returns the detection and segmentation metrics of class `i`. + maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95. + fitness: Returns the fitness scores, which are a single weighted combination of metrics. + ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP). + results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score. + """ + + def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: + self.save_dir = save_dir + self.plot = plot + self.on_plot = on_plot + self.names = names + self.box = Metric() + self.seg = Metric() + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def process(self, tp_b, tp_m, conf, pred_cls, target_cls): + """ + Processes the detection and segmentation metrics over the given set of predictions. + + Args: + tp_b (list): List of True Positive boxes. + tp_m (list): List of True Positive masks. + conf (list): List of confidence scores. + pred_cls (list): List of predicted classes. + target_cls (list): List of target classes. + """ + + results_mask = ap_per_class(tp_m, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Mask')[2:] + self.seg.nc = len(self.names) + self.seg.update(results_mask) + results_box = ap_per_class(tp_b, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Box')[2:] + self.box.nc = len(self.names) + self.box.update(results_box) + + @property + def keys(self): + """Returns a list of keys for accessing metrics.""" + return [ + 'metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP50(B)', 'metrics/mAP50-95(B)', + 'metrics/precision(M)', 'metrics/recall(M)', 'metrics/mAP50(M)', 'metrics/mAP50-95(M)'] + + def mean_results(self): + """Return the mean metrics for bounding box and segmentation results.""" + return self.box.mean_results() + self.seg.mean_results() + + def class_result(self, i): + """Returns classification results for a specified class index.""" + return self.box.class_result(i) + self.seg.class_result(i) + + @property + def maps(self): + """Returns mAP scores for object detection and semantic segmentation models.""" + return self.box.maps + self.seg.maps + + @property + def fitness(self): + """Get the fitness score for both segmentation and bounding box models.""" + return self.seg.fitness() + self.box.fitness() + + @property + def ap_class_index(self): + """Boxes and masks have the same ap_class_index.""" + return self.box.ap_class_index + + @property + def results_dict(self): + """Returns results of object detection model for evaluation.""" + return dict(zip(self.keys + ['fitness'], self.mean_results() + [self.fitness])) + + +class PoseMetrics(SegmentMetrics): + """ + Calculates and aggregates detection and pose metrics over a given set of classes. + + Args: + save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory. + plot (bool): Whether to save the detection and segmentation plots. Default is False. + on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None. + names (list): List of class names. Default is an empty list. + + Attributes: + save_dir (Path): Path to the directory where the output plots should be saved. + plot (bool): Whether to save the detection and segmentation plots. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + names (list): List of class names. + box (Metric): An instance of the Metric class to calculate box detection metrics. + pose (Metric): An instance of the Metric class to calculate mask segmentation metrics. + speed (dict): Dictionary to store the time taken in different phases of inference. + + Methods: + process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions. + mean_results(): Returns the mean of the detection and segmentation metrics over all the classes. + class_result(i): Returns the detection and segmentation metrics of class `i`. + maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95. + fitness: Returns the fitness scores, which are a single weighted combination of metrics. + ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP). + results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score. + """ + + def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: + super().__init__(save_dir, plot, names) + self.save_dir = save_dir + self.plot = plot + self.on_plot = on_plot + self.names = names + self.box = Metric() + self.pose = Metric() + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def __getattr__(self, attr): + """Raises an AttributeError if an invalid attribute is accessed.""" + name = self.__class__.__name__ + raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}") + + def process(self, tp_b, tp_p, conf, pred_cls, target_cls): + """ + Processes the detection and pose metrics over the given set of predictions. + + Args: + tp_b (list): List of True Positive boxes. + tp_p (list): List of True Positive keypoints. + conf (list): List of confidence scores. + pred_cls (list): List of predicted classes. + target_cls (list): List of target classes. + """ + + results_pose = ap_per_class(tp_p, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Pose')[2:] + self.pose.nc = len(self.names) + self.pose.update(results_pose) + results_box = ap_per_class(tp_b, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Box')[2:] + self.box.nc = len(self.names) + self.box.update(results_box) + + @property + def keys(self): + """Returns list of evaluation metric keys.""" + return [ + 'metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP50(B)', 'metrics/mAP50-95(B)', + 'metrics/precision(P)', 'metrics/recall(P)', 'metrics/mAP50(P)', 'metrics/mAP50-95(P)'] + + def mean_results(self): + """Return the mean results of box and pose.""" + return self.box.mean_results() + self.pose.mean_results() + + def class_result(self, i): + """Return the class-wise detection results for a specific class i.""" + return self.box.class_result(i) + self.pose.class_result(i) + + @property + def maps(self): + """Returns the mean average precision (mAP) per class for both box and pose detections.""" + return self.box.maps + self.pose.maps + + @property + def fitness(self): + """Computes classification metrics and speed using the `targets` and `pred` inputs.""" + return self.pose.fitness() + self.box.fitness() + + +class ClassifyMetrics(SimpleClass): + """ + Class for computing classification metrics including top-1 and top-5 accuracy. + + Attributes: + top1 (float): The top-1 accuracy. + top5 (float): The top-5 accuracy. + speed (Dict[str, float]): A dictionary containing the time taken for each step in the pipeline. + + Properties: + fitness (float): The fitness of the model, which is equal to top-5 accuracy. + results_dict (Dict[str, Union[float, str]]): A dictionary containing the classification metrics and fitness. + keys (List[str]): A list of keys for the results_dict. + + Methods: + process(targets, pred): Processes the targets and predictions to compute classification metrics. + """ + + def __init__(self) -> None: + self.top1 = 0 + self.top5 = 0 + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def process(self, targets, pred): + """Target classes and predicted classes.""" + pred, targets = torch.cat(pred), torch.cat(targets) + correct = (targets[:, None] == pred).float() + acc = torch.stack((correct[:, 0], correct.max(1).values), dim=1) # (top1, top5) accuracy + self.top1, self.top5 = acc.mean(0).tolist() + + @property + def fitness(self): + """Returns mean of top-1 and top-5 accuracies as fitness score.""" + return (self.top1 + self.top5) / 2 + + @property + def results_dict(self): + """Returns a dictionary with model's performance metrics and fitness score.""" + return dict(zip(self.keys + ['fitness'], [self.top1, self.top5, self.fitness])) + + @property + def keys(self): + """Returns a list of keys for the results_dict property.""" + return ['metrics/accuracy_top1', 'metrics/accuracy_top5'] diff --git a/ultralytics/utils/metrics.py b/ultralytics/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..d2f8d21a49d2dae47a986f1412e2804b611b9650 --- /dev/null +++ b/ultralytics/utils/metrics.py @@ -0,0 +1,944 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Model validation metrics +""" +import math +import warnings +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from ultralytics.utils import LOGGER, SimpleClass, TryExcept, plt_settings +from ultralytics.utils.iou import * + +OKS_SIGMA = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0 + + +# Boxes +def box_area(box): + """Return box area, where box shape is xyxy(4,n).""" + return (box[2] - box[0]) * (box[3] - box[1]) + + +def bbox_ioa(box1, box2, eps=1e-7): + """ + Calculate the intersection over box2 area given box1 and box2. Boxes are in x1y1x2y2 format. + + Args: + box1 (np.array): A numpy array of shape (n, 4) representing n bounding boxes. + box2 (np.array): A numpy array of shape (m, 4) representing m bounding boxes. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (np.array): A numpy array of shape (n, m) representing the intersection over box2 area. + """ + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1.T + b2_x1, b2_y1, b2_x2, b2_y2 = box2.T + + # Intersection area + inter_area = (np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)).clip(0) * \ + (np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps + + # Intersection over box2 area + return inter_area / box2_area + + +def box_iou(box1, box2, eps=1e-7): + """ + Calculate intersection-over-union (IoU) of boxes. + Both sets of boxes are expected to be in (x1, y1, x2, y2) format. + Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py + + Args: + box1 (torch.Tensor): A tensor of shape (N, 4) representing N bounding boxes. + box2 (torch.Tensor): A tensor of shape (M, 4) representing M bounding boxes. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2. + """ + + # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) + (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2) + inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp_(0).prod(2) + + # IoU = inter / (area1 + area2 - inter) + return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps) + + +def bbox_iou(box1, box2, xywh=True, type_='CIoU',eps=1e-7): + # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4) + # Get the coordinates of bounding boxes + if xywh: # transform from xywh to xyxy + (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) + w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 + b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ + b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ + else: # x1, y1, x2, y2 = box1 + b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) + b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) + w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps + w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps + # 将边界框信息拼接 + b1 = torch.stack([b1_x1, b1_y1, b1_x2, b1_y2], dim=-1) + b2 = torch.stack([b2_x1, b2_y1, b2_x2, b2_y2], dim=-1) + self = IoU_Cal(b1, b2) + loss = getattr(IoU_Cal, type_)(b1, b2, self=self) + iou = 1 - self.iou + return loss, iou + + +def mask_iou(mask1, mask2, eps=1e-7): + """ + Calculate masks IoU. + + Args: + mask1 (torch.Tensor): A tensor of shape (N, n) where N is the number of ground truth objects and n is the + product of image width and height. + mask2 (torch.Tensor): A tensor of shape (M, n) where M is the number of predicted objects and n is the + product of image width and height. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (torch.Tensor): A tensor of shape (N, M) representing masks IoU. + """ + intersection = torch.matmul(mask1, mask2.T).clamp_(0) + union = (mask1.sum(1)[:, None] + mask2.sum(1)[None]) - intersection # (area1 + area2) - intersection + return intersection / (union + eps) + + +def kpt_iou(kpt1, kpt2, area, sigma, eps=1e-7): + """ + Calculate Object Keypoint Similarity (OKS). + + Args: + kpt1 (torch.Tensor): A tensor of shape (N, 17, 3) representing ground truth keypoints. + kpt2 (torch.Tensor): A tensor of shape (M, 17, 3) representing predicted keypoints. + area (torch.Tensor): A tensor of shape (N,) representing areas from ground truth. + sigma (list): A list containing 17 values representing keypoint scales. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. + + Returns: + (torch.Tensor): A tensor of shape (N, M) representing keypoint similarities. + """ + d = (kpt1[:, None, :, 0] - kpt2[..., 0]) ** 2 + (kpt1[:, None, :, 1] - kpt2[..., 1]) ** 2 # (N, M, 17) + sigma = torch.tensor(sigma, device=kpt1.device, dtype=kpt1.dtype) # (17, ) + kpt_mask = kpt1[..., 2] != 0 # (N, 17) + e = d / (2 * sigma) ** 2 / (area[:, None, None] + eps) / 2 # from cocoeval + # e = d / ((area[None, :, None] + eps) * sigma) ** 2 / 2 # from formula + return (torch.exp(-e) * kpt_mask[:, None]).sum(-1) / (kpt_mask.sum(-1)[:, None] + eps) + + +def smooth_BCE(eps=0.1): # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441 + # return positive, negative label smoothing BCE targets + return 1.0 - 0.5 * eps, 0.5 * eps + + +class ConfusionMatrix: + """ + A class for calculating and updating a confusion matrix for object detection and classification tasks. + + Attributes: + task (str): The type of task, either 'detect' or 'classify'. + matrix (np.array): The confusion matrix, with dimensions depending on the task. + nc (int): The number of classes. + conf (float): The confidence threshold for detections. + iou_thres (float): The Intersection over Union threshold. + """ + + def __init__(self, nc, conf=0.25, iou_thres=0.45, task='detect'): + """Initialize attributes for the YOLO model.""" + self.task = task + self.matrix = np.zeros((nc + 1, nc + 1)) if self.task == 'detect' else np.zeros((nc, nc)) + self.nc = nc # number of classes + self.conf = conf + self.iou_thres = iou_thres + + def process_cls_preds(self, preds, targets): + """ + Update confusion matrix for classification task + + Args: + preds (Array[N, min(nc,5)]): Predicted class labels. + targets (Array[N, 1]): Ground truth class labels. + """ + preds, targets = torch.cat(preds)[:, 0], torch.cat(targets) + for p, t in zip(preds.cpu().numpy(), targets.cpu().numpy()): + self.matrix[p][t] += 1 + + def process_batch(self, detections, labels): + """ + Update confusion matrix for object detection task. + + Args: + detections (Array[N, 6]): Detected bounding boxes and their associated information. + Each row should contain (x1, y1, x2, y2, conf, class). + labels (Array[M, 5]): Ground truth bounding boxes and their associated class labels. + Each row should contain (class, x1, y1, x2, y2). + """ + if detections is None: + gt_classes = labels.int() + for gc in gt_classes: + self.matrix[self.nc, gc] += 1 # background FN + return + + detections = detections[detections[:, 4] > self.conf] + gt_classes = labels[:, 0].int() + detection_classes = detections[:, 5].int() + iou = box_iou(labels[:, 1:], detections[:, :4]) + + x = torch.where(iou > self.iou_thres) + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + else: + matches = np.zeros((0, 3)) + + n = matches.shape[0] > 0 + m0, m1, _ = matches.transpose().astype(int) + for i, gc in enumerate(gt_classes): + j = m0 == i + if n and sum(j) == 1: + self.matrix[detection_classes[m1[j]], gc] += 1 # correct + else: + self.matrix[self.nc, gc] += 1 # true background + + if n: + for i, dc in enumerate(detection_classes): + if not any(m1 == i): + self.matrix[dc, self.nc] += 1 # predicted background + + def matrix(self): + """Returns the confusion matrix.""" + return self.matrix + + def tp_fp(self): + """Returns true positives and false positives.""" + tp = self.matrix.diagonal() # true positives + fp = self.matrix.sum(1) - tp # false positives + # fn = self.matrix.sum(0) - tp # false negatives (missed detections) + return (tp[:-1], fp[:-1]) if self.task == 'detect' else (tp, fp) # remove background class if task=detect + + @TryExcept('WARNING ⚠️ ConfusionMatrix plot failure') + @plt_settings() + def plot(self, normalize=True, save_dir='', names=(), on_plot=None): + """ + Plot the confusion matrix using seaborn and save it to a file. + + Args: + normalize (bool): Whether to normalize the confusion matrix. + save_dir (str): Directory where the plot will be saved. + names (tuple): Names of classes, used as labels on the plot. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + """ + import seaborn as sn + + array = self.matrix / ((self.matrix.sum(0).reshape(1, -1) + 1E-9) if normalize else 1) # normalize columns + array[array < 0.005] = np.nan # don't annotate (would appear as 0.00) + + fig, ax = plt.subplots(1, 1, figsize=(12, 9), tight_layout=True) + nc, nn = self.nc, len(names) # number of classes, names + sn.set(font_scale=1.0 if nc < 50 else 0.8) # for label size + labels = (0 < nn < 99) and (nn == nc) # apply names to ticklabels + ticklabels = (list(names) + ['background']) if labels else 'auto' + with warnings.catch_warnings(): + warnings.simplefilter('ignore') # suppress empty matrix RuntimeWarning: All-NaN slice encountered + sn.heatmap(array, + ax=ax, + annot=nc < 30, + annot_kws={ + 'size': 8}, + cmap='Blues', + fmt='.2f' if normalize else '.0f', + square=True, + vmin=0.0, + xticklabels=ticklabels, + yticklabels=ticklabels).set_facecolor((1, 1, 1)) + title = 'Confusion Matrix' + ' Normalized' * normalize + ax.set_xlabel('True') + ax.set_ylabel('Predicted') + ax.set_title(title) + plot_fname = Path(save_dir) / f'{title.lower().replace(" ", "_")}.png' + fig.savefig(plot_fname, dpi=250) + plt.close(fig) + if on_plot: + on_plot(plot_fname) + + def print(self): + """ + Print the confusion matrix to the console. + """ + for i in range(self.nc + 1): + LOGGER.info(' '.join(map(str, self.matrix[i]))) + + +def smooth(y, f=0.05): + """Box filter of fraction f.""" + nf = round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd) + p = np.ones(nf // 2) # ones padding + yp = np.concatenate((p * y[0], y, p * y[-1]), 0) # y padded + return np.convolve(yp, np.ones(nf) / nf, mode='valid') # y-smoothed + + +@plt_settings() +def plot_pr_curve(px, py, ap, save_dir=Path('pr_curve.png'), names=(), on_plot=None): + """Plots a precision-recall curve.""" + fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) + py = np.stack(py, axis=1) + + if 0 < len(names) < 21: # display per-class legend if < 21 classes + for i, y in enumerate(py.T): + ax.plot(px, y, linewidth=1, label=f'{names[i]} {ap[i, 0]:.3f}') # plot(recall, precision) + else: + ax.plot(px, py, linewidth=1, color='grey') # plot(recall, precision) + + ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean()) + ax.set_xlabel('Recall') + ax.set_ylabel('Precision') + ax.set_xlim(0, 1) + ax.set_ylim(0, 1) + ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left') + ax.set_title('Precision-Recall Curve') + fig.savefig(save_dir, dpi=250) + plt.close(fig) + if on_plot: + on_plot(save_dir) + + +@plt_settings() +def plot_mc_curve(px, py, save_dir=Path('mc_curve.png'), names=(), xlabel='Confidence', ylabel='Metric', on_plot=None): + """Plots a metric-confidence curve.""" + fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) + + if 0 < len(names) < 21: # display per-class legend if < 21 classes + for i, y in enumerate(py): + ax.plot(px, y, linewidth=1, label=f'{names[i]}') # plot(confidence, metric) + else: + ax.plot(px, py.T, linewidth=1, color='grey') # plot(confidence, metric) + + y = smooth(py.mean(0), 0.05) + ax.plot(px, y, linewidth=3, color='blue', label=f'all classes {y.max():.2f} at {px[y.argmax()]:.3f}') + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + ax.set_xlim(0, 1) + ax.set_ylim(0, 1) + ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left') + ax.set_title(f'{ylabel}-Confidence Curve') + fig.savefig(save_dir, dpi=250) + plt.close(fig) + if on_plot: + on_plot(save_dir) + + +def compute_ap(recall, precision): + """ + Compute the average precision (AP) given the recall and precision curves. + + Arguments: + recall (list): The recall curve. + precision (list): The precision curve. + + Returns: + (float): Average precision. + (np.ndarray): Precision envelope curve. + (np.ndarray): Modified recall curve with sentinel values added at the beginning and end. + """ + + # Append sentinel values to beginning and end + mrec = np.concatenate(([0.0], recall, [1.0])) + mpre = np.concatenate(([1.0], precision, [0.0])) + + # Compute the precision envelope + mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) + + # Integrate area under curve + method = 'interp' # methods: 'continuous', 'interp' + if method == 'interp': + x = np.linspace(0, 1, 101) # 101-point interp (COCO) + ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate + else: # 'continuous' + i = np.where(mrec[1:] != mrec[:-1])[0] # points where x-axis (recall) changes + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve + + return ap, mpre, mrec + + +def ap_per_class(tp, + conf, + pred_cls, + target_cls, + plot=False, + on_plot=None, + save_dir=Path(), + names=(), + eps=1e-16, + prefix=''): + """ + Computes the average precision per class for object detection evaluation. + + Args: + tp (np.ndarray): Binary array indicating whether the detection is correct (True) or not (False). + conf (np.ndarray): Array of confidence scores of the detections. + pred_cls (np.ndarray): Array of predicted classes of the detections. + target_cls (np.ndarray): Array of true classes of the detections. + plot (bool, optional): Whether to plot PR curves or not. Defaults to False. + on_plot (func, optional): A callback to pass plots path and data when they are rendered. Defaults to None. + save_dir (Path, optional): Directory to save the PR curves. Defaults to an empty path. + names (tuple, optional): Tuple of class names to plot PR curves. Defaults to an empty tuple. + eps (float, optional): A small value to avoid division by zero. Defaults to 1e-16. + prefix (str, optional): A prefix string for saving the plot files. Defaults to an empty string. + + Returns: + (tuple): A tuple of six arrays and one array of unique classes, where: + tp (np.ndarray): True positive counts for each class. + fp (np.ndarray): False positive counts for each class. + p (np.ndarray): Precision values at each confidence threshold. + r (np.ndarray): Recall values at each confidence threshold. + f1 (np.ndarray): F1-score values at each confidence threshold. + ap (np.ndarray): Average precision for each class at different IoU thresholds. + unique_classes (np.ndarray): An array of unique classes that have data. + + """ + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes, nt = np.unique(target_cls, return_counts=True) + nc = unique_classes.shape[0] # number of classes, number of detections + + # Create Precision-Recall curve and compute AP for each class + px, py = np.linspace(0, 1, 1000), [] # for plotting + ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000)) + for ci, c in enumerate(unique_classes): + i = pred_cls == c + n_l = nt[ci] # number of labels + n_p = i.sum() # number of predictions + if n_p == 0 or n_l == 0: + continue + + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum(0) + tpc = tp[i].cumsum(0) + + # Recall + recall = tpc / (n_l + eps) # recall curve + r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases + + # Precision + precision = tpc / (tpc + fpc) # precision curve + p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score + + # AP from recall-precision curve + for j in range(tp.shape[1]): + ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) + if plot and j == 0: + py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 + + # Compute F1 (harmonic mean of precision and recall) + f1 = 2 * p * r / (p + r + eps) + names = [v for k, v in names.items() if k in unique_classes] # list: only classes that have data + names = dict(enumerate(names)) # to dict + if plot: + plot_pr_curve(px, py, ap, save_dir / f'{prefix}PR_curve.png', names, on_plot=on_plot) + plot_mc_curve(px, f1, save_dir / f'{prefix}F1_curve.png', names, ylabel='F1', on_plot=on_plot) + plot_mc_curve(px, p, save_dir / f'{prefix}P_curve.png', names, ylabel='Precision', on_plot=on_plot) + plot_mc_curve(px, r, save_dir / f'{prefix}R_curve.png', names, ylabel='Recall', on_plot=on_plot) + + i = smooth(f1.mean(0), 0.1).argmax() # max F1 index + p, r, f1 = p[:, i], r[:, i], f1[:, i] + tp = (r * nt).round() # true positives + fp = (tp / (p + eps) - tp).round() # false positives + return tp, fp, p, r, f1, ap, unique_classes.astype(int) + + +class Metric(SimpleClass): + """ + Class for computing evaluation metrics for YOLOv8 model. + + Attributes: + p (list): Precision for each class. Shape: (nc,). + r (list): Recall for each class. Shape: (nc,). + f1 (list): F1 score for each class. Shape: (nc,). + all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10). + ap_class_index (list): Index of class for each AP score. Shape: (nc,). + nc (int): Number of classes. + + Methods: + ap50(): AP at IoU threshold of 0.5 for all classes. Returns: List of AP scores. Shape: (nc,) or []. + ap(): AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: List of AP scores. Shape: (nc,) or []. + mp(): Mean precision of all classes. Returns: Float. + mr(): Mean recall of all classes. Returns: Float. + map50(): Mean AP at IoU threshold of 0.5 for all classes. Returns: Float. + map75(): Mean AP at IoU threshold of 0.75 for all classes. Returns: Float. + map(): Mean AP at IoU thresholds from 0.5 to 0.95 for all classes. Returns: Float. + mean_results(): Mean of results, returns mp, mr, map50, map. + class_result(i): Class-aware result, returns p[i], r[i], ap50[i], ap[i]. + maps(): mAP of each class. Returns: Array of mAP scores, shape: (nc,). + fitness(): Model fitness as a weighted combination of metrics. Returns: Float. + update(results): Update metric attributes with new evaluation results. + + """ + + def __init__(self) -> None: + self.p = [] # (nc, ) + self.r = [] # (nc, ) + self.f1 = [] # (nc, ) + self.all_ap = [] # (nc, 10) + self.ap_class_index = [] # (nc, ) + self.nc = 0 + + @property + def ap50(self): + """ + Returns the Average Precision (AP) at an IoU threshold of 0.5 for all classes. + + Returns: + (np.ndarray, list): Array of shape (nc,) with AP50 values per class, or an empty list if not available. + """ + return self.all_ap[:, 0] if len(self.all_ap) else [] + + @property + def ap(self): + """ + Returns the Average Precision (AP) at an IoU threshold of 0.5-0.95 for all classes. + + Returns: + (np.ndarray, list): Array of shape (nc,) with AP50-95 values per class, or an empty list if not available. + """ + return self.all_ap.mean(1) if len(self.all_ap) else [] + + @property + def mp(self): + """ + Returns the Mean Precision of all classes. + + Returns: + (float): The mean precision of all classes. + """ + return self.p.mean() if len(self.p) else 0.0 + + @property + def mr(self): + """ + Returns the Mean Recall of all classes. + + Returns: + (float): The mean recall of all classes. + """ + return self.r.mean() if len(self.r) else 0.0 + + @property + def map50(self): + """ + Returns the mean Average Precision (mAP) at an IoU threshold of 0.5. + + Returns: + (float): The mAP50 at an IoU threshold of 0.5. + """ + return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0 + + @property + def map75(self): + """ + Returns the mean Average Precision (mAP) at an IoU threshold of 0.75. + + Returns: + (float): The mAP50 at an IoU threshold of 0.75. + """ + return self.all_ap[:, 5].mean() if len(self.all_ap) else 0.0 + + @property + def map(self): + """ + Returns the mean Average Precision (mAP) over IoU thresholds of 0.5 - 0.95 in steps of 0.05. + + Returns: + (float): The mAP over IoU thresholds of 0.5 - 0.95 in steps of 0.05. + """ + return self.all_ap.mean() if len(self.all_ap) else 0.0 + + def mean_results(self): + """Mean of results, return mp, mr, map50, map.""" + return [self.mp, self.mr, self.map50, self.map] + + def class_result(self, i): + """class-aware result, return p[i], r[i], ap50[i], ap[i].""" + return self.p[i], self.r[i], self.ap50[i], self.ap[i] + + @property + def maps(self): + """mAP of each class.""" + maps = np.zeros(self.nc) + self.map + for i, c in enumerate(self.ap_class_index): + maps[c] = self.ap[i] + return maps + + def fitness(self): + """Model fitness as a weighted combination of metrics.""" + w = [0.0, 0.0, 0.1, 0.9] # weights for [P, R, mAP@0.5, mAP@0.5:0.95] + return (np.array(self.mean_results()) * w).sum() + + def update(self, results): + """ + Args: + results (tuple): A tuple of (p, r, ap, f1, ap_class) + """ + self.p, self.r, self.f1, self.all_ap, self.ap_class_index = results + + +class DetMetrics(SimpleClass): + """ + This class is a utility class for computing detection metrics such as precision, recall, and mean average precision + (mAP) of an object detection model. + + Args: + save_dir (Path): A path to the directory where the output plots will be saved. Defaults to current directory. + plot (bool): A flag that indicates whether to plot precision-recall curves for each class. Defaults to False. + on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None. + names (tuple of str): A tuple of strings that represents the names of the classes. Defaults to an empty tuple. + + Attributes: + save_dir (Path): A path to the directory where the output plots will be saved. + plot (bool): A flag that indicates whether to plot the precision-recall curves for each class. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + names (tuple of str): A tuple of strings that represents the names of the classes. + box (Metric): An instance of the Metric class for storing the results of the detection metrics. + speed (dict): A dictionary for storing the execution time of different parts of the detection process. + + Methods: + process(tp, conf, pred_cls, target_cls): Updates the metric results with the latest batch of predictions. + keys: Returns a list of keys for accessing the computed detection metrics. + mean_results: Returns a list of mean values for the computed detection metrics. + class_result(i): Returns a list of values for the computed detection metrics for a specific class. + maps: Returns a dictionary of mean average precision (mAP) values for different IoU thresholds. + fitness: Computes the fitness score based on the computed detection metrics. + ap_class_index: Returns a list of class indices sorted by their average precision (AP) values. + results_dict: Returns a dictionary that maps detection metric keys to their computed values. + """ + + def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: + self.save_dir = save_dir + self.plot = plot + self.on_plot = on_plot + self.names = names + self.box = Metric() + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def process(self, tp, conf, pred_cls, target_cls): + """Process predicted results for object detection and update metrics.""" + results = ap_per_class(tp, + conf, + pred_cls, + target_cls, + plot=self.plot, + save_dir=self.save_dir, + names=self.names, + on_plot=self.on_plot)[2:] + self.box.nc = len(self.names) + self.box.update(results) + + @property + def keys(self): + """Returns a list of keys for accessing specific metrics.""" + return ['metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP50(B)', 'metrics/mAP50-95(B)'] + + def mean_results(self): + """Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95.""" + return self.box.mean_results() + + def class_result(self, i): + """Return the result of evaluating the performance of an object detection model on a specific class.""" + return self.box.class_result(i) + + @property + def maps(self): + """Returns mean Average Precision (mAP) scores per class.""" + return self.box.maps + + @property + def fitness(self): + """Returns the fitness of box object.""" + return self.box.fitness() + + @property + def ap_class_index(self): + """Returns the average precision index per class.""" + return self.box.ap_class_index + + @property + def results_dict(self): + """Returns dictionary of computed performance metrics and statistics.""" + return dict(zip(self.keys + ['fitness'], self.mean_results() + [self.fitness])) + + +class SegmentMetrics(SimpleClass): + """ + Calculates and aggregates detection and segmentation metrics over a given set of classes. + + Args: + save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory. + plot (bool): Whether to save the detection and segmentation plots. Default is False. + on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None. + names (list): List of class names. Default is an empty list. + + Attributes: + save_dir (Path): Path to the directory where the output plots should be saved. + plot (bool): Whether to save the detection and segmentation plots. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + names (list): List of class names. + box (Metric): An instance of the Metric class to calculate box detection metrics. + seg (Metric): An instance of the Metric class to calculate mask segmentation metrics. + speed (dict): Dictionary to store the time taken in different phases of inference. + + Methods: + process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions. + mean_results(): Returns the mean of the detection and segmentation metrics over all the classes. + class_result(i): Returns the detection and segmentation metrics of class `i`. + maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95. + fitness: Returns the fitness scores, which are a single weighted combination of metrics. + ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP). + results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score. + """ + + def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: + self.save_dir = save_dir + self.plot = plot + self.on_plot = on_plot + self.names = names + self.box = Metric() + self.seg = Metric() + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def process(self, tp_b, tp_m, conf, pred_cls, target_cls): + """ + Processes the detection and segmentation metrics over the given set of predictions. + + Args: + tp_b (list): List of True Positive boxes. + tp_m (list): List of True Positive masks. + conf (list): List of confidence scores. + pred_cls (list): List of predicted classes. + target_cls (list): List of target classes. + """ + + results_mask = ap_per_class(tp_m, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Mask')[2:] + self.seg.nc = len(self.names) + self.seg.update(results_mask) + results_box = ap_per_class(tp_b, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Box')[2:] + self.box.nc = len(self.names) + self.box.update(results_box) + + @property + def keys(self): + """Returns a list of keys for accessing metrics.""" + return [ + 'metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP50(B)', 'metrics/mAP50-95(B)', + 'metrics/precision(M)', 'metrics/recall(M)', 'metrics/mAP50(M)', 'metrics/mAP50-95(M)'] + + def mean_results(self): + """Return the mean metrics for bounding box and segmentation results.""" + return self.box.mean_results() + self.seg.mean_results() + + def class_result(self, i): + """Returns classification results for a specified class index.""" + return self.box.class_result(i) + self.seg.class_result(i) + + @property + def maps(self): + """Returns mAP scores for object detection and semantic segmentation models.""" + return self.box.maps + self.seg.maps + + @property + def fitness(self): + """Get the fitness score for both segmentation and bounding box models.""" + return self.seg.fitness() + self.box.fitness() + + @property + def ap_class_index(self): + """Boxes and masks have the same ap_class_index.""" + return self.box.ap_class_index + + @property + def results_dict(self): + """Returns results of object detection model for evaluation.""" + return dict(zip(self.keys + ['fitness'], self.mean_results() + [self.fitness])) + + +class PoseMetrics(SegmentMetrics): + """ + Calculates and aggregates detection and pose metrics over a given set of classes. + + Args: + save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory. + plot (bool): Whether to save the detection and segmentation plots. Default is False. + on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None. + names (list): List of class names. Default is an empty list. + + Attributes: + save_dir (Path): Path to the directory where the output plots should be saved. + plot (bool): Whether to save the detection and segmentation plots. + on_plot (func): An optional callback to pass plots path and data when they are rendered. + names (list): List of class names. + box (Metric): An instance of the Metric class to calculate box detection metrics. + pose (Metric): An instance of the Metric class to calculate mask segmentation metrics. + speed (dict): Dictionary to store the time taken in different phases of inference. + + Methods: + process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions. + mean_results(): Returns the mean of the detection and segmentation metrics over all the classes. + class_result(i): Returns the detection and segmentation metrics of class `i`. + maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95. + fitness: Returns the fitness scores, which are a single weighted combination of metrics. + ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP). + results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score. + """ + + def __init__(self, save_dir=Path('.'), plot=False, on_plot=None, names=()) -> None: + super().__init__(save_dir, plot, names) + self.save_dir = save_dir + self.plot = plot + self.on_plot = on_plot + self.names = names + self.box = Metric() + self.pose = Metric() + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def __getattr__(self, attr): + """Raises an AttributeError if an invalid attribute is accessed.""" + name = self.__class__.__name__ + raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}") + + def process(self, tp_b, tp_p, conf, pred_cls, target_cls): + """ + Processes the detection and pose metrics over the given set of predictions. + + Args: + tp_b (list): List of True Positive boxes. + tp_p (list): List of True Positive keypoints. + conf (list): List of confidence scores. + pred_cls (list): List of predicted classes. + target_cls (list): List of target classes. + """ + + results_pose = ap_per_class(tp_p, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Pose')[2:] + self.pose.nc = len(self.names) + self.pose.update(results_pose) + results_box = ap_per_class(tp_b, + conf, + pred_cls, + target_cls, + plot=self.plot, + on_plot=self.on_plot, + save_dir=self.save_dir, + names=self.names, + prefix='Box')[2:] + self.box.nc = len(self.names) + self.box.update(results_box) + + @property + def keys(self): + """Returns list of evaluation metric keys.""" + return [ + 'metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP50(B)', 'metrics/mAP50-95(B)', + 'metrics/precision(P)', 'metrics/recall(P)', 'metrics/mAP50(P)', 'metrics/mAP50-95(P)'] + + def mean_results(self): + """Return the mean results of box and pose.""" + return self.box.mean_results() + self.pose.mean_results() + + def class_result(self, i): + """Return the class-wise detection results for a specific class i.""" + return self.box.class_result(i) + self.pose.class_result(i) + + @property + def maps(self): + """Returns the mean average precision (mAP) per class for both box and pose detections.""" + return self.box.maps + self.pose.maps + + @property + def fitness(self): + """Computes classification metrics and speed using the `targets` and `pred` inputs.""" + return self.pose.fitness() + self.box.fitness() + + +class ClassifyMetrics(SimpleClass): + """ + Class for computing classification metrics including top-1 and top-5 accuracy. + + Attributes: + top1 (float): The top-1 accuracy. + top5 (float): The top-5 accuracy. + speed (Dict[str, float]): A dictionary containing the time taken for each step in the pipeline. + + Properties: + fitness (float): The fitness of the model, which is equal to top-5 accuracy. + results_dict (Dict[str, Union[float, str]]): A dictionary containing the classification metrics and fitness. + keys (List[str]): A list of keys for the results_dict. + + Methods: + process(targets, pred): Processes the targets and predictions to compute classification metrics. + """ + + def __init__(self) -> None: + self.top1 = 0 + self.top5 = 0 + self.speed = {'preprocess': 0.0, 'inference': 0.0, 'loss': 0.0, 'postprocess': 0.0} + + def process(self, targets, pred): + """Target classes and predicted classes.""" + pred, targets = torch.cat(pred), torch.cat(targets) + correct = (targets[:, None] == pred).float() + acc = torch.stack((correct[:, 0], correct.max(1).values), dim=1) # (top1, top5) accuracy + self.top1, self.top5 = acc.mean(0).tolist() + + @property + def fitness(self): + """Returns mean of top-1 and top-5 accuracies as fitness score.""" + return (self.top1 + self.top5) / 2 + + @property + def results_dict(self): + """Returns a dictionary with model's performance metrics and fitness score.""" + return dict(zip(self.keys + ['fitness'], [self.top1, self.top5, self.fitness])) + + @property + def keys(self): + """Returns a list of keys for the results_dict property.""" + return ['metrics/accuracy_top1', 'metrics/accuracy_top5'] diff --git a/ultralytics/utils/ops.py b/ultralytics/utils/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..5c9a9aff111990a8899fd0777de11f05ddd9f286 --- /dev/null +++ b/ultralytics/utils/ops.py @@ -0,0 +1,738 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import contextlib +import math +import re +import time + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +import torchvision + +from ultralytics.utils import LOGGER + +from .metrics import box_iou + + +class Profile(contextlib.ContextDecorator): + """ + YOLOv8 Profile class. + Usage: as a decorator with @Profile() or as a context manager with 'with Profile():' + """ + + def __init__(self, t=0.0): + """ + Initialize the Profile class. + + Args: + t (float): Initial time. Defaults to 0.0. + """ + self.t = t + self.cuda = torch.cuda.is_available() + + def __enter__(self): + """ + Start timing. + """ + self.start = self.time() + return self + + def __exit__(self, type, value, traceback): + """ + Stop timing. + """ + self.dt = self.time() - self.start # delta-time + self.t += self.dt # accumulate dt + + def time(self): + """ + Get current time. + """ + if self.cuda: + torch.cuda.synchronize() + return time.time() + + +def coco80_to_coco91_class(): # + """ + Converts 80-index (val2014) to 91-index (paper). + For details see https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/. + + Example: + ```python + a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n') + b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n') + x1 = [list(a[i] == b).index(True) + 1 for i in range(80)] # darknet to coco + x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)] # coco to darknet + ``` + """ + return [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] + + +def segment2box(segment, width=640, height=640): + """ + Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy) + + Args: + segment (torch.Tensor): the segment label + width (int): the width of the image. Defaults to 640 + height (int): The height of the image. Defaults to 640 + + Returns: + (np.ndarray): the minimum and maximum x and y values of the segment. + """ + # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy) + x, y = segment.T # segment xy + inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) + x, y, = x[inside], y[inside] + return np.array([x.min(), y.min(), x.max(), y.max()], dtype=segment.dtype) if any(x) else np.zeros( + 4, dtype=segment.dtype) # xyxy + + +def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True): + """ + Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in + (img1_shape) to the shape of a different image (img0_shape). + + Args: + img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). + boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) + img0_shape (tuple): the shape of the target image, in the format of (height, width). + ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be + calculated based on the size difference between the two images. + padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular + rescaling. + + Returns: + boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) + """ + if ratio_pad is None: # calculate from img0_shape + gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new + pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round( + (img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) # wh padding + else: + gain = ratio_pad[0][0] + pad = ratio_pad[1] + + if padding: + boxes[..., [0, 2]] -= pad[0] # x padding + boxes[..., [1, 3]] -= pad[1] # y padding + boxes[..., :4] /= gain + clip_boxes(boxes, img0_shape) + return boxes + + +def make_divisible(x, divisor): + """ + Returns the nearest number that is divisible by the given divisor. + + Args: + x (int): The number to make divisible. + divisor (int | torch.Tensor): The divisor. + + Returns: + (int): The nearest number divisible by the divisor. + """ + if isinstance(divisor, torch.Tensor): + divisor = int(divisor.max()) # to int + return math.ceil(x / divisor) * divisor + + +def non_max_suppression( + prediction, + conf_thres=0.25, + iou_thres=0.45, + classes=None, + agnostic=False, + multi_label=False, + labels=(), + max_det=300, + nc=0, # number of classes (optional) + max_time_img=0.05, + max_nms=30000, + max_wh=7680, +): + """ + Perform non-maximum suppression (NMS) on a set of boxes, with support for masks and multiple labels per box. + + Arguments: + prediction (torch.Tensor): A tensor of shape (batch_size, num_classes + 4 + num_masks, num_boxes) + containing the predicted boxes, classes, and masks. The tensor should be in the format + output by a model, such as YOLO. + conf_thres (float): The confidence threshold below which boxes will be filtered out. + Valid values are between 0.0 and 1.0. + iou_thres (float): The IoU threshold below which boxes will be filtered out during NMS. + Valid values are between 0.0 and 1.0. + classes (List[int]): A list of class indices to consider. If None, all classes will be considered. + agnostic (bool): If True, the model is agnostic to the number of classes, and all + classes will be considered as one. + multi_label (bool): If True, each box may have multiple labels. + labels (List[List[Union[int, float, torch.Tensor]]]): A list of lists, where each inner + list contains the apriori labels for a given image. The list should be in the format + output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2). + max_det (int): The maximum number of boxes to keep after NMS. + nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks. + max_time_img (float): The maximum time (seconds) for processing one image. + max_nms (int): The maximum number of boxes into torchvision.ops.nms(). + max_wh (int): The maximum box width and height in pixels + + Returns: + (List[torch.Tensor]): A list of length batch_size, where each element is a tensor of + shape (num_boxes, 6 + num_masks) containing the kept boxes, with columns + (x1, y1, x2, y2, confidence, class, mask1, mask2, ...). + """ + + # Checks + assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0' + assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0' + if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out) + prediction = prediction[0] # select only inference output + + device = prediction.device + mps = 'mps' in device.type # Apple MPS + if mps: # MPS not fully supported yet, convert tensors to CPU before NMS + prediction = prediction.cpu() + bs = prediction.shape[0] # batch size + nc = nc or (prediction.shape[1] - 4) # number of classes + nm = prediction.shape[1] - nc - 4 + mi = 4 + nc # mask start index + xc = prediction[:, 4:mi].amax(1) > conf_thres # candidates + + # Settings + # min_wh = 2 # (pixels) minimum box width and height + time_limit = 0.5 + max_time_img * bs # seconds to quit after + redundant = True # require redundant detections + multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img) + merge = False # use merge-NMS + + prediction = prediction.transpose(-1, -2) # shape(1,84,6300) to shape(1,6300,84) + prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # xywh to xyxy + + t = time.time() + output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs + for xi, x in enumerate(prediction): # image index, image inference + # Apply constraints + # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height + x = x[xc[xi]] # confidence + + # Cat apriori labels if autolabelling + if labels and len(labels[xi]): + lb = labels[xi] + v = torch.zeros((len(lb), nc + nm + 5), device=x.device) + v[:, :4] = lb[:, 1:5] # box + v[range(len(lb)), lb[:, 0].long() + 4] = 1.0 # cls + x = torch.cat((x, v), 0) + + # If none remain process next image + if not x.shape[0]: + continue + + # Detections matrix nx6 (xyxy, conf, cls) + box, cls, mask = x.split((4, nc, nm), 1) + + if multi_label: + i, j = torch.where(cls > conf_thres) + x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1) + else: # best class only + conf, j = cls.max(1, keepdim=True) + x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres] + + # Filter by class + if classes is not None: + x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] + + # Apply finite constraint + # if not torch.isfinite(x).all(): + # x = x[torch.isfinite(x).all(1)] + + # Check shape + n = x.shape[0] # number of boxes + if not n: # no boxes + continue + if n > max_nms: # excess boxes + x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes + + # Batched NMS + c = x[:, 5:6] * (0 if agnostic else max_wh) # classes + boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores + i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS + i = i[:max_det] # limit detections + if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean) + # Update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) + iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix + weights = iou * scores[None] # box weights + x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes + if redundant: + i = i[iou.sum(1) > 1] # require redundancy + + output[xi] = x[i] + if mps: + output[xi] = output[xi].to(device) + if (time.time() - t) > time_limit: + LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded') + break # time limit exceeded + + return output + + +def clip_boxes(boxes, shape): + """ + It takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the + shape + + Args: + boxes (torch.Tensor): the bounding boxes to clip + shape (tuple): the shape of the image + """ + if isinstance(boxes, torch.Tensor): # faster individually + boxes[..., 0].clamp_(0, shape[1]) # x1 + boxes[..., 1].clamp_(0, shape[0]) # y1 + boxes[..., 2].clamp_(0, shape[1]) # x2 + boxes[..., 3].clamp_(0, shape[0]) # y2 + else: # np.array (faster grouped) + boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2 + boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2 + + +def clip_coords(coords, shape): + """ + Clip line coordinates to the image boundaries. + + Args: + coords (torch.Tensor | numpy.ndarray): A list of line coordinates. + shape (tuple): A tuple of integers representing the size of the image in the format (height, width). + + Returns: + (None): The function modifies the input `coordinates` in place, by clipping each coordinate to the image boundaries. + """ + if isinstance(coords, torch.Tensor): # faster individually + coords[..., 0].clamp_(0, shape[1]) # x + coords[..., 1].clamp_(0, shape[0]) # y + else: # np.array (faster grouped) + coords[..., 0] = coords[..., 0].clip(0, shape[1]) # x + coords[..., 1] = coords[..., 1].clip(0, shape[0]) # y + + +def scale_image(masks, im0_shape, ratio_pad=None): + """ + Takes a mask, and resizes it to the original image size + + Args: + masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3]. + im0_shape (tuple): the original image shape + ratio_pad (tuple): the ratio of the padding to the original image. + + Returns: + masks (torch.Tensor): The masks that are being returned. + """ + # Rescale coordinates (xyxy) from im1_shape to im0_shape + im1_shape = masks.shape + if im1_shape[:2] == im0_shape[:2]: + return masks + if ratio_pad is None: # calculate from im0_shape + gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new + pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding + else: + gain = ratio_pad[0][0] + pad = ratio_pad[1] + top, left = int(pad[1]), int(pad[0]) # y, x + bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0]) + + if len(masks.shape) < 2: + raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}') + masks = masks[top:bottom, left:right] + masks = cv2.resize(masks, (im0_shape[1], im0_shape[0])) + if len(masks.shape) == 2: + masks = masks[:, :, None] + + return masks + + +def xyxy2xywh(x): + """ + Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format. + + Args: + x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format. + Returns: + y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height) format. + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center + y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center + y[..., 2] = x[..., 2] - x[..., 0] # width + y[..., 3] = x[..., 3] - x[..., 1] # height + return y + + +def xywh2xyxy(x): + """ + Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the + top-left corner and (x2, y2) is the bottom-right corner. + + Args: + x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format. + Returns: + y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format. + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[..., 0] = x[..., 0] - x[..., 2] / 2 # top left x + y[..., 1] = x[..., 1] - x[..., 3] / 2 # top left y + y[..., 2] = x[..., 0] + x[..., 2] / 2 # bottom right x + y[..., 3] = x[..., 1] + x[..., 3] / 2 # bottom right y + return y + + +def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0): + """ + Convert normalized bounding box coordinates to pixel coordinates. + + Args: + x (np.ndarray | torch.Tensor): The bounding box coordinates. + w (int): Width of the image. Defaults to 640 + h (int): Height of the image. Defaults to 640 + padw (int): Padding width. Defaults to 0 + padh (int): Padding height. Defaults to 0 + Returns: + y (np.ndarray | torch.Tensor): The coordinates of the bounding box in the format [x1, y1, x2, y2] where + x1,y1 is the top-left corner, x2,y2 is the bottom-right corner of the bounding box. + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw # top left x + y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh # top left y + y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw # bottom right x + y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh # bottom right y + return y + + +def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0): + """ + Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height, normalized) format. + x, y, width and height are normalized to image dimensions + + Args: + x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format. + w (int): The width of the image. Defaults to 640 + h (int): The height of the image. Defaults to 640 + clip (bool): If True, the boxes will be clipped to the image boundaries. Defaults to False + eps (float): The minimum value of the box's width and height. Defaults to 0.0 + Returns: + y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height, normalized) format + """ + if clip: + clip_boxes(x, (h - eps, w - eps)) # warning: inplace clip + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w # x center + y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h # y center + y[..., 2] = (x[..., 2] - x[..., 0]) / w # width + y[..., 3] = (x[..., 3] - x[..., 1]) / h # height + return y + + +def xyn2xy(x, w=640, h=640, padw=0, padh=0): + """ + Convert normalized coordinates to pixel coordinates of shape (n,2) + + Args: + x (np.ndarray | torch.Tensor): The input tensor of normalized bounding box coordinates + w (int): The width of the image. Defaults to 640 + h (int): The height of the image. Defaults to 640 + padw (int): The width of the padding. Defaults to 0 + padh (int): The height of the padding. Defaults to 0 + Returns: + y (np.ndarray | torch.Tensor): The x and y coordinates of the top left corner of the bounding box + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[..., 0] = w * x[..., 0] + padw # top left x + y[..., 1] = h * x[..., 1] + padh # top left y + return y + + +def xywh2ltwh(x): + """ + Convert the bounding box format from [x, y, w, h] to [x1, y1, w, h], where x1, y1 are the top-left coordinates. + + Args: + x (np.ndarray | torch.Tensor): The input tensor with the bounding box coordinates in the xywh format + Returns: + y (np.ndarray | torch.Tensor): The bounding box coordinates in the xyltwh format + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x + y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y + return y + + +def xyxy2ltwh(x): + """ + Convert nx4 bounding boxes from [x1, y1, x2, y2] to [x1, y1, w, h], where xy1=top-left, xy2=bottom-right + + Args: + x (np.ndarray | torch.Tensor): The input tensor with the bounding boxes coordinates in the xyxy format + Returns: + y (np.ndarray | torch.Tensor): The bounding box coordinates in the xyltwh format. + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[:, 2] = x[:, 2] - x[:, 0] # width + y[:, 3] = x[:, 3] - x[:, 1] # height + return y + + +def ltwh2xywh(x): + """ + Convert nx4 boxes from [x1, y1, w, h] to [x, y, w, h] where xy1=top-left, xy=center + + Args: + x (torch.Tensor): the input tensor + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[:, 0] = x[:, 0] + x[:, 2] / 2 # center x + y[:, 1] = x[:, 1] + x[:, 3] / 2 # center y + return y + + +def ltwh2xyxy(x): + """ + It converts the bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + + Args: + x (np.ndarray | torch.Tensor): the input image + + Returns: + y (np.ndarray | torch.Tensor): the xyxy coordinates of the bounding boxes. + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[:, 2] = x[:, 2] + x[:, 0] # width + y[:, 3] = x[:, 3] + x[:, 1] # height + return y + + +def segments2boxes(segments): + """ + It converts segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh) + + Args: + segments (list): list of segments, each segment is a list of points, each point is a list of x, y coordinates + + Returns: + (np.ndarray): the xywh coordinates of the bounding boxes. + """ + boxes = [] + for s in segments: + x, y = s.T # segment xy + boxes.append([x.min(), y.min(), x.max(), y.max()]) # cls, xyxy + return xyxy2xywh(np.array(boxes)) # cls, xywh + + +def resample_segments(segments, n=1000): + """ + Inputs a list of segments (n,2) and returns a list of segments (n,2) up-sampled to n points each. + + Args: + segments (list): a list of (n,2) arrays, where n is the number of points in the segment. + n (int): number of points to resample the segment to. Defaults to 1000 + + Returns: + segments (list): the resampled segments. + """ + for i, s in enumerate(segments): + s = np.concatenate((s, s[0:1, :]), axis=0) + x = np.linspace(0, len(s) - 1, n) + xp = np.arange(len(s)) + segments[i] = np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)], + dtype=np.float32).reshape(2, -1).T # segment xy + return segments + + +def crop_mask(masks, boxes): + """ + It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box + + Args: + masks (torch.Tensor): [n, h, w] tensor of masks + boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form + + Returns: + (torch.Tensor): The masks are being cropped to the bounding box. + """ + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1) + + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + +def process_mask_upsample(protos, masks_in, bboxes, shape): + """ + It takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher + quality but is slower. + + Args: + protos (torch.Tensor): [mask_dim, mask_h, mask_w] + masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms + bboxes (torch.Tensor): [n, 4], n is number of masks after nms + shape (tuple): the size of the input image (h,w) + + Returns: + (torch.Tensor): The upsampled masks. + """ + c, mh, mw = protos.shape # CHW + masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) + masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW + masks = crop_mask(masks, bboxes) # CHW + return masks.gt_(0.5) + + +def process_mask(protos, masks_in, bboxes, shape, upsample=False): + """ + Apply masks to bounding boxes using the output of the mask head. + + Args: + protos (torch.Tensor): A tensor of shape [mask_dim, mask_h, mask_w]. + masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS. + bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS. + shape (tuple): A tuple of integers representing the size of the input image in the format (h, w). + upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False. + + Returns: + (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w + are the height and width of the input image. The mask is applied to the bounding boxes. + """ + + c, mh, mw = protos.shape # CHW + ih, iw = shape + masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW + + downsampled_bboxes = bboxes.clone() + downsampled_bboxes[:, 0] *= mw / iw + downsampled_bboxes[:, 2] *= mw / iw + downsampled_bboxes[:, 3] *= mh / ih + downsampled_bboxes[:, 1] *= mh / ih + + masks = crop_mask(masks, downsampled_bboxes) # CHW + if upsample: + masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW + return masks.gt_(0.5) + + +def process_mask_native(protos, masks_in, bboxes, shape): + """ + It takes the output of the mask head, and crops it after upsampling to the bounding boxes. + + Args: + protos (torch.Tensor): [mask_dim, mask_h, mask_w] + masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms + bboxes (torch.Tensor): [n, 4], n is number of masks after nms + shape (tuple): the size of the input image (h,w) + + Returns: + masks (torch.Tensor): The returned masks with dimensions [h, w, n] + """ + c, mh, mw = protos.shape # CHW + masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) + masks = scale_masks(masks[None], shape)[0] # CHW + masks = crop_mask(masks, bboxes) # CHW + return masks.gt_(0.5) + + +def scale_masks(masks, shape, padding=True): + """ + Rescale segment masks to shape. + + Args: + masks (torch.Tensor): (N, C, H, W). + shape (tuple): Height and width. + padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular + rescaling. + """ + mh, mw = masks.shape[2:] + gain = min(mh / shape[0], mw / shape[1]) # gain = old / new + pad = [mw - shape[1] * gain, mh - shape[0] * gain] # wh padding + if padding: + pad[0] /= 2 + pad[1] /= 2 + top, left = (int(pad[1]), int(pad[0])) if padding else (0, 0) # y, x + bottom, right = (int(mh - pad[1]), int(mw - pad[0])) + masks = masks[..., top:bottom, left:right] + + masks = F.interpolate(masks, shape, mode='bilinear', align_corners=False) # NCHW + return masks + + +def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize=False, padding=True): + """ + Rescale segment coordinates (xyxy) from img1_shape to img0_shape + + Args: + img1_shape (tuple): The shape of the image that the coords are from. + coords (torch.Tensor): the coords to be scaled + img0_shape (tuple): the shape of the image that the segmentation is being applied to + ratio_pad (tuple): the ratio of the image size to the padded image size. + normalize (bool): If True, the coordinates will be normalized to the range [0, 1]. Defaults to False + padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular + rescaling. + + Returns: + coords (torch.Tensor): the segmented image. + """ + if ratio_pad is None: # calculate from img0_shape + gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new + pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding + else: + gain = ratio_pad[0][0] + pad = ratio_pad[1] + + if padding: + coords[..., 0] -= pad[0] # x padding + coords[..., 1] -= pad[1] # y padding + coords[..., 0] /= gain + coords[..., 1] /= gain + clip_coords(coords, img0_shape) + if normalize: + coords[..., 0] /= img0_shape[1] # width + coords[..., 1] /= img0_shape[0] # height + return coords + + +def masks2segments(masks, strategy='largest'): + """ + It takes a list of masks(n,h,w) and returns a list of segments(n,xy) + + Args: + masks (torch.Tensor): the output of the model, which is a tensor of shape (batch_size, 160, 160) + strategy (str): 'concat' or 'largest'. Defaults to largest + + Returns: + segments (List): list of segment masks + """ + segments = [] + for x in masks.int().cpu().numpy().astype('uint8'): + c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0] + if c: + if strategy == 'concat': # concatenate all segments + c = np.concatenate([x.reshape(-1, 2) for x in c]) + elif strategy == 'largest': # select largest segment + c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2) + else: + c = np.zeros((0, 2)) # no segments found + segments.append(c.astype('float32')) + return segments + + +def clean_str(s): + """ + Cleans a string by replacing special characters with underscore _ + + Args: + s (str): a string needing special characters replaced + + Returns: + (str): a string with special characters replaced by an underscore _ + """ + return re.sub(pattern='[|@#!¡·$€%&()=?¿^*;:,¨´><+]', repl='_', string=s) diff --git a/ultralytics/utils/patches.py b/ultralytics/utils/patches.py new file mode 100644 index 0000000000000000000000000000000000000000..9dd15204b2f4b2c96af03c12ce464c68d993ed7d --- /dev/null +++ b/ultralytics/utils/patches.py @@ -0,0 +1,45 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +""" +Monkey patches to update/extend functionality of existing functions +""" + +from pathlib import Path + +import cv2 +import numpy as np +import torch + +# OpenCV Multilanguage-friendly functions ------------------------------------------------------------------------------ +_imshow = cv2.imshow # copy to avoid recursion errors + + +def imread(filename, flags=cv2.IMREAD_COLOR): + return cv2.imdecode(np.fromfile(filename, np.uint8), flags) + + +def imwrite(filename, img): + try: + cv2.imencode(Path(filename).suffix, img)[1].tofile(filename) + return True + except Exception: + return False + + +def imshow(path, im): + _imshow(path.encode('unicode_escape').decode(), im) + + +# PyTorch functions ---------------------------------------------------------------------------------------------------- +_torch_save = torch.save # copy to avoid recursion errors + + +def torch_save(*args, **kwargs): + """Use dill (if exists) to serialize the lambda functions where pickle does not do this.""" + try: + import dill as pickle + except ImportError: + import pickle + + if 'pickle_module' not in kwargs: + kwargs['pickle_module'] = pickle + return _torch_save(*args, **kwargs) diff --git a/ultralytics/utils/plotting.py b/ultralytics/utils/plotting.py new file mode 100644 index 0000000000000000000000000000000000000000..35666f77c7b10dee551cc5c5b9bb5a5a5b65177f --- /dev/null +++ b/ultralytics/utils/plotting.py @@ -0,0 +1,578 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import contextlib +import math +import warnings +from pathlib import Path + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import torch +from PIL import Image, ImageDraw, ImageFont +from PIL import __version__ as pil_version +from scipy.ndimage import gaussian_filter1d + +from ultralytics.utils import LOGGER, TryExcept, plt_settings, threaded + +from .checks import check_font, check_version, is_ascii +from .files import increment_path +from .ops import clip_boxes, scale_image, xywh2xyxy, xyxy2xywh + + +class Colors: + """Ultralytics default color palette https://ultralytics.com/. + + This class provides methods to work with the Ultralytics color palette, including converting hex color codes to + RGB values. + + Attributes: + palette (list of tuple): List of RGB color values. + n (int): The number of colors in the palette. + pose_palette (np.array): A specific color palette array with dtype np.uint8. + """ + + def __init__(self): + """Initialize colors as hex = matplotlib.colors.TABLEAU_COLORS.values().""" + hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB', + '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7') + self.palette = [self.hex2rgb(f'#{c}') for c in hexs] + self.n = len(self.palette) + self.pose_palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102], [230, 230, 0], [255, 153, 255], + [153, 204, 255], [255, 102, 255], [255, 51, 255], [102, 178, 255], [51, 153, 255], + [255, 153, 153], [255, 102, 102], [255, 51, 51], [153, 255, 153], [102, 255, 102], + [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0], [255, 255, 255]], + dtype=np.uint8) + + def __call__(self, i, bgr=False): + """Converts hex color codes to RGB values.""" + c = self.palette[int(i) % self.n] + return (c[2], c[1], c[0]) if bgr else c + + @staticmethod + def hex2rgb(h): + """Converts hex color codes to RGB values (i.e. default PIL order).""" + return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) + + +colors = Colors() # create instance for 'from utils.plots import colors' + + +class Annotator: + """Ultralytics Annotator for train/val mosaics and JPGs and predictions annotations. + + Attributes: + im (Image.Image or numpy array): The image to annotate. + pil (bool): Whether to use PIL or cv2 for drawing annotations. + font (ImageFont.truetype or ImageFont.load_default): Font used for text annotations. + lw (float): Line width for drawing. + skeleton (List[List[int]]): Skeleton structure for keypoints. + limb_color (List[int]): Color palette for limbs. + kpt_color (List[int]): Color palette for keypoints. + """ + + def __init__(self, im, line_width=None, font_size=None, font='Arial.ttf', pil=False, example='abc'): + """Initialize the Annotator class with image and line width along with color palette for keypoints and limbs.""" + assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to Annotator() input images.' + non_ascii = not is_ascii(example) # non-latin labels, i.e. asian, arabic, cyrillic + self.pil = pil or non_ascii + if self.pil: # use PIL + self.im = im if isinstance(im, Image.Image) else Image.fromarray(im) + self.draw = ImageDraw.Draw(self.im) + try: + font = check_font('Arial.Unicode.ttf' if non_ascii else font) + size = font_size or max(round(sum(self.im.size) / 2 * 0.035), 12) + self.font = ImageFont.truetype(str(font), size) + except Exception: + self.font = ImageFont.load_default() + # Deprecation fix for w, h = getsize(string) -> _, _, w, h = getbox(string) + if check_version(pil_version, '9.2.0'): + self.font.getsize = lambda x: self.font.getbbox(x)[2:4] # text width, height + else: # use cv2 + self.im = im + self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2) # line width + # Pose + self.skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], + [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]] + + self.limb_color = colors.pose_palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]] + self.kpt_color = colors.pose_palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]] + + def box_label(self, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255)): + """Add one xyxy box to image with label.""" + if isinstance(box, torch.Tensor): + box = box.tolist() + if self.pil or not is_ascii(label): + self.draw.rectangle(box, width=self.lw, outline=color) # box + if label: + w, h = self.font.getsize(label) # text width, height + outside = box[1] - h >= 0 # label fits outside box + self.draw.rectangle( + (box[0], box[1] - h if outside else box[1], box[0] + w + 1, + box[1] + 1 if outside else box[1] + h + 1), + fill=color, + ) + # self.draw.text((box[0], box[1]), label, fill=txt_color, font=self.font, anchor='ls') # for PIL>8.0 + self.draw.text((box[0], box[1] - h if outside else box[1]), label, fill=txt_color, font=self.font) + else: # cv2 + p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3])) + cv2.rectangle(self.im, p1, p2, color, thickness=self.lw, lineType=cv2.LINE_AA) + if label: + tf = max(self.lw - 1, 1) # font thickness + w, h = cv2.getTextSize(label, 0, fontScale=self.lw / 3, thickness=tf)[0] # text width, height + outside = p1[1] - h >= 3 + p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3 + cv2.rectangle(self.im, p1, p2, color, -1, cv2.LINE_AA) # filled + cv2.putText(self.im, + label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), + 0, + self.lw / 3, + txt_color, + thickness=tf, + lineType=cv2.LINE_AA) + + def masks(self, masks, colors, im_gpu, alpha=0.5, retina_masks=False): + """Plot masks at once. + + Args: + masks (tensor): predicted masks on cuda, shape: [n, h, w] + colors (List[List[Int]]): colors for predicted masks, [[r, g, b] * n] + im_gpu (tensor): img is in cuda, shape: [3, h, w], range: [0, 1] + alpha (float): mask transparency: 0.0 fully transparent, 1.0 opaque + """ + if self.pil: + # Convert to numpy first + self.im = np.asarray(self.im).copy() + if len(masks) == 0: + self.im[:] = im_gpu.permute(1, 2, 0).contiguous().cpu().numpy() * 255 + if im_gpu.device != masks.device: + im_gpu = im_gpu.to(masks.device) + colors = torch.tensor(colors, device=masks.device, dtype=torch.float32) / 255.0 # shape(n,3) + colors = colors[:, None, None] # shape(n,1,1,3) + masks = masks.unsqueeze(3) # shape(n,h,w,1) + masks_color = masks * (colors * alpha) # shape(n,h,w,3) + + inv_alph_masks = (1 - masks * alpha).cumprod(0) # shape(n,h,w,1) + mcs = masks_color.max(dim=0).values # shape(n,h,w,3) + + im_gpu = im_gpu.flip(dims=[0]) # flip channel + im_gpu = im_gpu.permute(1, 2, 0).contiguous() # shape(h,w,3) + im_gpu = im_gpu * inv_alph_masks[-1] + mcs + im_mask = (im_gpu * 255) + im_mask_np = im_mask.byte().cpu().numpy() + self.im[:] = im_mask_np if retina_masks else scale_image(im_mask_np, self.im.shape) + if self.pil: + # Convert im back to PIL and update draw + self.fromarray(self.im) + + def kpts(self, kpts, shape=(640, 640), radius=5, kpt_line=True): + """Plot keypoints on the image. + + Args: + kpts (tensor): Predicted keypoints with shape [17, 3]. Each keypoint has (x, y, confidence). + shape (tuple): Image shape as a tuple (h, w), where h is the height and w is the width. + radius (int, optional): Radius of the drawn keypoints. Default is 5. + kpt_line (bool, optional): If True, the function will draw lines connecting keypoints + for human pose. Default is True. + + Note: `kpt_line=True` currently only supports human pose plotting. + """ + if self.pil: + # Convert to numpy first + self.im = np.asarray(self.im).copy() + nkpt, ndim = kpts.shape + is_pose = nkpt == 17 and ndim == 3 + kpt_line &= is_pose # `kpt_line=True` for now only supports human pose plotting + for i, k in enumerate(kpts): + color_k = [int(x) for x in self.kpt_color[i]] if is_pose else colors(i) + x_coord, y_coord = k[0], k[1] + if x_coord % shape[1] != 0 and y_coord % shape[0] != 0: + if len(k) == 3: + conf = k[2] + if conf < 0.5: + continue + cv2.circle(self.im, (int(x_coord), int(y_coord)), radius, color_k, -1, lineType=cv2.LINE_AA) + + if kpt_line: + ndim = kpts.shape[-1] + for i, sk in enumerate(self.skeleton): + pos1 = (int(kpts[(sk[0] - 1), 0]), int(kpts[(sk[0] - 1), 1])) + pos2 = (int(kpts[(sk[1] - 1), 0]), int(kpts[(sk[1] - 1), 1])) + if ndim == 3: + conf1 = kpts[(sk[0] - 1), 2] + conf2 = kpts[(sk[1] - 1), 2] + if conf1 < 0.5 or conf2 < 0.5: + continue + if pos1[0] % shape[1] == 0 or pos1[1] % shape[0] == 0 or pos1[0] < 0 or pos1[1] < 0: + continue + if pos2[0] % shape[1] == 0 or pos2[1] % shape[0] == 0 or pos2[0] < 0 or pos2[1] < 0: + continue + cv2.line(self.im, pos1, pos2, [int(x) for x in self.limb_color[i]], thickness=2, lineType=cv2.LINE_AA) + if self.pil: + # Convert im back to PIL and update draw + self.fromarray(self.im) + + def rectangle(self, xy, fill=None, outline=None, width=1): + """Add rectangle to image (PIL-only).""" + self.draw.rectangle(xy, fill, outline, width) + + def text(self, xy, text, txt_color=(255, 255, 255), anchor='top', box_style=False): + """Adds text to an image using PIL or cv2.""" + if anchor == 'bottom': # start y from font bottom + w, h = self.font.getsize(text) # text width, height + xy[1] += 1 - h + if self.pil: + if box_style: + w, h = self.font.getsize(text) + self.draw.rectangle((xy[0], xy[1], xy[0] + w + 1, xy[1] + h + 1), fill=txt_color) + # Using `txt_color` for background and draw fg with white color + txt_color = (255, 255, 255) + if '\n' in text: + lines = text.split('\n') + _, h = self.font.getsize(text) + for line in lines: + self.draw.text(xy, line, fill=txt_color, font=self.font) + xy[1] += h + else: + self.draw.text(xy, text, fill=txt_color, font=self.font) + else: + if box_style: + tf = max(self.lw - 1, 1) # font thickness + w, h = cv2.getTextSize(text, 0, fontScale=self.lw / 3, thickness=tf)[0] # text width, height + outside = xy[1] - h >= 3 + p2 = xy[0] + w, xy[1] - h - 3 if outside else xy[1] + h + 3 + cv2.rectangle(self.im, xy, p2, txt_color, -1, cv2.LINE_AA) # filled + # Using `txt_color` for background and draw fg with white color + txt_color = (255, 255, 255) + tf = max(self.lw - 1, 1) # font thickness + cv2.putText(self.im, text, xy, 0, self.lw / 3, txt_color, thickness=tf, lineType=cv2.LINE_AA) + + def fromarray(self, im): + """Update self.im from a numpy array.""" + self.im = im if isinstance(im, Image.Image) else Image.fromarray(im) + self.draw = ImageDraw.Draw(self.im) + + def result(self): + """Return annotated image as array.""" + return np.asarray(self.im) + + +@TryExcept() # known issue https://github.com/ultralytics/yolov5/issues/5395 +@plt_settings() +def plot_labels(boxes, cls, names=(), save_dir=Path(''), on_plot=None): + """Save and plot image with no axis or spines.""" + import pandas as pd + import seaborn as sn + + # Filter matplotlib>=3.7.2 warning + warnings.filterwarnings('ignore', category=UserWarning, message='The figure layout has changed to tight') + + # Plot dataset labels + LOGGER.info(f"Plotting labels to {save_dir / 'labels.jpg'}... ") + b = boxes.transpose() # classes, boxes + nc = int(cls.max() + 1) # number of classes + x = pd.DataFrame(b.transpose(), columns=['x', 'y', 'width', 'height']) + + # Seaborn correlogram + sn.pairplot(x, corner=True, diag_kind='auto', kind='hist', diag_kws=dict(bins=50), plot_kws=dict(pmax=0.9)) + plt.savefig(save_dir / 'labels_correlogram.jpg', dpi=200) + plt.close() + + # Matplotlib labels + ax = plt.subplots(2, 2, figsize=(8, 8), tight_layout=True)[1].ravel() + y = ax[0].hist(cls, bins=np.linspace(0, nc, nc + 1) - 0.5, rwidth=0.8) + with contextlib.suppress(Exception): # color histogram bars by class + [y[2].patches[i].set_color([x / 255 for x in colors(i)]) for i in range(nc)] # known issue #3195 + ax[0].set_ylabel('instances') + if 0 < len(names) < 30: + ax[0].set_xticks(range(len(names))) + ax[0].set_xticklabels(list(names.values()), rotation=90, fontsize=10) + else: + ax[0].set_xlabel('classes') + sn.histplot(x, x='x', y='y', ax=ax[2], bins=50, pmax=0.9) + sn.histplot(x, x='width', y='height', ax=ax[3], bins=50, pmax=0.9) + + # Rectangles + boxes[:, 0:2] = 0.5 # center + boxes = xywh2xyxy(boxes) * 1000 + img = Image.fromarray(np.ones((1000, 1000, 3), dtype=np.uint8) * 255) + for cls, box in zip(cls[:500], boxes[:500]): + ImageDraw.Draw(img).rectangle(box, width=1, outline=colors(cls)) # plot + ax[1].imshow(img) + ax[1].axis('off') + + for a in [0, 1, 2, 3]: + for s in ['top', 'right', 'left', 'bottom']: + ax[a].spines[s].set_visible(False) + + fname = save_dir / 'labels.jpg' + plt.savefig(fname, dpi=200) + plt.close() + if on_plot: + on_plot(fname) + + +def save_one_box(xyxy, im, file=Path('im.jpg'), gain=1.02, pad=10, square=False, BGR=False, save=True): + """Save image crop as {file} with crop size multiple {gain} and {pad} pixels. Save and/or return crop. + + This function takes a bounding box and an image, and then saves a cropped portion of the image according + to the bounding box. Optionally, the crop can be squared, and the function allows for gain and padding + adjustments to the bounding box. + + Args: + xyxy (torch.Tensor or list): A tensor or list representing the bounding box in xyxy format. + im (numpy.ndarray): The input image. + file (Path, optional): The path where the cropped image will be saved. Defaults to 'im.jpg'. + gain (float, optional): A multiplicative factor to increase the size of the bounding box. Defaults to 1.02. + pad (int, optional): The number of pixels to add to the width and height of the bounding box. Defaults to 10. + square (bool, optional): If True, the bounding box will be transformed into a square. Defaults to False. + BGR (bool, optional): If True, the image will be saved in BGR format, otherwise in RGB. Defaults to False. + save (bool, optional): If True, the cropped image will be saved to disk. Defaults to True. + + Returns: + (numpy.ndarray): The cropped image. + + Example: + ```python + from ultralytics.utils.plotting import save_one_box + + xyxy = [50, 50, 150, 150] + im = cv2.imread('image.jpg') + cropped_im = save_one_box(xyxy, im, file='cropped.jpg', square=True) + ``` + """ + + if not isinstance(xyxy, torch.Tensor): # may be list + xyxy = torch.stack(xyxy) + b = xyxy2xywh(xyxy.view(-1, 4)) # boxes + if square: + b[:, 2:] = b[:, 2:].max(1)[0].unsqueeze(1) # attempt rectangle to square + b[:, 2:] = b[:, 2:] * gain + pad # box wh * gain + pad + xyxy = xywh2xyxy(b).long() + clip_boxes(xyxy, im.shape) + crop = im[int(xyxy[0, 1]):int(xyxy[0, 3]), int(xyxy[0, 0]):int(xyxy[0, 2]), ::(1 if BGR else -1)] + if save: + file.parent.mkdir(parents=True, exist_ok=True) # make directory + f = str(increment_path(file).with_suffix('.jpg')) + # cv2.imwrite(f, crop) # save BGR, https://github.com/ultralytics/yolov5/issues/7007 chroma subsampling issue + Image.fromarray(crop[..., ::-1]).save(f, quality=95, subsampling=0) # save RGB + return crop + + +@threaded +def plot_images(images, + batch_idx, + cls, + bboxes=np.zeros(0, dtype=np.float32), + masks=np.zeros(0, dtype=np.uint8), + kpts=np.zeros((0, 51), dtype=np.float32), + paths=None, + fname='images.jpg', + names=None, + on_plot=None): + """Plot image grid with labels.""" + if isinstance(images, torch.Tensor): + images = images.cpu().float().numpy() + if isinstance(cls, torch.Tensor): + cls = cls.cpu().numpy() + if isinstance(bboxes, torch.Tensor): + bboxes = bboxes.cpu().numpy() + if isinstance(masks, torch.Tensor): + masks = masks.cpu().numpy().astype(int) + if isinstance(kpts, torch.Tensor): + kpts = kpts.cpu().numpy() + if isinstance(batch_idx, torch.Tensor): + batch_idx = batch_idx.cpu().numpy() + + max_size = 1920 # max image size + max_subplots = 16 # max image subplots, i.e. 4x4 + bs, _, h, w = images.shape # batch size, _, height, width + bs = min(bs, max_subplots) # limit plot images + ns = np.ceil(bs ** 0.5) # number of subplots (square) + if np.max(images[0]) <= 1: + images *= 255 # de-normalise (optional) + + # Build Image + mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype=np.uint8) # init + for i, im in enumerate(images): + if i == max_subplots: # if last batch has fewer images than we expect + break + x, y = int(w * (i // ns)), int(h * (i % ns)) # block origin + im = im.transpose(1, 2, 0) + mosaic[y:y + h, x:x + w, :] = im + + # Resize (optional) + scale = max_size / ns / max(h, w) + if scale < 1: + h = math.ceil(scale * h) + w = math.ceil(scale * w) + mosaic = cv2.resize(mosaic, tuple(int(x * ns) for x in (w, h))) + + # Annotate + fs = int((h + w) * ns * 0.01) # font size + annotator = Annotator(mosaic, line_width=round(fs / 10), font_size=fs, pil=True, example=names) + for i in range(i + 1): + x, y = int(w * (i // ns)), int(h * (i % ns)) # block origin + annotator.rectangle([x, y, x + w, y + h], None, (255, 255, 255), width=2) # borders + if paths: + annotator.text((x + 5, y + 5), text=Path(paths[i]).name[:40], txt_color=(220, 220, 220)) # filenames + if len(cls) > 0: + idx = batch_idx == i + classes = cls[idx].astype('int') + + if len(bboxes): + boxes = xywh2xyxy(bboxes[idx, :4]).T + labels = bboxes.shape[1] == 4 # labels if no conf column + conf = None if labels else bboxes[idx, 4] # check for confidence presence (label vs pred) + + if boxes.shape[1]: + if boxes.max() <= 1.01: # if normalized with tolerance 0.01 + boxes[[0, 2]] *= w # scale to pixels + boxes[[1, 3]] *= h + elif scale < 1: # absolute coords need scale if image scales + boxes *= scale + boxes[[0, 2]] += x + boxes[[1, 3]] += y + for j, box in enumerate(boxes.T.tolist()): + c = classes[j] + color = colors(c) + c = names.get(c, c) if names else c + if labels or conf[j] > 0.25: # 0.25 conf thresh + label = f'{c}' if labels else f'{c} {conf[j]:.1f}' + annotator.box_label(box, label, color=color) + elif len(classes): + for c in classes: + color = colors(c) + c = names.get(c, c) if names else c + annotator.text((x, y), f'{c}', txt_color=color, box_style=True) + + # Plot keypoints + if len(kpts): + kpts_ = kpts[idx].copy() + if len(kpts_): + if kpts_[..., 0].max() <= 1.01 or kpts_[..., 1].max() <= 1.01: # if normalized with tolerance .01 + kpts_[..., 0] *= w # scale to pixels + kpts_[..., 1] *= h + elif scale < 1: # absolute coords need scale if image scales + kpts_ *= scale + kpts_[..., 0] += x + kpts_[..., 1] += y + for j in range(len(kpts_)): + if labels or conf[j] > 0.25: # 0.25 conf thresh + annotator.kpts(kpts_[j]) + + # Plot masks + if len(masks): + if idx.shape[0] == masks.shape[0]: # overlap_masks=False + image_masks = masks[idx] + else: # overlap_masks=True + image_masks = masks[[i]] # (1, 640, 640) + nl = idx.sum() + index = np.arange(nl).reshape((nl, 1, 1)) + 1 + image_masks = np.repeat(image_masks, nl, axis=0) + image_masks = np.where(image_masks == index, 1.0, 0.0) + + im = np.asarray(annotator.im).copy() + for j, box in enumerate(boxes.T.tolist()): + if labels or conf[j] > 0.25: # 0.25 conf thresh + color = colors(classes[j]) + mh, mw = image_masks[j].shape + if mh != h or mw != w: + mask = image_masks[j].astype(np.uint8) + mask = cv2.resize(mask, (w, h)) + mask = mask.astype(bool) + else: + mask = image_masks[j].astype(bool) + with contextlib.suppress(Exception): + im[y:y + h, x:x + w, :][mask] = im[y:y + h, x:x + w, :][mask] * 0.4 + np.array(color) * 0.6 + annotator.fromarray(im) + annotator.im.save(fname) # save + if on_plot: + on_plot(fname) + + +@plt_settings() +def plot_results(file='path/to/results.csv', dir='', segment=False, pose=False, classify=False, on_plot=None): + """Plot training results.csv. Usage: from utils.plots import *; plot_results('path/to/results.csv').""" + import pandas as pd + save_dir = Path(file).parent if file else Path(dir) + if classify: + fig, ax = plt.subplots(2, 2, figsize=(6, 6), tight_layout=True) + index = [1, 4, 2, 3] + elif segment: + fig, ax = plt.subplots(2, 8, figsize=(18, 6), tight_layout=True) + index = [1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 15, 16, 7, 8, 11, 12] + elif pose: + fig, ax = plt.subplots(2, 9, figsize=(21, 6), tight_layout=True) + index = [1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, 16, 17, 18, 8, 9, 12, 13] + else: + fig, ax = plt.subplots(2, 5, figsize=(12, 6), tight_layout=True) + index = [1, 2, 3, 4, 5, 8, 9, 10, 6, 7] + ax = ax.ravel() + files = list(save_dir.glob('results*.csv')) + assert len(files), f'No results.csv files found in {save_dir.resolve()}, nothing to plot.' + for f in files: + try: + data = pd.read_csv(f) + s = [x.strip() for x in data.columns] + x = data.values[:, 0] + for i, j in enumerate(index): + y = data.values[:, j].astype('float') + # y[y == 0] = np.nan # don't show zero values + ax[i].plot(x, y, marker='.', label=f.stem, linewidth=2, markersize=8) # actual results + ax[i].plot(x, gaussian_filter1d(y, sigma=3), ':', label='smooth', linewidth=2) # smoothing line + ax[i].set_title(s[j], fontsize=12) + # if j in [8, 9, 10]: # share train and val loss y axes + # ax[i].get_shared_y_axes().join(ax[i], ax[i - 5]) + except Exception as e: + LOGGER.warning(f'WARNING: Plotting error for {f}: {e}') + ax[1].legend() + fname = save_dir / 'results.png' + fig.savefig(fname, dpi=200) + plt.close() + if on_plot: + on_plot(fname) + + +def output_to_target(output, max_det=300): + """Convert model output to target format [batch_id, class_id, x, y, w, h, conf] for plotting.""" + targets = [] + for i, o in enumerate(output): + box, conf, cls = o[:max_det, :6].cpu().split((4, 1, 1), 1) + j = torch.full((conf.shape[0], 1), i) + targets.append(torch.cat((j, cls, xyxy2xywh(box), conf), 1)) + targets = torch.cat(targets, 0).numpy() + return targets[:, 0], targets[:, 1], targets[:, 2:] + + +def feature_visualization(x, module_type, stage, n=32, save_dir=Path('runs/detect/exp')): + """ + Visualize feature maps of a given model module during inference. + + Args: + x (torch.Tensor): Features to be visualized. + module_type (str): Module type. + stage (int): Module stage within the model. + n (int, optional): Maximum number of feature maps to plot. Defaults to 32. + save_dir (Path, optional): Directory to save results. Defaults to Path('runs/detect/exp'). + """ + for m in ['Detect', 'Pose', 'Segment']: + if m in module_type: + return + batch, channels, height, width = x.shape # batch, channels, height, width + if height > 1 and width > 1: + f = save_dir / f"stage{stage}_{module_type.split('.')[-1]}_features.png" # filename + + blocks = torch.chunk(x[0].cpu(), channels, dim=0) # select batch index 0, block by channels + n = min(n, channels) # number of plots + fig, ax = plt.subplots(math.ceil(n / 8), 8, tight_layout=True) # 8 rows x n/8 cols + ax = ax.ravel() + plt.subplots_adjust(wspace=0.05, hspace=0.05) + for i in range(n): + ax[i].imshow(blocks[i].squeeze()) # cmap='gray' + ax[i].axis('off') + + LOGGER.info(f'Saving {f}... ({n}/{channels})') + plt.savefig(f, dpi=300, bbox_inches='tight') + plt.close() + np.save(str(f.with_suffix('.npy')), x[0].cpu().numpy()) # npy save diff --git a/ultralytics/utils/tal-origin.py b/ultralytics/utils/tal-origin.py new file mode 100644 index 0000000000000000000000000000000000000000..ea11d24265acfd82b8e905fa3bed0a9f1b96251a --- /dev/null +++ b/ultralytics/utils/tal-origin.py @@ -0,0 +1,276 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch +import torch.nn as nn + +from .checks import check_version +from .metrics import bbox_iou + +TORCH_1_10 = check_version(torch.__version__, '1.10.0') + + +def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9): + """select the positive anchor center in gt + + Args: + xy_centers (Tensor): shape(h*w, 4) + gt_bboxes (Tensor): shape(b, n_boxes, 4) + Return: + (Tensor): shape(b, n_boxes, h*w) + """ + n_anchors = xy_centers.shape[0] + bs, n_boxes, _ = gt_bboxes.shape + lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2) # left-top, right-bottom + bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1) + # return (bbox_deltas.min(3)[0] > eps).to(gt_bboxes.dtype) + return bbox_deltas.amin(3).gt_(eps) + + +def select_highest_overlaps(mask_pos, overlaps, n_max_boxes): + """if an anchor box is assigned to multiple gts, + the one with the highest iou will be selected. + + Args: + mask_pos (Tensor): shape(b, n_max_boxes, h*w) + overlaps (Tensor): shape(b, n_max_boxes, h*w) + Return: + target_gt_idx (Tensor): shape(b, h*w) + fg_mask (Tensor): shape(b, h*w) + mask_pos (Tensor): shape(b, n_max_boxes, h*w) + """ + # (b, n_max_boxes, h*w) -> (b, h*w) + fg_mask = mask_pos.sum(-2) + if fg_mask.max() > 1: # one anchor is assigned to multiple gt_bboxes + mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, n_max_boxes, -1) # (b, n_max_boxes, h*w) + max_overlaps_idx = overlaps.argmax(1) # (b, h*w) + + is_max_overlaps = torch.zeros(mask_pos.shape, dtype=mask_pos.dtype, device=mask_pos.device) + is_max_overlaps.scatter_(1, max_overlaps_idx.unsqueeze(1), 1) + + mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos).float() # (b, n_max_boxes, h*w) + fg_mask = mask_pos.sum(-2) + # Find each grid serve which gt(index) + target_gt_idx = mask_pos.argmax(-2) # (b, h*w) + return target_gt_idx, fg_mask, mask_pos + + +class TaskAlignedAssigner(nn.Module): + """ + A task-aligned assigner for object detection. + + This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric, + which combines both classification and localization information. + + Attributes: + topk (int): The number of top candidates to consider. + num_classes (int): The number of object classes. + alpha (float): The alpha parameter for the classification component of the task-aligned metric. + beta (float): The beta parameter for the localization component of the task-aligned metric. + eps (float): A small value to prevent division by zero. + """ + + def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0, eps=1e-9): + """Initialize a TaskAlignedAssigner object with customizable hyperparameters.""" + super().__init__() + self.topk = topk + self.num_classes = num_classes + self.bg_idx = num_classes + self.alpha = alpha + self.beta = beta + self.eps = eps + + @torch.no_grad() + def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt): + """ + Compute the task-aligned assignment. + Reference https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py + + Args: + pd_scores (Tensor): shape(bs, num_total_anchors, num_classes) + pd_bboxes (Tensor): shape(bs, num_total_anchors, 4) + anc_points (Tensor): shape(num_total_anchors, 2) + gt_labels (Tensor): shape(bs, n_max_boxes, 1) + gt_bboxes (Tensor): shape(bs, n_max_boxes, 4) + mask_gt (Tensor): shape(bs, n_max_boxes, 1) + + Returns: + target_labels (Tensor): shape(bs, num_total_anchors) + target_bboxes (Tensor): shape(bs, num_total_anchors, 4) + target_scores (Tensor): shape(bs, num_total_anchors, num_classes) + fg_mask (Tensor): shape(bs, num_total_anchors) + target_gt_idx (Tensor): shape(bs, num_total_anchors) + """ + self.bs = pd_scores.size(0) + self.n_max_boxes = gt_bboxes.size(1) + + if self.n_max_boxes == 0: + device = gt_bboxes.device + return (torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), torch.zeros_like(pd_bboxes).to(device), + torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device), + torch.zeros_like(pd_scores[..., 0]).to(device)) + + mask_pos, align_metric, overlaps = self.get_pos_mask(pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, + mask_gt) + + target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes) + + # Assigned target + target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask) + + # Normalize + align_metric *= mask_pos + pos_align_metrics = align_metric.amax(axis=-1, keepdim=True) # b, max_num_obj + pos_overlaps = (overlaps * mask_pos).amax(axis=-1, keepdim=True) # b, max_num_obj + norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1) + target_scores = target_scores * norm_align_metric + + return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx + + def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt): + """Get in_gts mask, (b, max_num_obj, h*w).""" + mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes) + # Get anchor_align metric, (b, max_num_obj, h*w) + align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts * mask_gt) + # Get topk_metric mask, (b, max_num_obj, h*w) + mask_topk = self.select_topk_candidates(align_metric, topk_mask=mask_gt.expand(-1, -1, self.topk).bool()) + # Merge all mask to a final mask, (b, max_num_obj, h*w) + mask_pos = mask_topk * mask_in_gts * mask_gt + + return mask_pos, align_metric, overlaps + + def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_gt): + """Compute alignment metric given predicted and ground truth bounding boxes.""" + na = pd_bboxes.shape[-2] + mask_gt = mask_gt.bool() # b, max_num_obj, h*w + overlaps = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device) + bbox_scores = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device) + + ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long) # 2, b, max_num_obj + ind[0] = torch.arange(end=self.bs).view(-1, 1).expand(-1, self.n_max_boxes) # b, max_num_obj + ind[1] = gt_labels.squeeze(-1) # b, max_num_obj + # Get the scores of each grid for each gt cls + bbox_scores[mask_gt] = pd_scores[ind[0], :, ind[1]][mask_gt] # b, max_num_obj, h*w + + # (b, max_num_obj, 1, 4), (b, 1, h*w, 4) + pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, self.n_max_boxes, -1, -1)[mask_gt] + gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[mask_gt] + overlaps[mask_gt] = bbox_iou(gt_boxes, pd_boxes, xywh=False, CIoU=True).squeeze(-1).clamp_(0) + + align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) + return align_metric, overlaps + + def select_topk_candidates(self, metrics, largest=True, topk_mask=None): + """ + Select the top-k candidates based on the given metrics. + + Args: + metrics (Tensor): A tensor of shape (b, max_num_obj, h*w), where b is the batch size, + max_num_obj is the maximum number of objects, and h*w represents the + total number of anchor points. + largest (bool): If True, select the largest values; otherwise, select the smallest values. + topk_mask (Tensor): An optional boolean tensor of shape (b, max_num_obj, topk), where + topk is the number of top candidates to consider. If not provided, + the top-k values are automatically computed based on the given metrics. + + Returns: + (Tensor): A tensor of shape (b, max_num_obj, h*w) containing the selected top-k candidates. + """ + + # (b, max_num_obj, topk) + topk_metrics, topk_idxs = torch.topk(metrics, self.topk, dim=-1, largest=largest) + if topk_mask is None: + topk_mask = (topk_metrics.max(-1, keepdim=True)[0] > self.eps).expand_as(topk_idxs) + # (b, max_num_obj, topk) + topk_idxs.masked_fill_(~topk_mask, 0) + + # (b, max_num_obj, topk, h*w) -> (b, max_num_obj, h*w) + count_tensor = torch.zeros(metrics.shape, dtype=torch.int8, device=topk_idxs.device) + ones = torch.ones_like(topk_idxs[:, :, :1], dtype=torch.int8, device=topk_idxs.device) + for k in range(self.topk): + # Expand topk_idxs for each value of k and add 1 at the specified positions + count_tensor.scatter_add_(-1, topk_idxs[:, :, k:k + 1], ones) + # count_tensor.scatter_add_(-1, topk_idxs, torch.ones_like(topk_idxs, dtype=torch.int8, device=topk_idxs.device)) + # filter invalid bboxes + count_tensor.masked_fill_(count_tensor > 1, 0) + + return count_tensor.to(metrics.dtype) + + def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, fg_mask): + """ + Compute target labels, target bounding boxes, and target scores for the positive anchor points. + + Args: + gt_labels (Tensor): Ground truth labels of shape (b, max_num_obj, 1), where b is the + batch size and max_num_obj is the maximum number of objects. + gt_bboxes (Tensor): Ground truth bounding boxes of shape (b, max_num_obj, 4). + target_gt_idx (Tensor): Indices of the assigned ground truth objects for positive + anchor points, with shape (b, h*w), where h*w is the total + number of anchor points. + fg_mask (Tensor): A boolean tensor of shape (b, h*w) indicating the positive + (foreground) anchor points. + + Returns: + (Tuple[Tensor, Tensor, Tensor]): A tuple containing the following tensors: + - target_labels (Tensor): Shape (b, h*w), containing the target labels for + positive anchor points. + - target_bboxes (Tensor): Shape (b, h*w, 4), containing the target bounding boxes + for positive anchor points. + - target_scores (Tensor): Shape (b, h*w, num_classes), containing the target scores + for positive anchor points, where num_classes is the number + of object classes. + """ + + # Assigned target labels, (b, 1) + batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[..., None] + target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes # (b, h*w) + target_labels = gt_labels.long().flatten()[target_gt_idx] # (b, h*w) + + # Assigned target boxes, (b, max_num_obj, 4) -> (b, h*w) + target_bboxes = gt_bboxes.view(-1, 4)[target_gt_idx] + + # Assigned target scores + target_labels.clamp_(0) + + # 10x faster than F.one_hot() + target_scores = torch.zeros((target_labels.shape[0], target_labels.shape[1], self.num_classes), + dtype=torch.int64, + device=target_labels.device) # (b, h*w, 80) + target_scores.scatter_(2, target_labels.unsqueeze(-1), 1) + + fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes) # (b, h*w, 80) + target_scores = torch.where(fg_scores_mask > 0, target_scores, 0) + + return target_labels, target_bboxes, target_scores + + +def make_anchors(feats, strides, grid_cell_offset=0.5): + """Generate anchors from features.""" + anchor_points, stride_tensor = [], [] + assert feats is not None + dtype, device = feats[0].dtype, feats[0].device + for i, stride in enumerate(strides): + _, _, h, w = feats[i].shape + sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset # shift x + sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y + sy, sx = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx) + anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) + stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) + return torch.cat(anchor_points), torch.cat(stride_tensor) + + +def dist2bbox(distance, anchor_points, xywh=True, dim=-1): + """Transform distance(ltrb) to box(xywh or xyxy).""" + lt, rb = distance.chunk(2, dim) + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + if xywh: + c_xy = (x1y1 + x2y2) / 2 + wh = x2y2 - x1y1 + return torch.cat((c_xy, wh), dim) # xywh bbox + return torch.cat((x1y1, x2y2), dim) # xyxy bbox + + +def bbox2dist(anchor_points, bbox, reg_max): + """Transform bbox(xyxy) to dist(ltrb).""" + x1y1, x2y2 = bbox.chunk(2, -1) + return torch.cat((anchor_points - x1y1, x2y2 - anchor_points), -1).clamp_(0, reg_max - 0.01) # dist (lt, rb) diff --git a/ultralytics/utils/tal.py b/ultralytics/utils/tal.py new file mode 100644 index 0000000000000000000000000000000000000000..a455dbf705c051df1290c0a945d8f9fd0bbc7dd9 --- /dev/null +++ b/ultralytics/utils/tal.py @@ -0,0 +1,276 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +import torch +import torch.nn as nn + +from .checks import check_version +from .metrics import bbox_iou + +TORCH_1_10 = check_version(torch.__version__, '1.10.0') + + +def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9): + """select the positive anchor center in gt + + Args: + xy_centers (Tensor): shape(h*w, 4) + gt_bboxes (Tensor): shape(b, n_boxes, 4) + Return: + (Tensor): shape(b, n_boxes, h*w) + """ + n_anchors = xy_centers.shape[0] + bs, n_boxes, _ = gt_bboxes.shape + lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2) # left-top, right-bottom + bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1) + # return (bbox_deltas.min(3)[0] > eps).to(gt_bboxes.dtype) + return bbox_deltas.amin(3).gt_(eps) + + +def select_highest_overlaps(mask_pos, overlaps, n_max_boxes): + """if an anchor box is assigned to multiple gts, + the one with the highest iou will be selected. + + Args: + mask_pos (Tensor): shape(b, n_max_boxes, h*w) + overlaps (Tensor): shape(b, n_max_boxes, h*w) + Return: + target_gt_idx (Tensor): shape(b, h*w) + fg_mask (Tensor): shape(b, h*w) + mask_pos (Tensor): shape(b, n_max_boxes, h*w) + """ + # (b, n_max_boxes, h*w) -> (b, h*w) + fg_mask = mask_pos.sum(-2) + if fg_mask.max() > 1: # one anchor is assigned to multiple gt_bboxes + mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, n_max_boxes, -1) # (b, n_max_boxes, h*w) + max_overlaps_idx = overlaps.argmax(1) # (b, h*w) + + is_max_overlaps = torch.zeros(mask_pos.shape, dtype=mask_pos.dtype, device=mask_pos.device) + is_max_overlaps.scatter_(1, max_overlaps_idx.unsqueeze(1), 1) + + mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos).float() # (b, n_max_boxes, h*w) + fg_mask = mask_pos.sum(-2) + # Find each grid serve which gt(index) + target_gt_idx = mask_pos.argmax(-2) # (b, h*w) + return target_gt_idx, fg_mask, mask_pos + + +class TaskAlignedAssigner(nn.Module): + """ + A task-aligned assigner for object detection. + + This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric, + which combines both classification and localization information. + + Attributes: + topk (int): The number of top candidates to consider. + num_classes (int): The number of object classes. + alpha (float): The alpha parameter for the classification component of the task-aligned metric. + beta (float): The beta parameter for the localization component of the task-aligned metric. + eps (float): A small value to prevent division by zero. + """ + + def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0, eps=1e-9): + """Initialize a TaskAlignedAssigner object with customizable hyperparameters.""" + super().__init__() + self.topk = topk + self.num_classes = num_classes + self.bg_idx = num_classes + self.alpha = alpha + self.beta = beta + self.eps = eps + + @torch.no_grad() + def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt): + """ + Compute the task-aligned assignment. + Reference https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py + + Args: + pd_scores (Tensor): shape(bs, num_total_anchors, num_classes) + pd_bboxes (Tensor): shape(bs, num_total_anchors, 4) + anc_points (Tensor): shape(num_total_anchors, 2) + gt_labels (Tensor): shape(bs, n_max_boxes, 1) + gt_bboxes (Tensor): shape(bs, n_max_boxes, 4) + mask_gt (Tensor): shape(bs, n_max_boxes, 1) + + Returns: + target_labels (Tensor): shape(bs, num_total_anchors) + target_bboxes (Tensor): shape(bs, num_total_anchors, 4) + target_scores (Tensor): shape(bs, num_total_anchors, num_classes) + fg_mask (Tensor): shape(bs, num_total_anchors) + target_gt_idx (Tensor): shape(bs, num_total_anchors) + """ + self.bs = pd_scores.size(0) + self.n_max_boxes = gt_bboxes.size(1) + + if self.n_max_boxes == 0: + device = gt_bboxes.device + return (torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), torch.zeros_like(pd_bboxes).to(device), + torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device), + torch.zeros_like(pd_scores[..., 0]).to(device)) + + mask_pos, align_metric, overlaps = self.get_pos_mask(pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, + mask_gt) + + target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes) + + # Assigned target + target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask) + + # Normalize + align_metric *= mask_pos + pos_align_metrics = align_metric.amax(axis=-1, keepdim=True) # b, max_num_obj + pos_overlaps = (overlaps * mask_pos).amax(axis=-1, keepdim=True) # b, max_num_obj + norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1) + target_scores = target_scores * norm_align_metric + + return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx + + def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt): + """Get in_gts mask, (b, max_num_obj, h*w).""" + mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes) + # Get anchor_align metric, (b, max_num_obj, h*w) + align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts * mask_gt) + # Get topk_metric mask, (b, max_num_obj, h*w) + mask_topk = self.select_topk_candidates(align_metric, topk_mask=mask_gt.expand(-1, -1, self.topk).bool()) + # Merge all mask to a final mask, (b, max_num_obj, h*w) + mask_pos = mask_topk * mask_in_gts * mask_gt + + return mask_pos, align_metric, overlaps + + def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_gt): + """Compute alignment metric given predicted and ground truth bounding boxes.""" + na = pd_bboxes.shape[-2] + mask_gt = mask_gt.bool() # b, max_num_obj, h*w + overlaps = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device) + bbox_scores = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device) + + ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long) # 2, b, max_num_obj + ind[0] = torch.arange(end=self.bs).view(-1, 1).expand(-1, self.n_max_boxes) # b, max_num_obj + ind[1] = gt_labels.squeeze(-1) # b, max_num_obj + # Get the scores of each grid for each gt cls + bbox_scores[mask_gt] = pd_scores[ind[0], :, ind[1]][mask_gt] # b, max_num_obj, h*w + + # (b, max_num_obj, 1, 4), (b, 1, h*w, 4) + pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, self.n_max_boxes, -1, -1)[mask_gt] + gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[mask_gt] + overlaps[mask_gt] = bbox_iou(gt_boxes, pd_boxes, xywh=False, type_='WIoU')[1].squeeze(-1).clamp_(0) + + align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) + return align_metric, overlaps + + def select_topk_candidates(self, metrics, largest=True, topk_mask=None): + """ + Select the top-k candidates based on the given metrics. + + Args: + metrics (Tensor): A tensor of shape (b, max_num_obj, h*w), where b is the batch size, + max_num_obj is the maximum number of objects, and h*w represents the + total number of anchor points. + largest (bool): If True, select the largest values; otherwise, select the smallest values. + topk_mask (Tensor): An optional boolean tensor of shape (b, max_num_obj, topk), where + topk is the number of top candidates to consider. If not provided, + the top-k values are automatically computed based on the given metrics. + + Returns: + (Tensor): A tensor of shape (b, max_num_obj, h*w) containing the selected top-k candidates. + """ + + # (b, max_num_obj, topk) + topk_metrics, topk_idxs = torch.topk(metrics, self.topk, dim=-1, largest=largest) + if topk_mask is None: + topk_mask = (topk_metrics.max(-1, keepdim=True)[0] > self.eps).expand_as(topk_idxs) + # (b, max_num_obj, topk) + topk_idxs.masked_fill_(~topk_mask, 0) + + # (b, max_num_obj, topk, h*w) -> (b, max_num_obj, h*w) + count_tensor = torch.zeros(metrics.shape, dtype=torch.int8, device=topk_idxs.device) + ones = torch.ones_like(topk_idxs[:, :, :1], dtype=torch.int8, device=topk_idxs.device) + for k in range(self.topk): + # Expand topk_idxs for each value of k and add 1 at the specified positions + count_tensor.scatter_add_(-1, topk_idxs[:, :, k:k + 1], ones) + # count_tensor.scatter_add_(-1, topk_idxs, torch.ones_like(topk_idxs, dtype=torch.int8, device=topk_idxs.device)) + # filter invalid bboxes + count_tensor.masked_fill_(count_tensor > 1, 0) + + return count_tensor.to(metrics.dtype) + + def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, fg_mask): + """ + Compute target labels, target bounding boxes, and target scores for the positive anchor points. + + Args: + gt_labels (Tensor): Ground truth labels of shape (b, max_num_obj, 1), where b is the + batch size and max_num_obj is the maximum number of objects. + gt_bboxes (Tensor): Ground truth bounding boxes of shape (b, max_num_obj, 4). + target_gt_idx (Tensor): Indices of the assigned ground truth objects for positive + anchor points, with shape (b, h*w), where h*w is the total + number of anchor points. + fg_mask (Tensor): A boolean tensor of shape (b, h*w) indicating the positive + (foreground) anchor points. + + Returns: + (Tuple[Tensor, Tensor, Tensor]): A tuple containing the following tensors: + - target_labels (Tensor): Shape (b, h*w), containing the target labels for + positive anchor points. + - target_bboxes (Tensor): Shape (b, h*w, 4), containing the target bounding boxes + for positive anchor points. + - target_scores (Tensor): Shape (b, h*w, num_classes), containing the target scores + for positive anchor points, where num_classes is the number + of object classes. + """ + + # Assigned target labels, (b, 1) + batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[..., None] + target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes # (b, h*w) + target_labels = gt_labels.long().flatten()[target_gt_idx] # (b, h*w) + + # Assigned target boxes, (b, max_num_obj, 4) -> (b, h*w) + target_bboxes = gt_bboxes.view(-1, 4)[target_gt_idx] + + # Assigned target scores + target_labels.clamp_(0) + + # 10x faster than F.one_hot() + target_scores = torch.zeros((target_labels.shape[0], target_labels.shape[1], self.num_classes), + dtype=torch.int64, + device=target_labels.device) # (b, h*w, 80) + target_scores.scatter_(2, target_labels.unsqueeze(-1), 1) + + fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes) # (b, h*w, 80) + target_scores = torch.where(fg_scores_mask > 0, target_scores, 0) + + return target_labels, target_bboxes, target_scores + + +def make_anchors(feats, strides, grid_cell_offset=0.5): + """Generate anchors from features.""" + anchor_points, stride_tensor = [], [] + assert feats is not None + dtype, device = feats[0].dtype, feats[0].device + for i, stride in enumerate(strides): + _, _, h, w = feats[i].shape + sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset # shift x + sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y + sy, sx = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx) + anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) + stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) + return torch.cat(anchor_points), torch.cat(stride_tensor) + + +def dist2bbox(distance, anchor_points, xywh=True, dim=-1): + """Transform distance(ltrb) to box(xywh or xyxy).""" + lt, rb = distance.chunk(2, dim) + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + if xywh: + c_xy = (x1y1 + x2y2) / 2 + wh = x2y2 - x1y1 + return torch.cat((c_xy, wh), dim) # xywh bbox + return torch.cat((x1y1, x2y2), dim) # xyxy bbox + + +def bbox2dist(anchor_points, bbox, reg_max): + """Transform bbox(xyxy) to dist(ltrb).""" + x1y1, x2y2 = bbox.chunk(2, -1) + return torch.cat((anchor_points - x1y1, x2y2 - anchor_points), -1).clamp_(0, reg_max - 0.01) # dist (lt, rb) diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2d383ea4af64d0c7b5ec05eaaa261f20ad520af5 --- /dev/null +++ b/ultralytics/utils/torch_utils.py @@ -0,0 +1,520 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +import math +import os +import platform +import random +import time +from contextlib import contextmanager +from copy import deepcopy +from pathlib import Path +from typing import Union + +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +import torchvision + +from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, RANK, __version__ +from ultralytics.utils.checks import check_version + +try: + import thop +except ImportError: + thop = None + +TORCHVISION_0_10 = check_version(torchvision.__version__, '0.10.0') +TORCH_1_9 = check_version(torch.__version__, '1.9.0') +TORCH_1_11 = check_version(torch.__version__, '1.11.0') +TORCH_1_12 = check_version(torch.__version__, '1.12.0') +TORCH_2_0 = check_version(torch.__version__, '2.0.0') + + +@contextmanager +def torch_distributed_zero_first(local_rank: int): + """Decorator to make all processes in distributed training wait for each local_master to do something.""" + initialized = torch.distributed.is_available() and torch.distributed.is_initialized() + if initialized and local_rank not in (-1, 0): + dist.barrier(device_ids=[local_rank]) + yield + if initialized and local_rank == 0: + dist.barrier(device_ids=[0]) + + +def smart_inference_mode(): + """Applies torch.inference_mode() decorator if torch>=1.9.0 else torch.no_grad() decorator.""" + + def decorate(fn): + """Applies appropriate torch decorator for inference mode based on torch version.""" + return (torch.inference_mode if TORCH_1_9 else torch.no_grad)()(fn) + + return decorate + + +def get_cpu_info(): + """Return a string with system CPU information, i.e. 'Apple M2'.""" + import cpuinfo # pip install py-cpuinfo + + k = 'brand_raw', 'hardware_raw', 'arch_string_raw' # info keys sorted by preference (not all keys always available) + info = cpuinfo.get_cpu_info() # info dict + string = info.get(k[0] if k[0] in info else k[1] if k[1] in info else k[2], 'unknown') + return string.replace('(R)', '').replace('CPU ', '').replace('@ ', '') + + +def select_device(device='', batch=0, newline=False, verbose=True): + """Selects PyTorch Device. Options are device = None or 'cpu' or 0 or '0' or '0,1,2,3'.""" + s = f'Ultralytics YOLOv{__version__} 🚀 Python-{platform.python_version()} torch-{torch.__version__} ' + device = str(device).lower() + for remove in 'cuda:', 'none', '(', ')', '[', ']', "'", ' ': + device = device.replace(remove, '') # to string, 'cuda:0' -> '0' and '(0, 1)' -> '0,1' + cpu = device == 'cpu' + mps = device == 'mps' # Apple Metal Performance Shaders (MPS) + if cpu or mps: + os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # force torch.cuda.is_available() = False + elif device: # non-cpu device requested + if device == 'cuda': + device = '0' + visible = os.environ.get('CUDA_VISIBLE_DEVICES', None) + os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable - must be before assert is_available() + if not (torch.cuda.is_available() and torch.cuda.device_count() >= len(device.replace(',', ''))): + LOGGER.info(s) + install = 'See https://pytorch.org/get-started/locally/ for up-to-date torch install instructions if no ' \ + 'CUDA devices are seen by torch.\n' if torch.cuda.device_count() == 0 else '' + raise ValueError(f"Invalid CUDA 'device={device}' requested." + f" Use 'device=cpu' or pass valid CUDA device(s) if available," + f" i.e. 'device=0' or 'device=0,1,2,3' for Multi-GPU.\n" + f'\ntorch.cuda.is_available(): {torch.cuda.is_available()}' + f'\ntorch.cuda.device_count(): {torch.cuda.device_count()}' + f"\nos.environ['CUDA_VISIBLE_DEVICES']: {visible}\n" + f'{install}') + + if not cpu and not mps and torch.cuda.is_available(): # prefer GPU if available + devices = device.split(',') if device else '0' # range(torch.cuda.device_count()) # i.e. 0,1,6,7 + n = len(devices) # device count + if n > 1 and batch > 0 and batch % n != 0: # check batch_size is divisible by device_count + raise ValueError(f"'batch={batch}' must be a multiple of GPU count {n}. Try 'batch={batch // n * n}' or " + f"'batch={batch // n * n + n}', the nearest batch sizes evenly divisible by {n}.") + space = ' ' * (len(s) + 1) + for i, d in enumerate(devices): + p = torch.cuda.get_device_properties(i) + s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n" # bytes to MB + arg = 'cuda:0' + elif mps and getattr(torch, 'has_mps', False) and torch.backends.mps.is_available() and TORCH_2_0: + # Prefer MPS if available + s += f'MPS ({get_cpu_info()})\n' + arg = 'mps' + else: # revert to CPU + s += f'CPU ({get_cpu_info()})\n' + arg = 'cpu' + + if verbose and RANK == -1: + LOGGER.info(s if newline else s.rstrip()) + return torch.device(arg) + + +def time_sync(): + """PyTorch-accurate time.""" + if torch.cuda.is_available(): + torch.cuda.synchronize() + return time.time() + + +def fuse_conv_and_bn(conv, bn): + """Fuse Conv2d() and BatchNorm2d() layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/.""" + fusedconv = nn.Conv2d(conv.in_channels, + conv.out_channels, + kernel_size=conv.kernel_size, + stride=conv.stride, + padding=conv.padding, + dilation=conv.dilation, + groups=conv.groups, + bias=True).requires_grad_(False).to(conv.weight.device) + + # Prepare filters + w_conv = conv.weight.clone().view(conv.out_channels, -1) + w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) + fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) + + # Prepare spatial bias + b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias + b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) + fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) + + return fusedconv + + +def fuse_deconv_and_bn(deconv, bn): + """Fuse ConvTranspose2d() and BatchNorm2d() layers.""" + fuseddconv = nn.ConvTranspose2d(deconv.in_channels, + deconv.out_channels, + kernel_size=deconv.kernel_size, + stride=deconv.stride, + padding=deconv.padding, + output_padding=deconv.output_padding, + dilation=deconv.dilation, + groups=deconv.groups, + bias=True).requires_grad_(False).to(deconv.weight.device) + + # Prepare filters + w_deconv = deconv.weight.clone().view(deconv.out_channels, -1) + w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) + fuseddconv.weight.copy_(torch.mm(w_bn, w_deconv).view(fuseddconv.weight.shape)) + + # Prepare spatial bias + b_conv = torch.zeros(deconv.weight.size(1), device=deconv.weight.device) if deconv.bias is None else deconv.bias + b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) + fuseddconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) + + return fuseddconv + + +def model_info(model, detailed=False, verbose=True, imgsz=640): + """Model information. imgsz may be int or list, i.e. imgsz=640 or imgsz=[640, 320].""" + if not verbose: + return + n_p = get_num_params(model) # number of parameters + n_g = get_num_gradients(model) # number of gradients + n_l = len(list(model.modules())) # number of layers + if detailed: + LOGGER.info( + f"{'layer':>5} {'name':>40} {'gradient':>9} {'parameters':>12} {'shape':>20} {'mu':>10} {'sigma':>10}") + for i, (name, p) in enumerate(model.named_parameters()): + name = name.replace('module_list.', '') + LOGGER.info('%5g %40s %9s %12g %20s %10.3g %10.3g %10s' % + (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std(), p.dtype)) + + flops = get_flops(model, imgsz) + fused = ' (fused)' if getattr(model, 'is_fused', lambda: False)() else '' + fs = f', {flops:.1f} GFLOPs' if flops else '' + yaml_file = getattr(model, 'yaml_file', '') or getattr(model, 'yaml', {}).get('yaml_file', '') + model_name = Path(yaml_file).stem.replace('yolo', 'YOLO') or 'Model' + LOGGER.info(f'{model_name} summary{fused}: {n_l} layers, {n_p} parameters, {n_g} gradients{fs}') + return n_l, n_p, n_g, flops + + +def get_num_params(model): + """Return the total number of parameters in a YOLO model.""" + return sum(x.numel() for x in model.parameters()) + + +def get_num_gradients(model): + """Return the total number of parameters with gradients in a YOLO model.""" + return sum(x.numel() for x in model.parameters() if x.requires_grad) + + +def model_info_for_loggers(trainer): + """ + Return model info dict with useful model information. + + Example for YOLOv8n: + {'model/parameters': 3151904, + 'model/GFLOPs': 8.746, + 'model/speed_ONNX(ms)': 41.244, + 'model/speed_TensorRT(ms)': 3.211, + 'model/speed_PyTorch(ms)': 18.755} + """ + if trainer.args.profile: # profile ONNX and TensorRT times + from ultralytics.utils.benchmarks import ProfileModels + results = ProfileModels([trainer.last], device=trainer.device).profile()[0] + results.pop('model/name') + else: # only return PyTorch times from most recent validation + results = { + 'model/parameters': get_num_params(trainer.model), + 'model/GFLOPs': round(get_flops(trainer.model), 3)} + results['model/speed_PyTorch(ms)'] = round(trainer.validator.speed['inference'], 3) + return results + + +def get_flops(model, imgsz=640): + """Return a YOLO model's FLOPs.""" + try: + model = de_parallel(model) + p = next(model.parameters()) + stride = max(int(model.stride.max()), 32) if hasattr(model, 'stride') else 32 # max stride + im = torch.empty((1, p.shape[1], stride, stride), device=p.device) # input image in BCHW format + flops = thop.profile(deepcopy(model), inputs=[im], verbose=False)[0] / 1E9 * 2 if thop else 0 # stride GFLOPs + imgsz = imgsz if isinstance(imgsz, list) else [imgsz, imgsz] # expand if int/float + return flops * imgsz[0] / stride * imgsz[1] / stride # 640x640 GFLOPs + except Exception: + return 0 + + +def get_flops_with_torch_profiler(model, imgsz=640): + """Compute model FLOPs (thop alternative).""" + model = de_parallel(model) + p = next(model.parameters()) + stride = (max(int(model.stride.max()), 32) if hasattr(model, 'stride') else 32) * 2 # max stride + im = torch.zeros((1, p.shape[1], stride, stride), device=p.device) # input image in BCHW format + with torch.profiler.profile(with_flops=True) as prof: + model(im) + flops = sum(x.flops for x in prof.key_averages()) / 1E9 + imgsz = imgsz if isinstance(imgsz, list) else [imgsz, imgsz] # expand if int/float + flops = flops * imgsz[0] / stride * imgsz[1] / stride # 640x640 GFLOPs + return flops + + +def initialize_weights(model): + """Initialize model weights to random values.""" + for m in model.modules(): + t = type(m) + if t is nn.Conv2d: + pass # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif t is nn.BatchNorm2d: + m.eps = 1e-3 + m.momentum = 0.03 + elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]: + m.inplace = True + + +def scale_img(img, ratio=1.0, same_shape=False, gs=32): # img(16,3,256,416) + # Scales img(bs,3,y,x) by ratio constrained to gs-multiple + if ratio == 1.0: + return img + h, w = img.shape[2:] + s = (int(h * ratio), int(w * ratio)) # new size + img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize + if not same_shape: # pad/crop img + h, w = (math.ceil(x * ratio / gs) * gs for x in (h, w)) + return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447) # value = imagenet mean + + +def make_divisible(x, divisor): + """Returns nearest x divisible by divisor.""" + if isinstance(divisor, torch.Tensor): + divisor = int(divisor.max()) # to int + return math.ceil(x / divisor) * divisor + + +def copy_attr(a, b, include=(), exclude=()): + """Copies attributes from object 'b' to object 'a', with options to include/exclude certain attributes.""" + for k, v in b.__dict__.items(): + if (len(include) and k not in include) or k.startswith('_') or k in exclude: + continue + else: + setattr(a, k, v) + + +def get_latest_opset(): + """Return second-most (for maturity) recently supported ONNX opset by this version of torch.""" + return max(int(k[14:]) for k in vars(torch.onnx) if 'symbolic_opset' in k) - 1 # opset + + +def intersect_dicts(da, db, exclude=()): + """Returns a dictionary of intersecting keys with matching shapes, excluding 'exclude' keys, using da values.""" + return {k: v for k, v in da.items() if k in db and all(x not in k for x in exclude) and v.shape == db[k].shape} + + +def is_parallel(model): + """Returns True if model is of type DP or DDP.""" + return isinstance(model, (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)) + + +def de_parallel(model): + """De-parallelize a model: returns single-GPU model if model is of type DP or DDP.""" + return model.module if is_parallel(model) else model + + +def one_cycle(y1=0.0, y2=1.0, steps=100): + """Returns a lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf.""" + return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1 + + +def init_seeds(seed=0, deterministic=False): + """Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html.""" + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # for Multi-GPU, exception safe + # torch.backends.cudnn.benchmark = True # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287 + if deterministic: + if TORCH_2_0: + torch.use_deterministic_algorithms(True, warn_only=True) # warn if deterministic is not possible + torch.backends.cudnn.deterministic = True + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + os.environ['PYTHONHASHSEED'] = str(seed) + else: + LOGGER.warning('WARNING ⚠️ Upgrade to torch>=2.0.0 for deterministic training.') + else: + torch.use_deterministic_algorithms(False) + torch.backends.cudnn.deterministic = False + + +class ModelEMA: + """Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models + Keeps a moving average of everything in the model state_dict (parameters and buffers) + For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + To disable EMA set the `enabled` attribute to `False`. + """ + + def __init__(self, model, decay=0.9999, tau=2000, updates=0): + """Create EMA.""" + self.ema = deepcopy(de_parallel(model)).eval() # FP32 EMA + self.updates = updates # number of EMA updates + self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs) + for p in self.ema.parameters(): + p.requires_grad_(False) + self.enabled = True + + def update(self, model): + """Update EMA parameters.""" + if self.enabled: + self.updates += 1 + d = self.decay(self.updates) + + msd = de_parallel(model).state_dict() # model state_dict + for k, v in self.ema.state_dict().items(): + if v.dtype.is_floating_point: # true for FP16 and FP32 + v *= d + v += (1 - d) * msd[k].detach() + # assert v.dtype == msd[k].dtype == torch.float32, f'{k}: EMA {v.dtype}, model {msd[k].dtype}' + + def update_attr(self, model, include=(), exclude=('process_group', 'reducer')): + """Updates attributes and saves stripped model with optimizer removed.""" + if self.enabled: + copy_attr(self.ema, model, include, exclude) + + +def strip_optimizer(f: Union[str, Path] = 'best.pt', s: str = '') -> None: + """ + Strip optimizer from 'f' to finalize training, optionally save as 's'. + + Args: + f (str): file path to model to strip the optimizer from. Default is 'best.pt'. + s (str): file path to save the model with stripped optimizer to. If not provided, 'f' will be overwritten. + + Returns: + None + + Usage: + from pathlib import Path + from ultralytics.utils.torch_utils import strip_optimizer + for f in Path('/Users/glennjocher/Downloads/weights').rglob('*.pt'): + strip_optimizer(f) + """ + # Use dill (if exists) to serialize the lambda functions where pickle does not do this + try: + import dill as pickle + except ImportError: + import pickle + + x = torch.load(f, map_location=torch.device('cpu')) + if 'model' not in x: + LOGGER.info(f'Skipping {f}, not a valid Ultralytics model.') + return + + if hasattr(x['model'], 'args'): + x['model'].args = dict(x['model'].args) # convert from IterableSimpleNamespace to dict + args = {**DEFAULT_CFG_DICT, **x['train_args']} if 'train_args' in x else None # combine args + if x.get('ema'): + x['model'] = x['ema'] # replace model with ema + for k in 'optimizer', 'best_fitness', 'ema', 'updates': # keys + x[k] = None + x['epoch'] = -1 + x['model'].half() # to FP16 + for p in x['model'].parameters(): + p.requires_grad = False + x['train_args'] = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # strip non-default keys + # x['model'].args = x['train_args'] + torch.save(x, s or f, pickle_module=pickle) + mb = os.path.getsize(s or f) / 1E6 # filesize + LOGGER.info(f"Optimizer stripped from {f},{f' saved as {s},' if s else ''} {mb:.1f}MB") + + +def profile(input, ops, n=10, device=None): + """ + YOLOv8 speed/memory/FLOPs profiler + + Usage: + input = torch.randn(16, 3, 640, 640) + m1 = lambda x: x * torch.sigmoid(x) + m2 = nn.SiLU() + profile(input, [m1, m2], n=100) # profile over 100 iterations + """ + results = [] + if not isinstance(device, torch.device): + device = select_device(device) + LOGGER.info(f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}{'forward (ms)':>14s}{'backward (ms)':>14s}" + f"{'input':>24s}{'output':>24s}") + + for x in input if isinstance(input, list) else [input]: + x = x.to(device) + x.requires_grad = True + for m in ops if isinstance(ops, list) else [ops]: + m = m.to(device) if hasattr(m, 'to') else m # device + m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m + tf, tb, t = 0, 0, [0, 0, 0] # dt forward, backward + try: + flops = thop.profile(m, inputs=[x], verbose=False)[0] / 1E9 * 2 if thop else 0 # GFLOPs + except Exception: + flops = 0 + + try: + for _ in range(n): + t[0] = time_sync() + y = m(x) + t[1] = time_sync() + try: + _ = (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward() + t[2] = time_sync() + except Exception: # no backward method + # print(e) # for debug + t[2] = float('nan') + tf += (t[1] - t[0]) * 1000 / n # ms per op forward + tb += (t[2] - t[1]) * 1000 / n # ms per op backward + mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0 # (GB) + s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else 'list' for x in (x, y)) # shapes + p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0 # parameters + LOGGER.info(f'{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}') + results.append([p, flops, mem, tf, tb, s_in, s_out]) + except Exception as e: + LOGGER.info(e) + results.append(None) + torch.cuda.empty_cache() + return results + + +class EarlyStopping: + """ + Early stopping class that stops training when a specified number of epochs have passed without improvement. + """ + + def __init__(self, patience=50): + """ + Initialize early stopping object + + Args: + patience (int, optional): Number of epochs to wait after fitness stops improving before stopping. + """ + self.best_fitness = 0.0 # i.e. mAP + self.best_epoch = 0 + self.patience = patience or float('inf') # epochs to wait after fitness stops improving to stop + self.possible_stop = False # possible stop may occur next epoch + + def __call__(self, epoch, fitness): + """ + Check whether to stop training + + Args: + epoch (int): Current epoch of training + fitness (float): Fitness value of current epoch + + Returns: + (bool): True if training should stop, False otherwise + """ + if fitness is None: # check if fitness=None (happens when val=False) + return False + + if fitness >= self.best_fitness: # >= 0 to allow for early zero-fitness stage of training + self.best_epoch = epoch + self.best_fitness = fitness + delta = epoch - self.best_epoch # epochs without improvement + self.possible_stop = delta >= (self.patience - 1) # possible stop may occur next epoch + stop = delta >= self.patience # stop training if patience exceeded + if stop: + LOGGER.info(f'Stopping training early as no improvement observed in last {self.patience} epochs. ' + f'Best results observed at epoch {self.best_epoch}, best model saved as best.pt.\n' + f'To update EarlyStopping(patience={self.patience}) pass a new patience value, ' + f'i.e. `patience=300` or use `patience=0` to disable EarlyStopping.') + return stop diff --git a/ultralytics/utils/tuner.py b/ultralytics/utils/tuner.py new file mode 100644 index 0000000000000000000000000000000000000000..3c5d6deb7fd110bea26815dd19d4f1f4e74e5fc7 --- /dev/null +++ b/ultralytics/utils/tuner.py @@ -0,0 +1,120 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +from ultralytics.cfg import TASK2DATA, TASK2METRIC +from ultralytics.utils import DEFAULT_CFG_DICT, LOGGER, NUM_THREADS + + +def run_ray_tune(model, + space: dict = None, + grace_period: int = 10, + gpu_per_trial: int = None, + max_samples: int = 10, + **train_args): + """ + Runs hyperparameter tuning using Ray Tune. + + Args: + model (YOLO): Model to run the tuner on. + space (dict, optional): The hyperparameter search space. Defaults to None. + grace_period (int, optional): The grace period in epochs of the ASHA scheduler. Defaults to 10. + gpu_per_trial (int, optional): The number of GPUs to allocate per trial. Defaults to None. + max_samples (int, optional): The maximum number of trials to run. Defaults to 10. + train_args (dict, optional): Additional arguments to pass to the `train()` method. Defaults to {}. + + Returns: + (dict): A dictionary containing the results of the hyperparameter search. + + Raises: + ModuleNotFoundError: If Ray Tune is not installed. + """ + if train_args is None: + train_args = {} + + try: + from ray import tune + from ray.air import RunConfig + from ray.air.integrations.wandb import WandbLoggerCallback + from ray.tune.schedulers import ASHAScheduler + except ImportError: + raise ModuleNotFoundError('Tuning hyperparameters requires Ray Tune. Install with: pip install "ray[tune]"') + + try: + import wandb + + assert hasattr(wandb, '__version__') + except (ImportError, AssertionError): + wandb = False + + default_space = { + # 'optimizer': tune.choice(['SGD', 'Adam', 'AdamW', 'NAdam', 'RAdam', 'RMSProp']), + 'lr0': tune.uniform(1e-5, 1e-1), + 'lrf': tune.uniform(0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) + 'momentum': tune.uniform(0.6, 0.98), # SGD momentum/Adam beta1 + 'weight_decay': tune.uniform(0.0, 0.001), # optimizer weight decay 5e-4 + 'warmup_epochs': tune.uniform(0.0, 5.0), # warmup epochs (fractions ok) + 'warmup_momentum': tune.uniform(0.0, 0.95), # warmup initial momentum + 'box': tune.uniform(0.02, 0.2), # box loss gain + 'cls': tune.uniform(0.2, 4.0), # cls loss gain (scale with pixels) + 'hsv_h': tune.uniform(0.0, 0.1), # image HSV-Hue augmentation (fraction) + 'hsv_s': tune.uniform(0.0, 0.9), # image HSV-Saturation augmentation (fraction) + 'hsv_v': tune.uniform(0.0, 0.9), # image HSV-Value augmentation (fraction) + 'degrees': tune.uniform(0.0, 45.0), # image rotation (+/- deg) + 'translate': tune.uniform(0.0, 0.9), # image translation (+/- fraction) + 'scale': tune.uniform(0.0, 0.9), # image scale (+/- gain) + 'shear': tune.uniform(0.0, 10.0), # image shear (+/- deg) + 'perspective': tune.uniform(0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 + 'flipud': tune.uniform(0.0, 1.0), # image flip up-down (probability) + 'fliplr': tune.uniform(0.0, 1.0), # image flip left-right (probability) + 'mosaic': tune.uniform(0.0, 1.0), # image mixup (probability) + 'mixup': tune.uniform(0.0, 1.0), # image mixup (probability) + 'copy_paste': tune.uniform(0.0, 1.0)} # segment copy-paste (probability) + + def _tune(config): + """ + Trains the YOLO model with the specified hyperparameters and additional arguments. + + Args: + config (dict): A dictionary of hyperparameters to use for training. + + Returns: + None. + """ + model._reset_callbacks() + config.update(train_args) + model.train(**config) + + # Get search space + if not space: + space = default_space + LOGGER.warning('WARNING ⚠️ search space not provided, using default search space.') + + # Get dataset + data = train_args.get('data', TASK2DATA[model.task]) + space['data'] = data + if 'data' not in train_args: + LOGGER.warning(f'WARNING ⚠️ data not provided, using default "data={data}".') + + # Define the trainable function with allocated resources + trainable_with_resources = tune.with_resources(_tune, {'cpu': NUM_THREADS, 'gpu': gpu_per_trial or 0}) + + # Define the ASHA scheduler for hyperparameter search + asha_scheduler = ASHAScheduler(time_attr='epoch', + metric=TASK2METRIC[model.task], + mode='max', + max_t=train_args.get('epochs') or DEFAULT_CFG_DICT['epochs'] or 100, + grace_period=grace_period, + reduction_factor=3) + + # Define the callbacks for the hyperparameter search + tuner_callbacks = [WandbLoggerCallback(project='YOLOv8-tune')] if wandb else [] + + # Create the Ray Tune hyperparameter search tuner + tuner = tune.Tuner(trainable_with_resources, + param_space=space, + tune_config=tune.TuneConfig(scheduler=asha_scheduler, num_samples=max_samples), + run_config=RunConfig(callbacks=tuner_callbacks, storage_path='./runs/tune')) + + # Run the hyperparameter search + tuner.fit() + + # Return the results of the hyperparameter search + return tuner.get_results() diff --git a/ultralytics/yolo/__init__.py b/ultralytics/yolo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db9c60867771e0b2eaec1d989799dc825323cbd2 --- /dev/null +++ b/ultralytics/yolo/__init__.py @@ -0,0 +1,5 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + +from . import v8 + +__all__ = 'v8', # tuple or list diff --git a/ultralytics/yolo/cfg/__init__.py b/ultralytics/yolo/cfg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..10fc701e162f80c1ebd6e898a1652e3dac538f70 --- /dev/null +++ b/ultralytics/yolo/cfg/__init__.py @@ -0,0 +1,10 @@ +import importlib +import sys + +from ultralytics.utils import LOGGER + +# Set modules in sys.modules under their old name +sys.modules['ultralytics.yolo.cfg'] = importlib.import_module('ultralytics.cfg') + +LOGGER.warning("WARNING ⚠️ 'ultralytics.yolo.cfg' is deprecated since '8.0.136' and will be removed in '8.1.0'. " + "Please use 'ultralytics.cfg' instead.") diff --git a/ultralytics/yolo/data/__init__.py b/ultralytics/yolo/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bf54414ce9c47d70a358b2aea33947f37d8d97cc --- /dev/null +++ b/ultralytics/yolo/data/__init__.py @@ -0,0 +1,17 @@ +import importlib +import sys + +from ultralytics.utils import LOGGER + +# Set modules in sys.modules under their old name +sys.modules['ultralytics.yolo.data'] = importlib.import_module('ultralytics.data') +# This is for updating old cls models, or the way in following warning won't work. +sys.modules['ultralytics.yolo.data.augment'] = importlib.import_module('ultralytics.data.augment') + +DATA_WARNING = """WARNING ⚠️ 'ultralytics.yolo.data' is deprecated since '8.0.136' and will be removed in '8.1.0'. Please use 'ultralytics.data' instead. +Note this warning may be related to loading older models. You can update your model to current structure with: + import torch + ckpt = torch.load("model.pt") # applies to both official and custom models + torch.save(ckpt, "updated-model.pt") +""" +LOGGER.warning(DATA_WARNING) diff --git a/ultralytics/yolo/engine/__init__.py b/ultralytics/yolo/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e053869edf28cd31f16c31e585bb4efcbdea6124 --- /dev/null +++ b/ultralytics/yolo/engine/__init__.py @@ -0,0 +1,10 @@ +import importlib +import sys + +from ultralytics.utils import LOGGER + +# Set modules in sys.modules under their old name +sys.modules['ultralytics.yolo.engine'] = importlib.import_module('ultralytics.engine') + +LOGGER.warning("WARNING ⚠️ 'ultralytics.yolo.engine' is deprecated since '8.0.136' and will be removed in '8.1.0'. " + "Please use 'ultralytics.engine' instead.") diff --git a/ultralytics/yolo/utils/__init__.py b/ultralytics/yolo/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..434fc2397b9b1a1689380af31018e434ef37cd66 --- /dev/null +++ b/ultralytics/yolo/utils/__init__.py @@ -0,0 +1,15 @@ +import importlib +import sys + +from ultralytics.utils import LOGGER + +# Set modules in sys.modules under their old name +sys.modules['ultralytics.yolo.utils'] = importlib.import_module('ultralytics.utils') + +UTILS_WARNING = """WARNING ⚠️ 'ultralytics.yolo.utils' is deprecated since '8.0.136' and will be removed in '8.1.0'. Please use 'ultralytics.utils' instead. +Note this warning may be related to loading older models. You can update your model to current structure with: + import torch + ckpt = torch.load("model.pt") # applies to both official and custom models + torch.save(ckpt, "updated-model.pt") +""" +LOGGER.warning(UTILS_WARNING) diff --git a/ultralytics/yolo/v8/__init__.py b/ultralytics/yolo/v8/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c80b2a50fcc6921be4dbe46d53173f7d9f3cd174 --- /dev/null +++ b/ultralytics/yolo/v8/__init__.py @@ -0,0 +1,10 @@ +import importlib +import sys + +from ultralytics.utils import LOGGER + +# Set modules in sys.modules under their old name +sys.modules['ultralytics.yolo.v8'] = importlib.import_module('ultralytics.models.yolo') + +LOGGER.warning("WARNING ⚠️ 'ultralytics.yolo.v8' is deprecated since '8.0.136' and will be removed in '8.1.0'. " + "Please use 'ultralytics.models.yolo' instead.")