LLM-foundry update March 26, 2024 23:50:31

by irenedea - opened Mar 26

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+4942

-203

Files changed (49) hide show

act_ckpt.py +119 -0
async_eval_callback.py +322 -0
attention.py +31 -82
blocks.py +7 -7
builders.py +323 -0
callback_with_config.py +12 -0
callbacks.py +26 -0
checkpoint_conversion_helpers.py +206 -0
collator.py +256 -0
config_utils.py +105 -0
configuration_mpt.py +7 -17
curriculum_learning_callback.py +62 -0
data.py +76 -0
data_prep_utils.py +84 -0
dataloader.py +313 -0
eval_gauntlet_callback.py +141 -0
exceptions.py +162 -0
fdiff_callback.py +44 -0
ffn.py +2 -2
finetuning.py +2 -0
hf.py +3 -0
hf_causal_lm.py +14 -0
hf_checkpointer.py +221 -0
hf_fsdp.py +165 -0
hf_t5.py +8 -0
huggingface_hub_utils.py +102 -0
interfaces.py +1 -0
llmfoundry.py +16 -0
logging_utils.py +24 -0
meta_init_context.py +1 -1
model_download_utils.py +186 -0
model_wrapper.py +36 -0
modeling_mpt.py +66 -87
monolithic_ckpt_callback.py +66 -0
mosaicml_logger_utils.py +69 -0
mpt.py +2 -0
packing.py +272 -0
param_init_fns.py +4 -4
prompt_files.py +46 -0
registry.py +24 -0
registry_utils.py +115 -0
resumption_callbacks.py +64 -0
scheduled_gc_callback.py +57 -0
tasks.py +581 -0
text_data.py +217 -0
tiktoken.py +218 -0
tokenizers.py +1 -0
utils.py +11 -0
warnings.py +52 -3

act_ckpt.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from typing import Any
+import torch
+from .attention import ATTN_CLASS_REGISTRY
+from .blocks import MPTBlock
+from .ffn import FFN_CLASS_REGISTRY
+from .norm import NORM_CLASS_REGISTRY
+def pass_on_block_idx(parent: torch.nn.Module):
+    if not hasattr(parent, 'block_idx') or not hasattr(parent, 'max_block_idx'):
+        return
+    for child in parent.children():
+        child.block_idx = parent.block_idx
+        child.max_block_idx = parent.max_block_idx
+        if child.children():
+            pass_on_block_idx(child)
+def get_act_ckpt_module(mod_name: str) -> Any:
+    """Get the module type from the module name."""
+    if mod_name.lower() == 'mptblock':
+        mod_type = MPTBlock
+    elif mod_name in ATTN_CLASS_REGISTRY:
+        mod_type = ATTN_CLASS_REGISTRY[mod_name]
+    elif mod_name in FFN_CLASS_REGISTRY:
+        mod_type = FFN_CLASS_REGISTRY[mod_name]
+    elif mod_name in NORM_CLASS_REGISTRY:
+        mod_type = NORM_CLASS_REGISTRY[mod_name]
+    else:
+        msg = ', '.join(list(ATTN_CLASS_REGISTRY.keys()) + list(FFN_CLASS_REGISTRY.keys()) + list(NORM_CLASS_REGISTRY.keys()) + ['MPTBlock'])
+        raise ValueError(f'{mod_name} (specified in activation_checkpointing_target) is not a recognized option out of available options {msg}.')
+    return mod_type
+def parse_ele_str(ele: str, max_block_idx: int) -> list:
+    """Parse a string in target_blocks and return a list of block ids to add.
+    Supported formats are: first-n, middle-m, last-k, range-i-j which correspond
+    to the first n, the middle m,  the last k, and the range [i, j).
+    """
+    to_add = None
+    if ele.startswith('first-'):
+        assert ele[6:].isdigit(), f'Invalid target_blocks element {ele}'
+        to_add = list(range(min(int(ele[6:]), max_block_idx + 1)))
+    elif ele.startswith('last-'):
+        assert ele[5:].isdigit(), f'Invalid target_blocks element {ele}'
+        to_add = list(range(max(max_block_idx - int(ele[5:]) + 1, 0), max_block_idx + 1))
+    elif ele.startswith('middle-'):
+        assert ele[7:].isdigit(), f'Invalid target_blocks element {ele}'
+        num = int(ele[7:])
+        start = max(max_block_idx // 2 - num // 2, 0)
+        end = min(start + num, max_block_idx + 1)
+        to_add = list(range(start, end))
+    elif ele.startswith('range-'):
+        r = ele[6:].split('-')
+        assert len(r) == 2, f'Invalid target_blocks element {ele}'
+        start, end = (int(r[0]), int(r[1]))
+        start = max(start, 0)
+        end = min(end, max_block_idx + 1)
+        to_add = list(range(start, end))
+    else:
+        raise ValueError(f'Invalid target_blocks element {ele}')
+    return to_add
+def get_target_block_list(target_blocks: Any, max_block_idx: int) -> list:
+    """Parse the user input and return a list of block ids."""
+    candidate_block_ids = []
+    if isinstance(target_blocks, int):
+        candidate_block_ids = list(range(target_blocks))
+    elif isinstance(target_blocks, list):
+        for ele in target_blocks:
+            if isinstance(ele, int):
+                candidate_block_ids.append(ele)
+            elif isinstance(ele, str):
+                to_add = parse_ele_str(ele, max_block_idx)
+                candidate_block_ids.extend(to_add)
+            else:
+                raise ValueError(f'target_blocks must be a list of integers or "first-n", "middle-m", "last-k", or "range-i-j" where n, m, k, i, j are integers, but got {target_blocks}')
+    elif isinstance(target_blocks, str):
+        target_blocks = target_blocks.replace(' ', '')
+        for ele in target_blocks.split(','):
+            to_add = parse_ele_str(ele, max_block_idx)
+            candidate_block_ids.extend(to_add)
+    else:
+        raise ValueError(f'target_blocks must be either a single intege, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}')
+    candidate_block_ids = list(set(candidate_block_ids))
+    return candidate_block_ids
+def check_mapping_blocks_overlap(mapping: dict, max_block_idx: int) -> None:
+    """Check if the block ids in the mapping overlap with each other."""
+    all_blocks = [None] * (max_block_idx + 1)
+    for k, v in mapping.items():
+        if v == -1:
+            v = list(range(max_block_idx + 1))
+        for vv in v:
+            if vv < 0 or vv > max_block_idx:
+                continue
+            elif all_blocks[vv] is not None:
+                raise ValueError(f'Block {vv} is assigned to both {k} and {all_blocks[vv]}. Each block can only have one granularity of activation checkpointing. Make sure the target_blocks in activation_checkpointing_target do not overlap. For more details, refer to the docs of activation_checkpointing_fn.')
+            else:
+                all_blocks[vv] = k
+def build_act_ckpt_mod_to_blocks(act_ckpt_target: Any, top_module: Any, max_block_idx: int) -> dict:
+    act_ckpt_mod_to_blocks = {}
+    if act_ckpt_target is None or act_ckpt_target == []:
+        mod = top_module
+        act_ckpt_mod_to_blocks[mod] = -1
+    elif isinstance(act_ckpt_target, str):
+        mod = get_act_ckpt_module(act_ckpt_target)
+        act_ckpt_mod_to_blocks[mod] = -1
+    elif isinstance(act_ckpt_target, list):
+        for target in act_ckpt_target:
+            mod = get_act_ckpt_module(target)
+            act_ckpt_mod_to_blocks[mod] = -1
+    elif isinstance(act_ckpt_target, dict):
+        for k, v in act_ckpt_target.items():
+            mod = get_act_ckpt_module(k)
+            block_ids = get_target_block_list(v, max_block_idx)
+            act_ckpt_mod_to_blocks[mod] = block_ids
+    else:
+        raise ValueError(f'activation_checkpointing_target must be either a single string or a list or a dict, but got {type(act_ckpt_target)}')
+    return act_ckpt_mod_to_blocks

async_eval_callback.py ADDED Viewed

	@@ -0,0 +1,322 @@

+"""Run the eval loop asynchronously as part of a MosaicML platform run.
+This callback is currently experimental. The API may change in the future.
+"""
+import logging
+import os
+import warnings
+from collections import Counter
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+from .interfaces import CallbackWithConfig
+from mcli import Run, RunConfig, create_run, get_run
+log = logging.getLogger(__name__)
+REQUIRED_PARAMS_FOR_EVAL = {'device_eval_batch_size', 'icl_tasks', 'max_seq_len', 'model', 'tokenizer'}
+OPTIONAL_PARAMS_FOR_EVAL = {'dist_timeout', 'eval_gauntlet', 'eval_loader', 'fsdp_config', 'eval_subset_num_batches', 'icl_subset_num_batches', 'loggers', 'precision', 'python_log_level', 'seed'}
+RUN_NAME_PREFIX = 'eval'
+MAX_RUN_NAME_BASE_LENGTH = 55
+def get_run_name(training_run_name: str, current_interval: str) -> str:
+    """Get the new eval run name.
+    Args:
+        training_run_name: The name of the current training run
+        current_interval: The current interval string of the training run
+    Returns:
+        The new run name
+    """
+    name_without_uuid_suffix = training_run_name.rsplit('-', 1)[0]
+    max_length = MAX_RUN_NAME_BASE_LENGTH - len(RUN_NAME_PREFIX) - len(current_interval) - 2
+    if len(name_without_uuid_suffix) > max_length:
+        new_name = name_without_uuid_suffix[:max_length]
+        log.warning(f'Training run name {name_without_uuid_suffix} may be too long,' + f' truncating to {new_name}')
+        name_without_uuid_suffix = new_name
+    return f'{RUN_NAME_PREFIX}-{current_interval}-{name_without_uuid_suffix}'
+def get_eval_parameters(parameters: Dict[str, Any], checkpoint: str, training_run_name: str) -> Dict[str, Any]:
+    """Get the parameters needed for the eval run.
+    Args:
+        parameters: The parameters from the training run
+        checkpoint: The path to the latest checkpoint
+        training_run_name: The name of the training run
+    Returns:
+        The parameters needed for the eval run as a dict
+    """
+    looking_for = REQUIRED_PARAMS_FOR_EVAL.copy()
+    subset_keys = {}
+    for key in parameters:
+        if key in OPTIONAL_PARAMS_FOR_EVAL:
+            subset_keys[key] = parameters[key]
+        elif key in REQUIRED_PARAMS_FOR_EVAL:
+            subset_keys[key] = parameters[key]
+            looking_for.remove(key)
+    if looking_for:
+        raise Exception(f'Missing the following required parameters for async eval: {looking_for}')
+    for logger, config in subset_keys.get('loggers', {}).items():
+        if logger == 'wandb':
+            config['group'] = config.pop('name', training_run_name)
+    model = subset_keys.pop('model')
+    model_name = model.get('name', None)
+    if not model_name:
+        raise Exception(f'Async evaluation requires "name" keys for models')
+    new_models = {'model_name': model_name, 'model': model, 'load_path': checkpoint}
+    tokenizer = subset_keys.pop('tokenizer', None)
+    if tokenizer is not None:
+        new_models['tokenizer'] = tokenizer
+    subset_keys['models'] = [new_models]
+    return subset_keys
+def validate_interval(interval: Union[str, int, Time], save_interval: Union[str, int, Time]) -> Time:
+    new_save_interval = Time.from_input(save_interval, TimeUnit.EPOCH)
+    async_interval = Time.from_input(interval, TimeUnit.EPOCH)
+    if new_save_interval.unit != async_interval.unit:
+        raise ValueError('Save interval and async eval interval must be in the same unit')
+    if async_interval < new_save_interval:
+        raise ValueError('Async eval interval must be equal or greater (less frequent) than save interval')
+    if async_interval.value % new_save_interval.value != 0:
+        raise ValueError('Async eval interval must be a multiple of save interval')
+    return async_interval
+def validate_eval_run_config(eval_run_config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    if not eval_run_config:
+        return {}
+    run_config = eval_run_config.copy()
+    supported_keys = {'image', 'command', 'compute', 'scheduling'}
+    found_unsupported = set()
+    for key in run_config:
+        if key not in supported_keys:
+            found_unsupported.add(key)
+    if found_unsupported:
+        raise ValueError(f"Unsupported eval run config keys found: {', '.join(found_unsupported)}" + f'. Supported keys: {supported_keys}')
+    return run_config
+CHECKS_PER_INTERVAL = 4
+class AsyncEval(CallbackWithConfig):
+    """Run the eval loop asynchronously as part of a MosaicML platform run.
+    This callback is currently experimental. The API may change in the future.
+    Args:
+        training_params: Dict[str, Any]: The parameter config from the training run
+        interval: Union[str, int, Time]: The interval describing how often eval runs should be
+            launched. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`.
+            Otherwise, the unit must be either :attr:`.TimeUnit.EPOCH`, :attr:`.TimeUnit.BATCH`,
+            :attr:`.TimeUnit.TOKEN`, or :attr:`.TimeUnit.SAMPLE`.
+        eval_run_config: Optional[Dict[str, Any]]: A subset of mcli run config values to use
+            for the eval run. If not specified, any fields from run config will be created
+            dynamically from the training run config and the interval. The following fields
+            are supported:
+            - ``image``: Image of the eval run. Default: same as training run
+            - ``command``: Command to run for the eval run. Default: calls
+                `composer scripts/eval/eval.py $PARAMETERS`. If custom setup is needed,
+                the command should include calling the eval script with $PARAMETERS
+            - ``compute``: Compute to use for the eval run. Default: same cluster as
+                the training run and a single node (8 GPUs)
+            - ``scheduling``: Scheduling to use for the eval run. Default: same as training run
+            All fields are optional, but if specified, must be valid for a mcli run config. We
+            provide this optional config to give you the most flexibility in customizing the eval
+            run, but it is recommended to use the default values unless you have a specific use case
+    """
+    def __init__(self, training_params: Dict[str, Any], interval: Union[str, int, Time], eval_run_config: Optional[Dict[str, Any]]=None):
+        for required in ('save_interval', 'save_folder'):
+            if required not in training_params:
+                raise ValueError(f'{required} required for async eval')
+        if '/' in training_params.get('save_filename', ''):
+            raise ValueError('AsyncEval not supported for save_filename that includes a path')
+        self.checkpoint_save_folder = training_params['save_folder']
+        self.training_params = training_params
+        self.eval_run_config = validate_eval_run_config(eval_run_config)
+        self.current_run = self._get_current_run()
+        get_eval_parameters(parameters=training_params, checkpoint='test', training_run_name=self.current_run.name)
+        self.interval = validate_interval(interval, self.training_params['save_interval'])
+        check_interval_value = max(self.interval.value // CHECKS_PER_INTERVAL, 1)
+        self.check_interval = Time(check_interval_value, self.interval.unit)
+        self.checkpoints_evaled: Dict[Time, Tuple[str, str]] = {}
+        self.is_at_check_interval = create_interval_scheduler(self.check_interval, include_end_of_training=False)
+        log.info('Initialized AsyncEval callback. Will generate runs at ' + f'interval {interval}, checking at {self.check_interval}')
+    def state_dict(self) -> Dict[str, Any]:
+        checkpoints_evaled = []
+        for eval_ts, (checkpoint, run_name) in self.checkpoints_evaled.items():
+            eval_ts_dict = {'value': eval_ts.value, 'unit': eval_ts.unit.value}
+            checkpoints_evaled.append((eval_ts_dict, checkpoint, run_name))
+        return {'checkpoints_evaled': checkpoints_evaled}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        previous_checkpoints_evaled = state_dict.get('checkpoints_evaled', [])
+        if previous_checkpoints_evaled:
+            for eval_ts, checkpoint, run_name in previous_checkpoints_evaled:
+                eval_ts = Time(eval_ts['value'], TimeUnit(eval_ts['unit']))
+                self.checkpoints_evaled[eval_ts] = (checkpoint, run_name)
+            log.info(f'Loaded previous checkpoints evaled: {self.checkpoints_evaled}')
+    @staticmethod
+    def _get_ready_sharded_checkpoints(checkpointer_checkpoints: Dict[str, Timestamp], remote_files: List[str]) -> Dict[str, Timestamp]:
+        """Identify checkpoints ready to be evaled based on remote files.
+        This has special logic for sharded checkpoints to consider checkpoints composed
+        of multiple shards (one per gpu) and metadata
+        Args:
+            checkpointer_checkpoints: All checkpoints from the checkpointer state
+            remote_files: List of remote files in the save folder
+        Returns:
+            Dict of checkpoints that are complete and ready to be evaled
+        """
+        remote_file_group_counts = Counter()
+        for f in remote_files:
+            checkpoint_ts_path = Path(f).parts[-2]
+            remote_file_group_counts[checkpoint_ts_path] += 1
+        checkpoints_to_eval = {}
+        for checkpoint, checkpoint_ts in checkpointer_checkpoints.items():
+            checkpoint_ts_path = Path(checkpoint).parts[-2]
+            expected_shard_count = dist.get_world_size() + 1
+            if remote_file_group_counts[checkpoint_ts_path] != expected_shard_count:
+                log.debug(f'Checkpoint {checkpoint} not fully uploaded (missing shards ' + f'{remote_file_group_counts[checkpoint_ts_path]}/{expected_shard_count}), skipping')
+                continue
+            checkpoints_to_eval[checkpoint_ts_path] = checkpoint_ts
+        return checkpoints_to_eval
+    @staticmethod
+    def _get_ready_single_checkpoints(checkpointer_checkpoints: Dict[str, Timestamp], remote_checkpoints: List[str]) -> Dict[str, Timestamp]:
+        """Identify checkpoints ready to be evaled based on remote checkpoints.
+        This is much simpler than the sharded case, because there is only one file
+        Args:
+            checkpointer_checkpoints: All checkpoints from the checkpointer state
+            remote_checkpoints: List of remote checkpoints in the save folder
+        Returns:
+            Dict of checkpoints that are complete and ready to be evaled
+        """
+        unique_remote_checkpoints = set(remote_checkpoints)
+        checkpoints_to_eval = {}
+        for checkpoint, checkpoint_ts in checkpointer_checkpoints.items():
+            checkpoint_ts_path = Path(checkpoint).parts[-1]
+            if checkpoint not in unique_remote_checkpoints:
+                log.debug(f'Checkpoint {checkpoint} not fully uploaded, skipping')
+                continue
+            checkpoints_to_eval[checkpoint_ts_path] = checkpoint_ts
+        return checkpoints_to_eval
+    def _get_checkpoints_and_launch_runs(self, state: State):
+        """Get the latest checkpoint from the training run.
+        Args:
+            state: The current state of the training run
+        Returns:
+            Returns checkpoints that have not been evaled
+        """
+        checkpointer = None
+        for callback in state.callbacks:
+            if isinstance(callback, CheckpointSaver):
+                if checkpointer is None:
+                    checkpointer = callback
+                else:
+                    log.warning('Multiple checkpoint savers found. Using the first one')
+        if not checkpointer:
+            warnings.warn('No checkpoint saver callback found. Skipping eval')
+            return
+        if not checkpointer.all_saved_checkpoints_to_timestamp:
+            log.debug('No saved checkpoints found on the checkpointer. Skipping eval')
+            return
+        log.debug(f'Found {len(checkpointer.all_saved_checkpoints_to_timestamp)} ' + f'checkpoints: {checkpointer.all_saved_checkpoints_to_timestamp}')
+        remote_checkpoints = list_remote_objects(self.checkpoint_save_folder)
+        if not remote_checkpoints:
+            log.debug('No saved checkpoints found yet on remote. Skipping eval')
+            return
+        if state.fsdp_sharded_state_dict_enabled:
+            checkpoints_to_eval = self._get_ready_sharded_checkpoints(checkpointer.all_saved_checkpoints_to_timestamp, remote_checkpoints)
+        else:
+            checkpoints_to_eval = self._get_ready_single_checkpoints(checkpointer.all_saved_checkpoints_to_timestamp, remote_checkpoints)
+        for checkpoint_interval_path, checkpoint_timestamp in checkpoints_to_eval.items():
+            checkpoint_ts = checkpoint_timestamp.get(self.interval.unit)
+            if checkpoint_ts.value % self.interval.value != 0:
+                log.debug(f'Checkpoint {checkpoint_interval_path} ({checkpoint_ts}) is ' + f'not at an eval interval ({self.interval}), skipping')
+                continue
+            if checkpoint_ts in self.checkpoints_evaled:
+                continue
+            full_checkpoint_path = f'{self.checkpoint_save_folder}/{checkpoint_interval_path}'
+            eval_run = self.launch_run(full_checkpoint_path, checkpoint_ts)
+            self.checkpoints_evaled[checkpoint_ts] = (full_checkpoint_path, eval_run.name)
+    def run_event(self, event: Event, state: State, logger: Logger) -> None:
+        del logger
+        should_launch_run = all([state.get_elapsed_duration() is not None, self.is_at_check_interval(state, event), dist.get_global_rank() == 0])
+        if should_launch_run:
+            self._get_checkpoints_and_launch_runs(state)
+    def close(self, state: State, logger: Logger) -> None:
+        del logger
+        if dist.get_global_rank() != 0:
+            return
+        self._get_checkpoints_and_launch_runs(state)
+        latest_timestamp = state.timestamp.get(self.interval.unit)
+        if latest_timestamp not in self.checkpoints_evaled:
+            save_latest_filename = self.training_params.get('save_latest_filename', None)
+            if not save_latest_filename:
+                rank = dist.get_global_rank()
+                save_latest_filename = f'latest-rank{rank}.pt'
+            checkpoint = f'{self.checkpoint_save_folder}/{save_latest_filename}'
+            eval_run = self.launch_run(checkpoint, latest_timestamp)
+            self.checkpoints_evaled[latest_timestamp] = (checkpoint, eval_run.name)
+        log.info(f'AsyncEval callback finished. Launched {len(self.checkpoints_evaled)} eval runs:')
+        for checkpoint_ts, (checkpoint, run_name) in self.checkpoints_evaled.items():
+            log.info(f'  {checkpoint_ts}: {checkpoint}, {run_name}')
+    def _get_current_run(self) -> Run:
+        if os.environ.get(MOSAICML_PLATFORM_ENV_VAR, 'false').lower() == 'false':
+            raise RuntimeError('AsyncEval callback is only supported when running on the MosaicML platform')
+        run_name = os.environ.get(RUN_NAME_ENV_VAR, None)
+        if not run_name:
+            raise RuntimeError('RUN_NAME environment variable must be set to use the AsyncEval callback')
+        return get_run(run_name, include_details=True)
+    def launch_run(self, checkpoint: str, current_interval: Time) -> Run:
+        """Launch a new eval run.
+        Args:
+            checkpoint: The checkpoint to eval
+            current_interval: The interval of the checkpoint
+        Returns:
+            The launched run (mcli.Run type)
+        """
+        log.info(f'Launching eval run for {checkpoint} at {current_interval}')
+        cfg = self.current_run.submitted_config
+        default_compute = {'gpus': 8, 'cluster': self.current_run.cluster}
+        run_name = get_run_name(self.current_run.name, str(current_interval))
+        params = get_eval_parameters(parameters=self.training_params, checkpoint=checkpoint, training_run_name=self.current_run.name)
+        params['run_name'] = run_name
+        integrations = cfg.integrations
+        found_llm_foundry, installation_path = (False, 'llm-foundry')
+        for i in integrations:
+            if i['integration_type'] != 'git_repo':
+                continue
+            if not i['git_repo'].endswith('llm-foundry'):
+                continue
+            found_llm_foundry = True
+            if i.get('path'):
+                installation_path = i['path']
+        if not found_llm_foundry:
+            from .llmfoundry import __version__ as latest_foundry_version
+            version = f'v{latest_foundry_version}'
+            log.warning('No github integration found for llm-foundry. Adding installation ' + f'to eval run for latest foundry release ({version}). ' + 'To use a fork, custom branch, or custom version, configure ' + 'llm-foundry installation through a github integration')
+            integrations.append({'integration_type': 'git_repo', 'git_repo': 'mosaicml/llm-foundry', 'git_branch': version, 'pip_install': '-e .[gpu]', 'ssh_clone': False})
+        metadata = cfg.metadata
+        metadata['eval_timestamp'] = current_interval.value
+        metadata['eval_timestamp_unit'] = current_interval.unit.value
+        default_command = f'cd {installation_path}/scripts \n composer eval/eval.py $PARAMETERS'
+        run_config = RunConfig(name=run_name, image=self.eval_run_config.get('image', self.current_run.image), command=self.eval_run_config.get('command', default_command), compute=self.eval_run_config.get('compute', default_compute), scheduling=self.eval_run_config.get('scheduling', self.current_run.submitted_config.scheduling), integrations=integrations, env_variables=cfg.env_variables, metadata=cfg.metadata, parameters=params)
+        log.info(f'Creating new run with config: \n{run_config}')
+        new_run = create_run(run_config)
+        log.info(f'Launched new run {new_run.name} inside eval loop')
+        return new_run

attention.py CHANGED Viewed

@@ -31,9 +31,6 @@ def is_transformers_version_gte(hf_version: str) -> bool:
 def check_alibi_support(attention_impl: str) -> bool:
     return attention_impl != 'flash' or is_flash_v2_installed(v2_version='v2.4.2')
-if is_flash_v1_installed():
-    import transformers
-    transformers.utils.is_flash_attn_available = lambda : False
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool) -> bool:
@@ -53,7 +50,7 @@ def repeat_kv_for_gqa(hidden: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     if n_rep == 1:
         return hidden
-    (b, s, kv_n_heads, d) = hidden.shape
     hidden = hidden[:, :, :, None, :].expand(b, s, kv_n_heads, n_rep, d)
     return hidden.reshape(b, s, kv_n_heads * n_rep, d)
@@ -66,7 +63,7 @@ def scaled_multihead_dot_product_attention(query: torch.Tensor, key: torch.Tenso
             k = torch.cat([past_key_value[0], k], dim=3)
             v = torch.cat([past_key_value[1], v], dim=2)
         past_key_value = (k, v)
-    (b, _, s_q, d) = q.shape
     s_k = k.size(-1)
     if kv_n_heads > 1 and kv_n_heads < n_heads:
         k = repeat_kv_for_gqa(k.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
@@ -130,7 +127,7 @@ def flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n
         past_key_value = (key, value)
     if attn_bias is not None:
         raise NotImplementedError(f'attn_bias not implemented for flash attn.')
-    (batch_size, seqlen) = query.shape[:2]
     indices_q = flash_attn_padding_info['indices_q']
     indices_k = flash_attn_padding_info['indices_k']
     indices_v = flash_attn_padding_info['indices_v']
@@ -169,65 +166,17 @@ def flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
     return (output, None, past_key_value)
-def triton_flash_attn_fn(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, n_heads: int, kv_n_heads: int, past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]]=None, softmax_scale: Optional[float]=None, attn_bias: Optional[torch.Tensor]=None, key_padding_mask: Optional[torch.Tensor]=None, is_causal: bool=False, dropout_p: float=0.0, training: bool=False, needs_weights: bool=False) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]:
-    try:
-        from .flash_attn_triton import flash_attn_func
-    except:
-        _installed = False
-        if version.parse(torch.__version__) < version.parse('2.0.0'):
-            _installed = True
-            try:
-                from flash_attn.flash_attn_triton import flash_attn_func
-            except:
-                _installed = False
-        if not _installed:
-            raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU ' + 'and `pip install .[gpu]` if installing from llm-foundry source or ' + '`pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` ' + 'if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). ' + 'Note: (1) requires you have CMake and PyTorch already installed.')
-    check_valid_inputs(query, key, value)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            key = torch.cat([past_key_value[0], key], dim=1)
-            value = torch.cat([past_key_value[1], value], dim=1)
-        past_key_value = (key, value)
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-    if dropout_p:
-        raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
-    dropout_p = dropout_p if training else 0.0
-    if needs_weights:
-        raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
-    if key_padding_mask is not None:
-        warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
-        (b_size, s_k) = key_padding_mask.shape[:2]
-        if attn_bias is None:
-            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
-        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
-    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
-    key = rearrange(key, 'b s (h d) -> b s h d', h=kv_n_heads)
-    value = rearrange(value, 'b s (h d) -> b s h d', h=kv_n_heads)
-    if kv_n_heads == 1:
-        key = key.repeat(1, 1, n_heads, 1)
-        value = value.repeat(1, 1, n_heads, 1)
-    elif kv_n_heads < n_heads:
-        key = repeat_kv_for_gqa(key, n_heads // kv_n_heads)
-        value = repeat_kv_for_gqa(value, n_heads // kv_n_heads)
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
-    output = attn_output.view(*attn_output.shape[:2], -1)
-    return (output, None, past_key_value)
 class GroupedQueryAttention(nn.Module):
     """Grouped Query Attention (GQA) is a generalization of Multi-head (MHA).
     and Multi-query attention (MQA).
     This allows the user to set a variable of number of kv_n_heads, rather than
-    just n_heads or 1, as in MHA and MQA. Using torch or triton attention
     implementation enables user to also use additive bias.
     """
-    def __init__(self, d_model: int, n_heads: int, kv_n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
         super().__init__()
         self.attn_impl = attn_impl
         self.clip_qkv = clip_qkv
@@ -251,8 +200,7 @@ class GroupedQueryAttention(nn.Module):
             self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
         self.attn_dropout_p = attn_pdrop
         fc_kwargs: dict[str, Any] = {'bias': bias}
-        if fc_type != 'te':
-            fc_kwargs['device'] = device
         self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs)
         fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)]
         self.Wqkv._fused = (0, fuse_splits)
@@ -265,8 +213,6 @@ class GroupedQueryAttention(nn.Module):
             self.k_ln = norm_class(norm_size, device=device)
         if self.attn_impl == 'flash':
             self.attn_fn = flash_attn_fn
-        elif self.attn_impl == 'triton':
-            self.attn_fn = triton_flash_attn_fn
         elif self.attn_impl == 'torch':
             self.attn_fn = scaled_multihead_dot_product_attention
         else:
@@ -278,12 +224,12 @@ class GroupedQueryAttention(nn.Module):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2)
         key_padding_mask = attention_mask
         if self.qk_ln or self.qk_gn:
-            (q_shape, k_shape) = (query.shape, key.shape)
             if self.qk_gn:
-                (b, s) = query.shape[:2]
                 query = query.view(b, s, self.n_heads, -1)
                 key = key.view(b, s, self.kv_n_heads, -1)
             dtype = query.dtype
@@ -293,23 +239,28 @@ class GroupedQueryAttention(nn.Module):
             rotary_emb = rotary_emb_w_meta_info['rotary_emb']
             seq_len = rotary_emb_w_meta_info['seq_len']
             offset_info = rotary_emb_w_meta_info['offset_info']
-            (bsz, seqlen) = query.shape[:2]
             query = query.view(bsz, seqlen, -1, self.head_dim)
             key = key.view(bsz, seqlen, -1, self.head_dim)
             if rotary_emb_w_meta_info['impl'] == 'dail':
                 value = value.view(bsz, seqlen, -1, self.head_dim)
                 kv = torch.stack([key, value], dim=2)
-                (query, kv) = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len)
                 [key, value] = torch.unbind(kv, dim=2)
                 value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
             elif rotary_emb_w_meta_info['impl'] == 'hf':
-                (cos, sin) = rotary_emb(value, seq_len)
-                if is_transformers_version_gte('4.36'):
-                    (query, key) = apply_rotary_pos_emb(query, key, cos, sin, offset_info, unsqueeze_dim=2)
                 else:
                     query = query.transpose(1, 2)
                     key = key.transpose(1, 2)
-                    (query, key) = apply_rotary_pos_emb(query, key, cos, sin, offset_info)
                     query = query.transpose(1, 2)
                     key = key.transpose(1, 2)
             query = query.view(bsz, seqlen, self.d_model)
@@ -318,38 +269,36 @@ class GroupedQueryAttention(nn.Module):
         if self.attn_impl == 'flash':
             key_padding_mask = None
             extra_attn_kwargs = {'should_repeat_kv_for_gqa': not is_flash_v2_installed(), 'sliding_window_size': self.sliding_window_size, 'alibi_slopes': alibi_slopes, 'flash_attn_padding_info': flash_attn_padding_info}
-        (context, attn_weights, past_key_value) = self.attn_fn(query, key, value, self.n_heads, self.kv_n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, **extra_attn_kwargs)
         return (self.out_proj(context), attn_weights, past_key_value)
 class MultiheadAttention(GroupedQueryAttention):
     """Multi-head self attention.
-    Using torch or triton attention implementation enables user to also use
-    additive bias.
     """
-    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
         super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=n_heads, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
 class MultiQueryAttention(GroupedQueryAttention):
     """Multi-Query self attention.
-    Using torch or triton attention implementation enables user to also use
-    additive bias.
     """
-    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
         super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=1, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
-def attn_bias_shape(attn_impl: str, n_heads: int, seq_len: int, alibi: bool, prefix_lm: bool, causal: bool, use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
     if attn_impl == 'flash':
         return None
-    elif attn_impl in ['torch', 'triton']:
         if alibi:
-            if (prefix_lm or not causal) or use_sequence_id:
                 return (1, n_heads, seq_len, seq_len)
             return (1, n_heads, 1, seq_len)
-        elif prefix_lm or use_sequence_id:
             return (1, 1, seq_len, seq_len)
         return None
     else:
@@ -358,9 +307,9 @@ def attn_bias_shape(attn_impl: str, n_heads: int, seq_len: int, alibi: bool, pre
 def build_attn_bias(attn_impl: str, attn_bias: torch.Tensor, n_heads: int, seq_len: int, causal: bool=False, alibi: bool=False, alibi_bias_max: int=8) -> Optional[torch.Tensor]:
     if attn_impl == 'flash':
         return None
-    elif attn_impl in ['torch', 'triton']:
         if alibi:
-            (device, dtype) = (attn_bias.device, attn_bias.dtype)
             attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
         return attn_bias
     else:

 def check_alibi_support(attention_impl: str) -> bool:
     return attention_impl != 'flash' or is_flash_v2_installed(v2_version='v2.4.2')
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool) -> bool:
     """
     if n_rep == 1:
         return hidden
+    b, s, kv_n_heads, d = hidden.shape
     hidden = hidden[:, :, :, None, :].expand(b, s, kv_n_heads, n_rep, d)
     return hidden.reshape(b, s, kv_n_heads * n_rep, d)
             k = torch.cat([past_key_value[0], k], dim=3)
             v = torch.cat([past_key_value[1], v], dim=2)
         past_key_value = (k, v)
+    b, _, s_q, d = q.shape
     s_k = k.size(-1)
     if kv_n_heads > 1 and kv_n_heads < n_heads:
         k = repeat_kv_for_gqa(k.transpose(1, 2), n_heads // kv_n_heads).transpose(1, 2)
         past_key_value = (key, value)
     if attn_bias is not None:
         raise NotImplementedError(f'attn_bias not implemented for flash attn.')
+    batch_size, seqlen = query.shape[:2]
     indices_q = flash_attn_padding_info['indices_q']
     indices_k = flash_attn_padding_info['indices_k']
     indices_v = flash_attn_padding_info['indices_v']
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
     return (output, None, past_key_value)
 class GroupedQueryAttention(nn.Module):
     """Grouped Query Attention (GQA) is a generalization of Multi-head (MHA).
     and Multi-query attention (MQA).
     This allows the user to set a variable of number of kv_n_heads, rather than
+    just n_heads or 1, as in MHA and MQA. Using torch attention
     implementation enables user to also use additive bias.
     """
+    def __init__(self, d_model: int, n_heads: int, kv_n_heads: int, attn_impl: str='flash', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
         super().__init__()
         self.attn_impl = attn_impl
         self.clip_qkv = clip_qkv
             self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
         self.attn_dropout_p = attn_pdrop
         fc_kwargs: dict[str, Any] = {'bias': bias}
+        fc_kwargs['device'] = device
         self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs)
         fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)]
         self.Wqkv._fused = (0, fuse_splits)
             self.k_ln = norm_class(norm_size, device=device)
         if self.attn_impl == 'flash':
             self.attn_fn = flash_attn_fn
         elif self.attn_impl == 'torch':
             self.attn_fn = scaled_multihead_dot_product_attention
         else:
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+        query, key, value = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2)
         key_padding_mask = attention_mask
         if self.qk_ln or self.qk_gn:
+            q_shape, k_shape = (query.shape, key.shape)
             if self.qk_gn:
+                b, s = query.shape[:2]
                 query = query.view(b, s, self.n_heads, -1)
                 key = key.view(b, s, self.kv_n_heads, -1)
             dtype = query.dtype
             rotary_emb = rotary_emb_w_meta_info['rotary_emb']
             seq_len = rotary_emb_w_meta_info['seq_len']
             offset_info = rotary_emb_w_meta_info['offset_info']
+            bsz, seqlen = query.shape[:2]
             query = query.view(bsz, seqlen, -1, self.head_dim)
             key = key.view(bsz, seqlen, -1, self.head_dim)
             if rotary_emb_w_meta_info['impl'] == 'dail':
                 value = value.view(bsz, seqlen, -1, self.head_dim)
                 kv = torch.stack([key, value], dim=2)
+                query, kv = rotary_emb(query, kv, seqlen_offset=offset_info, max_seqlen=seq_len)
                 [key, value] = torch.unbind(kv, dim=2)
                 value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
             elif rotary_emb_w_meta_info['impl'] == 'hf':
+                if is_transformers_version_gte('4.38'):
+                    cos, sin = rotary_emb(x=value, position_ids=offset_info, seq_len=None)
+                else:
+                    cos, sin = rotary_emb(x=value, seq_len=seq_len)
+                if is_transformers_version_gte('4.38'):
+                    query, key = apply_rotary_pos_emb(q=query, k=key, cos=cos, sin=sin, position_ids=None, unsqueeze_dim=2)
+                elif is_transformers_version_gte('4.36'):
+                    query, key = apply_rotary_pos_emb(q=query, k=key, cos=cos, sin=sin, position_ids=offset_info, unsqueeze_dim=2)
                 else:
                     query = query.transpose(1, 2)
                     key = key.transpose(1, 2)
+                    query, key = apply_rotary_pos_emb(q=query, k=key, cos=cos, sin=sin, position_ids=offset_info)
                     query = query.transpose(1, 2)
                     key = key.transpose(1, 2)
             query = query.view(bsz, seqlen, self.d_model)
         if self.attn_impl == 'flash':
             key_padding_mask = None
             extra_attn_kwargs = {'should_repeat_kv_for_gqa': not is_flash_v2_installed(), 'sliding_window_size': self.sliding_window_size, 'alibi_slopes': alibi_slopes, 'flash_attn_padding_info': flash_attn_padding_info}
+        context, attn_weights, past_key_value = self.attn_fn(query, key, value, self.n_heads, self.kv_n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, **extra_attn_kwargs)
         return (self.out_proj(context), attn_weights, past_key_value)
 class MultiheadAttention(GroupedQueryAttention):
     """Multi-head self attention.
+    Using torch attention implementation enables user to also use additive bias.
     """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='flash', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
         super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=n_heads, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
 class MultiQueryAttention(GroupedQueryAttention):
     """Multi-Query self attention.
+    Using torch attention implementation enables user to also use additive bias.
     """
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='flash', clip_qkv: Optional[float]=None, qk_ln: bool=False, qk_gn: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, norm_type: str='low_precision_layernorm', fc_type: str='torch', device: Optional[str]=None, bias: bool=True, sliding_window_size: int=-1):
         super().__init__(d_model=d_model, n_heads=n_heads, kv_n_heads=1, attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, fc_type=fc_type, device=device, bias=bias, sliding_window_size=sliding_window_size)
+def attn_bias_shape(attn_impl: str, n_heads: int, seq_len: int, alibi: bool, causal: bool, use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
     if attn_impl == 'flash':
         return None
+    elif attn_impl == 'torch':
         if alibi:
+            if not causal or use_sequence_id:
                 return (1, n_heads, seq_len, seq_len)
             return (1, n_heads, 1, seq_len)
+        elif use_sequence_id:
             return (1, 1, seq_len, seq_len)
         return None
     else:
 def build_attn_bias(attn_impl: str, attn_bias: torch.Tensor, n_heads: int, seq_len: int, causal: bool=False, alibi: bool=False, alibi_bias_max: int=8) -> Optional[torch.Tensor]:
     if attn_impl == 'flash':
         return None
+    elif attn_impl == 'torch':
         if alibi:
+            device, dtype = (attn_bias.device, attn_bias.dtype)
             attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
         return attn_bias
     else:

blocks.py CHANGED Viewed

@@ -8,8 +8,8 @@ from .norm import NORM_CLASS_REGISTRY
 try:
     from flash_attn.bert_padding import unpad_input, pad_input
 except:
-    (unpad_input, pad_input) = (None, None)
-attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'qk_gn': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'sliding_window_size': -1, 'alibi': False, 'alibi_bias_max': 8, 'rope': False, 'rope_theta': 10000, 'rope_impl': 'dail', 'rope_dail_config': {'type': 'original', 'pos_idx_in_fp32': True, 'xpos_scale_base': 512}, 'rope_hf_config': {'type': 'no_scaling', 'factor': 1.0}}
 class MPTBlock(nn.Module):
@@ -23,8 +23,8 @@ class MPTBlock(nn.Module):
         norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
         assert isinstance(attn_config['attn_type'], str)
         attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
-        args_to_exclude_in_attn_class = {'attn_type', 'prefix_lm', 'alibi', 'attn_uses_sequence_id', 'alibi_bias_max', 'rope', 'rope_theta', 'rope_impl', 'rope_dail_config', 'rope_hf_config'}
-        attn_config_subset_for_attn_class = {k: v for (k, v) in attn_config.items() if k not in args_to_exclude_in_attn_class}
         self.norm_1 = norm_class(d_model, device=device)
         self.attn = attn_class(d_model=d_model, n_heads=n_heads, fc_type=fc_type, device=device, **attn_config_subset_for_attn_class, bias=not no_bias)
         self.norm_2 = None
@@ -37,16 +37,16 @@ class MPTBlock(nn.Module):
     def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[Dict]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True, output_attentions: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         a = self.norm_1(x)
-        (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
         x = x + self.resid_attn_dropout(b)
         m = x
         if self.norm_2 is not None:
             m = self.norm_2(x)
-        (batch_size, seq_len) = m.size()[:2]
         indices = None
         if not self.use_pad_tok_in_ffn:
             assert unpad_input is not None
-            (m, indices, _, _) = unpad_input(m, attention_mask)
         n = self.ffn(m)
         if not self.use_pad_tok_in_ffn:
             assert pad_input is not None

 try:
     from flash_attn.bert_padding import unpad_input, pad_input
 except:
+    unpad_input, pad_input = (None, None)
+attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'flash', 'qk_ln': False, 'qk_gn': False, 'clip_qkv': None, 'softmax_scale': None, 'attn_uses_sequence_id': False, 'sliding_window_size': -1, 'alibi': False, 'alibi_bias_max': 8, 'rope': False, 'rope_theta': 10000, 'rope_impl': 'dail', 'rope_dail_config': {'type': 'original', 'pos_idx_in_fp32': True, 'xpos_scale_base': 512}, 'rope_hf_config': {'type': 'no_scaling', 'factor': 1.0}}
 class MPTBlock(nn.Module):
         norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
         assert isinstance(attn_config['attn_type'], str)
         attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
+        args_to_exclude_in_attn_class = {'attn_type', 'alibi', 'attn_uses_sequence_id', 'alibi_bias_max', 'rope', 'rope_theta', 'rope_impl', 'rope_dail_config', 'rope_hf_config'}
+        attn_config_subset_for_attn_class = {k: v for k, v in attn_config.items() if k not in args_to_exclude_in_attn_class}
         self.norm_1 = norm_class(d_model, device=device)
         self.attn = attn_class(d_model=d_model, n_heads=n_heads, fc_type=fc_type, device=device, **attn_config_subset_for_attn_class, bias=not no_bias)
         self.norm_2 = None
     def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, rotary_emb_w_meta_info: Optional[Dict]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True, output_attentions: bool=False, alibi_slopes: Optional[torch.Tensor]=None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]]=None) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         a = self.norm_1(x)
+        b, attn_weights, past_key_value = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
         x = x + self.resid_attn_dropout(b)
         m = x
         if self.norm_2 is not None:
             m = self.norm_2(x)
+        batch_size, seq_len = m.size()[:2]
         indices = None
         if not self.use_pad_tok_in_ffn:
             assert unpad_input is not None
+            m, indices, _, _ = unpad_input(m, attention_mask)
         n = self.ffn(m)
         if not self.use_pad_tok_in_ffn:
             assert pad_input is not None

builders.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import contextlib
+import functools
+import logging
+import os
+import re
+from collections import OrderedDict
+from typing import Any, ContextManager, Dict, Iterable, List, Optional, Tuple, Union
+import torch
+from torch.optim.optimizer import Optimizer
+from torchmetrics import Metric
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from .llmfoundry import registry
+from .callbacks import EvalGauntlet
+from .dataloader import build_dataloader
+from .tiktoken import TiktokenTokenizerWrapper
+from .registry_utils import construct_from_registry
+log = logging.getLogger(__name__)
+def build_evaluators(eval_loader_config: Optional[Union[DictConfig, ListConfig]], icl_tasks_config: Optional[Union[str, ListConfig]], eval_gauntlet_config: Optional[Union[str, DictConfig]], *, tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, icl_subset_num_batches: Optional[int]) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]:
+    evaluators = []
+    if eval_loader_config is not None:
+        evaluators = build_eval_loaders(eval_loader_config, tokenizer, device_eval_batch_size)
+    logger_keys = []
+    eval_gauntlet_callback = None
+    if icl_tasks_config is not None:
+        icl_evaluators, logger_keys, eval_gauntlet_callback = build_icl_data_and_gauntlet(icl_tasks_config, eval_gauntlet_config, tokenizer, device_eval_batch_size, icl_seq_len, icl_subset_num_batches)
+        evaluators.extend(icl_evaluators)
+    return (evaluators, logger_keys, eval_gauntlet_callback)
+def build_eval_loaders(eval_loader_config: Union[DictConfig, ListConfig], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int) -> List[Evaluator]:
+    evaluators: List[Evaluator] = []
+    if isinstance(eval_loader_config, ListConfig):
+        eval_configs: ListConfig = eval_loader_config
+        is_multi_eval = True
+    else:
+        eval_configs = ListConfig([eval_loader_config])
+        is_multi_eval = False
+    for eval_config in eval_configs:
+        eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size)
+        eval_loader: Evaluator = Evaluator(label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', dataloader=eval_dataloader, metric_names=[])
+        evaluators.append(eval_loader)
+    return evaluators
+def add_metrics_to_eval_loaders(evaluators: List[Evaluator], metric_names: List[str]) -> List[Evaluator]:
+    eval_loaders, other_evaluators = ([], [])
+    for evaluator in evaluators:
+        if evaluator.metric_names == []:
+            evaluator.metric_names = metric_names
+            eval_loaders.append(evaluator)
+        else:
+            other_evaluators.append(evaluator)
+    return eval_loaders + other_evaluators
+def build_icl_data_and_gauntlet(icl_tasks_config: Union[str, ListConfig], eval_gauntlet_config: Optional[Union[str, DictConfig]], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, icl_subset_num_batches: Optional[int]=None) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]:
+    icl_evaluators, logger_keys = build_icl_evaluators(icl_tasks_config, tokenizer, icl_seq_len, device_eval_batch_size, icl_subset_num_batches=icl_subset_num_batches)
+    eval_gauntlet_cb = None
+    if eval_gauntlet_config is not None:
+        if isinstance(eval_gauntlet_config, str):
+            with open(eval_gauntlet_config, 'r') as icl_f:
+                eval_gauntlet_cfg = om.load(icl_f)
+            eval_gauntlet = eval_gauntlet_cfg.eval_gauntlet
+        elif isinstance(eval_gauntlet_config, DictConfig):
+            eval_gauntlet = eval_gauntlet_config
+        else:
+            raise ValueError(f'Got invalid type for eval_gauntlet_config: {type(eval_gauntlet_config)}')
+        eval_gauntlet.logger_keys = logger_keys
+        eval_gauntlet.benchmark_sizes = {e.label: e.dataloader.num_samples for e in icl_evaluators}
+        eval_gauntlet_cb = EvalGauntlet(**eval_gauntlet)
+    return (icl_evaluators, logger_keys, eval_gauntlet_cb)
+def build_composer_model(name: str, cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, init_context: Optional[ContextManager]=None, master_weights_dtype: Optional[str]=None) -> ComposerModel:
+    """Builds a ComposerModel from the registry.
+    Args:
+        name (str): Name of the model to build.
+        cfg (DictConfig): Configuration for the model.
+        tokenizer (PreTrainedTokenizerBase): Tokenizer to use.
+        init_context (Optional[ContextManager], optional): Context manager to use for initialization. Defaults to None.
+        master_weights_dtype (Optional[str], optional): Master weights dtype. Defaults to None.
+    Returns:
+        ComposerModel: _description_
+    """
+    if init_context is None:
+        init_context = contextlib.nullcontext()
+    with init_context:
+        model = construct_from_registry(name=name, registry=registry.models, pre_validation_function=ComposerModel, post_validation_function=None, kwargs={'om_model_config': cfg, 'tokenizer': tokenizer})
+    str_dtype_to_torch_dtype = {'f16': torch.float16, 'float16': torch.float16, 'bf16': torch.bfloat16, 'bfloat16': torch.bfloat16}
+    if master_weights_dtype is not None:
+        if master_weights_dtype not in str_dtype_to_torch_dtype:
+            raise ValueError(f'Invalid master_weights_dtype: {master_weights_dtype}. ' + f'Valid options are: {list(str_dtype_to_torch_dtype.keys())}.')
+        dtype = str_dtype_to_torch_dtype[master_weights_dtype]
+        model = model.to(dtype=dtype)
+    return model
+def build_callback(name: str, kwargs: Optional[Dict[str, Any]]=None, config: Any=None) -> Callback:
+    """Builds a callback from the registry."""
+    registry_to_use = registry.callbacks
+    if name in registry.callbacks_with_config:
+        if kwargs is None:
+            kwargs = {}
+        if 'config' in kwargs:
+            raise ValueError(f'`config` is a reserved keyword for callbacks with config. Please remove it from the kwargs.')
+        kwargs['config'] = config
+        registry_to_use = registry.callbacks_with_config
+    return construct_from_registry(name=name, registry=registry_to_use, partial_function=True, pre_validation_function=Callback, post_validation_function=None, kwargs=kwargs)
+def build_logger(name: str, kwargs: Optional[Dict[str, Any]]=None) -> LoggerDestination:
+    """Builds a logger from the registry."""
+    return construct_from_registry(name=name, registry=registry.loggers, partial_function=True, pre_validation_function=LoggerDestination, post_validation_function=None, kwargs=kwargs)
+def build_algorithm(name: str, kwargs: Optional[Dict[str, Any]]=None) -> Algorithm:
+    """Builds an algorithm from the registry."""
+    return construct_from_registry(name=name, registry=registry.algorithms, partial_function=True, pre_validation_function=Algorithm, post_validation_function=None, kwargs=kwargs)
+def build_metric(name: str, kwargs: Optional[Dict[str, Any]]=None) -> Metric:
+    """Builds a metric from the registry."""
+    return construct_from_registry(name=name, registry=registry.metrics, partial_function=True, pre_validation_function=Metric, post_validation_function=None, kwargs=kwargs)
+def _extract_param_groups(model: torch.nn.Module, optimizer_config: Optional[Dict[str, Any]]=None) -> Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]:
+    """Extracts parameter groups defined in the optimizer config.
+    The optimizer_config defines the optimizer args. It can additionally have key
+    `disable_grad` which is a string or list of strings. If a string matches a
+    parameter name, then that parameter will have `requires_grad=False`. This is
+    useful for freezing parameters. It can additionally have a key
+    `param_groups` which is a list of dicts. In this dict, key `param_str_match`
+    defines a string; if a parameter name contains this string, then it will be
+    in this parameter group. This is useful for grouping parameters together.
+    The dict can also contain any other key that is a valid optimizer arg.
+    Note: to handle name overlap conflicts, params are assigned to parameter
+    groups and added to `param_groups` in the order that `param_str_match` appear
+    in `param_groups`.
+    Usage
+    To disable gradient for all parameters that contain the string "norm" or "bias":
+    ```
+    optimizer_config: {
+        "name": "decoupled_lionw",
+        "lr": 1e-3,
+        "weight_decay": 1e-2,
+        "betas": [0.9, 0.999],
+        "eps": 1e-8,
+        "disable_grad": ["norm", "bias"]
+    }
+    ```
+    To create and modify the optimizer parameters for all parameters that contain
+    the string "norm" and "bias" separately:
+    ```
+    optimizer_config: {
+        "name": "decoupled_lionw",
+        "lr": 1e-3,
+        "weight_decay": 1e-2,
+        "betas": [0.9, 0.999],
+        "eps": 1e-8,
+        "param_groups": [
+            {
+                "param_str_match": "norm",
+                "lr": 1e-4,
+                "weight_decay": 0.0,
+            },
+            {
+                "param_str_match": "bias",
+                "lr": 5e-4,
+                "weight_decay": 0.0,
+            },
+        ],
+    }
+    ```
+    Args:
+        model (torch.nn.Module): model to extract parameters from
+        optimizer_config (Dict[str, Any]): optimizer config
+    Returns:
+        Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]: an iterable of
+            torch.Tensor's or dict's. Specifies what Tensors should be optimized
+            and their param groupings.
+    """
+    if optimizer_config is None:
+        return model.parameters()
+    if 'disable_grad' in optimizer_config.keys():
+        str_matches = optimizer_config.pop('disable_grad')
+        if isinstance(str_matches, str):
+            str_matches = [str_matches]
+        for str_match in str_matches:
+            for n, p in model.named_parameters():
+                if re.search(str_match, n):
+                    p.requires_grad = False
+                    log.debug(f'Setting `{n}.requires_grad = False`.')
+    param_groups_config = optimizer_config.pop('param_groups', None)
+    if param_groups_config is not None:
+        params = []
+        param_dict = OrderedDict(((n, p) for n, p in model.named_parameters()))
+        log.debug(f'Default optimizer settings: {optimizer_config}.')
+        for param_group_config in param_groups_config:
+            str_match = param_group_config.pop('param_str_match')
+            filter_fn = functools.partial(re.search, str_match)
+            param_names = [n for n in param_dict.keys() if filter_fn(n)]
+            group_params = {'params': [param_dict.pop(n) for n in param_names]}
+            group_params.update(param_group_config)
+            log.debug(f'Creating optimizer param_group with parameters: {param_names} ' + f'(extracted using str_match={str_match!r}). The param_group optimizer ' + f'setting overrides are: {param_group_config}.')
+            params.append(group_params)
+        params.insert(0, {'params': param_dict.values()})
+        return params
+    return model.parameters()
+def build_optimizer(model: torch.nn.Module, name: str, optimizer_config: Optional[Dict[str, Any]]=None) -> Optimizer:
+    params = _extract_param_groups(model, optimizer_config)
+    kwargs = optimizer_config
+    if kwargs is None:
+        kwargs = {}
+    if 'params' in kwargs:
+        raise ValueError('The `params` will be automatically extracted from the model and ' + 'optimizer config. Please remove it from the optimizer config kwargs.')
+    kwargs['params'] = params
+    return construct_from_registry(name=name, registry=registry.optimizers, partial_function=True, pre_validation_function=Optimizer, post_validation_function=None, kwargs=kwargs)
+def build_scheduler(name: str, scheduler_config: Optional[Dict[str, Any]]=None) -> ComposerScheduler:
+    return construct_from_registry(name=name, registry=registry.schedulers, partial_function=True, pre_validation_function=ComposerScheduler, post_validation_function=None, kwargs=scheduler_config)
+def build_tokenizer(tokenizer_name: str, tokenizer_kwargs: Dict[str, Any]) -> PreTrainedTokenizerBase:
+    os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
+    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+    signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
+    if dist.is_available() and dist.is_initialized() and (dist.get_world_size() > 1):
+        with dist.local_rank_zero_download_and_wait(signal_file_path):
+            pass
+    if tokenizer_name.startswith('tiktoken'):
+        tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, **tokenizer_kwargs)
+        tokenizer.model_max_length = tokenizer_kwargs.get('model_max_length', int(1e+30))
+    if not hasattr(tokenizer, 'eos_token') or tokenizer.eos_token is None:
+        raise ValueError(f'The tokenizer {tokenizer_name} must have an eos_token.')
+    if dist.is_available() and dist.is_initialized() and (dist.get_world_size() > 1):
+        if dist.get_local_rank() == 0:
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_tokenizer_setup')
+        dist.barrier()
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
+    return tokenizer
+def build_icl_evaluators(icl_tasks: Union[str, ListConfig], tokenizer: PreTrainedTokenizerBase, default_max_seq_len: int, default_batch_size: int, destination_dir: Optional[str]=None, icl_subset_num_batches: Optional[int]=None) -> Tuple[List[Evaluator], List[str]]:
+    if destination_dir is None:
+        destination_dir = os.getcwd()
+    evaluators = []
+    logger_keys = []
+    icl_tasks_list = None
+    if isinstance(icl_tasks, str):
+        log.info(f'Extracting ICL task config from path: {icl_tasks}')
+        with open(icl_tasks, 'r') as icl_f:
+            icl_task_cfg = om.load(icl_f)
+        icl_tasks_list = icl_task_cfg.icl_tasks
+    else:
+        icl_tasks_list = icl_tasks
+    def _validate_cfg(icl_cfg: DictConfig):
+        assert 'label' in icl_cfg
+        assert 'dataset_uri' in icl_cfg and icl_cfg.dataset_uri is not None
+        assert 'icl_task_type' in icl_cfg
+        assert 'num_fewshot' in icl_cfg
+        if 'metric_names' not in icl_cfg:
+            if icl_cfg.icl_task_type == 'language_modeling':
+                icl_cfg.metric_names = ['InContextLearningLMAccuracy']
+            elif icl_cfg.icl_task_type == 'multiple_choice':
+                icl_cfg.metric_names = ['InContextLearningMultipleChoiceAccuracy']
+            elif icl_cfg.icl_task_type == 'schema':
+                icl_cfg.metric_names = ['InContextLearningMultipleChoiceAccuracy']
+            elif icl_cfg.icl_task_type == 'question_answering':
+                icl_cfg.metric_names = ['InContextLearningQAAccuracy']
+            elif icl_cfg.icl_task_type == 'code_evaluation':
+                icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
+            else:
+                raise ValueError(f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.')
+        if 'prompt_string' not in icl_cfg:
+            icl_cfg.prompt_string = ''
+        if 'example_delimiter' not in icl_cfg:
+            icl_cfg.example_delimiter = '\n'
+        if 'continuation_delimiter' not in icl_cfg:
+            icl_cfg.continuation_delimiter = ' '
+        if 'max_seq_len' not in icl_cfg:
+            icl_cfg.max_seq_len = default_max_seq_len
+        if 'batch_size' not in icl_cfg:
+            icl_cfg.batch_size = default_batch_size
+        if 'pass_at_k' not in icl_cfg:
+            icl_cfg.pass_at_k = 1
+        if 'fewshot_random_seed' not in icl_cfg:
+            icl_cfg.fewshot_random_seed = 1234
+        if 'generations_per_sample' not in icl_cfg:
+            icl_cfg.generations_per_sample = 1
+        if 'num_beams' in icl_cfg:
+            raise ValueError('num_beams is no longer supported as a top level icl_task parameter.' + 'Please use generation_kwargs.num_beams instead.')
+    for icl_cfg in icl_tasks_list:
+        assert isinstance(icl_cfg, DictConfig)
+        _validate_cfg(icl_cfg)
+        for num_fewshot in list(icl_cfg.num_fewshot):
+            if tokenizer.pad_token_id is None:
+                pad_tok_id = tokenizer.eos_token_id
+            else:
+                pad_tok_id = tokenizer.pad_token_id
+            label = f'{icl_cfg.label}/{num_fewshot}-shot'
+            metric_names = list(icl_cfg.metric_names)
+            destination_path = f'{destination_dir}/{icl_cfg.label}-{num_fewshot}.jsonl'
+            if dist.get_local_rank() == 0 and os.path.exists(destination_path):
+                os.remove(destination_path)
+            dist.barrier()
+            hf_parsing_map = icl_cfg.get('hf_parsing_map', {})
+            hf_loading_vars = icl_cfg.get('hf_loading_vars', {})
+            early_stopping_criteria = icl_cfg.get('early_stopping_criteria', None)
+            if isinstance(early_stopping_criteria, ListConfig):
+                early_stopping_criteria = om.to_container(early_stopping_criteria)
+            assert early_stopping_criteria is None or isinstance(early_stopping_criteria, list)
+            dataloaders = get_icl_task_dataloader(icl_cfg.icl_task_type, icl_cfg.dataset_uri, tokenizer, batch_size=icl_cfg.batch_size, max_seq_len=icl_cfg.max_seq_len, pad_tok_id=pad_tok_id, num_fewshot=num_fewshot, prompt_string=icl_cfg.prompt_string, example_delimiter=icl_cfg.example_delimiter, hf_loading_vars=hf_loading_vars, hf_parsing_map=hf_parsing_map, continuation_delimiter=icl_cfg.continuation_delimiter, question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, fewshot_random_seed=icl_cfg.fewshot_random_seed, pass_at_k=icl_cfg.pass_at_k, generations_per_sample=icl_cfg.generations_per_sample, has_categories=icl_cfg.get('has_categories', False), cot_delimiter=icl_cfg.get('cot_delimiter', ''), generation_kwargs=icl_cfg.get('generation_kwargs', {}), early_stopping_criteria=early_stopping_criteria, do_normalization=icl_cfg.get('do_normalization', True))
+            if hasattr(icl_cfg, 'has_categories') and icl_cfg.has_categories and isinstance(dataloaders, dict):
+                for category in dataloaders.keys():
+                    logger_keys.extend([f'metrics/{label}/{category}/{m}' for m in metric_names])
+                    evaluators.append(Evaluator(label=f'{label}/{category}', dataloader=dataloaders[category], metric_names=metric_names))
+            else:
+                logger_keys.extend([f'metrics/{label}/{m}' for m in metric_names])
+                evaluators.append(Evaluator(label=label, dataloader=dataloaders, metric_names=metric_names, subset_num_batches=icl_subset_num_batches))
+    return (evaluators, logger_keys)

callback_with_config.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import abc
+from typing import Any
+class CallbackWithConfig(Callback, abc.ABC):
+    """A callback that takes a config dictionary as an argument, in addition to.
+    its other kwargs.
+    """
+    def __init__(self, config: dict[str, Any], *args: Any, **kwargs: Any) -> None:
+        del config, args, kwargs
+        pass

callbacks.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from .async_eval_callback import AsyncEval
+from .curriculum_learning_callback import CurriculumLearning
+from .eval_gauntlet_callback import EvalGauntlet
+from .fdiff_callback import FDiffMetrics
+from .hf_checkpointer import HuggingFaceCheckpointer
+from .monolithic_ckpt_callback import MonolithicCheckpointSaver
+from .resumption_callbacks import GlobalLRScaling, LayerFreezing
+from .scheduled_gc_callback import ScheduledGarbageCollector
+from .registry import callbacks, callbacks_with_config
+callbacks.register('lr_monitor', func=LRMonitor)
+callbacks.register('memory_monitor', func=MemoryMonitor)
+callbacks.register('memory_snapshot', func=MemorySnapshot)
+callbacks.register('speed_monitor', func=SpeedMonitor)
+callbacks.register('runtime_estimator', func=RuntimeEstimator)
+callbacks.register('optimizer_monitor', func=OptimizerMonitor)
+callbacks.register('generate_callback', func=Generate)
+callbacks.register('early_stopper', func=EarlyStopper)
+callbacks.register('fdiff_metrics', func=FDiffMetrics)
+callbacks.register('hf_checkpointer', func=HuggingFaceCheckpointer)
+callbacks.register('global_lr_scaling', func=GlobalLRScaling)
+callbacks.register('layer_freezing', func=LayerFreezing)
+callbacks.register('mono_checkpoint_saver', func=MonolithicCheckpointSaver)
+callbacks.register('scheduled_gc', func=ScheduledGarbageCollector)
+callbacks.register('oom_observer', func=OOMObserver)
+callbacks_with_config.register('async_eval', func=AsyncEval)
+callbacks_with_config.register('curriculum_learning', func=CurriculumLearning)

checkpoint_conversion_helpers.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Helper methods for the checkpoint conversion scripts.
+The checkpoint conversion scripts are located in the
+llmfoundry/scripts/inference/benchmarking/ folder. Users should run those
+scripts directly to convert between checkpoints; this file contains only common
+utility functions that are present in multiple scripts.
+"""
+import json
+import logging
+import os
+import random
+import string
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import sentencepiece as spm
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+log = logging.getLogger(__name__)
+def _get_weight_data_type(data_type: str):
+    if data_type == 'fp32':
+        return np.float32
+    elif data_type == 'fp16':
+        return np.float16
+    else:
+        raise RuntimeError('Unsupported data type: {data_type} for conversion.')
+def get_hf_tokenizer_from_composer_state_dict(state_dict: Dict[str, Any], trust_remote_code: bool, tokenizer_save_dir: Optional[str]=None) -> Optional[PreTrainedTokenizer]:
+    if 'state' not in state_dict:
+        raise RuntimeError('Unexpected composer state dictionary. Did you pass in a full composer checkpoint?')
+    if 'integrations' not in state_dict['state'] or 'huggingface' not in state_dict['state']['integrations']:
+        raise RuntimeError('Did not find HuggingFace related state (e.g., tokenizer) in the provided composer checkpoint!')
+    hf_tokenizer_state = state_dict['state']['integrations']['huggingface']['tokenizer']
+    hf_tokenizer = None
+    if hf_tokenizer_state != {}:
+        if tokenizer_save_dir is None:
+            unique_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+            tokenizer_save_dir = os.path.join(os.getcwd(), f'tokenizer-save-dir-{unique_suffix}')
+        os.makedirs(tokenizer_save_dir, exist_ok=True)
+        for filename, saved_content in hf_tokenizer_state.items():
+            if filename.endswith(saved_content['file_extension']):
+                tokenizer_file_name = filename
+            else:
+                tokenizer_file_name = filename + saved_content['file_extension']
+            tokenizer_file_path = Path(tokenizer_save_dir) / tokenizer_file_name
+            if saved_content['file_extension'] == '.json':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    json.dump(saved_content['content'], _tmp_file)
+            elif saved_content['file_extension'] == '.txt':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    for line in saved_content['content']:
+                        _tmp_file.write(line)
+                        _tmp_file.write('\n')
+            elif saved_content['file_extension'] == '.py':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    _tmp_file.write(saved_content['content'])
+            elif saved_content['file_extension'] == '.model':
+                s = spm.SentencePieceProcessor()
+                s.load_from_serialized_proto(saved_content['content'])
+                with open(tokenizer_file_path, 'wb') as _tmp_file:
+                    _tmp_file.write(s.serialized_model_proto())
+        hf_tokenizer = load_tokenizer(tokenizer_save_dir, trust_remote_code=trust_remote_code)
+        hf_tokenizer.name_or_path = ''
+        hf_tokenizer.init_kwargs['name_or_path'] = ''
+    return hf_tokenizer
+def load_tokenizer(tokenizer_save_dir: str, trust_remote_code: bool) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    try:
+        return AutoTokenizer.from_pretrained(tokenizer_save_dir, trust_remote_code=trust_remote_code)
+    except ValueError as e:
+        raise ValueError(f'Got error while loading tokenizer with trust_remote_code={trust_remote_code}: {e}. ' + 'If accessing a tokenizer defined outside of the transformers module,' + ' please use --trust_remote_code.')
+def _write_zero_bias(weight_name: str, weight_file_path: str, bias_shape: Union[Tuple[int, ...], int], np_data_type: np.dtype) -> None:
+    """Write zeros for bias when converting MPT to FasterTransformer weights.
+    MPT model might not have bias while FT expects bias.
+    Args:
+        weight_name (str): Name of the weight tensor.
+        weight_file_path (str): Output path for storing the weight (NOT zero bias).
+        bias_shape (Union[Tuple[int, ...], int]): Shape of the bias array.
+        np_data_type (np.dtype): The data type for bias.
+    """
+    if 'weight' not in weight_file_path:
+        raise RuntimeError(f'Cannot write zero bias for {weight_name}. Input is not a weight tensor')
+    log.debug(f'zero bias for weight: {weight_name}')
+    bias_file_path = weight_file_path.replace('.weight', '.bias')
+    bias = np.zeros(bias_shape, dtype=np_data_type)
+    bias.tofile(bias_file_path)
+def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int, tensor_name: str, config: Dict[str, Any], data: np.ndarray, np_weight_data_type: np.dtype) -> None:
+    """Convert each MPT weight to a FasterTransformer compatible format.
+    Args:
+        save_dir (str): Path of the directory to save the weight in FT format. The directory must already exist.
+        infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        tensor_name (str): Name of the weight tensor. Used in naming the output file.
+        config (Dict[str, Any]): Configuration for the model. This is used in getting model specific parameters.
+        data (np.ndarray): Tensor data in np.ndarray format.
+    Returns:
+        None: Writes to a file in `save_dir`. File name is based on the `tensor_name`
+    """
+    if tensor_name.find('input_layernorm.weight') != -1 or tensor_name.find('input_layernorm.bias') != -1 or tensor_name.find('attention.dense.bias') != -1 or (tensor_name.find('post_attention_layernorm.weight') != -1) or (tensor_name.find('post_attention_layernorm.bias') != -1) or (tensor_name.find('mlp.dense_4h_to_h.bias') != -1) or (tensor_name.find('final_layernorm.weight') != -1) or (tensor_name.find('final_layernorm.bias') != -1):
+        save_path = os.path.join(save_dir, f'model.{tensor_name}.bin')
+        data.tofile(save_path)
+        if 'weight' in tensor_name and config['no_bias']:
+            _write_zero_bias(tensor_name, save_path, data.shape[-1], np_weight_data_type)
+    elif tensor_name.find('attention.dense.weight') != -1:
+        assert data.shape == (config['d_model'], config['d_model']), f'unexpected dim for {tensor_name}'
+        data = data.T
+        split_vals = np.split(data, infer_gpu_num, axis=0)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+        if config['no_bias']:
+            fake_weight_path = os.path.join(save_dir, f'model.{tensor_name}.bin')
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1], np_weight_data_type)
+    elif tensor_name.find('mlp.dense_4h_to_h.weight') != -1:
+        assert data.shape == (config['d_model'], config['expansion_ratio'] * config['d_model']), f'unexpected dim for {tensor_name}'
+        data = data.T
+        split_vals = np.split(data, infer_gpu_num, axis=0)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+        if config['no_bias']:
+            fake_weight_path = os.path.join(save_dir, f'model.{tensor_name}.bin')
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1], np_weight_data_type)
+    elif tensor_name.find('mlp.dense_h_to_4h.weight') != -1:
+        assert data.shape == (config['expansion_ratio'] * config['d_model'], config['d_model']), f'unexpected dim for {tensor_name}'
+        data = data.T
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+            if config['no_bias']:
+                _write_zero_bias(tensor_name, save_path, split_vals[j].shape[-1], np_weight_data_type)
+    elif tensor_name.find('mlp.dense_h_to_4h.bias') != -1:
+        assert data.shape == (config['expansion_ratio'] * config['d_model'],), f'unexpected dim for {tensor_name}'
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir + f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+    elif tensor_name.find('attention.query_key_value.bias') != -1:
+        assert data.shape == (3 * config['d_model'],), f'unexpected dim for {tensor_name}'
+        data = data.reshape(3, config['d_model'])
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+    elif tensor_name.find('attention.query_key_value.weight') != -1:
+        assert data.shape == (3 * config['d_model'], config['d_model']), f'unexpected dim for {tensor_name}'
+        data = data.T
+        data = data.reshape(config['d_model'], 3, config['d_model'])
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+            if config['no_bias']:
+                _write_zero_bias(tensor_name, save_path, (3, split_vals[j].shape[-1]), np_weight_data_type)
+    else:
+        raise RuntimeError(f'Tensor with name {tensor_name} is not handled')
+def convert_and_save_ft_weights(named_params: dict, config: dict, infer_gpu_num: int=1, weight_data_type: str='fp32', save_dir: str='') -> None:
+    """Convert a Composer MPT checkpoint to a FasterTransformer format.
+    Args:
+        named_params (Dict[str, Parameter]): A dictionary containing the Composer MPT model's parameter names and data.
+        config (Dict[str, Any]): Configuration for the model. This is used in getting model specific parameters.
+        infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        weight_data_type (str): The dtype of the converted FasterTransformer model.
+        save_dir (str): Path of the directory to save the weight in FT format. The directory must already exist.
+    Returns:
+        None: Writes to the `save_dir` folder. File names within this folder are based on the model parameter names.
+    """
+    np_weight_data_type = _get_weight_data_type(weight_data_type)
+    param_remapping = {'norm_1.bias': 'input_layernorm.bias', 'norm_1.weight': 'input_layernorm.weight', 'attn.Wqkv.bias': 'attention.query_key_value.bias', 'attn.Wqkv.weight': 'attention.query_key_value.weight', 'attn.out_proj.bias': 'attention.dense.bias', 'attn.out_proj.weight': 'attention.dense.weight', 'norm_2.bias': 'post_attention_layernorm.bias', 'norm_2.weight': 'post_attention_layernorm.weight', 'ffn.up_proj.bias': 'mlp.dense_h_to_4h.bias', 'ffn.up_proj.weight': 'mlp.dense_h_to_4h.weight', 'ffn.down_proj.bias': 'mlp.dense_4h_to_h.bias', 'ffn.down_proj.weight': 'mlp.dense_4h_to_h.weight'}
+    for name, param in named_params.items():
+        log.debug(f'Working on parameter {name} ...')
+        data = param.detach().cpu().numpy().astype(np_weight_data_type)
+        if name.find('weight') == -1 and name.find('bias') == -1:
+            log.debug(f'found a parameter name that is not handled: {name}')
+            continue
+        if name == 'transformer.wpe.weight':
+            assert data.shape == (config['max_seq_len'], config['d_model']), f'unexpected dim for {name}'
+            data.tofile(os.path.join(save_dir, 'model.wpe.bin'))
+        elif name == 'transformer.wte.weight':
+            assert data.shape == (config['vocab_size'], config['d_model']), f'unexpected dim for {name}'
+            data.tofile(os.path.join(save_dir, 'model.wte.bin'))
+        elif name == 'transformer.norm_f.bias':
+            assert data.shape == (config['d_model'],), f'unexpected dim for {name}'
+            data.tofile(os.path.join(save_dir, 'model.final_layernorm.bias.bin'))
+        elif name == 'transformer.norm_f.weight':
+            assert data.shape == (config['d_model'],), f'unexpected dim for {name}'
+            save_path = os.path.join(save_dir, 'model.final_layernorm.weight.bin')
+            data.tofile(save_path)
+            if config['no_bias']:
+                _write_zero_bias(name, save_path, data.shape[-1], np_weight_data_type)
+        elif name == 'transformer.lm_head.weight':
+            data.tofile(os.path.join(save_dir, 'model.lm_head.weight.bin'))
+        else:
+            for mpt_pattern, ft_pattern in param_remapping.items():
+                if name.find(mpt_pattern) != -1:
+                    new_name = name.replace('transformer.blocks.', 'layers.').replace(mpt_pattern, ft_pattern)
+                    _convert_weight_to_ft_each(save_dir, infer_gpu_num, new_name, config, data, np_weight_data_type)

collator.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import logging
+import warnings
+from typing import Any, Dict, List, Optional, Union
+import torch
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+log = logging.getLogger(__name__)
+_HF_IGNORE_INDEX = -100
+TokenizedExample = Dict[str, List[Dict[str, List[int]]]]
+def ensure_list(x: Union[List, torch.Tensor]) -> List:
+    if isinstance(x, torch.Tensor):
+        x = list(x.flatten())
+    assert isinstance(x, list)
+    return x
+def validate_target_settings(target_prompts: str, target_responses: str, decoder_only_format: bool):
+    """Raises an error if target settings are invalid."""
+    if not decoder_only_format and (target_prompts != 'none' or target_responses != 'last'):
+        raise ValueError(f'When using encoder_decoder format, you must use target_prompts="none" and target_responses="last".')
+    if target_responses not in {'all', 'last'}:
+        raise ValueError(f'target_responses must be either "last" or "all" but target_responses={target_responses!r}')
+    if target_prompts.startswith('length>='):
+        cutoff = target_prompts[8:]
+        if not cutoff.isdigit():
+            raise ValueError(f'target_prompts starts with "length>=" but the rest of the string is not digits (target_prompts={target_prompts!r}). ' + 'To use this configuration option, set target_prompts "length>=XX" where "XX" is a positive integer indicating ' + 'the length cutoff. Prompts of at least XX tokens in length will be treated as targets.')
+        cutoff = int(cutoff)
+        if cutoff <= 0:
+            raise ValueError(f'You are trying to set the target_prompts length cutoff to a negative number cutoff={cutoff!r}. This is not allowed.')
+    elif target_prompts not in {'all', 'none'}:
+        raise ValueError(f'target_prompts must either be "all", "none" or "length>=XX" where "XX" is a positive integer, but target_prompts={target_prompts!r}')
+def _sequence_to_labels_all(sequence: list[int], is_last_turn: bool, cutoff: Optional[int]=None) -> list[int]:
+    del is_last_turn, cutoff
+    return sequence
+def _sequence_to_labels_none(sequence: list[int], is_last_turn: bool, cutoff: Optional[int]=None) -> list[int]:
+    del is_last_turn, cutoff
+    return [_HF_IGNORE_INDEX] * len(sequence)
+def _sequence_to_labels_last(sequence: list[int], is_last_turn: bool, cutoff: Optional[int]=None) -> list[int]:
+    del cutoff
+    if is_last_turn:
+        return sequence
+    else:
+        return [_HF_IGNORE_INDEX] * len(sequence)
+def _sequence_to_labels_cutoff(sequence: list[int], is_last_turn: bool, cutoff: Optional[int]=None) -> list[int]:
+    del is_last_turn
+    if cutoff is None:
+        raise ValueError('input ``cutoff`` must be provided')
+    if len(sequence) >= cutoff:
+        return sequence
+    else:
+        return [_HF_IGNORE_INDEX] * len(sequence)
+_TARGET_POLICY_LOOKUP = {'all': _sequence_to_labels_all, 'none': _sequence_to_labels_none, 'last': _sequence_to_labels_last, 'length': _sequence_to_labels_cutoff}
+def stitch_turns_decoder_only(example_turns: list[dict[str, list[int]]], target_prompts: str, target_responses: str, eos_token_id: Optional[int]=None, validate: bool=False) -> tuple[list[int], list[int]]:
+    target_prompts = target_prompts.lower()
+    target_responses = target_responses.lower()
+    if validate:
+        validate_target_settings(target_prompts, target_responses, decoder_only_format=True)
+    if target_prompts.startswith('length'):
+        prompt_cutoff = int(target_prompts.split('>=')[-1])
+        prompt_to_target = _TARGET_POLICY_LOOKUP['length']
+    else:
+        prompt_cutoff = None
+        prompt_to_target = _TARGET_POLICY_LOOKUP[target_prompts]
+    response_to_target = _TARGET_POLICY_LOOKUP[target_responses]
+    input_ids = []
+    labels = []
+    for idx, turn in enumerate(example_turns):
+        is_last_turn = idx + 1 == len(example_turns)
+        context = ensure_list(turn['input_ids'])
+        target = ensure_list(turn['labels'])
+        if is_last_turn and eos_token_id is not None:
+            if target[-1] != eos_token_id:
+                target = target + [eos_token_id]
+        input_ids += context
+        input_ids += target
+        labels += prompt_to_target(context, is_last_turn, prompt_cutoff)
+        labels += response_to_target(target, is_last_turn)
+    if len(input_ids) != len(labels):
+        raise ValueError(f'input_ids and labels should be the same length, len(input_ids)={len(input_ids)!r}, len(labels)={len(labels)!r}')
+    return (input_ids, labels)
+def stitch_turns_encoder_decoder(example_turns: list[dict[str, list[int]]], eos_token_id: Optional[int]=None) -> tuple[list[int], list[int]]:
+    context = []
+    target = None
+    for idx, turn in enumerate(example_turns):
+        is_last_turn = idx + 1 == len(example_turns)
+        turn_context = ensure_list(turn['input_ids'])
+        turn_target = ensure_list(turn['labels'])
+        context += turn_context
+        if is_last_turn:
+            if eos_token_id is not None and turn_target[-1] != eos_token_id:
+                turn_target = turn_target + [eos_token_id]
+            target = turn_target
+        else:
+            context += turn_target
+    if target is None:
+        raise ValueError('target is still None but should be list[int]')
+    return (context, target)
+class Seq2SeqFinetuningCollator:
+    """A general-purpose collator for sequence-to-sequence training/evaluation.
+    Args:
+        tokenizer: A HuggingFace tokenizer. Must have a pad_token set.
+        max_seq_len (int): The maximum sequence length of the combined
+            context/target sequence (decoder-only format) or of each the
+            context sequence and target sequence (encoder-decoder format).
+        decoder_only_format (bool): Whether to format the batches for a
+            decoder-only model (if True) or an encoder-decoder model (if False).
+        target_responses (str): For multi-turn examples, this controls which
+            responses are treated as training targets (i.e. generate loss).
+            Options are:
+                "last": (Default) Only the final response is used as the training
+                    target; non-terminal responses are only part of the context.
+                "all": All of the responses are used as training targets.
+        target_prompts (str): This controls which prompts are treated as
+            training targets (i.e. generate loss).
+            Options are:
+                "none": (Default) Prompts are never used as training targets.
+                "all": Prompts are always used as training targets.
+                "length>=XX": Prompt sequences are used as training targets when
+                    they have length of at least XX tokens. For instance,
+                    setting "length>=512" instructs the collator to use a prompt
+                    sequence as a training target when it is at least 512 tokens long.
+        allow_pad_trimming (bool, optional): Whether to allow the collator
+            to trim padding, which may result in smaller but inconsistent batch
+            sizes. Default: ``False`` ensures that all sequences are max_seq_len.
+        batch_metadata (dict, optional): A dictionary of metadata which will be added
+            to the batch.
+    """
+    def __init__(self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], max_seq_len: int, decoder_only_format: bool, target_responses: str='last', target_prompts: str='none', allow_pad_trimming: bool=False, batch_metadata: Optional[Dict[str, Any]]=None):
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.decoder_only_format = decoder_only_format
+        self.target_responses = target_responses.lower()
+        self.target_prompts = target_prompts.lower()
+        self.batch_metadata = batch_metadata or {}
+        self._allow_pad_trimming = allow_pad_trimming
+        self._seen_first_batch = False
+        illegal_keys = ['input_ids', 'labels', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask']
+        found_keys = []
+        for illegal_key in illegal_keys:
+            if illegal_key in self.batch_metadata:
+                found_keys.append(illegal_key)
+        if found_keys:
+            raise ValueError(f"The following keys are in batch_metadata but are not allowed: {', '.join(found_keys)}.\n" + f'You cannot use keys that are used directly by the models. The prohibited keys are:\n' + f"{', '.join(illegal_keys)}")
+        if max_seq_len % 8 != 0:
+            log.warning('For performance, a max_seq_len as a multiple of 8 is recommended.')
+        if self.tokenizer.pad_token_id is None:
+            raise ValueError(f'{self.__class__.__name__} requires that the tokenizer has the pad token set, but it is None')
+        validate_target_settings(self.target_prompts, self.target_responses, self.decoder_only_format)
+        if self.target_prompts.startswith('length'):
+            self.prompt_cutoff = int(self.target_prompts.split('>=')[-1])
+            self.prompt_to_target = _TARGET_POLICY_LOOKUP['length']
+        else:
+            self.prompt_cutoff = None
+            self.prompt_to_target = _TARGET_POLICY_LOOKUP[self.target_prompts]
+        self.response_to_target = _TARGET_POLICY_LOOKUP[self.target_responses]
+        self._warned_truncated = False
+        self._warned_context = False
+        self._warned_target = False
+    def __call__(self, examples: List[TokenizedExample]) -> Dict[str, torch.Tensor]:
+        for check_key in ['input_ids', 'labels']:
+            if check_key not in examples[0]['turns'][0]:
+                raise KeyError(f'Examples returned by dataset do not include required key: {check_key}')
+        if self.decoder_only_format:
+            batch = self._process_and_batch_decoder_only(examples)
+        else:
+            batch = self._process_and_batch_encoder_decoder(examples)
+        batch_size = batch['input_ids'].shape[0]
+        batch.update({k: torch.tensor([v] * batch_size) for k, v in self.batch_metadata.items()})
+        return batch
+    def _process_and_batch_decoder_only(self, examples: List[TokenizedExample]) -> Dict[str, torch.Tensor]:
+        processed_examples = []
+        for example in examples:
+            input_ids, labels = stitch_turns_decoder_only(example_turns=example['turns'], target_prompts=self.target_prompts, target_responses=self.target_responses, eos_token_id=self.tokenizer.eos_token_id)
+            orig_size = len(input_ids)
+            if orig_size > self.max_seq_len:
+                input_ids = input_ids[:self.max_seq_len]
+                labels = labels[:self.max_seq_len]
+                if len([l for l in labels if l != _HF_IGNORE_INDEX]) == 0:
+                    raise ValueError(f'Truncating to max_seq_len={self.max_seq_len} has removed all loss-generating tokens. ' + f'Pre-truncation sequence length was {orig_size}. ' + 'This sample should have been filtered out before reaching the collator. If using ' + 'pre-tokenized streaming data, this may have resulted from using different ' + '``target_prompts``, ``target_responses``, or ``max_seq_len`` ' + 'settings when preparing the streaming dataset than what are currently being used.')
+                if not self._warned_truncated:
+                    warnings.warn(f'Truncating sequence of length={orig_size} to fit max_seq_len={self.max_seq_len}. ' + f'If truncation is a problem, consider increasing max_seq_len.')
+                    self._warned_truncated = True
+            attention_mask = [1] * len(input_ids)
+            n_total = len(input_ids)
+            i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - n_total)
+            if self.tokenizer.padding_side == 'left':
+                labels = i_pad + labels
+            else:
+                labels = labels + i_pad
+            processed_example = {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}
+            processed_examples.append(processed_example)
+        batch = self.tokenizer.pad(processed_examples, padding='max_length', max_length=self.max_seq_len, return_tensors='pt')
+        batch['sequence_id'] = batch['attention_mask'] - 1
+        if not (self._allow_pad_trimming and self._seen_first_batch):
+            self._seen_first_batch = True
+            return batch
+        self._seen_first_batch = True
+        multiple_of = 8
+        n_non_padding = batch['attention_mask'].sum(dim=1).max()
+        keep_tokens = int(multiple_of * torch.ceil(n_non_padding / multiple_of))
+        for k, v in batch.items():
+            if len(v.shape) < 2:
+                continue
+            if self.tokenizer.padding_side == 'left':
+                batch[k] = v[:, -keep_tokens:].contiguous()
+            else:
+                batch[k] = v[:, :keep_tokens].contiguous()
+        return batch
+    def _process_and_batch_encoder_decoder(self, examples: List[TokenizedExample]) -> Dict[str, torch.Tensor]:
+        processed_examples = []
+        for example in examples:
+            context, target = stitch_turns_encoder_decoder(example_turns=example['turns'], eos_token_id=self.tokenizer.eos_token_id)
+            if len(target) < self.max_seq_len:
+                i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - len(target))
+                target = target + i_pad
+            else:
+                if not self._warned_target:
+                    warnings.warn(f'Truncating TARGET sequence of length={len(target)} ' + f'to max_seq_len={self.max_seq_len}. If truncation is ' + f'a problem, consider increasing max_seq_len.')
+                    self._warned_target = True
+                target = target[:self.max_seq_len - 1] + [self.tokenizer.eos_token_id]
+            if len(context) > self.max_seq_len:
+                if not self._warned_context:
+                    warnings.warn(f'Truncating CONTEXT sequence of length={len(context)} ' + f'to max_seq_len={self.max_seq_len}. If truncation is ' + f'a problem, consider increasing max_seq_len.')
+                    self._warned_context = True
+                context = context[:self.max_seq_len - 1] + [self.tokenizer.eos_token_id]
+            processed_example = {'input_ids': context, 'labels': target, 'attention_mask': [1] * len(context)}
+            processed_examples.append(processed_example)
+        batch = self.tokenizer.pad(processed_examples, padding='max_length', max_length=self.max_seq_len, return_tensors='pt')
+        batch['decoder_input_ids'] = torch.cat([torch.full((len(processed_examples), 1), self.tokenizer.pad_token_id), batch['labels'][:, :-1]], dim=1)
+        batch['decoder_input_ids'].masked_fill_(batch['decoder_input_ids'] == _HF_IGNORE_INDEX, self.tokenizer.pad_token_id)
+        batch['decoder_attention_mask'] = torch.not_equal(batch['labels'], _HF_IGNORE_INDEX)
+        if not (self._allow_pad_trimming and self._seen_first_batch):
+            self._seen_first_batch = True
+            return batch
+        self._seen_first_batch = True
+        multiple_of = 8
+        n_non_padding = batch['attention_mask'].sum(dim=1).max()
+        keep_tokens = int(multiple_of * torch.ceil(n_non_padding / multiple_of))
+        for k in ['input_ids', 'attention_mask']:
+            batch[k] = batch[k][:, :keep_tokens].contiguous()
+        n_non_padding = batch['decoder_attention_mask'].sum(dim=1).max()
+        keep_tokens = int(multiple_of * torch.ceil(n_non_padding / multiple_of))
+        for k in ['decoder_input_ids', 'decoder_attention_mask', 'labels']:
+            batch[k] = batch[k][:, :keep_tokens].contiguous()
+        return batch

config_utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import contextlib
+import logging
+import math
+import warnings
+from typing import Any, Dict, Literal, Mapping, Optional, Tuple, Union
+from .utils import init_empty_weights
+log = logging.getLogger(__name__)
+def pop_config(cfg: DictConfig, key: str, must_exist: bool=True, default_value: Any=None, convert: bool=False) -> Any:
+    """Pop a value from the main config file and return it.
+    If the key does not exist, return the default_value or raise a RuntimeError
+    depending on the must_exist flag. If the convert flag is set to True, then
+    we will convert the value to a python object using OmegaConf.to_container.
+    """
+    value = cfg.pop(key, None)
+    if value is not None and convert:
+        if not isinstance(value, DictConfig) and (not isinstance(value, ListConfig)):
+            raise ValueError(f'The key {key} has a value of type {type(value)} that cannot be                             converted to a dict or list. Please check your yaml.')
+        return om.to_container(value)
+    elif value is not None:
+        return value
+    elif must_exist:
+        raise NameError(f'The {key} parameter is missing and must exist for execution. Please check your yaml.')
+    else:
+        return default_value
+def calculate_batch_size_info(global_batch_size: int, device_microbatch_size: Union[int, Literal['auto']]) -> Tuple[int, Union[int, Literal['auto']], Union[int, Literal['auto']]]:
+    if global_batch_size % dist.get_world_size() != 0:
+        raise ValueError(f'Global batch size {global_batch_size} is not divisible by {dist.get_world_size()} ' + 'as a result, the batch size would be truncated, please adjust `global_batch_size` ' + f'to be divisible by world size, {dist.get_world_size()}.')
+    device_batch_size = global_batch_size // dist.get_world_size()
+    if device_microbatch_size == 'auto':
+        device_grad_accum = 'auto'
+    elif isinstance(device_microbatch_size, int):
+        if device_microbatch_size > device_batch_size:
+            log.warn(f'device_microbatch_size > device_batch_size, ' + f'will be reduced from {device_microbatch_size} -> {device_batch_size}.')
+            device_microbatch_size = device_batch_size
+        device_grad_accum = math.ceil(device_batch_size / device_microbatch_size)
+    else:
+        raise ValueError(f'Not sure how to parse device_microbatch_size={device_microbatch_size!r}')
+    return (device_batch_size, device_microbatch_size, device_grad_accum)
+def update_batch_size_info(cfg: DictConfig) -> DictConfig:
+    device_train_batch_size, device_train_microbatch_size, device_train_grad_accum = calculate_batch_size_info(cfg.global_train_batch_size, cfg.device_train_microbatch_size)
+    cfg.n_gpus = dist.get_world_size()
+    cfg.device_train_batch_size = device_train_batch_size
+    cfg.device_train_microbatch_size = device_train_microbatch_size
+    cfg.device_train_grad_accum = device_train_grad_accum
+    if 'device_eval_batch_size' not in cfg:
+        if cfg.device_train_microbatch_size == 'auto':
+            cfg.device_eval_batch_size = 1
+        else:
+            cfg.device_eval_batch_size = cfg.device_train_microbatch_size
+    return cfg
+def process_init_device(model_cfg: DictConfig, fsdp_config: Optional[Dict]):
+    init_context = contextlib.nullcontext()
+    if 'init_device' in model_cfg:
+        assert model_cfg.init_device in ['meta', 'cpu', 'mixed']
+        if fsdp_config is None and model_cfg.init_device == 'meta':
+            warnings.warn("Using `cfg.model.init_device='meta'` is only valid when using FSDP! " + "Reverting to `cfg.model.init_device='cpu'`.")
+            model_cfg.init_device = 'cpu'
+        if model_cfg.init_device == 'meta':
+            init_context = init_empty_weights()
+        if model_cfg.init_device == 'mixed':
+            if fsdp_config is None:
+                raise NotImplementedError('Using init_device `mixed` is only supported with FSDP. ' + 'Please add a FSDP config.')
+            if not fsdp_config.get('sync_module_states', False):
+                warnings.warn('Setting `sync_module_states = True` for FSDP. This is required when using mixed initialization.')
+                fsdp_config['sync_module_states'] = True
+            fsdp_config.setdefault('use_orig_params', False)
+            fsdp_config.setdefault('load_monolith_rank0_only', True)
+    master_dtype = model_cfg.get('master_weights_dtype')
+    small_dtypes = ('bf16', 'fp16', 'float16', 'bfloat16', 'amp_fp16', 'amp_bf16')
+    if fsdp_config and master_dtype in small_dtypes:
+        reduce_dtype = None
+        buffer_dtype = None
+        mixed_precision = fsdp_config.get('mixed_precision')
+        if isinstance(mixed_precision, Mapping):
+            reduce_dtype = mixed_precision.get('reduce_dtype')
+            buffer_dtype = mixed_precision.get('buffer_dtype')
+        fsdp_config['mixed_precision'] = {'param_dtype': None, 'reduce_dtype': reduce_dtype, 'buffer_dtype': buffer_dtype, 'keep_low_precision_grads': True}
+    return init_context
+def log_config(cfg: DictConfig) -> None:
+    """Logs the current config and updates the wandb and mlflow configs.
+    This function can be called multiple times to update the wandb and MLflow
+    config with different variables.
+    """
+    print(om.to_yaml(cfg))
+    if 'wandb' in cfg.get('loggers', {}):
+        try:
+            import wandb
+        except ImportError as e:
+            raise e
+        if wandb.run:
+            wandb.config.update(om.to_container(cfg, resolve=True))
+    if 'mlflow' in cfg.get('loggers', {}):
+        try:
+            import mlflow
+        except ImportError as e:
+            raise e
+        if mlflow.active_run():
+            mlflow.log_params(params=om.to_container(cfg, resolve=True))

configuration_mpt.py CHANGED Viewed

@@ -2,12 +2,11 @@
 import warnings
 from typing import Any, Dict, Optional, Union
 from transformers import PretrainedConfig
-from .attention import check_alibi_support, is_flash_v1_installed, is_flash_v2_installed
 from .blocks import attn_config_defaults
 from .fc import FC_CLASS_REGISTRY
 from .norm import LPLayerNorm
 from .ffn import FFN_CLASS_REGISTRY
-from .warnings import VersionedDeprecationWarning
 ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
 init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
@@ -30,16 +29,13 @@ class MPTConfig(PretrainedConfig):
             attn_config (Dict): A dictionary used to configure the model's attention module:
                 attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
                 attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
                 qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
                 qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer.
                 clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
                     this value.
                 softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
                     use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
                 attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
                     When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
                     which sub-sequence each token belongs to.
@@ -116,7 +112,7 @@ class MPTConfig(PretrainedConfig):
         self._validate_config()
     def _set_config_defaults(self, config: Dict[str, Any], config_defaults: Dict[str, Any]) -> Dict[str, Any]:
-        for (k, v) in config_defaults.items():
             if k not in config:
                 config[k] = v
             elif isinstance(v, dict):
@@ -131,18 +127,12 @@ class MPTConfig(PretrainedConfig):
             raise ValueError('d_model must be divisible by n_heads')
         if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
             raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
-        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
             raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
-        if self.attn_config['attn_impl'] == 'flash' and is_flash_v1_installed():
-            warnings.warn(VersionedDeprecationWarning('Support for Flash Attention v1 is deprecated. Please upgrade to Flash Attention v2.4.2. To install Flash Attention v2.4.2, please run `pip install -e ".[gpu-flash2]"` from the root directory of the llm-foundry repository.', remove_version='0.6.0'))
-        if self.attn_config['attn_impl'] == 'triton' and (not self.attn_config['prefix_lm']):
-            warnings.warn(UserWarning('If not using a Prefix Language Model, we recommend setting "attn_impl" to "flash" instead of "triton".'))
         if self.attn_config['alibi'] and (not check_alibi_support(self.attn_config['attn_impl'])):
-            raise NotImplementedError('alibi only implemented with torch, triton, and flash (v2.4.2 or higher) attention.')
-        if self.attn_config['attn_uses_sequence_id'] and (not (self.attn_config['attn_impl'] in ['torch', 'triton'] or (self.attn_config['attn_impl'] == 'flash' and is_flash_v2_installed(v2_version='v2.1.2')))):
-            raise NotImplementedError('attn_uses_sequence_id only implemented with torch, triton, and flash (v2.1.2 or higher) attention.')
         if self.attn_config['rope'] and self.attn_config['rope_impl'] not in ['dail', 'hf']:
             raise ValueError('If rope is being used then rope_impl should be either "dail", or "hf".')
         if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'hf' and (self.attn_config['rope_hf_config']['type'] not in ['no_scaling', 'linear', 'dynamic']):

 import warnings
 from typing import Any, Dict, Optional, Union
 from transformers import PretrainedConfig
+from .attention import check_alibi_support, is_flash_v2_installed
 from .blocks import attn_config_defaults
 from .fc import FC_CLASS_REGISTRY
 from .norm import LPLayerNorm
 from .ffn import FFN_CLASS_REGISTRY
 ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
 init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
             attn_config (Dict): A dictionary used to configure the model's attention module:
                 attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
                 attn_pdrop (float): The dropout probability for the attention layers.
+                attn_impl (str): The attention implementation to use. One of 'torch' or 'flash'.
                 qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
                 qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer.
                 clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
                     this value.
                 softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
                     use the default scale of ``1/sqrt(d_keys)``.
                 attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
                     When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
                     which sub-sequence each token belongs to.
         self._validate_config()
     def _set_config_defaults(self, config: Dict[str, Any], config_defaults: Dict[str, Any]) -> Dict[str, Any]:
+        for k, v in config_defaults.items():
             if k not in config:
                 config[k] = v
             elif isinstance(v, dict):
             raise ValueError('d_model must be divisible by n_heads')
         if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
             raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
+        if self.attn_config['attn_impl'] not in ['torch', 'flash']:
             raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
         if self.attn_config['alibi'] and (not check_alibi_support(self.attn_config['attn_impl'])):
+            raise NotImplementedError('alibi only implemented with torch and flash (v2.4.2 or higher) attention.')
+        if self.attn_config['attn_uses_sequence_id'] and (not (self.attn_config['attn_impl'] == 'torch' or (self.attn_config['attn_impl'] == 'flash' and is_flash_v2_installed(v2_version='v2.1.2')))):
+            raise NotImplementedError('attn_uses_sequence_id only implemented with torch and flash (v2.1.2 or higher) attention.')
         if self.attn_config['rope'] and self.attn_config['rope_impl'] not in ['dail', 'hf']:
             raise ValueError('If rope is being used then rope_impl should be either "dail", or "hf".')
         if self.attn_config['rope'] and self.attn_config['rope_impl'] == 'hf' and (self.attn_config['rope_hf_config']['type'] not in ['no_scaling', 'linear', 'dynamic']):

curriculum_learning_callback.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Enable curriculum learning by resuming with a different dataset.
+This callback is currently experimental. The API may change without warning in
+the future.
+"""
+import logging
+from typing import Any, Dict
+from streaming import StreamingDataset
+from torch.utils.data import DataLoader
+from .interfaces import CallbackWithConfig
+from .warnings import experimental_class
+log = logging.getLogger(__name__)
+@experimental_class('CurriculumLearning callback')
+class CurriculumLearning(CallbackWithConfig):
+    """Starts an epoch with a different dataset when resuming from a checkpoint.
+    Args:
+        dataset_index (int): The index of the dataset currently being used.
+        current_dataset_config (Dict): The configuration of the dataset currently
+            being used.
+    """
+    def __init__(self, dataset_index: int, train_config: Dict):
+        self.dataset_index = dataset_index
+        self.saved_dataset_index = 0
+        self.all_dataset_configs = []
+        self.current_dataset_state = {}
+        self.current_dataset_config = train_config['dataloader']
+    def before_load(self, state: State, logger: Logger):
+        del logger
+        train_loader = state.train_dataloader
+        if not isinstance(train_loader, DataLoader):
+            raise ValueError(f'CurriculumLearning callback can only be used with a train ', f'dataloader of type DataLoader, but got {type(train_loader)}.')
+        dataset = train_loader.dataset
+        if not isinstance(dataset, StreamingDataset):
+            raise ValueError(f'CurriculumLearning callback only supports StreamingDataset ', f'because it requires loading and saving dataset state. ', f'Instead, got a dataset of type {type(dataset)}')
+        assert isinstance(dataset, StreamingDataset)
+        self.current_dataset_state = dataset.state_dict(num_samples=0, from_beginning=False)
+    def after_load(self, state: State, logger: Logger):
+        del logger
+        train_loader = state._train_dataloader
+        assert isinstance(train_loader, DataLoader), 'CurriculumLearning callback requires a DataLoader.'
+        dataset = train_loader.dataset
+        assert isinstance(dataset, StreamingDataset), 'CurriculumLearning callback requires a StreamingDataset.'
+        if self.saved_dataset_index < self.dataset_index:
+            if self.current_dataset_state['epoch'] < 0:
+                self.current_dataset_state['epoch'] = 0
+            dataset.load_state_dict(self.current_dataset_state)
+            state.timestamp = state.timestamp.to_next_epoch()
+            self.all_dataset_configs.append(self.current_dataset_config)
+        elif self.dataset_index == 0 and len(self.all_dataset_configs) == 0:
+            self.all_dataset_configs.append(self.current_dataset_config)
+    def state_dict(self):
+        return {'dataset_index': self.dataset_index, 'all_dataset_configs': self.all_dataset_configs}
+    def load_state_dict(self, state: Dict[str, Any]):
+        self.saved_dataset_index = state.get('dataset_index', 0)
+        self.all_dataset_configs = state.get('all_dataset_configs', [])

data.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Datasets for converting to MDS Shards."""
+import os
+import warnings
+from typing import Dict, Iterable, Union
+import datasets as hf_datasets
+import numpy as np
+from torch.utils.data import IterableDataset
+from transformers import PreTrainedTokenizerBase
+class NoConcatDataset(IterableDataset):
+    """An IterableDataset that returns text samples for MDSWriter.
+    Returns dicts of {'text': bytes}
+    """
+    def __init__(self, hf_dataset: Union[hf_datasets.IterableDataset, hf_datasets.Dataset]):
+        self.hf_dataset = hf_dataset
+    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+        for sample in self.hf_dataset:
+            yield {'text': sample['text'].encode('utf-8')}
+class ConcatTokensDataset(IterableDataset):
+    """An IterableDataset that returns token samples for MDSWriter.
+    Returns dicts of {'tokens': bytes}
+    To use data created by this class and written to MDS format:
+    ```python
+        import torch
+        from streaming.base import StreamingDataset
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained('your/tokenizer')
+        ds = StreamingDataset(local='mds-data-folder', split='val')
+        # note, you need to copy the numpy array because the original is non-writeable
+        # and torch does not support non-writeable tensors, so you get a scary warning and
+        # if you do try to write to the tensor you get undefined behavior
+        tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int64).copy())
+        print(tokenizer.decode(tokens))
+    ```
+    """
+    def __init__(self, hf_dataset: Union[hf_datasets.IterableDataset, hf_datasets.Dataset], tokenizer: PreTrainedTokenizerBase, max_length: int, bos_text: str, eos_text: str, no_wrap: bool):
+        self.hf_dataset = hf_dataset
+        self.tokenizer = tokenizer
+        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+        self.max_length = max_length
+        self.bos_text = bos_text
+        self.eos_text = eos_text
+        self.should_wrap = not no_wrap
+        self.bos_tokens = self.tokenizer(self.bos_text, truncation=False, padding=False, add_special_tokens=False)['input_ids']
+        if len(self.bos_tokens) > 1:
+            warnings.warn(f'You specified --concat_tokens with --bos_text, but your BOS text is not tokenizing to one token                , instead we got {self.bos_tokens}. Quit if this was in error.')
+        self.eos_tokens = self.tokenizer(self.eos_text, truncation=False, padding=False, add_special_tokens=False)['input_ids']
+        if len(self.eos_tokens) > 1:
+            warnings.warn(f'You specified --concat_tokens with --eos_text, but your EOS text is not tokenizing to one token                , instead we got {self.eos_tokens}. Quit if this was in error.')
+        eos_text_provided = self.eos_text != ''
+        bos_text_provided = self.bos_text != ''
+        test_text = self.tokenizer('')
+        if len(test_text['input_ids']) > 0 and (eos_text_provided or bos_text_provided):
+            message = 'both eos and bos' if eos_text_provided and bos_text_provided else 'eos_text' if eos_text_provided else 'bos_text'
+            warnings.warn(f'The provided tokenizer adds special tokens, but you also specified {message}. This may result ' + 'in duplicated special tokens. Please be sure this is what you intend.')
+    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+        buffer = []
+        for sample in self.hf_dataset:
+            encoded = self.tokenizer(sample['text'], truncation=False, padding=False)
+            iids = encoded['input_ids']
+            buffer = buffer + self.bos_tokens + iids + self.eos_tokens
+            while len(buffer) >= self.max_length:
+                concat_sample = buffer[:self.max_length]
+                buffer = buffer[self.max_length:] if self.should_wrap else []
+                yield {'tokens': np.asarray(concat_sample).tobytes()}

data_prep_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import json
+import os
+from glob import glob
+from typing import List, Optional
+def with_id(basename: str, shard_id: int) -> str:
+    """Get a new basename with the given shard_id.
+    From https://github.com/mosaicml/streaming/blob/main/examples/multiprocess_dataset_conversion.ipynb.
+    Args:
+        basename (str): Old basename of file.
+        shard_id (int): New shard ID.
+    Returns:
+        str: New basename of file.
+    """
+    parts = basename.split('.')
+    parts[1] = f'{shard_id:05}'
+    return '.'.join(parts)
+def merge_shard_groups(root: str) -> None:
+    """Merge ephemeral sub-datasets created in parallel into one dataset.
+    From https://github.com/mosaicml/streaming/blob/main/examples/multiprocess_dataset
+    _conversion.ipynb.
+    Args:
+        root (str): Root directory.
+    """
+    pattern = os.path.join(root, '*')
+    subdirs = sorted(glob(pattern))
+    shard_id = 0
+    infos = []
+    for subdir in subdirs:
+        index_filename = os.path.join(subdir, 'index.json')
+        with open(index_filename) as index_file:
+            obj = json.load(index_file)
+        for info in obj['shards']:
+            old_basename = info['raw_data']['basename']
+            new_basename = with_id(old_basename, shard_id)
+            info['raw_data']['basename'] = new_basename
+            if info['zip_data'] is not None:
+                old_basename = info['zip_data']['basename']
+                new_basename = with_id(old_basename, shard_id)
+                info['zip_data']['basename'] = new_basename
+            old_filename = os.path.join(subdir, old_basename)
+            new_filename = os.path.join(root, new_basename)
+            os.rename(old_filename, new_filename)
+            shard_id += 1
+            infos.append(info)
+        os.remove(index_filename)
+        os.rmdir(subdir)
+    index_filename = os.path.join(root, 'index.json')
+    obj = {'version': 2, 'shards': infos}
+    text = json.dumps(obj, sort_keys=True)
+    with open(index_filename, 'w') as out:
+        out.write(text)
+class DownloadingIterable:
+    def __init__(self, object_names: List[str], output_folder: str, object_store: Optional[ObjectStore]):
+        """Iterable that downloads files from an object store before yielding.
+        If object_store is None, input_folder_prefix is treated as a local path.
+        Args:
+            object_names (List[str]): Names of objects to download
+            output_folder (str): Local folder to write downloaded files to
+            object_store (Optiona[ObjectStore]): Object store to download from
+        """
+        self.object_names = object_names
+        self.object_store = object_store
+        self.output_folder = output_folder
+    def __iter__(self):
+        for object_name in self.object_names:
+            output_filename = object_name
+            if self.object_store is not None:
+                output_filename = os.path.join(self.output_folder, object_name.strip('/'))
+                self.object_store.download_object(object_name=object_name, filename=output_filename, overwrite=True)
+            with open(output_filename) as _txt_file:
+                txt = _txt_file.read()
+            yield {'text': txt}

dataloader.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import logging
+import os
+from typing import Tuple, Union
+import torch
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+from .collator import Seq2SeqFinetuningCollator, validate_target_settings
+from .tasks import DOWNLOADED_FT_DATASETS_DIRPATH, SUPPORTED_EXTENSIONS, dataset_constructor
+from .packing import BinPackCollator, auto_packing_ratio
+from .text_data import build_streams, get_tokens_per_batch_func
+from .exceptions import MissingHuggingFaceURLSplitError, NotEnoughDatasetSamplesError
+log = logging.getLogger(__name__)
+_HF_IGNORE_INDEX = -100
+_DEFAULT_TARGET_RESPONSES = 'last'
+_DEFAULT_TARGET_PROMPTS = 'none'
+def build_finetuning_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int) -> DataSpec:
+    """Builds a finetuning dataloader for training or evaluating.
+    The underlying dataset can be built through one of two code paths:
+        1. As a HuggingFace dataset, via `datasets.load_dataset(...)`
+        2. As a streaming dataset
+    You will need to set slightly different dataset config fields depending
+    on which you intend to use, as explained below.
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the loader:
+            cfg.name (str): The type of dataloader to build. Must = "finetuning".
+            ---
+            *** HuggingFace dataset config fields ***
+            cfg.dataset.hf_name (str, optional): The name of the HuggingFace dataset
+                to use. Can also be a remote http(s) directory or object store bucket
+                containing the file {split}.jsonl in the format (prompt, response),
+                in which case the builder will create a HuggingFace dataset.
+            cfg.dataset.hf_kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
+            cfg.dataset.preprocessing_fn (str, optional): The name/import path of
+                the preprocessing function to use for formatting the data examples.
+                If ``None`` (default), the builder will use the preprocessing function
+                    registered under `hf_name` (see `tasks.py`), if one exists,
+                    otherwise it will skip preprocessing.
+                If `preprocessing_fn` corresponds to a registered preprocessing
+                    function in `tasks.py`, the builder will use that.
+                Otherwise, it will interpret `preprocessing_fn` as a
+                    "import.path:function_name" import path; e.g., it will call
+                    `from import.path import function_name` and use the imported
+                    function as the preprocessing function.
+            *** Streaming dataset config fields ***
+            cfg.dataset.remote (str, optional): Location of a MDS-formatted
+                streaming dataset to use. Setting this will tell the builder
+                to create a streaming dataset rather than a HuggingFace dataset.
+            cfg.dataset.local (str, optional): Local path where remote data
+                will be streamed to. Only valid if `cfg.dataset.remote` has
+                also been set.
+            *** Shared dataset configs fields ***
+            cfg.dataset.max_seq_len (int): The maximum length of sequences
+                in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+                for details.
+            cfg.dataset.decoder_only_format (bool): Whether to format the
+                examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+                docstring for details.
+            cfg.dataset.target_responses (str): Which responses are used as training targets.
+                Defaults to "last", meaning only the final response in multi-turn examples
+                will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+                details.
+            cfg.dataset.target_prompts (str): Which prompts are used as training targets.
+                Defaults to "none", meaning prompts are never used as training targets.
+                See :class:`Seq2SeqFinetuningCollator` docstring for details.
+            cfg.dataset.allow_pad_trimming (bool, optional): Whether to allow
+                the collator to trim padding. See :class:`Seq2SeqFinetuningCollator`
+                docstring for details. Default: ``False``.
+            cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes
+                a collator wrapper that packs device_batch_size*packing_ratio
+                raw examples into device_batch_size packed examples. This helps
+                minimize padding while preserving sequence integrity.
+                This adds `sequence_id` to the batch, which indicates which unique
+                sequence each token belongs to.
+                If set to 'auto', packing_ratio is profiled and the highest observed packing ratio with
+                zero waste is selected.
+                In practice, this may result in > 0 waste because profiling is done on only a portion
+                of the dataset.
+                Note: Using this feature will not change device_batch_size but it
+                    will determine the number of raw examples consumed by the dataloader
+                    per batch. Some examples may be discarded if they do not fit when
+                    packing.
+                    Select packing_ratio **carefully** based on the dataset
+                    statistics, max_seq_len, and tolerance for discarding samples!
+                    The script `scripts/misc/profile_packing.py` can help
+                    you choose the best packing_ratio.
+            cfg.dataset.shuffle (bool): Whether to shuffle the dataset.
+            ___
+            See :class:`StreamingFinetuningDataset` for info on other standard config
+                options within `cfg.dataset` that will be passed as kwargs if
+                using the streaming codepath.
+            ---
+            See :class:`DataLoader` for standard argument options to the pytorch
+                dataloader, such as `cfg.drop_last`, `cfg.num_workers`, etc.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        device_batch_size (int): The size of the batches (number of examples)
+            that the dataloader will produce.
+    Returns:
+        A pytorch dataloader
+    Note:
+        You can run the script inside `scripts/misc/profile_packing.py` to quickly test the
+        padding/waste rates for different `cfg.dataset.packing_ratio` choices,
+        given a starting workload YAML.
+    """
+    _validate_config(cfg.dataset)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    collate_fn, dataloader_batch_size = _build_collate_fn(cfg, tokenizer, device_batch_size)
+    dataset = None
+    sampler = None
+    if cfg.dataset.get('remote') is not None or cfg.dataset.get('streams') is not None:
+        streams = build_streams(cfg.dataset)
+        dataset = dataset_constructor.build_from_streaming(tokenizer=tokenizer, streams=streams, local=cfg.dataset.get('local', None), remote=cfg.dataset.get('remote', None), split=cfg.dataset.get('split', None), download_retry=cfg.dataset.get('download_retry', 2), download_timeout=cfg.dataset.get('download_timeout', 60), validate_hash=cfg.dataset.get('validate_hash', None), keep_zip=cfg.dataset.get('keep_zip', False), epoch_size=cfg.dataset.get('epoch_size', None), predownload=cfg.dataset.get('predownload', None), cache_limit=cfg.dataset.get('cache_limit', None), partition_algo=cfg.dataset.get('partition_algo', 'relaxed'), num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', None), batch_size=device_batch_size, shuffle=cfg.dataset.get('shuffle', False), shuffle_algo=cfg.dataset.get('shuffle_algo', 'py1e'), shuffle_seed=cfg.dataset.get('shuffle_seed', 9176), shuffle_block_size=cfg.dataset.get('shuffle_block_size', None), sampling_method=cfg.dataset.get('sampling_method', 'balanced'), sampling_granularity=cfg.dataset.get('sampling_granularity', 1), batching_method=cfg.dataset.get('batching_method', 'random'), max_seq_len=cfg.dataset.max_seq_len)
+    else:
+        dataset_name_or_path = cfg.dataset.hf_name
+        split = cfg.dataset.get('split')
+        if split is None:
+            raise MissingHuggingFaceURLSplitError()
+        backend, _, _ = parse_uri(dataset_name_or_path)
+        if backend not in ['', None]:
+            dataset_name_or_path = _download_remote_hf_dataset(remote_path=dataset_name_or_path, split=split)
+            split = split.replace('-', '_')
+        proto_preprocessing_fn = cfg.dataset.get('preprocessing_fn')
+        if isinstance(proto_preprocessing_fn, (dict, DictConfig)):
+            preprocessing_fn = dataset_constructor.get_preprocessing_fn_from_dict(dict(proto_preprocessing_fn))
+        else:
+            preprocessing_fn = dataset_constructor.get_preprocessing_fn_from_str(proto_preprocessing_fn, dataset_name_or_path)
+        dataset = dataset_constructor.build_from_hf(dataset_name=dataset_name_or_path, split=split, safe_load=cfg.dataset.get('safe_load', False), max_seq_len=cfg.dataset.max_seq_len, preprocessing_fn=preprocessing_fn, tokenizer=tokenizer, target_prompts=cfg.dataset.get('target_prompts', _DEFAULT_TARGET_PROMPTS), target_responses=cfg.dataset.get('target_responses', _DEFAULT_TARGET_RESPONSES), decoder_only_format=cfg.dataset.decoder_only_format, hf_kwargs=cfg.dataset.get('hf_kwargs', {}))
+        if cfg.drop_last:
+            world_size = dist.get_world_size()
+            minimum_dataset_size = world_size * dataloader_batch_size
+            if hasattr(dataset, '__len__'):
+                full_dataset_size = len(dataset)
+                if full_dataset_size < minimum_dataset_size:
+                    raise NotEnoughDatasetSamplesError(dataset_name=cfg.dataset.hf_name, split=split, dataloader_batch_size=dataloader_batch_size, world_size=world_size, full_dataset_size=full_dataset_size, minimum_dataset_size=minimum_dataset_size)
+        sampler = dist.get_sampler(dataset, drop_last=cfg.drop_last, shuffle=cfg.dataset.shuffle)
+    assert dataset is not None
+    dl = DataLoader(dataset, collate_fn=collate_fn, batch_size=dataloader_batch_size, drop_last=cfg.drop_last, sampler=sampler, num_workers=cfg.num_workers, pin_memory=cfg.get('pin_memory', True), prefetch_factor=cfg.get('prefetch_factor', 2), persistent_workers=cfg.get('persistent_workers', True), timeout=cfg.get('timeout', 0))
+    token_counting_func = get_tokens_per_batch_func()
+    return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func)
+def _validate_config(dataset_cfg: DictConfig) -> None:
+    """Validates the dataset configuration.
+    Makes sure that the dataset is properly configured for either
+    a HuggingFace dataset or a streaming dataset. Must be valid for one or
+    the other.
+    Args:
+        dataset_cfg (DictConfig): The dataset configuration to be validated.
+    Raises:
+        ValueError: If the dataset configuration does not meet the requirements.
+    """
+    if dataset_cfg.get('hf_name') is not None:
+        illegal_keys = ['local', 'remote']
+        discovered_illegal_keys = []
+        for key in illegal_keys:
+            if dataset_cfg.get(key) is not None:
+                discovered_illegal_keys.append('`' + key + '`')
+        if discovered_illegal_keys:
+            raise ValueError('The dataset config sets a value for `hf_name` as well as the ' + f"following keys: {', '.join(discovered_illegal_keys)}.\n" + 'Those keys are used when building from a streaming dataset, but ' + 'setting `hf_name` instructs the dataset to build from a HuggingFace dataset.')
+    elif dataset_cfg.get('remote') is not None:
+        illegal_keys = ['hf_name', 'hf_kwargs', 'preprocessing_fn', 'safe_load']
+        discovered_illegal_keys = []
+        for key in illegal_keys:
+            if dataset_cfg.get(key) is not None:
+                discovered_illegal_keys.append('`' + key + '`')
+        if discovered_illegal_keys:
+            raise ValueError('The dataset config sets a value for `remote` as well as the ' + f"following keys: {', '.join(discovered_illegal_keys)}.\n" + 'Those keys are used when building from a HuggingFace dataset, but ' + 'setting `remote` instructs the dataset to build from a streaming dataset.')
+        if dataset_cfg.get('local') is None:
+            raise ValueError('Using a streaming dataset requires setting both `remote` and `local`, ' + 'but dataset.local is None.')
+    elif dataset_cfg.get('streams') is not None:
+        illegal_keys = ['hf_name', 'hf_kwargs', 'preprocessing_fn', 'safe_load']
+        discovered_illegal_keys = []
+        for key in illegal_keys:
+            if dataset_cfg.get(key) is not None:
+                discovered_illegal_keys.append('`' + key + '`')
+        if discovered_illegal_keys:
+            raise ValueError('The dataset config sets a value for `streams` as well as the ' + f"following keys: {', '.join(discovered_illegal_keys)}.\n" + 'Those keys are used when building from a HuggingFace dataset, but ' + 'setting `streams` instructs the dataset to build from a streaming dataset.')
+        illegal_keys = ['remote', 'local']
+        discovered_illegal_keys = []
+        for key in illegal_keys:
+            if dataset_cfg.get(key) is not None:
+                discovered_illegal_keys.append('`' + key + '`')
+        if discovered_illegal_keys:
+            raise ValueError('The dataset config sets a value for `streams` as well as the ' + f"following keys: {', '.join(discovered_illegal_keys)}.\n" + 'Please either use single stream (set remote/local only) ' + 'or put remote/local under streams')
+    else:
+        raise ValueError('In the dataset config, you must set `hf_name` to use a HuggingFace ' + 'dataset, or set `remote` to use a streaming dataset, or set ' + '`streams` to use multiple streaming datasets,  but all were None.')
+    if dataset_cfg.get('max_seq_len') is None:
+        raise ValueError('In the dataset config, you must set the `max_seq_len`')
+    target_responses = str(dataset_cfg.get('target_responses', _DEFAULT_TARGET_RESPONSES)).lower()
+    target_prompts = str(dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS)).lower()
+    decoder_only_format = dataset_cfg.decoder_only_format
+    validate_target_settings(target_prompts, target_responses, decoder_only_format)
+def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
+    """Downloads a dataset from a remote object store.
+    This function supports 'jsonl', 'csv', and 'parquet' file formats for the dataset. It will attempt to download
+    the dataset, then once it is downloaded, convert it into HuggingFace ``datasets`` format, and then return this
+    dataset.
+    The function also ensures synchronicity across multiple processes during the file download. It creates a signal
+    file that is used to synchronize the start of the download across different processes. Once the download is
+    completed, the function removes the signal file.
+    Args:
+        hf_name (str): The path of the HuggingFace dataset to download.
+        split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
+    Returns:
+        A local directory path where the dataset files are stored.
+    Raises:
+        FileNotFoundError: Raised if the dataset file cannot be found with any of the supported extensions.
+    """
+    hf_formatted_split = split.replace('-', '_')
+    finetune_dir = os.path.join(DOWNLOADED_FT_DATASETS_DIRPATH, hf_formatted_split if hf_formatted_split != 'data' else 'data_not')
+    os.makedirs(finetune_dir, exist_ok=True)
+    for extension in SUPPORTED_EXTENSIONS:
+        name = f"{remote_path.strip('/')}/{split}{extension}"
+        destination = str(os.path.abspath(os.path.join(finetune_dir, 'data', f'{hf_formatted_split}-00000-of-00001{extension}')))
+        signal_file_path = os.path.join(finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed')
+        if dist.get_local_rank() == 0:
+            try:
+                get_file(path=name, destination=destination, overwrite=True)
+            except FileNotFoundError as e:
+                if extension == SUPPORTED_EXTENSIONS[-1]:
+                    files_searched = [f'{cfg.dataset.hf_name}/{cfg.dataset.split}{ext}' for ext in SUPPORTED_EXTENSIONS]
+                    raise FileNotFoundError(f'Could not find a file with any of ' + f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + f'at {files_searched}') from e
+                else:
+                    log.debug(f'Could not find {name}, looking for another extension')
+                continue
+            os.makedirs(os.path.dirname(signal_file_path), exist_ok=True)
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_download')
+        with dist.local_rank_zero_download_and_wait(signal_file_path):
+            dist.barrier()
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
+        dist.barrier()
+        break
+    return finetune_dir
+def _build_collate_fn(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]:
+    dataset_cfg = dataloader_cfg.dataset
+    max_seq_len = dataset_cfg.max_seq_len
+    collate_fn = Seq2SeqFinetuningCollator(tokenizer=tokenizer, max_seq_len=max_seq_len, decoder_only_format=dataset_cfg.decoder_only_format, target_responses=dataset_cfg.get('target_responses', _DEFAULT_TARGET_RESPONSES), target_prompts=dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS), allow_pad_trimming=dataset_cfg.get('allow_pad_trimming', False))
+    packing_ratio = dataset_cfg.get('packing_ratio')
+    max_leftover_bins_to_keep = dataset_cfg.get('max_leftover_bins_to_keep')
+    if packing_ratio is None:
+        if max_leftover_bins_to_keep is not None:
+            raise ValueError('dataset.max_leftover_bins_to_keep has been defined, ' + 'but dataset.packing_ratio has not been set. Please set ' + 'the latter to turn on packing or remove the former from the config.')
+        return (collate_fn, device_batch_size)
+    if packing_ratio == 'auto':
+        packing_ratio = auto_packing_ratio(dataloader_cfg, tokenizer, device_batch_size)
+    if isinstance(packing_ratio, str):
+        raise ValueError('dataset.packing_ratio must be a float or "auto", but it was set to ' + f'{packing_ratio}.')
+    log.info(f'Using packing ratio {packing_ratio}')
+    if packing_ratio == 1.0:
+        return (collate_fn, device_batch_size)
+    elif packing_ratio < 1.0:
+        raise ValueError('packing_ratio must be >= 1, if supplied')
+    if not dataset_cfg.decoder_only_format:
+        raise NotImplementedError('On-the-fly packing is currently only supported for decoder-only formats.')
+    collate_fn = BinPackCollator(collator=collate_fn, target_batch_size=device_batch_size, max_seq_len=max_seq_len, pad_token_id=tokenizer.pad_token_id, padding_side=tokenizer.padding_side, max_leftover_bins_to_keep=max_leftover_bins_to_keep)
+    n_examples_to_pack = int(device_batch_size * packing_ratio)
+    return (collate_fn, n_examples_to_pack)
+if __name__ == '__main__':
+    import torch
+    from .utils import build_tokenizer
+    cfg = om.create({'dataset': {'hf_name': 'tatsu-lab/alpaca', 'preprocessing_fn': 'llmfoundry.data.finetuning.tasks:alpaca_preprocessing_function', 'split': 'train', 'packing_ratio': 18.0, 'max_seq_len': 2048, 'decoder_only_format': True, 'allow_pad_trimming': False, 'num_canonical_nodes': 472, 'shuffle': True, 'target_responses': 'last', 'target_prompts': 'none'}, 'drop_last': False, 'num_workers': 0, 'pin_memory': False, 'prefetch_factor': None, 'persistent_workers': False, 'timeout': 0})
+    tokenizer_name = 'EleutherAI/gpt-neox-20b'
+    tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len}
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+    device_batch_size = 1
+    dataloader = build_finetuning_dataloader(cfg, tokenizer, device_batch_size).dataloader
+    packing = cfg.dataset.get('packing_ratio') is not None
+    for i, batch in enumerate(dataloader):
+        if i >= 5:
+            break
+        print(f'-----Batch {i}-----')
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                print(k, v.shape)
+            else:
+                print(k, v)
+        for j in range(device_batch_size):
+            print(f'--- Sample {j} ---')
+            if cfg.dataset.decoder_only_format:
+                if packing:
+                    for subseq in range(int(batch['sequence_id'][j].max()) + 1):
+                        is_subseq = batch['sequence_id'][j] == subseq
+                        print('\x1b[93m{}\x1b[00m\n'.format('INPUT IDS:'), tokenizer.decode(batch['input_ids'][j, torch.logical_and(is_subseq, batch['attention_mask'][j] == 1)], skip_special_tokens=False, clean_up_tokenization_spaces=True))
+                        print('\x1b[91m{}\x1b[00m\n'.format('TARGET:   '), tokenizer.decode(batch['input_ids'][j, torch.logical_and(is_subseq, batch['labels'][j] != _HF_IGNORE_INDEX)], skip_special_tokens=False, clean_up_tokenization_spaces=True))
+                else:
+                    print('\x1b[93m{}\x1b[00m\n'.format('INPUT IDS:'), tokenizer.decode(batch['input_ids'][j, batch['attention_mask'][j] == 1], skip_special_tokens=False, clean_up_tokenization_spaces=True))
+                    print('\x1b[91m{}\x1b[00m\n'.format('TARGET:   '), tokenizer.decode(batch['input_ids'][j, batch['labels'][j] != _HF_IGNORE_INDEX], skip_special_tokens=False, clean_up_tokenization_spaces=True))
+            else:
+                print('\x1b[92m{}\x1b[00m\n'.format('CONTEXT:  '), tokenizer.decode(batch['input_ids'][j, batch['attention_mask'][j] == 1], skip_special_tokens=False, clean_up_tokenization_spaces=True))
+                print('\x1b[91m{}\x1b[00m\n'.format('TARGET:   '), tokenizer.decode(batch['labels'][j, batch['decoder_attention_mask'][j] == 1], skip_special_tokens=False, clean_up_tokenization_spaces=True))
+        print('   ')

eval_gauntlet_callback.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""Aggregate ICL evals into composite scores."""
+import logging
+import math
+from enum import Enum
+from typing import Dict, Optional
+log = logging.getLogger(__name__)
+class Weighting(Enum):
+    EQUAL = 1
+    SAMPLE_SZ = 2
+    LOG_SAMPLE_SZ = 3
+def calculate_named_averages(average_names: Dict[str, list], category_scores: Dict[str, float]):
+    """Calculates the named averages based off the raw category scores.
+    For each named average, take a simple average of all the category scores associated with that named average.
+    Args:
+        average_names (dict[str, list]):  Contains a mapping of named averages to which category scores that average should consist of.
+        category_scores (dict[str, float]): Contains the raw scores corresponding to each category.
+    """
+    average_scores = {}
+    for avg_name, category_list in average_names.items():
+        composite_subset = {category: score for category, score in category_scores.items() if category in category_list}
+        if len(composite_subset.values()) > 0:
+            average_scores[avg_name] = sum(composite_subset.values()) / len(composite_subset.values())
+        else:
+            average_scores[avg_name] = 0
+    return average_scores
+class EvalGauntlet(Callback):
+    """The EvalGauntlet aggregates ICL eval results.
+    After `eval_end`, this callback inspects the logger for different ICL metrics and aggregates the scores according to the aggregation
+    specification provided in the constructor.
+    Args:
+        logger_keys (list): These are the exact keys that the individual benchmark metrics will be
+                            logged under in the logger after eval
+        categories (dict): This contains the list of categories, as well as the subtasks within them, the
+                      random baseline accuracy of each subtask, and the number of fewshot examples
+                      used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet_v0.2.yaml` to see the structure.
+        weighting (Weighting): The weighting scheme used to balance different tasks within each category.
+                               Either assign them all equal weight, assign them weight proportional
+                               to the dataset size, or assign them weight proportional to the log2 of the dataset size.
+                               Options are 'EQUAL', 'SAMPLE_SZ', and 'LOG_SAMPLE_SZ'.
+        subtract_random_baseline (bool): Flag determining whether to subtract random baseline accuracy
+                                          from the performance on each individual benchmark before aggregating.
+        rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark
+                                 by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0.
+        benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting.
+        averages (Optional[dict]): Optional dictionary specifying a mapping from a average names to lists of categories used produce each named average.
+    """
+    def __init__(self, logger_keys: list, categories: dict, weighting: str='EQUAL', subtract_random_baseline: bool=True, rescale_accuracy: bool=True, benchmark_sizes: Optional[dict]=None, averages: Optional[dict]=None):
+        if isinstance(logger_keys, dict):
+            raise ValueError('logger_keys now requires a list type as input, not a dict')
+        if weighting != Weighting.EQUAL and benchmark_sizes is None:
+            raise Exception('When not using equal weighting, you must provide the benchmark sizes.')
+        if rescale_accuracy and (not subtract_random_baseline):
+            raise Exception('Only use accuracy rescaling in conjunction with subtracting random baseline accuracy.')
+        self.categories = categories
+        self.category_names = [conf.get('name') for conf in self.categories]
+        self.weighting = Weighting[weighting]
+        self.subtract_random_baseline = subtract_random_baseline
+        self.rescale_accuracy = rescale_accuracy
+        self.logger_keys = logger_keys
+        for category in self.categories:
+            for benchmark in category['benchmarks']:
+                bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
+                if self.weighting != Weighting.EQUAL:
+                    assert benchmark_sizes is not None
+                    cumulative_samples = max(sum((count for name, count in benchmark_sizes.items() if name.startswith(bench_name))), 1)
+                else:
+                    cumulative_samples = -1
+                weight = None
+                if self.weighting == Weighting.EQUAL:
+                    weight = 1
+                elif self.weighting == Weighting.SAMPLE_SZ:
+                    weight = cumulative_samples
+                elif self.weighting == Weighting.LOG_SAMPLE_SZ:
+                    weight = max(math.log2(cumulative_samples), 1)
+                assert weight is not None
+                benchmark['weighting'] = weight
+        self.averages = {}
+        if averages is not None:
+            self.averages = averages
+        else:
+            self.averages['default_average'] = self.category_names
+        for avg_name in self.averages:
+            if avg_name in self.category_names:
+                raise ValueError(f'Found average name `{avg_name}` used as category name. Average names and category names must be non-overlapping.')
+    def extract_metrics_from_state(self, state: State) -> Dict[str, float]:
+        results = {}
+        for key in self.logger_keys:
+            dl_name, metric_name = (key.split('/')[1:-1], key.split('/')[-1])
+            if 'Accuracy' not in metric_name:
+                continue
+            metric = state.eval_metrics.get('/'.join(dl_name), {}).get(metric_name, None)
+            if metric is None:
+                continue
+            val = metric.compute().item()
+            key = '/'.join(dl_name[0:2])
+            if key not in results:
+                results[key] = []
+            results[key].append(val)
+        return {k: sum(v) / len(v) for k, v in results.items()}
+    def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
+        computed_metrics = self.extract_metrics_from_state(state)
+        if len(computed_metrics) == 0:
+            return {}
+        category_scores = {}
+        for category in self.categories:
+            missing_metrics = []
+            category_scores[category['name']] = []
+            for benchmark in category['benchmarks']:
+                key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
+                if key not in computed_metrics:
+                    log.warning(f'Could not find results for benchmark: {benchmark}.')
+                    missing_metrics.append(key)
+                else:
+                    score = computed_metrics[key]
+                    if self.subtract_random_baseline:
+                        score -= benchmark['random_baseline']
+                    if self.rescale_accuracy and self.subtract_random_baseline:
+                        score /= 1.0 - benchmark['random_baseline']
+                    category_scores[category['name']].append({'name': benchmark['name'], 'score': score, 'weighting': benchmark['weighting']})
+            if len(missing_metrics) > 0:
+                log.warning(f"Removing category `{category['name']}` from scores because benchmarks were missing: {missing_metrics}")
+                del category_scores[category['name']]
+                continue
+            total_weight = sum((k['weighting'] for k in category_scores[category['name']]))
+            category_scores[category['name']] = sum((k['score'] * (k['weighting'] / total_weight) for k in category_scores[category['name']]))
+        named_averages = calculate_named_averages(self.averages, category_scores)
+        category_scores.update(named_averages)
+        category_scores = {f'icl/metrics/eval_gauntlet/{k}': v for k, v in category_scores.items()}
+        if logger is not None:
+            logger.log_metrics(category_scores)
+        return category_scores

exceptions.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""Custom exceptions for the LLMFoundry."""
+from collections.abc import Mapping
+from typing import Any, Dict, List
+class MissingHuggingFaceURLSplitError(ValueError):
+    """Error thrown when there's no split used in HF dataset config."""
+    def __init__(self) -> None:
+        message = 'When using a HuggingFace dataset from a URL, you must set the ' + '`split` key in the dataset config.'
+        super().__init__(message)
+class NotEnoughDatasetSamplesError(ValueError):
+    """Error thrown when there is not enough data to train a model."""
+    def __init__(self, dataset_name: str, split: str, dataloader_batch_size: int, world_size: int, full_dataset_size: int, minimum_dataset_size: int) -> None:
+        self.dataset_name = dataset_name
+        self.split = split
+        self.dataloader_batch_size = dataloader_batch_size
+        self.world_size = world_size
+        self.full_dataset_size = full_dataset_size
+        self.minimum_dataset_size = minimum_dataset_size
+        message = f'Your dataset (name={dataset_name}, split={split}) ' + f'has {full_dataset_size} samples, but your minimum batch size ' + f'is {minimum_dataset_size} because you are running on {world_size} gpus and ' + f'your per device batch size is {dataloader_batch_size}. Please increase the number ' + f'of samples in your dataset to at least {minimum_dataset_size}.'
+        super().__init__(message)
+class UnknownExampleTypeError(KeyError):
+    """Error thrown when an unknown example type is used in a task."""
+    def __init__(self, example: Mapping) -> None:
+        message = f'Unknown example type example={example!r}'
+        super().__init__(message)
+class TooManyKeysInExampleError(ValueError):
+    """Error thrown when a data sample has too many keys."""
+    def __init__(self, desired_keys: set[str], keys: set[str]) -> None:
+        message = f'Data sample has {len(keys)} keys in `allowed_keys`: {desired_keys} Please specify exactly one. Provided keys: {keys}'
+        super().__init__(message)
+class NotEnoughChatDataError(ValueError):
+    """Error thrown when there is not enough chat data to train a model."""
+    def __init__(self) -> None:
+        message = 'Chat example must have at least two messages'
+        super().__init__(message)
+class ConsecutiveRepeatedChatRolesError(ValueError):
+    """Error thrown when there are consecutive repeated chat roles."""
+    def __init__(self, repeated_role: str) -> None:
+        self.repeated_role = repeated_role
+        message = f'Conversation roles must alternate but found {repeated_role} repeated consecutively.'
+        super().__init__(message)
+class InvalidLastChatMessageRoleError(ValueError):
+    """Error thrown when the last message role in a chat example is invalid."""
+    def __init__(self, last_role: str, expected_roles: set[str]) -> None:
+        message = f'Invalid last message role: {last_role}. Expected one of: {expected_roles}'
+        super().__init__(message)
+class IncorrectMessageKeyQuantityError(ValueError):
+    """Error thrown when a message has an incorrect number of keys."""
+    def __init__(self, keys: List[str]) -> None:
+        self.keys = keys
+        message = f'Expected 2 keys in message, but found {len(keys)}'
+        super().__init__(message)
+class InvalidRoleError(ValueError):
+    """Error thrown when a role is invalid."""
+    def __init__(self, role: str, valid_roles: set[str]) -> None:
+        self.role = role
+        self.valid_roles = valid_roles
+        message = f'Expected role to be one of {valid_roles} but found: {role}'
+        super().__init__(message)
+class InvalidContentTypeError(TypeError):
+    """Error thrown when the content type is invalid."""
+    def __init__(self, content_type: type) -> None:
+        self.content_type = content_type
+        message = f'Expected content to be a string, but found {content_type}'
+        super().__init__(message)
+class InvalidPromptTypeError(TypeError):
+    """Error thrown when the prompt type is invalid."""
+    def __init__(self, prompt_type: type) -> None:
+        self.prompt_type = prompt_type
+        message = f'Expected prompt to be a string, but found {prompt_type}'
+        super().__init__(message)
+class InvalidResponseTypeError(TypeError):
+    """Error thrown when the response type is invalid."""
+    def __init__(self, response_type: type) -> None:
+        self.response_type = response_type
+        message = f'Expected response to be a string, but found {response_type}'
+        super().__init__(message)
+class InvalidPromptResponseKeysError(ValueError):
+    """Error thrown when missing expected prompt and response keys."""
+    def __init__(self, mapping: Dict[str, str], example: Dict[str, Any]):
+        self.example = example
+        message = f'Expected mapping={mapping!r} to have keys "prompt" and "response".'
+        super().__init__(message)
+class InvalidFileExtensionError(FileNotFoundError):
+    """Error thrown when a file extension is not a safe extension."""
+    def __init__(self, dataset_name: str, valid_extensions: List[str]) -> None:
+        self.dataset_name = dataset_name
+        self.valid_extensions = valid_extensions
+        message = f'safe_load is set to True. No data files with safe extensions {valid_extensions} ' + f'found for dataset at local path {dataset_name}.'
+        super().__init__(message)
+class UnableToProcessPromptResponseError(ValueError):
+    """Error thrown when a prompt and response cannot be processed."""
+    def __init__(self, input: Dict) -> None:
+        message = f'Unable to extract prompt/response from {input}'
+        super().__init__(message)
+class ClusterDoesNotExistError(ValueError):
+    """Error thrown when the cluster does not exist."""
+    def __init__(self, cluster_id: str) -> None:
+        self.cluster_id = cluster_id
+        message = f'Cluster with id {cluster_id} does not exist. Check cluster id and try again!'
+        super().__init__(message)
+class FailedToCreateSQLConnectionError(RuntimeError):
+    """Error thrown when client can't sql connect to Databricks."""
+    def __init__(self) -> None:
+        message = 'Failed to create sql connection to db workspace. To use sql connect, you need to provide http_path and cluster_id!'
+        super().__init__(message)
+class FailedToConnectToDatabricksError(RuntimeError):
+    """Error thrown when the client fails to connect to Databricks."""
+    def __init__(self) -> None:
+        message = 'Failed to create databricks connection. Check hostname and access token!'
+        super().__init__(message)
+class InputFolderMissingDataError(ValueError):
+    """Error thrown when the input folder is missing data."""
+    def __init__(self, input_folder: str) -> None:
+        self.input_folder = input_folder
+        message = f'No text files were found at {input_folder}.'
+        super().__init__(message)
+class OutputFolderNotEmptyError(FileExistsError):
+    """Error thrown when the output folder is not empty."""
+    def __init__(self, output_folder: str) -> None:
+        self.output_folder = output_folder
+        message = f'{output_folder} is not empty. Please remove or empty it and retry.'
+        super().__init__(message)

fdiff_callback.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Monitor rate of change of loss."""
+from __future__ import annotations
+import torch
+class FDiffMetrics(Callback):
+    """Rate of change of metrics.
+    tracks and plots the rate of change of metrics effectively taking the
+    numerical derivative of the metrics
+    """
+    def __init__(self, diff_train_metrics: bool=False, diff_eval_metrics: bool=True):
+        self.diff_train_metrics = diff_train_metrics
+        self.diff_eval_metrics = diff_eval_metrics
+        self.train_prev_loss = None
+        self.train_prev_metric = {}
+        self.eval_prev_metric = {}
+    def batch_end(self, state: State, logger: Logger) -> None:
+        if self.diff_train_metrics:
+            if not isinstance(state.loss, torch.Tensor):
+                raise NotImplementedError('Multiple losses not supported yet')
+            loss = state.loss.item()
+            if self.train_prev_loss:
+                logger.log_metrics({'loss/train/total_fdiff': loss - self.train_prev_loss})
+            self.train_prev_loss = loss
+            for k in self.train_prev_metric.keys():
+                logger.log_metrics({f'metrics/train/{k}_fdiff': state.train_metric_values[k] - self.train_prev_metric[k]})
+            for k in state.train_metric_values.keys():
+                value = state.train_metric_values[k]
+                self.train_prev_metric[k] = value
+    def eval_end(self, state: State, logger: Logger) -> None:
+        if self.diff_eval_metrics:
+            evaluator = state.dataloader_label
+            assert evaluator is not None, 'dataloader should have been set'
+            metrics = list(state.eval_metrics[evaluator].keys())
+            for k in metrics:
+                mkey = '/'.join(['metrics', evaluator, k])
+                if mkey in self.eval_prev_metric.keys():
+                    logger.log_metrics({f'{mkey}_fdiff': state.eval_metric_values[k] - self.eval_prev_metric[mkey]})
+            for k in metrics:
+                mkey = '/'.join(['metrics', evaluator, k])
+                self.eval_prev_metric[mkey] = state.eval_metric_values[k]

ffn.py CHANGED Viewed

@@ -59,8 +59,7 @@ class MPTMLP(nn.Module):
         super().__init__()
         ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
         self.fc_kwargs: dict[str, Any] = {'bias': bias}
-        if fc_type != 'te':
-            self.fc_kwargs['device'] = device
         self.up_proj = FC_CLASS_REGISTRY[fc_type](d_model, ffn_hidden_size, **self.fc_kwargs)
         self.act = act_fn
         self.down_proj = FC_CLASS_REGISTRY[fc_type](ffn_hidden_size, d_model, **self.fc_kwargs)
@@ -75,6 +74,7 @@ class MPTGLU(MPTMLP):
         super().__init__(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, ffn_hidden_size=ffn_hidden_size, act_fn=act_fn, device=device, bias=bias)
         self.gate_proj = FC_CLASS_REGISTRY[fc_type](d_model, self.up_proj.out_features, **self.fc_kwargs)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
 FFN_CLASS_REGISTRY = {'mptmlp': MPTMLP, 'mptglu': MPTGLU}

         super().__init__()
         ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size)
         self.fc_kwargs: dict[str, Any] = {'bias': bias}
+        self.fc_kwargs['device'] = device
         self.up_proj = FC_CLASS_REGISTRY[fc_type](d_model, ffn_hidden_size, **self.fc_kwargs)
         self.act = act_fn
         self.down_proj = FC_CLASS_REGISTRY[fc_type](ffn_hidden_size, d_model, **self.fc_kwargs)
         super().__init__(d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, ffn_hidden_size=ffn_hidden_size, act_fn=act_fn, device=device, bias=bias)
         self.gate_proj = FC_CLASS_REGISTRY[fc_type](d_model, self.up_proj.out_features, **self.fc_kwargs)
+    @torch.compile
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
 FFN_CLASS_REGISTRY = {'mptmlp': MPTMLP, 'mptglu': MPTGLU}

finetuning.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .collator import Seq2SeqFinetuningCollator
2	+ from .dataloader import build_finetuning_dataloader

hf.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .hf_causal_lm import ComposerHFCausalLM
+from .hf_fsdp import prepare_hf_causal_lm_model_for_fsdp, prepare_hf_enc_dec_model_for_fsdp, prepare_hf_model_for_fsdp
+from .hf_t5 import ComposerHFT5

hf_causal_lm.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Implements a Hugging Causal LM wrapped inside a :class:`.ComposerModel`."""
+import logging
+import os
+import warnings
+from typing import TYPE_CHECKING, Any, Dict, Mapping
+from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizerBase
+from .hf_fsdp import hf_get_init_device
+from .model_wrapper import HuggingFaceModelWithFSDP
+from .attention import is_flash_v2_installed
+from .utils import init_empty_weights
+from .config_utils import pop_config
+if TYPE_CHECKING:
+    from peft import PeftConfig
+log = logging.getLogger(__name__)

hf_checkpointer.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import contextlib
+import copy
+import logging
+import math
+import os
+import re
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Union
+import torch
+from mlflow.transformers import _fetch_model_card, _write_license_information
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from .mpt import MPTConfig, MPTForCausalLM
+from .utils import init_empty_weights
+from .huggingface_hub_utils import edit_files_for_hf_compatibility
+log = logging.getLogger(__name__)
+_LICENSE_FILE_PATTERN = re.compile('license(\\.[a-z]+|$)', re.IGNORECASE)
+def _maybe_get_license_filename(local_dir: str, pretrained_model_name: Optional[str]=None) -> Optional[str]:
+    """Returns the name of the license file if it exists in the local_dir.
+    Note: This is intended to be consistent with the code in MLflow.
+    https://github.com/mlflow/mlflow/blob/5d13d6ec620a02de9a5e31201bf1becdb9722ea5/mlflow/transformers/__init__.py#L1152
+    Since LLM Foundry supports local model files being used rather than fetching the files from the Hugging Face Hub,
+    MLflow's logic to fetch and write the license information on model save is not applicable; it will try to search for
+    a Hugging Face repo named after the local path. However, the user can provide the original pretrained model name,
+    in which case this function will use that to fetch the correct license information.
+    If the license file does not exist, returns None.
+    """
+    try:
+        license_filename = next((file for file in os.listdir(local_dir) if _LICENSE_FILE_PATTERN.search(file)))
+        if pretrained_model_name is not None:
+            log.info(f'Overwriting license file {license_filename} with license info for model {pretrained_model_name} from Hugging Face Hub')
+            os.remove(os.path.join(local_dir, license_filename))
+            model_card = _fetch_model_card(pretrained_model_name)
+            local_dir_path = Path(local_dir).absolute()
+            _write_license_information(pretrained_model_name, model_card, local_dir_path)
+            license_filename = next((file for file in os.listdir(local_dir) if _LICENSE_FILE_PATTERN.search(file)))
+        return license_filename
+    except StopIteration:
+        return None
+class HuggingFaceCheckpointer(Callback):
+    """Save a huggingface formatted checkpoint during training.
+    Args:
+        save_folder (str): Top level folder to save checkpoints to (can be a
+            URI). It is likely that this would be the same as your save_folder.
+        save_interval: Union[str, int, Time]: The interval describing how often
+            checkpoints should be saved. If an integer, it will be assumed to be
+            in :attr:`.TimeUnit.EPOCH`. Otherwise, the unit must be either
+            :attr:`.TimeUnit.EPOCH`, :attr:`.TimeUnit.BATCH`,
+            :attr:`.TimeUnit.TOKEN`, or :attr:`.TimeUnit.SAMPLE`.
+        huggingface_folder_name (str): Folder to save each checkpoint under (can
+            be a format string). Default is ``ba{batch}``.
+        precision: The precision to save the model in. Default is ``float32``.
+            Options are ``bfloat16``, ``float16``, or ``float32``.
+        overwrite (bool): Whether to overwrite previous checkpoints.
+        mlflow_registered_model_name (Optional[str]): The name to register the
+            model under in the MLflow model registry. If ``None``, the model
+            will not be registered. Default is ``None``.
+        mlflow_logging_config (Optional[dict]): A dictionary of config arguments
+            that will get passed along to the MLflow ``save_model`` call.
+            Expected to contain ``metadata`` and ``task`` keys. If either is
+            unspecified, the defaults are ``'text-generation'`` and
+            ``{'task': 'llm/v1/completions'}`` respectively. A default input example
+            and signature intended for text generation is also included under the
+            keys ``input_example`` and ``signature``.
+        flatten_imports (Sequence[str]): A sequence of import prefixes that will
+            be flattened when editing MPT files.
+    """
+    def __init__(self, save_folder: str, save_interval: Union[str, int, Time], huggingface_folder_name: str='ba{batch}', precision: str='float32', overwrite: bool=True, mlflow_registered_model_name: Optional[str]=None, mlflow_logging_config: Optional[dict]=None, flatten_imports: Sequence[str]=('llmfoundry',)):
+        _, _, self.save_dir_format_str = parse_uri(save_folder)
+        self.overwrite = overwrite
+        self.precision = precision
+        self.dtype = {'float32': torch.float32, 'float16': torch.float16, 'bfloat16': torch.bfloat16}[precision]
+        self.flatten_imports = flatten_imports
+        self.mlflow_registered_model_name = mlflow_registered_model_name
+        if mlflow_logging_config is None:
+            mlflow_logging_config = {}
+        if self.mlflow_registered_model_name is not None:
+            import numpy as np
+            passed_metadata = mlflow_logging_config.get('metadata', {})
+            mlflow_logging_config['metadata'] = passed_metadata
+            mlflow_logging_config.setdefault('task', 'llm/v1/completions')
+            default_input_example = {'prompt': np.array(['What is Machine Learning?'])}
+            is_chat = mlflow_logging_config['task'].endswith('chat') or mlflow_logging_config['metadata'].get('task', '').endswith('chat')
+            if is_chat:
+                default_input_example = {'messages': np.array([{'role': 'user', 'content': 'What is Machine Learning?'}])}
+                mlflow_logging_config.setdefault('example_no_conversion', True)
+            mlflow_logging_config.setdefault('input_example', default_input_example)
+        self.mlflow_logging_config = mlflow_logging_config
+        self.huggingface_folder_name_fstr = os.path.join('huggingface', huggingface_folder_name)
+        self.save_interval: Time = Time.from_input(save_interval, TimeUnit.EPOCH)
+        self.check_interval = create_interval_scheduler(self.save_interval, include_end_of_training=True)
+        self.remote_ud = maybe_create_remote_uploader_downloader_from_uri(save_folder, loggers=[])
+        if self.remote_ud is not None:
+            self.remote_ud._num_concurrent_uploads = 4
+        self.last_checkpoint_batch: Optional[Time] = None
+        self.mlflow_loggers = []
+    def run_event(self, event: Event, state: State, logger: Logger) -> None:
+        if state.get_elapsed_duration() is not None and self.check_interval(state, event) and (self.last_checkpoint_batch != state.timestamp.batch):
+            self._save_checkpoint(state, logger)
+        elif event == Event.INIT:
+            if not isinstance(state.model, HuggingFaceModel):
+                raise ValueError(f'`HuggingFaceCheckpointer` is only compatible with `HuggingFaceModel`s. ' + f'Got {type(state.model)} instead.')
+            if self.remote_ud is not None:
+                self.remote_ud.init(state, logger)
+                state.callbacks.append(self.remote_ud)
+            if self.mlflow_registered_model_name is not None:
+                self.mlflow_loggers = [logger_destination for logger_destination in logger.destinations if isinstance(logger_destination, MLFlowLogger)]
+                if len(self.mlflow_loggers) == 0:
+                    raise ValueError(f'`mlflow_registered_model_name` was set, but no `MLFlowLogger` was found in the `logger.destinations` list. ' + 'Please add an `MLFlowLogger` or set `mlflow_registered_model_name` to `None`.')
+                import mlflow
+                mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set('5GB')
+    def _is_last_batch(self, state: State):
+        elapsed_duration = state.get_elapsed_duration()
+        if elapsed_duration is not None and elapsed_duration >= 1.0:
+            return True
+        assert state.max_duration is not None
+        if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and (state.max_duration.unit == TimeUnit.EPOCH):
+            assert state.dataloader_len is not None
+            return int(state.timestamp.batch) % math.ceil(state.max_duration.value * state.dataloader_len) == 0
+        return False
+    def _save_checkpoint(self, state: State, logger: Logger):
+        del logger
+        self.last_checkpoint_batch = state.timestamp.batch
+        log.info('Saving HuggingFace formatted checkpoint')
+        from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+        CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
+        MPTConfig.register_for_auto_class()
+        MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+        save_dir = format_name_with_dist_and_time(str(Path(self.save_dir_format_str) / self.huggingface_folder_name_fstr), state.run_name, state.timestamp)
+        dir_context_mgr = tempfile.TemporaryDirectory() if self.remote_ud is not None else contextlib.nullcontext(enter_result=save_dir)
+        with dir_context_mgr as temp_save_dir:
+            assert isinstance(temp_save_dir, str)
+            log.debug('Gathering state dict')
+            from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            if state.is_model_ddp:
+                composer_model = state.model.module
+                original_model: PreTrainedModel = state.model.module.model
+                state_dict_model = state.model.module.model
+                original_tokenizer = state.model.module.tokenizer
+            elif isinstance(state.model.model, FSDP):
+                composer_model = state.model
+                original_model: PreTrainedModel = state.model.model.module
+                state_dict_model = state.model.model
+                original_tokenizer = state.model.tokenizer
+            else:
+                composer_model = state.model
+                original_model: PreTrainedModel = state.model.model
+                state_dict_model = state.model.model
+                original_tokenizer = state.model.tokenizer
+            state_dict_context = fsdp_state_dict_type_context(original_model, state_dict_type='full') if not state.is_model_ddp and isinstance(state_dict_model, FSDP) else contextlib.nullcontext()
+            with state_dict_context:
+                state_dict = state_dict_model.state_dict()
+                for k, v in state_dict.items():
+                    if isinstance(v, torch.Tensor):
+                        state_dict[k] = v.to(dtype=self.dtype)
+            if dist.get_global_rank() == 0:
+                log.debug('Saving Hugging Face checkpoint in global rank 0')
+                copied_config = copy.deepcopy(original_model.config)
+                if copied_config.model_type == 'mpt':
+                    copied_config.attn_config['attn_impl'] = 'torch'
+                    copied_config.init_device = 'cpu'
+                log.debug(f'Creating new model instance')
+                if composer_model.using_peft:
+                    active_adapter = original_model.active_adapter
+                    base_model = original_model.get_base_model()
+                    new_base_model_instance = type(base_model)(copied_config)
+                    new_model_instance = type(original_model)(new_base_model_instance, original_model.peft_config[active_adapter])
+                    new_model_instance.to(dtype=self.dtype)
+                else:
+                    with init_empty_weights():
+                        new_model_instance = type(original_model)(copied_config)
+                new_model_instance.load_state_dict(state_dict, assign=True)
+                del state_dict
+                log.debug('Saving Hugging Face checkpoint to disk')
+                new_model_instance.save_pretrained(temp_save_dir)
+                if original_tokenizer is not None:
+                    assert isinstance(original_tokenizer, PreTrainedTokenizerBase)
+                    original_tokenizer.save_pretrained(temp_save_dir)
+                if original_model.config.model_type == 'mpt':
+                    log.debug('Editing MPT files for HuggingFace compatibility')
+                    edit_files_for_hf_compatibility(temp_save_dir, self.flatten_imports)
+                if self.remote_ud is not None:
+                    for filename in os.listdir(temp_save_dir):
+                        remote_file_name = os.path.join(save_dir, filename)
+                        remote_file_uri = self.remote_ud.remote_backend.get_uri(remote_file_name)
+                        log.info(f'Uploading HuggingFace formatted checkpoint to {remote_file_uri}')
+                        self.remote_ud.upload_file(state=state, remote_file_name=remote_file_name, file_path=Path(os.path.join(temp_save_dir, filename)), overwrite=self.overwrite)
+                if self.mlflow_registered_model_name and self._is_last_batch(state):
+                    components = {'model': new_model_instance}
+                    if original_tokenizer is not None:
+                        components['tokenizer'] = original_tokenizer
+                    log.debug('Logging Hugging Face model to MLFlow')
+                    for i, mlflow_logger in enumerate(self.mlflow_loggers):
+                        log.debug(f'Registering model to UC at {mlflow_logger.model_registry_prefix}.{self.mlflow_registered_model_name}')
+                        local_save_path = str(Path(temp_save_dir) / f'mlflow_save_{i}')
+                        import mlflow
+                        mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: ''
+                        model_saving_kwargs: Dict[str, Any] = {'path': local_save_path}
+                        if composer_model.using_peft:
+                            model_saving_kwargs['flavor'] = 'peft'
+                            model_saving_kwargs['save_pretrained_dir'] = temp_save_dir
+                            model_saving_kwargs['metadata'] = self.mlflow_logging_config['metadata']
+                        else:
+                            model_saving_kwargs['flavor'] = 'transformers'
+                            model_saving_kwargs['transformers_model'] = components
+                            model_saving_kwargs.update(self.mlflow_logging_config)
+                        mlflow_logger.save_model(**model_saving_kwargs)
+                        license_filename = _maybe_get_license_filename(local_save_path, self.mlflow_logging_config['metadata'].get('pretrained_model_name', None))
+                        if license_filename is not None:
+                            mlflow_logger._mlflow_client.log_artifact(mlflow_logger._run_id, os.path.join(local_save_path, license_filename))
+                        mlflow_logger.register_model_with_run_id(model_uri=local_save_path, name=self.mlflow_registered_model_name, await_creation_for=3600)

hf_fsdp.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import functools
+from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Union
+from transformers import PreTrainedModel
+from transformers.models.opt.modeling_opt import OPTDecoder
+if TYPE_CHECKING:
+    from peft import PeftModel
+def rhasattr(obj: Any, attr: str) -> bool:
+    """A chain-able attribute version of hasattr.
+    For example, to check if
+    `obj` has the attribute `foo.bar.baz`, you can use:
+        `rhasattr(obj, "foo.bar.baz")`
+    Reference: https://stackoverflow.com/a/67303315
+    """
+    _nested_attrs = attr.split('.')
+    _curr_obj = obj
+    for _a in _nested_attrs[:-1]:
+        if hasattr(_curr_obj, _a):
+            _curr_obj = getattr(_curr_obj, _a)
+        else:
+            return False
+    return hasattr(_curr_obj, _nested_attrs[-1])
+def rgetattr(obj: Any, attr: str, *args: List[Any]) -> Any:
+    """A chain-able attribute version of getattr.
+    For example, to get the attribute `foo.bar.baz` from `obj`, you can use:
+        `rgetattr(obj, "foo.bar.baz")`
+    Reference: https://stackoverflow.com/a/31174427
+    """
+    def _getattr(obj: Any, attr: str):
+        return getattr(obj, attr, *args)
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
+def findattr(obj: Any, attrs: Iterable[str]) -> Optional[Any]:
+    for attr in attrs:
+        if rhasattr(obj, attr):
+            return rgetattr(obj, attr)
+    return None
+def hf_get_causal_base_model(model: PreTrainedModel) -> Any:
+    """Returns the causal decoder backbone of the specified HuggingFace model.
+    Newer HF models have a `self.get_decoder()` method. Older models do not.
+    NOTE: Different model configurations have different causal decoder attribute
+    names.
+        - transformer: (GPT2LMHeadModel, GPTJConfig)
+        - model.decoder: (OPTConfig, BloomConfig)
+        - gpt_neox: (GPTNeoXConfig)
+    """
+    if hasattr(model, 'get_decoder'):
+        return model.get_decoder()
+    decoder_attrs = ('transformer', 'model.decoder', 'gpt_neox', 'model.transformer')
+    causal_base_model = findattr(model, decoder_attrs)
+    if causal_base_model is None:
+        raise ValueError(f'Unable to FSDP-wrap model {model}. Please open a github issue to add support.')
+    return causal_base_model
+def hf_get_hidden_layers(model: PreTrainedModel) -> Any:
+    """Returns the hidden layers of the specified model.
+    Expects to receive the causal decoder backbone, not he XXForCausalLM wrapper.
+    NOTE: Different model configurations have different hidden layer attribute names.
+        - h: (BloomForCausalLM, GPT2LMHeadModel, GPTJForCausalLM)
+        - decoder.layers: (OPTForCausalLM)
+        - layers: (GPTNeoXForCausalLM, LlaMaForCausalLM)
+        - blocks: (MPTForCausalLM)
+    """
+    hidden_layers_attrs = ('h', 'decoder.layers', 'layers', 'block', 'blocks')
+    layers = findattr(model, hidden_layers_attrs)
+    if layers is None:
+        raise ValueError(f'Unable to find hidden layer for {model}. Model must have one of the following attributes: {hidden_layers_attrs}')
+    return layers
+def hf_get_init_device(init_device: Optional[str]) -> Optional[str]:
+    """Returns the appropriate device to initialize models."""
+    if init_device == 'mixed':
+        if dist.get_local_rank() == 0:
+            return 'cpu'
+        return 'meta'
+    return init_device
+def prepare_hf_model_for_fsdp(model: PreTrainedModel, init_device: Optional[str]) -> None:
+    """FSDP wrap a HuggingFace model.
+    Call specific functions
+    """
+    if model.config.is_encoder_decoder:
+        prepare_hf_enc_dec_model_for_fsdp(model, init_device)
+    else:
+        prepare_hf_causal_lm_model_for_fsdp(model, init_device)
+def prepare_hf_causal_lm_model_for_fsdp(model: Union[PreTrainedModel, 'PeftModel'], init_device: Optional[str]) -> None:
+    """FSDP wrap a HuggingFace decoder.
+    Wrap any model for FSDP which follows one of the 3 existing conventions from
+    HuggingFace for decoder-only LLMs.
+    """
+    causal_base_model = hf_get_causal_base_model(model)
+    if isinstance(causal_base_model, OPTDecoder) or model.config.model_type == 'olmo':
+        underlying_model = maybe_get_underlying_model(model)
+        underlying_model.model._fsdp_wrap = False
+    model_block = hf_get_hidden_layers(causal_base_model)
+    lm_head = model.get_output_embeddings()
+    try:
+        tied_embeddings = causal_base_model.get_input_embeddings()
+    except:
+        tied_embeddings = model.get_input_embeddings()
+    modules = {'base_model': causal_base_model, 'model_block': model_block, 'lm_head': lm_head, 'tied_embeddings': tied_embeddings}
+    for mod_name, module in modules.items():
+        if module is None:
+            raise ValueError(f'Unable to FSDP-wrap this model! `{mod_name}` does not ' + 'follow common layer/weight naming conventions.')
+    block_type = type(model_block[0])
+    if model.config.tie_word_embeddings:
+        causal_base_model._fsdp_wrap = False
+        tied_embeddings._fsdp_wrap = False
+        lm_head._fsdp_wrap = False
+    if hasattr(model, 'peft_type') and model.peft_type is not None:
+        peft_type = model.peft_type.lower()
+        active_adapters = [adapter.lower() for adapter in model.active_adapters]
+        for name, module in model.named_modules():
+            if peft_type in name.lower() and any((adapter in name.lower() for adapter in active_adapters)):
+                has_parameters = next(module.parameters(), None) is not None
+                has_buffers = next(module.buffers(), None) is not None
+                if has_parameters or has_buffers:
+                    module._fsdp_wrap = True
+    model.fsdp_wrap_fn = lambda module: isinstance(module, block_type)
+    model.activation_checkpointing_fn = lambda module: isinstance(module, block_type)
+def prepare_hf_enc_dec_model_for_fsdp(model: PreTrainedModel, init_device: Optional[str]) -> None:
+    """Wrap an encoder/decoder HF model.
+    This works for T5, BART, Pegasus, PegasusX, but not all enc/dec (ProphetNet)
+    You have model.shared, model.encoder, model.decoder and model.lm_head, where
+    model.shared are the embeddings which are tied to model.lm_head, and
+    model.shared == model.encoder.embed_tokens and model.shared ==
+    model.decoder.embed_tokens
+    """
+    tied_embeddings = model.get_input_embeddings()
+    encoder = model.get_encoder()
+    decoder = model.get_decoder()
+    lm_head = model.get_output_embeddings()
+    encoder_block = hf_get_hidden_layers(encoder)
+    decoder_block = hf_get_hidden_layers(decoder)
+    modules = {'encoder': encoder, 'decoder': decoder, 'encoder_block': encoder_block, 'decoder_block': decoder_block, 'lm_head': lm_head, 'tied_embeddings': tied_embeddings}
+    for mod_name, module in modules.items():
+        if module is None:
+            raise ValueError(f'Unable to FSDP-wrap this model! `{mod_name}` does not ' + 'follow common layer/weight naming conventions.')
+    decoder_block_type = type(decoder_block[0])
+    encoder_block_type = type(encoder_block[0])
+    if model.config.tie_word_embeddings:
+        tied_embeddings._fsdp_wrap = False
+        encoder._fsdp_wrap = False
+        decoder._fsdp_wrap = False
+        lm_head._fsdp_wrap = False
+    model.fsdp_wrap_fn = lambda module: isinstance(module, decoder_block_type)
+    model.activation_checkpointing_fn = lambda module: isinstance(module, decoder_block_type)
+    if encoder_block_type == decoder_block_type:
+        return
+    model.fsdp_wrap_fn = lambda module: isinstance(module, encoder_block_type)
+    model.activation_checkpointing_fn = lambda module: isinstance(module, encoder_block_type)

hf_t5.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Implements a Hugging Face T5 wrapped inside a :class:`.ComposerModel`."""
+from __future__ import annotations
+from typing import Mapping
+from transformers import AutoConfig, PreTrainedTokenizerBase, T5ForConditionalGeneration
+from .hf_fsdp import hf_get_init_device
+from .model_wrapper import HuggingFaceModelWithFSDP
+from .utils import init_empty_weights
+from .warnings import experimental_class

huggingface_hub_utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import ast
+import importlib
+import os
+from typing import Optional, Sequence
+class DeleteSpecificNodes(ast.NodeTransformer):
+    def __init__(self, nodes_to_remove: list[ast.AST]):
+        self.nodes_to_remove = nodes_to_remove
+    def visit(self, node: ast.AST) -> Optional[ast.AST]:
+        if node in self.nodes_to_remove:
+            return None
+        return super().visit(node)
+def convert_to_relative_import(module_name: str, original_parent_module_name: Optional[str]) -> str:
+    parts = module_name.split('.')
+    if parts[-1] == original_parent_module_name:
+        return '.'
+    return '.' + parts[-1]
+def find_module_file(module_name: str) -> str:
+    if not module_name:
+        raise ValueError(f'Invalid input: module_name={module_name!r}')
+    module = importlib.import_module(module_name)
+    module_file = module.__file__
+    if module_file is None:
+        raise ValueError(f'Could not find file for module: {module_name}')
+    return module_file
+def _flatten_import(node: ast.ImportFrom, flatten_imports_prefix: Sequence[str]) -> bool:
+    """Returns True if import should be flattened.
+    Checks whether the node starts the same as any of the imports in
+    flatten_imports_prefix.
+    """
+    for import_prefix in flatten_imports_prefix:
+        if node.module is not None and node.module.startswith(import_prefix):
+            return True
+    return False
+def _remove_import(node: ast.ImportFrom, remove_imports_prefix: Sequence[str]) -> bool:
+    """Returns True if import should be removed.
+    Checks whether the node starts the same as any of the imports in
+    remove_imports_prefix.
+    """
+    for import_prefix in remove_imports_prefix:
+        if node.module is not None and node.module.startswith(import_prefix):
+            return True
+    return False
+def process_file(file_path: str, folder_path: str, flatten_imports_prefix: Sequence[str], remove_imports_prefix: Sequence[str]) -> list[str]:
+    with open(file_path, 'r', encoding='utf-8') as f:
+        source = f.read()
+    parent_module_name = None
+    if os.path.basename(file_path) == '__init__.py':
+        parent_module_name = os.path.basename(os.path.dirname(file_path))
+    tree = ast.parse(source)
+    new_files_to_process = []
+    nodes_to_remove = []
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ImportFrom) and node.module is not None and _remove_import(node, remove_imports_prefix):
+            nodes_to_remove.append(node)
+        elif isinstance(node, ast.ImportFrom) and node.module is not None and _flatten_import(node, flatten_imports_prefix):
+            module_path = find_module_file(node.module)
+            node.module = convert_to_relative_import(node.module, parent_module_name)
+            new_files_to_process.append(module_path)
+        elif isinstance(node, ast.ClassDef) and node.name.startswith('Composer'):
+            nodes_to_remove.append(node)
+        elif isinstance(node, ast.Assign) and len(node.targets) == 1 and isinstance(node.targets[0], ast.Name) and (node.targets[0].id == '__all__'):
+            nodes_to_remove.append(node)
+    transformer = DeleteSpecificNodes(nodes_to_remove)
+    new_tree = transformer.visit(tree)
+    new_filename = os.path.basename(file_path)
+    if new_filename == '__init__.py':
+        new_filename = file_path.split('/')[-2] + '.py'
+    new_file_path = os.path.join(folder_path, new_filename)
+    with open(new_file_path, 'w', encoding='utf-8') as f:
+        assert new_tree is not None
+        f.write(ast.unparse(new_tree))
+    return new_files_to_process
+def edit_files_for_hf_compatibility(folder: str, flatten_imports_prefix: Sequence[str]=('llmfoundry',), remove_imports_prefix: Sequence[str]=('composer', 'omegaconf', 'llmfoundry.metrics')) -> None:
+    """Edit files to be compatible with Hugging Face Hub.
+    Args:
+        folder (str): The folder to process.
+        flatten_imports_prefix (Sequence[str], optional): Sequence of prefixes to flatten. Defaults to ('llmfoundry',).
+        remove_imports_prefix (Sequence[str], optional): Sequence of prefixes to remove. Takes precedence over flattening.
+            Defaults to ('composer', 'omegaconf', 'llmfoundry.metrics').
+    """
+    files_to_process = [os.path.join(folder, filename) for filename in os.listdir(folder) if filename.endswith('.py')]
+    files_processed_and_queued = set(files_to_process)
+    while len(files_to_process) > 0:
+        to_process = files_to_process.pop()
+        if os.path.isfile(to_process) and to_process.endswith('.py'):
+            to_add = process_file(to_process, folder, flatten_imports_prefix, remove_imports_prefix)
+            for file in to_add:
+                if file not in files_processed_and_queued:
+                    files_to_process.append(file)
+                    files_processed_and_queued.add(file)

interfaces.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .callback_with_config import CallbackWithConfig

llmfoundry.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import warnings
+warnings.filterwarnings('ignore', category=UserWarning, module='bitsandbytes')
+import logging
+from .logging_utils import SpecificWarningFilter
+hf_dynamic_modules_logger = logging.getLogger('transformers.dynamic_module_utils')
+new_files_warning_filter = SpecificWarningFilter('A new version of the following files was downloaded from')
+hf_dynamic_modules_logger.addFilter(new_files_warning_filter)
+from . import algorithms, callbacks, loggers, optim, registry, utils
+from .data import ConcatTokensDataset, NoConcatDataset, Seq2SeqFinetuningCollator, build_finetuning_dataloader
+from .hf import ComposerHFCausalLM, ComposerHFT5
+from .attention import MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias, flash_attn_fn, scaled_multihead_dot_product_attention
+from .blocks import MPTBlock
+from .ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
+from .mpt import ComposerMPTCausalLM, MPTConfig, MPTForCausalLM, MPTModel, MPTPreTrainedModel
+from .tokenizers import TiktokenTokenizerWrapper
+__version__ = '0.7.0'

logging_utils.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import logging
+import os
+class SpecificWarningFilter(logging.Filter):
+    def __init__(self, message_to_suppress: str):
+        """Filter out a specific warning message based on its content.
+        This can be useful for filtering out specific warning messages from third party packages.
+        Args:
+            message_to_suppress (str): The warning message to suppress.
+        """
+        super().__init__()
+        self.message_to_suppress = message_to_suppress
+    def filter(self, record: logging.LogRecord) -> bool:
+        return self.message_to_suppress not in record.getMessage()
+def get_mosaicml_logger():
+    if os.environ.get(MOSAICML_PLATFORM_ENV_VAR, 'false').lower() == 'true' and os.environ.get(MOSAICML_ACCESS_TOKEN_ENV_VAR):
+        return MosaicMLLogger()
+    else:
+        return None

meta_init_context.py CHANGED Viewed

@@ -95,5 +95,5 @@ def init_on_device(device: torch.device, include_buffers: bool=False):
         nn.Module.register_parameter = old_register_parameter
         if include_buffers:
             nn.Module.register_buffer = old_register_buffer
-        for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
             setattr(torch, torch_function_name, old_torch_function)

         nn.Module.register_parameter = old_register_parameter
         if include_buffers:
             nn.Module.register_buffer = old_register_buffer
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
             setattr(torch, torch_function_name, old_torch_function)

model_download_utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""Utility functions for downloading models."""
+import copy
+import logging
+import os
+import shutil
+import subprocess
+import time
+import warnings
+from http import HTTPStatus
+from typing import Optional
+from urllib.parse import urljoin
+import huggingface_hub as hf_hub
+import requests
+import tenacity
+import yaml
+from bs4 import BeautifulSoup
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
+from transformers.utils import WEIGHTS_INDEX_NAME as PYTORCH_WEIGHTS_INDEX_NAME
+from transformers.utils import WEIGHTS_NAME as PYTORCH_WEIGHTS_NAME
+DEFAULT_IGNORE_PATTERNS = ['*.ckpt', '*.h5', '*.msgpack']
+PYTORCH_WEIGHTS_PATTERN = 'pytorch_model*.bin*'
+SAFE_WEIGHTS_PATTERN = 'model*.safetensors*'
+TOKENIZER_FILES = ['special_tokens_map.json', 'tokenizer.json', 'tokenizer.model', 'tokenizer_config.json']
+ORAS_PASSWD_PLACEHOLDER = '<placeholder_for_passwd>'
+ORAS_CLI = 'oras'
+log = logging.getLogger(__name__)
+@tenacity.retry(retry=tenacity.retry_if_not_exception_type((ValueError, hf_hub.utils.RepositoryNotFoundError)), stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(min=1, max=10))
+def download_from_hf_hub(model: str, save_dir: str, prefer_safetensors: bool=True, tokenizer_only: bool=False, token: Optional[str]=None):
+    """Downloads model files from a Hugging Face Hub model repo.
+    Only supports models stored in Safetensors and PyTorch formats for now. If both formats are available, only the
+    Safetensors weights will be downloaded unless `prefer_safetensors` is set to False.
+    Args:
+        repo_id (str): The Hugging Face Hub repo ID.
+        save_dir (str, optional): The local path to the directory where the model files will be downloaded.
+        prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are
+            available. Defaults to True.
+        tokenizer_only (bool): If true, only download tokenizer files.
+        token (str, optional): The HuggingFace API token. If not provided, the token will be read from the
+            `HUGGING_FACE_HUB_TOKEN` environment variable.
+    Raises:
+        RepositoryNotFoundError: If the model repo doesn't exist or the token is unauthorized.
+        ValueError: If the model repo doesn't contain any supported model weights.
+    """
+    repo_files = set(hf_hub.list_repo_files(model))
+    ignore_patterns = copy.deepcopy(DEFAULT_IGNORE_PATTERNS)
+    safetensors_available = SAFE_WEIGHTS_NAME in repo_files or SAFE_WEIGHTS_INDEX_NAME in repo_files
+    pytorch_available = PYTORCH_WEIGHTS_NAME in repo_files or PYTORCH_WEIGHTS_INDEX_NAME in repo_files
+    if safetensors_available and pytorch_available:
+        if prefer_safetensors:
+            log.info('Safetensors available and preferred. Excluding pytorch weights.')
+            ignore_patterns.append(PYTORCH_WEIGHTS_PATTERN)
+        else:
+            log.info('Pytorch available and preferred. Excluding safetensors weights.')
+            ignore_patterns.append(SAFE_WEIGHTS_PATTERN)
+    elif safetensors_available:
+        log.info('Only safetensors available. Ignoring weights preference.')
+    elif pytorch_available:
+        log.info('Only pytorch available. Ignoring weights preference.')
+    else:
+        raise ValueError(f'No supported model weights found in repo {model}.' + ' Please make sure the repo contains either safetensors or pytorch weights.')
+    allow_patterns = TOKENIZER_FILES if tokenizer_only else None
+    download_start = time.time()
+    hf_hub.snapshot_download(model, local_dir=save_dir, local_dir_use_symlinks=False, ignore_patterns=ignore_patterns, allow_patterns=allow_patterns, token=token)
+    download_duration = time.time() - download_start
+    log.info(f'Downloaded model {model} from Hugging Face Hub in {download_duration} seconds')
+def _extract_links_from_html(html: str):
+    """Extracts links from HTML content.
+    Args:
+        html (str): The HTML content
+    Returns:
+        list[str]: A list of links to download.
+    """
+    soup = BeautifulSoup(html, 'html.parser')
+    links = [a['href'] for a in soup.find_all('a')]
+    return links
+def _recursive_download(session: requests.Session, base_url: str, path: str, save_dir: str, ignore_cert: bool=False):
+    """Downloads all files/subdirectories from a directory on a remote server.
+    Args:
+        session: A requests.Session through which to make requests to the remote server.
+        url (str): The base URL where the files are located.
+        path (str): The path from the base URL to the files to download. The full URL for the download is equal to
+            '<base_url>/<path>'.
+        save_dir (str): The directory to save downloaded files to.
+        ignore_cert (bool): Whether or not to ignore the validity of the SSL certificate of the remote server.
+            Defaults to False.
+            WARNING: Setting this to true is *not* secure, as no certificate verification will be performed.
+    Raises:
+        PermissionError: If the remote server returns a 401 Unauthorized status code.
+        ValueError: If the remote server returns a 404 Not Found status code.
+        RuntimeError: If the remote server returns a status code other than 200 OK or 401 Unauthorized.
+    """
+    url = urljoin(base_url, path)
+    print(url)
+    response = session.get(url, verify=not ignore_cert)
+    if response.status_code == HTTPStatus.UNAUTHORIZED:
+        raise PermissionError(f'Not authorized to download file from {url}. Received status code {response.status_code}. ')
+    elif response.status_code == HTTPStatus.NOT_FOUND:
+        raise ValueError(f'Could not find file at {url}. Received status code {response.status_code}')
+    elif response.status_code != HTTPStatus.OK:
+        raise RuntimeError(f'Could not download file from {url}. Received unexpected status code {response.status_code}')
+    if not url.endswith('/'):
+        save_path = os.path.join(save_dir, path)
+        parent_dir = os.path.dirname(save_path)
+        if not os.path.exists(parent_dir):
+            os.makedirs(parent_dir)
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+            log.info(f'Downloaded file {save_path}')
+            return
+    child_links = _extract_links_from_html(response.content.decode())
+    print(child_links)
+    for child_link in child_links:
+        _recursive_download(session, base_url, urljoin(path, child_link), save_dir, ignore_cert=ignore_cert)
+@tenacity.retry(retry=tenacity.retry_if_not_exception_type((PermissionError, ValueError)), stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(min=1, max=10))
+def download_from_http_fileserver(url: str, save_dir: str, ignore_cert: bool=False):
+    """Downloads files from a remote HTTP file server.
+    Args:
+        url (str): The base URL where the files are located.
+        save_dir (str): The directory to save downloaded files to.
+        ignore_cert (bool): Whether or not to ignore the validity of the SSL certificate of the remote server.
+            Defaults to False.
+            WARNING: Setting this to true is *not* secure, as no certificate verification will be performed.
+    """
+    with requests.Session() as session:
+        with warnings.catch_warnings():
+            if ignore_cert:
+                warnings.simplefilter('ignore', category=InsecureRequestWarning)
+            _recursive_download(session, url, '', save_dir, ignore_cert=ignore_cert)
+def download_from_oras(model: str, config_file: str, credentials_dir: str, save_dir: str, tokenizer_only: bool=False, concurrency: int=10):
+    """Download from an OCI-compliant registry using oras.
+    Args:
+        model (str): The name of the model to download.
+        config_file (str): Path to a YAML config file that maps model and tokenizer names to registry paths.
+        credentials_dir (str): Path to a directory containing credentials for the registry. It is expected to contain three
+            files: `username`, `password`, and `registry`, each of which contains the corresponding credential.
+        save_dir (str): Path to the directory where files will be downloaded.
+        tokenizer_only (bool): If true, only download the tokenzier files.
+        concurrency (int): The number of concurrent downloads to run.
+    """
+    if shutil.which(ORAS_CLI) is None:
+        raise Exception(f'oras cli command `{ORAS_CLI}` is not found. Please install oras: https://oras.land/docs/installation ')
+    def _read_secrets_file(secret_file_path: str):
+        try:
+            with open(secret_file_path, encoding='utf-8') as f:
+                return f.read().strip()
+        except Exception as error:
+            raise ValueError(f'secrets file {secret_file_path} failed to be read') from error
+    secrets = {}
+    for secret in ['username', 'password', 'registry']:
+        secrets[secret] = _read_secrets_file(os.path.join(credentials_dir, secret))
+    with open(config_file, 'r', encoding='utf-8') as f:
+        configs = yaml.safe_load(f.read())
+    config_type = 'tokenizers' if tokenizer_only else 'models'
+    path = configs[config_type][model]
+    registry = secrets['registry']
+    def get_oras_cmd(username: Optional[str]=None, password: Optional[str]=None):
+        cmd = [ORAS_CLI, 'pull', f'{registry}/{path}', '-o', save_dir, '--verbose', '--concurrency', str(concurrency)]
+        if username is not None:
+            cmd.extend(['--username', username])
+        if password is not None:
+            cmd.extend(['--password', password])
+        return cmd
+    cmd_without_creds = get_oras_cmd()
+    log.info(f"CMD for oras cli to run: {' '.join(cmd_without_creds)}")
+    cmd_to_run = get_oras_cmd(username=secrets['username'], password=secrets['password'])
+    try:
+        subprocess.run(cmd_to_run, check=True)
+    except subprocess.CalledProcessError as e:
+        raise subprocess.CalledProcessError(e.returncode, cmd_without_creds, e.output, e.stderr)

model_wrapper.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Re-usable :class:`.ComposerModel` for LLM HF Models."""
+from __future__ import annotations
+from collections import UserDict
+from typing import TYPE_CHECKING, List, Mapping, Optional
+import transformers
+from torchmetrics import Metric
+from transformers import PreTrainedTokenizerBase
+from transformers.utils.generic import ModelOutput
+from .hf_fsdp import prepare_hf_model_for_fsdp
+if TYPE_CHECKING:
+    from peft import PeftConfig
+_HF_IGNORE_INDEX = -100
+class HuggingFaceModelWithFSDP(HuggingFaceModel):
+    """Wrapper around HuggingFaceModel.
+    Handles preparation for FSDP wrapping.
+    """
+    def __init__(self, model: transformers.PreTrainedModel, tokenizer: Optional[PreTrainedTokenizerBase]=None, metrics: Optional[List[Metric]]=None, eval_metrics: Optional[List[Metric]]=None, shift_labels: bool=False, init_device: Optional[str]=None, peft_config: Optional['PeftConfig']=None):
+        super().__init__(model, tokenizer, use_logits=True, metrics=metrics, eval_metrics=eval_metrics, shift_labels=shift_labels, peft_config=peft_config, should_save_peft_only=True)
+        prepare_hf_model_for_fsdp(self.model, init_device)
+        self.model.param_init_fn = lambda module: self.model._init_weights(module)
+    def forward(self, batch: Mapping):
+        if isinstance(batch, dict) or isinstance(batch, UserDict):
+            batch = {k: v for k, v in batch.items() if k in self.model_forward_args}
+            output = self.model(**batch)
+        else:
+            raise ValueError('Unexpected batch type. Expected a dictionary with keys corresponding to the inputs to the forward function of the Huggingface model')
+        return output
+    def loss(self, outputs: ModelOutput, batch: Mapping):
+        if self.config.use_return_dict:
+            return outputs['loss']
+        return outputs[:2]

modeling_mpt.py CHANGED Viewed

@@ -9,40 +9,27 @@ from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Tuple, Un
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .attention import is_flash_v1_installed, is_flash_v2_installed
 if is_flash_v2_installed():
     try:
         from flash_attn import bert_padding
         from flash_attn.layers.rotary import RotaryEmbedding as DAILRotaryEmbedding
     except Exception as e:
         raise e
-if is_flash_v1_installed():
-    try:
-        from flash_attn import bert_padding
-    except Exception as e:
-        raise e
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.llama.modeling_llama import LlamaDynamicNTKScalingRotaryEmbedding as HFDynamicNTKScalingRotaryEmbedding
 from transformers.models.llama.modeling_llama import LlamaLinearScalingRotaryEmbedding as HFLinearScalingRotaryEmbedding
 from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding as HFRotaryEmbedding
-from .attention import ATTN_CLASS_REGISTRY, attn_bias_shape, build_attn_bias, gen_slopes
 from .blocks import MPTBlock
 from .custom_embedding import SharedEmbedding
-from .fc import FC_CLASS_REGISTRY as FC_CLASS_REGISTRY
-from .ffn import FFN_CLASS_REGISTRY as FFN_CLASS_REGISTRY
-from .ffn import MPTMLP as MPTMLP
 from .ffn import build_ffn as build_ffn
-from .norm import NORM_CLASS_REGISTRY
 from .configuration_mpt import MPTConfig
-from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
-from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
 from .param_init_fns import generic_param_init_fn_, MODEL_INIT_REGISTRY
-try:
-    from .flash_attn_triton import flash_attn_func as flash_attn_func
-except:
-    pass
 import logging
 log = logging.getLogger(__name__)
@@ -140,9 +127,9 @@ def gen_flash_attn_padding_info(bsz: int, S: int, past_key_len: int, device: tor
         key_padding_mask = attention_mask_in_length
         query_padding_mask = attention_mask_in_length
         unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
-    (_, indices_q, cu_seqlens_q, max_seqlen_q) = unpadding_function(torch.empty(bsz, S, 1, device=device), query_padding_mask)
-    (_, indices_k, cu_seqlens_k, max_seqlen_k) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
-    (_, indices_v, _, _) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
     flash_attn_padding_info['indices_q'] = indices_q
     flash_attn_padding_info['indices_k'] = indices_k
     flash_attn_padding_info['indices_v'] = indices_v
@@ -176,7 +163,6 @@ class MPTModel(MPTPreTrainedModel):
         config._validate_config()
         super().__init__(config)
         self.attn_impl = config.attn_config['attn_impl']
-        self.prefix_lm = config.attn_config['prefix_lm']
         self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
         self.alibi = config.attn_config['alibi']
         self.alibi_bias_max = config.attn_config['alibi_bias_max']
@@ -196,6 +182,10 @@ class MPTModel(MPTPreTrainedModel):
             self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
         self.emb_drop = nn.Dropout(config.emb_pdrop)
         self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
         self.norm_f = norm_class(config.d_model, device=config.init_device)
         self.rope = config.attn_config['rope']
         self.rope_impl = None
@@ -205,10 +195,10 @@ class MPTModel(MPTPreTrainedModel):
         if config.init_device != 'meta':
             log.info(f'We recommend using config.init_device="meta" with Composer + FSDP for faster initialization.')
             self.apply(self.param_init_fn)
-        self.is_causal = not self.prefix_lm
         self._attn_bias_initialized = False
         self.attn_bias = None
-        self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
         if config.no_bias:
             for module in self.modules():
                 if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
@@ -227,7 +217,7 @@ class MPTModel(MPTPreTrainedModel):
         self.wte = value
     @torch.no_grad()
-    def _attn_bias(self, device: torch.device, dtype: torch.dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None) -> Tuple[Optional[torch.Tensor], Optional[torch.ByteTensor]]:
         if not self._attn_bias_initialized:
             if self.attn_bias_shape:
                 self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
@@ -238,10 +228,6 @@ class MPTModel(MPTPreTrainedModel):
         if self.attn_bias is not None:
             self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
         attn_bias = self.attn_bias
-        if self.prefix_lm:
-            assert isinstance(attn_bias, torch.Tensor)
-            assert isinstance(prefix_mask, torch.Tensor)
-            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
         if self.attn_uses_sequence_id and sequence_id is not None:
             assert isinstance(attn_bias, torch.Tensor)
             attn_bias = apply_sequence_id(attn_bias, sequence_id, self.config.max_seq_len)
@@ -252,43 +238,22 @@ class MPTModel(MPTPreTrainedModel):
             else:
                 _s_k = max(0, attn_bias.size(-1) - s_k)
                 attn_bias = attn_bias[:, :, :, _s_k:]
-            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
-                raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
             min_val = torch.finfo(attn_bias.dtype).min
             attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
         return (attn_bias, attention_mask)
-    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor) -> torch.Tensor:
-        (s_k, s_q) = attn_bias.shape[-2:]
-        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
-            raise ValueError('attn_bias does not match the expected shape. ' + f'The last two dimensions should both be {self.config.max_length} ' + f'but are {s_k} and {s_q}.')
-        seq_len = prefix_mask.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
-        prefix = prefix_mask.view(-1, 1, 1, seq_len)
-        cannot_attend = ~torch.logical_or(causal, prefix.bool())
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.Tensor]=None) -> BaseModelOutputWithPast:
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if attention_mask is not None:
             attention_mask = attention_mask.bool()
-        if prefix_mask is not None:
-            prefix_mask = prefix_mask.bool()
         if not return_dict:
             raise NotImplementedError('return_dict False is not implemented yet for MPT')
         if output_attentions:
             if self.attn_impl != 'torch':
-                raise NotImplementedError('output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.')
         if self.training and attention_mask is not None and (attention_mask[:, 0].sum() != attention_mask.shape[0]):
             raise NotImplementedError('MPT does not support training with left padding.')
-        if self.prefix_lm and prefix_mask is None:
-            raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
         if self.training:
             if self.attn_uses_sequence_id and sequence_id is None:
                 raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
@@ -336,7 +301,7 @@ class MPTModel(MPTPreTrainedModel):
             x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
             assert isinstance(self.emb_drop, nn.Module)
             x = self.emb_drop(x_shrunk)
-        (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
         attention_mask_in_length = gen_attention_mask_in_length(sequence_id=sequence_id, S=S, attn_uses_sequence_id=self.attn_uses_sequence_id, attn_impl=self.attn_impl, attention_mask=attention_mask)
         alibi_slopes = None
         if self.alibi and self.attn_impl == 'flash':
@@ -349,12 +314,12 @@ class MPTModel(MPTPreTrainedModel):
         flash_attn_padding_info = {}
         if self.attn_impl == 'flash':
             flash_attn_padding_info = gen_flash_attn_padding_info(bsz, S, past_position, x.device, attention_mask_in_length, attention_mask)
-        for (b_idx, block) in enumerate(self.blocks):
             if output_hidden_states:
                 assert all_hidden_states is not None
                 all_hidden_states = all_hidden_states + (x,)
             past_key_value = past_key_values[b_idx] if past_key_values is not None else None
-            (x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
             if presents is not None:
                 presents += (present,)
             if output_attentions:
@@ -422,7 +387,8 @@ class MPTForCausalLM(MPTPreTrainedModel):
             self.transformer.set_input_embeddings(new_embeddings)
     def tie_weights(self) -> None:
-        self.lm_head = None
     def set_decoder(self, decoder: MPTModel) -> None:
         self.transformer = decoder
@@ -430,10 +396,10 @@ class MPTForCausalLM(MPTPreTrainedModel):
     def get_decoder(self) -> MPTModel:
         return self.transformer
-    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor]=None) -> CausalLMOutputWithPast:
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds)
         if self.lm_head is not None:
             logits = self.lm_head(outputs.last_hidden_state)
         else:
@@ -459,29 +425,48 @@ class MPTForCausalLM(MPTPreTrainedModel):
         return _fsdp_wrap_fn(self, module)
     def activation_checkpointing_fn(self, module: nn.Module) -> bool:
-        act_ckpt_list = getattr(self.config, 'activation_checkpointing_target', None) or ['MPTBlock']
-        if isinstance(act_ckpt_list, str):
-            act_ckpt_list = [act_ckpt_list]
-        elif not isinstance(act_ckpt_list, list):
-            raise ValueError(f'activation_checkpointing_target must be either a single string or a list, but got {type(act_ckpt_list)}')
-        if 'MPTBlock' in act_ckpt_list or 'mptblock' in act_ckpt_list:
-            if len(act_ckpt_list) > 1:
-                log.info('Activation checkpointing MPTBlock only (ignoring other sub-block modules specified in activation_checkpointing_target).')
-            return isinstance(module, MPTBlock)
-        mod_types = ()
-        for mod_name in act_ckpt_list:
-            if mod_name.lower() == 'mptblock':
-                mod_types += (MPTBlock,)
-            elif mod_name in ATTN_CLASS_REGISTRY:
-                mod_types += (ATTN_CLASS_REGISTRY[mod_name],)
-            elif mod_name in FFN_CLASS_REGISTRY:
-                mod_types += (FFN_CLASS_REGISTRY[mod_name],)
-            elif mod_name in NORM_CLASS_REGISTRY:
-                mod_types += (NORM_CLASS_REGISTRY[mod_name],)
-            else:
-                msg = ', '.join(list(ATTN_CLASS_REGISTRY.keys()) + list(FFN_CLASS_REGISTRY.keys()) + list(NORM_CLASS_REGISTRY.keys()) + ['MPTBlock'])
-                raise ValueError(f'{mod_name} (specified in activation_checkpointing_target) is not a recognized option out of available options {msg}.')
-        return isinstance(module, mod_types)
     def prepare_inputs_for_generation(self, input_ids: torch.Tensor, past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]=None, inputs_embeds: Optional[torch.Tensor]=None, **kwargs: Any) -> Dict[str, Any]:
         attention_mask = kwargs['attention_mask'].bool()
@@ -493,17 +478,11 @@ class MPTForCausalLM(MPTPreTrainedModel):
             sequence_id = None
         if past_key_values is not None:
             input_ids = input_ids[:, -1].unsqueeze(-1)
-        if self.transformer.prefix_lm:
-            prefix_mask = torch.ones_like(attention_mask)
-            if kwargs.get('use_cache') == False:
-                raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
-        else:
-            prefix_mask = None
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {'inputs_embeds': inputs_embeds}
         else:
             model_inputs = {'input_ids': input_ids}
-        model_inputs.update({'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)})
         return model_inputs
     @staticmethod

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from .attention import is_flash_v2_installed
+from .norm import NORM_CLASS_REGISTRY
 if is_flash_v2_installed():
     try:
         from flash_attn import bert_padding
         from flash_attn.layers.rotary import RotaryEmbedding as DAILRotaryEmbedding
     except Exception as e:
         raise e
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.llama.modeling_llama import LlamaDynamicNTKScalingRotaryEmbedding as HFDynamicNTKScalingRotaryEmbedding
 from transformers.models.llama.modeling_llama import LlamaLinearScalingRotaryEmbedding as HFLinearScalingRotaryEmbedding
 from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding as HFRotaryEmbedding
+from .attention import attn_bias_shape, build_attn_bias, gen_slopes
 from .blocks import MPTBlock
 from .custom_embedding import SharedEmbedding
 from .ffn import build_ffn as build_ffn
 from .configuration_mpt import MPTConfig
 from .meta_init_context import init_empty_weights
 from .param_init_fns import generic_param_init_fn_, MODEL_INIT_REGISTRY
+from .act_ckpt import pass_on_block_idx, build_act_ckpt_mod_to_blocks, check_mapping_blocks_overlap
 import logging
 log = logging.getLogger(__name__)
         key_padding_mask = attention_mask_in_length
         query_padding_mask = attention_mask_in_length
         unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
+    _, indices_q, cu_seqlens_q, max_seqlen_q = unpadding_function(torch.empty(bsz, S, 1, device=device), query_padding_mask)
+    _, indices_k, cu_seqlens_k, max_seqlen_k = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+    _, indices_v, _, _ = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
     flash_attn_padding_info['indices_q'] = indices_q
     flash_attn_padding_info['indices_k'] = indices_k
     flash_attn_padding_info['indices_v'] = indices_v
         config._validate_config()
         super().__init__(config)
         self.attn_impl = config.attn_config['attn_impl']
         self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
         self.alibi = config.attn_config['alibi']
         self.alibi_bias_max = config.attn_config['alibi_bias_max']
             self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
         self.emb_drop = nn.Dropout(config.emb_pdrop)
         self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
+        for i, block in enumerate(self.blocks):
+            block.block_idx = i
+            block.max_block_idx = config.n_layers - 1
+            pass_on_block_idx(block)
         self.norm_f = norm_class(config.d_model, device=config.init_device)
         self.rope = config.attn_config['rope']
         self.rope_impl = None
         if config.init_device != 'meta':
             log.info(f'We recommend using config.init_device="meta" with Composer + FSDP for faster initialization.')
             self.apply(self.param_init_fn)
+        self.is_causal = True
         self._attn_bias_initialized = False
         self.attn_bias = None
+        self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
         if config.no_bias:
             for module in self.modules():
                 if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
         self.wte = value
     @torch.no_grad()
+    def _attn_bias(self, device: torch.device, dtype: torch.dtype, attention_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None) -> Tuple[Optional[torch.Tensor], Optional[torch.ByteTensor]]:
         if not self._attn_bias_initialized:
             if self.attn_bias_shape:
                 self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
         if self.attn_bias is not None:
             self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
         attn_bias = self.attn_bias
         if self.attn_uses_sequence_id and sequence_id is not None:
             assert isinstance(attn_bias, torch.Tensor)
             attn_bias = apply_sequence_id(attn_bias, sequence_id, self.config.max_seq_len)
             else:
                 _s_k = max(0, attn_bias.size(-1) - s_k)
                 attn_bias = attn_bias[:, :, :, _s_k:]
             min_val = torch.finfo(attn_bias.dtype).min
             attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
         return (attn_bias, attention_mask)
+    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.Tensor]=None) -> BaseModelOutputWithPast:
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if attention_mask is not None:
             attention_mask = attention_mask.bool()
         if not return_dict:
             raise NotImplementedError('return_dict False is not implemented yet for MPT')
         if output_attentions:
             if self.attn_impl != 'torch':
+                raise NotImplementedError('output_attentions is not implemented for MPT when using attn_impl `flash`.')
         if self.training and attention_mask is not None and (attention_mask[:, 0].sum() != attention_mask.shape[0]):
             raise NotImplementedError('MPT does not support training with left padding.')
         if self.training:
             if self.attn_uses_sequence_id and sequence_id is None:
                 raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
             x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
             assert isinstance(self.emb_drop, nn.Module)
             x = self.emb_drop(x_shrunk)
+        attn_bias, attention_mask = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, sequence_id=sequence_id)
         attention_mask_in_length = gen_attention_mask_in_length(sequence_id=sequence_id, S=S, attn_uses_sequence_id=self.attn_uses_sequence_id, attn_impl=self.attn_impl, attention_mask=attention_mask)
         alibi_slopes = None
         if self.alibi and self.attn_impl == 'flash':
         flash_attn_padding_info = {}
         if self.attn_impl == 'flash':
             flash_attn_padding_info = gen_flash_attn_padding_info(bsz, S, past_position, x.device, attention_mask_in_length, attention_mask)
+        for b_idx, block in enumerate(self.blocks):
             if output_hidden_states:
                 assert all_hidden_states is not None
                 all_hidden_states = all_hidden_states + (x,)
             past_key_value = past_key_values[b_idx] if past_key_values is not None else None
+            x, attn_weights, present = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
             if presents is not None:
                 presents += (present,)
             if output_attentions:
             self.transformer.set_input_embeddings(new_embeddings)
     def tie_weights(self) -> None:
+        if getattr(self.config, 'tie_word_embeddings', True):
+            self.lm_head = None
     def set_decoder(self, decoder: MPTModel) -> None:
         self.transformer = decoder
     def get_decoder(self) -> MPTModel:
         return self.transformer
+    def forward(self, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor]=None) -> CausalLMOutputWithPast:
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds)
         if self.lm_head is not None:
             logits = self.lm_head(outputs.last_hidden_state)
         else:
         return _fsdp_wrap_fn(self, module)
     def activation_checkpointing_fn(self, module: nn.Module) -> bool:
+        """The MPT activation checkpointing (act ckpt) function.
+        When `activation_checkpointing` in fsdp_config is set to true, this function will be called on all the modules in the FSDP wrapped model and determine whether a given module should be activation checkpointed. It checks the checkpointing target (`activation_checkpointing_target` in `model`) which can be specified as below:
+            1. null (or no such field): The whole MPTBlock will be activation checkpointed on all layers
+            2. a list of modules to act ckpt on all layers, e.g.,
+                activation_checkpointing_target:
+                    - grouped_query_attention
+                    - mptmlp
+            3. a dictionary of module name with target_blocks, e.g.,
+                activation_checkpointing_target:
+                    {
+                            "mptblock": target_blocks_1,
+                            "grouped_query_attention": target_blocks_2
+                    }
+                target_blocks (target_blocks_1, target_blocks_2 above) can be:
+                - a single integer n: the first n transformer block will be activation checkpointed
+                - a string of first-n, middle-m, last-k, range-i-j: the first n, the middle m,  the last k, or the range [i, j) layers will be activation checkpointed. E.g, 'first-2, last-2' means the first 2 and last 2 transformer blocks will be activation checkpointed
+                    middle-m is range [start, end) where ``start = max(max_block_idx // 2 - m // 2, 0), end = min(start + m, max_block_idx + 1)``
+                - a list of integers corresponds to the list of transformer block ids, e.g., [2] means the second transformer block will be activation checkpointed. [2, 3] means the second and third transformer blocks will be activation checkpointed
+                - a list of mixed integers and strings of first-n, middle-m, last-k, range-i-j
+            An example in yaml config file:
+                fsdp_config:
+                    activation_checkpointing: true
+                model:
+                    activation_checkpointing_target:
+                        {
+                            "mptblock": 'first-5',
+                            "grouped_query_attention": 'last-35'
+                        }
+        """
+        if not hasattr(module, 'block_idx'):
+            log.debug(f'{module.__class__.__name__} cannot be activation checkpointed. Only transformer block or its submodules are eligible for activation checkpointing.')
+            return False
+        act_ckpt_target = getattr(self.config, 'activation_checkpointing_target', None)
+        act_ckpt_mod_to_blocks = build_act_ckpt_mod_to_blocks(act_ckpt_target, MPTBlock, module.max_block_idx)
+        check_mapping_blocks_overlap(act_ckpt_mod_to_blocks, module.max_block_idx)
+        for k in act_ckpt_mod_to_blocks.keys():
+            if isinstance(module, k):
+                blocks = act_ckpt_mod_to_blocks[k]
+                return True if blocks == -1 else module.block_idx in blocks
+        return False
     def prepare_inputs_for_generation(self, input_ids: torch.Tensor, past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]=None, inputs_embeds: Optional[torch.Tensor]=None, **kwargs: Any) -> Dict[str, Any]:
         attention_mask = kwargs['attention_mask'].bool()
             sequence_id = None
         if past_key_values is not None:
             input_ids = input_ids[:, -1].unsqueeze(-1)
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {'inputs_embeds': inputs_embeds}
         else:
             model_inputs = {'input_ids': input_ids}
+        model_inputs.update({'attention_mask': attention_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)})
         return model_inputs
     @staticmethod

monolithic_ckpt_callback.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import contextlib
+import os
+import tempfile
+from pathlib import Path
+import torch
+class MonolithicCheckpointSaver(Callback):
+    """Save a monolithic checkpoint every N batches.
+    Args:
+        save_folder (str): Folder to save checkpoints to (can be a URI)
+        batch_interval (int): Number of batches between checkpoints.
+        filename (str): Filename to save checkpoints to.
+        overwrite (bool): Whether to overwrite previous checkpoints.
+        keep_optimizers (bool): Whether to save the optimizer state in the monolithic checkpoint.
+    """
+    def __init__(self, save_folder: str, batch_interval: int, filename: str='ep{epoch}-ba{batch}.pt', overwrite: bool=False, keep_optimizers: bool=False):
+        self.backend, self.bucket_name, self.save_dir_format_str = parse_uri(save_folder)
+        self.filename_format_str = filename
+        self.batch_interval = batch_interval
+        self.upload_to_object_store = self.backend != ''
+        self.overwrite = overwrite
+        self.keep_optimizers = keep_optimizers
+        if self.upload_to_object_store:
+            self.remote_ud = RemoteUploaderDownloader(bucket_uri=f'{self.backend}://{self.bucket_name}')
+        else:
+            self.remote_ud = None
+    def init(self, state: State, logger: Logger) -> None:
+        if self.upload_to_object_store and self.remote_ud is not None:
+            self.remote_ud.init(state, logger)
+            state.callbacks.append(self.remote_ud)
+    def batch_checkpoint(self, state: State, logger: Logger) -> None:
+        if state.timestamp.batch.value % self.batch_interval == 0:
+            self._save_checkpoint(state, logger)
+    def fit_end(self, state: State, logger: Logger) -> None:
+        if state.timestamp.batch.value % self.batch_interval != 0:
+            self._save_checkpoint(state, logger)
+    def _save_checkpoint(self, state: State, logger: Logger) -> None:
+        del logger
+        filename = format_name_with_dist_and_time(self.filename_format_str, state.run_name, state.timestamp)
+        save_dir = format_name_with_dist_and_time(self.save_dir_format_str, state.run_name, state.timestamp)
+        dir_context_mgr = tempfile.TemporaryDirectory() if self.upload_to_object_store else contextlib.nullcontext(enter_result=save_dir)
+        with dir_context_mgr as temp_save_dir:
+            assert isinstance(temp_save_dir, str)
+            save_path = str(Path(temp_save_dir) / Path(filename))
+            dirname = os.path.dirname(save_path)
+            if dirname:
+                os.makedirs(dirname, exist_ok=True)
+            state_dict = {'state': state.state_dict(), 'rng': reproducibility.get_rng_state()}
+            state_dict['state'].pop('optimizers')
+            state_dict['state'].pop('model')
+            with fsdp_state_dict_type_context(state.model, state_dict_type='full'):
+                state_dict['state']['model'] = state.model.state_dict()
+            if self.keep_optimizers:
+                optimizer = state.optimizers[0]
+                state_dict['state']['optimizers'] = {type(optimizer).__qualname__: fsdp_get_optim_state_dict(state.model, optimizer, state_dict_type='full')}
+            if dist.get_global_rank() == 0:
+                torch.save(state_dict, save_path)
+            if self.upload_to_object_store and self.remote_ud is not None and (dist.get_global_rank() == 0):
+                remote_file_name = str(Path(save_dir) / Path(filename))
+                self.remote_ud.upload_file(state=state, remote_file_name=remote_file_name, file_path=Path(save_path), overwrite=self.overwrite)

mosaicml_logger_utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import json
+import os
+from typing import Any, Dict, List, Optional, Union
+_MODEL_KEYS_TO_LOG = ['pretrained_model_name_or_path', 'pretrained', 'vocab_size', 'd_model', 'n_heads', 'n_layers', 'expansion_ratio', 'max_seq_len']
+def maybe_create_mosaicml_logger() -> Optional[MosaicMLLogger]:
+    """Creates a MosaicMLLogger if the run was sent from the Mosaic platform."""
+    if os.environ.get(MOSAICML_PLATFORM_ENV_VAR, 'false').lower() == 'true' and os.environ.get(MOSAICML_ACCESS_TOKEN_ENV_VAR):
+        return MosaicMLLogger()
+def find_mosaicml_logger(loggers: List[LoggerDestination]) -> Optional[MosaicMLLogger]:
+    """Returns the first MosaicMLLogger from a list, and None otherwise."""
+    return next((logger for logger in loggers if isinstance(logger, MosaicMLLogger)), None)
+def log_eval_analytics(mosaicml_logger: MosaicMLLogger, model_configs: ListConfig, icl_tasks: Union[str, ListConfig], eval_gauntlet_config: Optional[Union[str, DictConfig]]):
+    """Logs analytics for runs using the `eval.py` script."""
+    metrics: Dict[str, Any] = {'llmfoundry/script': 'eval'}
+    metrics['llmfoundry/gauntlet_configured'] = eval_gauntlet_config is not None
+    metrics['llmfoundry/icl_configured'] = isinstance(icl_tasks, str) or len(icl_tasks) > 0
+    metrics['llmfoundry/model_configs'] = []
+    for model_config in model_configs:
+        nested_model_config = model_config.get('model', {})
+        model_config_data = {}
+        for key in _MODEL_KEYS_TO_LOG:
+            if nested_model_config.get(key, None) is not None:
+                model_config_data[key] = nested_model_config.get(key)
+        if len(model_config_data) > 0:
+            metrics['llmfoundry/model_configs'].append(json.dumps(model_config_data, sort_keys=True))
+    mosaicml_logger.log_metrics(metrics)
+    mosaicml_logger._flush_metadata(force_flush=True)
+def log_train_analytics(mosaicml_logger: MosaicMLLogger, model_config: DictConfig, train_loader_config: DictConfig, eval_loader_config: Optional[Union[DictConfig, ListConfig]], callback_configs: Optional[DictConfig], tokenizer_name: str, load_path: Optional[str], icl_tasks_config: Optional[Union[ListConfig, str]], eval_gauntlet: Optional[Union[DictConfig, str]]):
+    """Logs analytics for runs using the `train.py` script."""
+    train_loader_dataset = train_loader_config.get('dataset', {})
+    metrics: Dict[str, Any] = {'llmfoundry/tokenizer_name': tokenizer_name, 'llmfoundry/script': 'train', 'llmfoundry/train_loader_name': train_loader_config.get('name')}
+    if callback_configs is not None:
+        metrics['llmfoundry/callbacks'] = [name for name, _ in callback_configs.items()]
+    metrics['llmfoundry/gauntlet_configured'] = eval_gauntlet is not None
+    metrics['llmfoundry/icl_configured'] = icl_tasks_config is not None and (isinstance(icl_tasks_config, str) or len(icl_tasks_config) > 0)
+    if train_loader_dataset.get('hf_name', None) is not None:
+        metrics['llmfoundry/train_dataset_hf_name'] = train_loader_dataset.get('hf_name', None)
+    if train_loader_config.get('name') == 'finetuning':
+        metrics['llmfoundry/train_task_type'] = 'INSTRUCTION_FINETUNE'
+    elif train_loader_config.get('name') == 'text':
+        if load_path is not None or model_config.get('pretrained') == True:
+            metrics['llmfoundry/train_task_type'] = 'CONTINUED_PRETRAIN'
+        else:
+            metrics['llmfoundry/train_task_type'] = 'PRETRAIN'
+    if eval_loader_config is not None:
+        metrics['llmfoundry/eval_loaders'] = []
+        if isinstance(eval_loader_config, ListConfig):
+            eval_loader_configs: ListConfig = eval_loader_config
+        else:
+            eval_loader_configs = ListConfig([eval_loader_config])
+        for loader_config in eval_loader_configs:
+            eval_loader_info = {}
+            eval_loader_dataset = loader_config.get('dataset', {})
+            eval_loader_info['name'] = loader_config.get('name')
+            if eval_loader_dataset.get('hf_name', None) is not None:
+                eval_loader_info['dataset_hf_name'] = eval_loader_dataset.get('hf_name')
+            metrics['llmfoundry/eval_loaders'].append(json.dumps(eval_loader_info, sort_keys=True))
+    model_config_data = {}
+    for key in _MODEL_KEYS_TO_LOG:
+        if model_config.get(key, None) is not None:
+            model_config_data[f'llmfoundry/{key}'] = model_config.get(key)
+    if len(model_config_data) > 0:
+        metrics.update(model_config_data)
+    mosaicml_logger.log_metrics(metrics)
+    mosaicml_logger._flush_metadata(force_flush=True)

mpt.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration_mpt import MPTConfig
2	+ from .modeling_mpt import ComposerMPTCausalLM, MPTForCausalLM, MPTModel, MPTPreTrainedModel

packing.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import logging
+import tempfile
+from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple
+import numpy as np
+import torch
+from transformers import PreTrainedTokenizerBase
+log = logging.getLogger(__name__)
+class BinPackCollator:
+    """Utility collator for packing to reduce padding."""
+    def __init__(self, collator: Callable, target_batch_size: int, max_seq_len: int, pad_token_id: int, padding_side: Literal['left', 'right'], max_leftover_bins_to_keep: Optional[int]=None):
+        self.base_collator = collator
+        self.out_size = int(target_batch_size)
+        self.max_seq_len = int(max_seq_len)
+        self.pad_token_id = int(pad_token_id)
+        self.padding_side = padding_side
+        if self.out_size <= 0:
+            raise ValueError(f'target_batch_size={target_batch_size!r} must be >0.')
+        if self.max_seq_len <= 0:
+            raise ValueError(f'max_seq_len={max_seq_len!r} must be >0.')
+        if self.pad_token_id < 0:
+            raise ValueError(f'pad_token_id={pad_token_id!r} must be >=0.')
+        if max_leftover_bins_to_keep is not None and max_leftover_bins_to_keep < 0:
+            raise ValueError(f'max_leftover_bins_to_keep={max_leftover_bins_to_keep!r} must be >=0 or None.')
+        self.max_leftover_bins_to_keep = max_leftover_bins_to_keep
+        self.n_packed_tokens = 0
+        self.n_total_tokens = 0
+        self.n_packed_examples = 0
+        self._leftover_bins: List[Tuple[int, Dict[str, torch.Tensor]]] = []
+    @property
+    def waste(self) -> float:
+        return 1 - self.n_packed_tokens / self.n_total_tokens
+    @property
+    def efficiency(self) -> float:
+        return self.n_packed_tokens / (self.max_seq_len * self.n_packed_examples)
+    def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        batch = self.base_collator(examples)
+        return self.pack(batch)
+    def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        assert 'attention_mask' in batch
+        assert 'input_ids' in batch
+        for key in batch.keys():
+            assert key in ['input_ids', 'labels', 'attention_mask', 'sequence_id']
+        sizes, trimmed_examples = _trim_batch(batch)
+        return self._pack_trimmed_examples(trimmed_examples, sizes)
+    def _pack_trimmed_examples(self, trimmed_examples: List[Dict[str, torch.Tensor]], sizes: List[int]) -> Dict[str, torch.Tensor]:
+        """Packs trimmed examples into fixed-size bins and repads them.
+        Args:
+            trimmed_examples (List[Dict[str, torch.Tensor]]): A list of trimmed examples.
+            sizes (List[int]): The sizes of the trimmed examples.
+        Returns:
+            Dict[str, torch.Tensor]: A batch of repadded examples ready for processing
+        """
+        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = _first_fit_bin_packing(sizes=sizes, examples=trimmed_examples, num_bins=self.out_size, max_bin_size=self.max_seq_len, existing_bins=self._leftover_bins)
+        self.n_packed_tokens += n_packed_tokens
+        self.n_total_tokens += n_total_tokens
+        self.n_packed_examples += self.out_size
+        self._leftover_bins = leftover_bins[:self.max_leftover_bins_to_keep]
+        batch = _repad(packed_examples, max_seq_len=self.max_seq_len, pad_token_id=self.pad_token_id, padding_side=self.padding_side)
+        return batch
+def _trim_batch(batch: Dict[str, torch.Tensor]) -> Tuple[List[int], List[Dict[str, torch.Tensor]]]:
+    """Trims padding off all examples in batch.
+    Args:
+        batch (Dict[str, torch.Tensor]): Batch of padded data with tensors as values.
+    Returns:
+        A tuple with unpadded lengths of examples and a list of each trimmed example from the batch.
+    """
+    sizes, trimmed_examples = ([], [])
+    for idx in range(batch['attention_mask'].shape[0]):
+        size, trimmed_example = _extract_trim_batch_idx(batch, idx)
+        sizes.append(size)
+        trimmed_examples.append(trimmed_example)
+    return (sizes, trimmed_examples)
+def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor], idx: int) -> Tuple[int, Dict[str, torch.Tensor]]:
+    example = {k: v[idx] for k, v in batch.items()}
+    keep = example['attention_mask'] == 1
+    size = int(keep.sum())
+    trim_example = {k: v[keep] for k, v in example.items()}
+    trim_example['sequence_id'] = torch.zeros_like(trim_example['input_ids'])
+    return (size, trim_example)
+def _combine_in_place(example: Dict[str, torch.Tensor], add_on: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    if 'labels' in add_on:
+        add_on['labels'][0] = -100
+    for k in example.keys():
+        if k == 'sequence_id':
+            example[k] = torch.cat([example[k], add_on[k] + 1 + torch.max(example[k])])
+        else:
+            example[k] = torch.cat([example[k], add_on[k]])
+    return example
+def _first_fit_bin_packing(sizes: List[int], examples: List[Dict[str, torch.Tensor]], num_bins: int, max_bin_size: int, existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]]) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[str, torch.Tensor]]]]:
+    bins: List[Tuple[int, Dict[str, torch.Tensor]]] = existing_bins
+    starting_total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+    sizes_and_examples = [(size, example) for size, example in zip(sizes, examples)]
+    sorted_sizes_and_examples = sorted(sizes_and_examples, key=lambda x: x[0], reverse=True)
+    required_num_examples = max(0, num_bins - len(bins))
+    num_examples = len(sizes)
+    if num_examples < required_num_examples:
+        for size, example in sorted_sizes_and_examples:
+            bins.append((size, example))
+        total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+        total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
+        total_example_sizes = sum(sizes)
+        if total_new_bin_sizes != total_example_sizes:
+            raise AssertionError(f'Error in packing. total_example_sizes={total_example_sizes!r} does not equal total_new_bin_sizes={total_new_bin_sizes!r}.')
+        sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
+        bin_sizes, packed_examples = ([], [])
+        for bin_size, packed_example in sorted_bins:
+            bin_sizes.append(bin_size)
+            packed_examples.append(packed_example)
+        return (packed_examples[:num_bins], sum(bin_sizes[:num_bins]), sum(sizes), sorted_bins[num_bins:])
+    for i, (size, example) in enumerate(sorted_sizes_and_examples):
+        required_num_examples = max(0, num_bins - len(bins))
+        n_remaining = num_examples - i
+        assert n_remaining >= required_num_examples
+        if n_remaining == required_num_examples:
+            bins.append((size, example))
+            continue
+        added = False
+        for bidx in range(len(bins)):
+            if bins[bidx][0] + size <= max_bin_size:
+                bin_size, packed_example = bins.pop(bidx)
+                bin_size = bin_size + size
+                packed_example = _combine_in_place(packed_example, example)
+                bins.append((bin_size, packed_example))
+                added = True
+                break
+        if not added:
+            bins.append((size, example))
+    total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+    total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
+    total_example_sizes = sum(sizes)
+    if total_new_bin_sizes != total_example_sizes:
+        raise AssertionError(f'Error in packing. total_example_sizes={total_example_sizes!r} does not equal total_new_bin_sizes={total_new_bin_sizes!r}.')
+    sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
+    bin_sizes, packed_examples = ([], [])
+    for bin_size, packed_example in sorted_bins:
+        bin_sizes.append(bin_size)
+        packed_examples.append(packed_example)
+    return (packed_examples[:num_bins], sum(bin_sizes[:num_bins]), sum(sizes), sorted_bins[num_bins:])
+def _repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int, pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]:
+    def pad_tensor(tensor: torch.Tensor, pad_value: int):
+        if len(tensor) == max_seq_len:
+            return tensor
+        t = torch.full((max_seq_len,), pad_value, dtype=tensor.dtype, device=tensor.device)
+        if padding_side == 'left':
+            t[-len(tensor):] = tensor
+        elif padding_side == 'right':
+            t[:len(tensor)] = tensor
+        else:
+            raise ValueError(f'Unknown padding_side={padding_side!r}')
+        return t
+    pad_vals = {'input_ids': pad_token_id, 'labels': -100, 'attention_mask': 0, 'sequence_id': -1}
+    keys = packed_examples[0].keys()
+    batch = {}
+    for key in keys:
+        batch[key] = torch.stack([pad_tensor(example[key], pad_vals[key]) for example in packed_examples])
+    return batch
+def auto_packing_ratio(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int, num_packing_ratios: int=20) -> float:
+    """Find a packing ratio that minimizes padding with zero waste.
+    By packing examples, we can increase training efficiency, training on more data with less batches.
+    However, in practice, the selected packing_ratio may produce some waste because profiling is done on only
+    a subset of the dataset.
+    We select a min_ratio of 1 and a max_ratio that is the max_seq_len / 100, and profile up to
+    num_packing_ratios packing ratios between min_ratio and max_ratio, inclusive.
+    When a packing_ratio with non-zero waste is found, we stop and select the previous ratio,
+    which has zero waste.
+    Args:
+        dataloader_cfg (DictConfig): The dataloader configuration for profiling.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
+        device_batch_size (int): The size of the batches (number of examples) per device.
+        num_packing_ratio (int): The number of packing ratios to try.
+    Returns:
+        A packing ratio that minimizes padding while maintaining zero waste.
+    """
+    rng_state = reproducibility.get_rng_state()
+    reproducibility.seed_all(0)
+    max_seq_len = dataloader_cfg.dataset.max_seq_len
+    if max_seq_len <= 100:
+        return 1
+    min_ratio = 1
+    max_ratio = max_seq_len / 100
+    profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio, max_ratio, num_packing_ratios, device_batch_size)
+    packing_ratio = 1
+    for packing_ratio_candidate, _, waste in profiling_results:
+        if waste is None or waste > 0:
+            break
+        packing_ratio = packing_ratio_candidate
+    if dist.is_available() and dist.is_initialized():
+        device = get_device(None)
+        packing_ratio_tensor = device.tensor_to_device(torch.tensor(packing_ratio))
+        dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN')
+        packing_ratio = packing_ratio_tensor.item()
+    reproducibility.load_rng_state(rng_state)
+    return packing_ratio
+def profile_packing(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, min_ratio: float, max_ratio: float, num_packing_ratios: int, device_batch_size: int) -> Iterable[Tuple[float, Optional[float], Optional[float]]]:
+    """Generator function that profiles example packing across packing ratios.
+    Args:
+        dataloader_cfg (DictConfig): The dataloader configuration for profiling.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
+        min_ratio (float): Smallest packing_ratio to test. Must be >=1.
+        max_ratio (float): Largest packing_ratio to test. Must be larger than `min_ratio`.
+        num_packing_ratios (int): Number of packing_ratio values (spaced between `min_ratio` and `max_ratio`) to try.
+        device_batch_size (int): The size of the batches (number of examples) per device.
+    Returns:
+        An iterable of tuples of packing ratio, padding, and waste, sorted by smallest to largest packing ratio.
+    """
+    import copy
+    from .dataloader import build_dataloader
+    max_seq_len = dataloader_cfg.dataset.get('max_seq_len')
+    max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', None)
+    dataloader_cfg = copy.deepcopy(dataloader_cfg)
+    dataloader_cfg.dataset.packing_ratio = 1.0
+    dataloader_cfg.drop_last = False
+    dataloader_cfg.num_workers = 0
+    dataloader_cfg.prefetch_factor = None
+    dataloader_cfg.persistent_workers = False
+    if dataloader_cfg.dataset.get('remote') is not None:
+        dataloader_cfg.dataset.local = tempfile.TemporaryDirectory().name
+    packing_ratios, raw_batch_sizes = ([], [])
+    for packing_ratio in np.linspace(min_ratio, max_ratio, num_packing_ratios, endpoint=True):
+        packing_ratio = np.round(10 * packing_ratio) / 10
+        raw_batch_size = int(packing_ratio * device_batch_size)
+        if raw_batch_size not in raw_batch_sizes:
+            packing_ratios.append(packing_ratio)
+            raw_batch_sizes.append(raw_batch_size)
+    n_profile_examples = max(raw_batch_sizes) * 100
+    train_dataspec = build_dataloader(dataloader_cfg, tokenizer, n_profile_examples)
+    train_dataloader = train_dataspec.dataloader
+    big_batch = next(iter(train_dataloader))
+    sizes, trimmed_examples = _trim_batch(big_batch)
+    def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]:
+        trimmed_examples_copy = [te.copy() for te in trimmed_examples]
+        packer = BinPackCollator(collator=lambda x: x, target_batch_size=device_batch_size, max_seq_len=max_seq_len, pad_token_id=0, padding_side='left', max_leftover_bins_to_keep=max_leftovers_to_keep)
+        for idx in range(0, len(trimmed_examples_copy), raw_batch_size):
+            batch = trimmed_examples_copy[idx:idx + raw_batch_size]
+            if len(batch) < device_batch_size:
+                continue
+            packer._pack_trimmed_examples(batch, sizes[idx:idx + raw_batch_size])
+        if packer.n_packed_examples == 0:
+            log.debug('No examples packed during profiling. Dataset is smaller than device batch size.')
+            return (None, None)
+        padding_percent = 100 * (1 - packer.efficiency)
+        waste_percent = 100 * packer.waste
+        return (padding_percent, waste_percent)
+    for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes):
+        padding, waste = profile(raw_batch_size)
+        yield (packing_ratio, padding, waste)

param_init_fns.py CHANGED Viewed

@@ -22,9 +22,9 @@ def fused_init_helper_(module: nn.Module, init_fn_: Callable) -> None:
     if _fused is None:
         raise RuntimeError(f'Internal logic error')
     assert isinstance(module.weight, torch.Tensor)
-    (dim, splits) = _fused
     splits = (0, *splits, module.weight.size(dim))
-    for (s, e) in zip(splits[:-1], splits[1:]):
         slice_indices = [slice(None)] * module.weight.ndim
         slice_indices[dim] = slice(s, e)
         init_fn_(module.weight[slice_indices])
@@ -71,7 +71,7 @@ def generic_param_init_fn_(module: nn.Module, init_fn_: Callable, n_layers: int,
                 if lim == 0:
                     warnings.warn(f'Embedding layer initialized to 0.')
                 lim = [-lim, lim]
-            (a, b) = lim
             emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
         else:
             emb_init_fn_ = init_fn_
@@ -88,7 +88,7 @@ def generic_param_init_fn_(module: nn.Module, init_fn_: Callable, n_layers: int,
             assert d_model is not None
             _d = d_model
             splits = (0, _d, 2 * _d, 3 * _d)
-            for (s, e) in zip(splits[:-1], splits[1:]):
                 init_fn_(module.in_proj_weight[s:e])
         else:
             assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)

     if _fused is None:
         raise RuntimeError(f'Internal logic error')
     assert isinstance(module.weight, torch.Tensor)
+    dim, splits = _fused
     splits = (0, *splits, module.weight.size(dim))
+    for s, e in zip(splits[:-1], splits[1:]):
         slice_indices = [slice(None)] * module.weight.ndim
         slice_indices[dim] = slice(s, e)
         init_fn_(module.weight[slice_indices])
                 if lim == 0:
                     warnings.warn(f'Embedding layer initialized to 0.')
                 lim = [-lim, lim]
+            a, b = lim
             emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
         else:
             emb_init_fn_ = init_fn_
             assert d_model is not None
             _d = d_model
             splits = (0, _d, 2 * _d, 3 * _d)
+            for s, e in zip(splits[:-1], splits[1:]):
                 init_fn_(module.in_proj_weight[s:e])
         else:
             assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)

prompt_files.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+from typing import List, Optional
+PROMPTFILE_PREFIX = 'file::'
+def load_prompts(prompts: List[str], prompt_delimiter: Optional[str]=None) -> List[str]:
+    """Loads a set of prompts, both free text and from file.
+    Args:
+        prompts (List[str]): List of free text prompts and prompt files
+        prompt_delimiter (Optional str): Delimiter for text file
+            If not provided, assumes the prompt file is a single prompt (non-delimited)
+    Returns:
+        List of prompt string(s)
+    """
+    prompt_strings = []
+    for prompt in prompts:
+        if prompt.startswith(PROMPTFILE_PREFIX):
+            prompts = load_prompts_from_file(prompt, prompt_delimiter)
+            prompt_strings.extend(prompts)
+        else:
+            prompt_strings.append(prompt)
+    return prompt_strings
+def load_prompts_from_file(prompt_path: str, prompt_delimiter: Optional[str]=None) -> List[str]:
+    """Load a set of prompts from a text fie.
+    Args:
+        prompt_path (str): Path for text file
+        prompt_delimiter (Optional str): Delimiter for text file
+            If not provided, assumes the prompt file is a single prompt (non-delimited)
+    Returns:
+        List of prompt string(s)
+    """
+    if not prompt_path.startswith(PROMPTFILE_PREFIX):
+        raise ValueError(f'prompt_path_str must start with {PROMPTFILE_PREFIX}')
+    _, prompt_file_path = prompt_path.split(PROMPTFILE_PREFIX, maxsplit=1)
+    prompt_file_path = os.path.expanduser(prompt_file_path)
+    if not os.path.isfile(prompt_file_path):
+        raise FileNotFoundError(f'prompt_file_path={prompt_file_path!r} does not match any existing files.')
+    with open(prompt_file_path, 'r') as f:
+        prompt_string = f.read()
+    if prompt_delimiter is None:
+        return [prompt_string]
+    return [i for i in prompt_string.split(prompt_delimiter) if i]

registry.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from typing import Callable, Type
+from torch.optim import Optimizer
+from torchmetrics import Metric
+from transformers import PreTrainedTokenizerBase
+from .interfaces import CallbackWithConfig
+from .registry_utils import create_registry
+_loggers_description = 'The loggers registry is used to register classes that implement the LoggerDestination interface. ' + 'These classes are used to log data from the training loop, and will be passed to the loggers arg of the Trainer. The loggers ' + 'will be constructed by directly passing along the specified kwargs to the constructor.'
+loggers = create_registry('llmfoundry', 'loggers', generic_type=Type[LoggerDestination], entry_points=True, description=_loggers_description)
+_callbacks_description = 'The callbacks registry is used to register classes that implement the Callback interface. ' + 'These classes are used to interact with the Composer event system, and will be passed to the callbacks arg of the Trainer. ' + 'The callbacks will be constructed by directly passing along the specified kwargs to the constructor.'
+callbacks = create_registry('llmfoundry', 'callbacks', generic_type=Type[Callback], entry_points=True, description=_callbacks_description)
+_callbacks_with_config_description = 'The callbacks_with_config registry is used to register classes that implement the CallbackWithConfig interface. ' + 'These are the same as the callbacks registry, except that they additionally take the full training config as an argument to their constructor.'
+callbacks_with_config = create_registry('llm_foundry.callbacks_with_config', generic_type=Type[CallbackWithConfig], entry_points=True, description=_callbacks_with_config_description)
+_optimizers_description = 'The optimizers registry is used to register classes that implement the Optimizer interface. ' + 'The optimizer will be passed to the optimizers arg of the Trainer. The optimizer will be constructed by directly passing along the ' + 'specified kwargs to the constructor, along with the model parameters.'
+optimizers = create_registry('llmfoundry', 'optimizers', generic_type=Type[Optimizer], entry_points=True, description=_optimizers_description)
+_algorithms_description = 'The algorithms registry is used to register classes that implement the Algorithm interface. ' + 'The algorithm will be passed to the algorithms arg of the Trainer. The algorithm will be constructed by directly passing along the ' + 'specified kwargs to the constructor.'
+algorithms = create_registry('llmfoundry', 'algorithms', generic_type=Type[Algorithm], entry_points=True, description=_algorithms_description)
+_schedulers_description = 'The schedulers registry is used to register classes that implement the ComposerScheduler interface. ' + 'The scheduler will be passed to the schedulers arg of the Trainer. The scheduler will be constructed by directly passing along the ' + 'specified kwargs to the constructor.'
+schedulers = create_registry('llmfoundry', 'schedulers', generic_type=Type[ComposerScheduler], entry_points=True, description=_schedulers_description)
+_models_description = 'The models registry is used to register classes that implement the ComposerModel interface. The model\nconstructor should accept two arguments: an omegaconf DictConfig named `om_model_config` and a PreTrainedTokenizerBase named `tokenizer`.\nNote: This will soon be updated to take in named kwargs instead of a config directly.'
+models = create_registry('llmfoundry', 'models', generic_type=Type[ComposerModel], entry_points=True, description=_models_description)
+_dataloaders_description = 'The dataloaders registry is used to register functions that create a DataSpec. The function should take\na DictConfig, a PreTrainedTokenizerBase, and an int as arguments, and return a DataSpec.'
+dataloaders = create_registry('llmfoundry', 'dataloaders', generic_type=Callable[[DictConfig, PreTrainedTokenizerBase, int], DataSpec], entry_points=True, description=_dataloaders_description)
+_metrics_description = 'The metrics registry is used to register classes that implement the torchmetrics.Metric interface.'
+metrics = create_registry('llmfoundry', 'metrics', generic_type=Type[Metric], entry_points=True, description=_metrics_description)

registry_utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import functools
+import importlib.util
+import os
+from pathlib import Path
+from types import ModuleType
+from typing import Any, Callable, Dict, Generic, Optional, Sequence, Type, TypeVar, Union
+import catalogue
+T = TypeVar('T')
+class TypedRegistry(catalogue.Registry, Generic[T]):
+    """A thin wrapper around catalogue.Registry to add static typing and.
+    descriptions.
+    """
+    def __init__(self, namespace: Sequence[str], entry_points: bool=False, description: str='') -> None:
+        super().__init__(namespace, entry_points=entry_points)
+        self.description = description
+    def __call__(self, name: str, func: Optional[T]=None) -> Callable[[T], T]:
+        return super().__call__(name, func)
+    def register(self, name: str, *, func: Optional[T]=None) -> T:
+        return super().register(name, func=func)
+    def get(self, name: str) -> T:
+        return super().get(name)
+    def get_all(self) -> Dict[str, T]:
+        return super().get_all()
+    def get_entry_point(self, name: str, default: Optional[T]=None) -> T:
+        return super().get_entry_point(name, default=default)
+    def get_entry_points(self) -> Dict[str, T]:
+        return super().get_entry_points()
+S = TypeVar('S')
+def create_registry(*namespace: str, generic_type: Type[S], entry_points: bool=False, description: str='') -> 'TypedRegistry[S]':
+    """Create a new registry.
+    Args:
+        namespace (str): The namespace, e.g. "llmfoundry.loggers"
+        generic_type (Type[S]): The type of the registry.
+        entry_points (bool): Accept registered functions from entry points.
+        description (str): A description of the registry.
+    Returns:
+        The TypedRegistry object.
+    """
+    if catalogue.check_exists(*namespace):
+        raise catalogue.RegistryError(f'Namespace already exists: {namespace}')
+    return TypedRegistry[generic_type](namespace, entry_points=entry_points, description=description)
+def construct_from_registry(name: str, registry: TypedRegistry, partial_function: bool=True, pre_validation_function: Optional[Union[Callable[[Any], None], type]]=None, post_validation_function: Optional[Callable[[Any], None]]=None, kwargs: Optional[Dict[str, Any]]=None) -> Any:
+    """Helper function to build an item from the registry.
+    Args:
+        name (str): The name of the registered item
+        registry (catalogue.Registry): The registry to fetch the item from
+        partial_function (bool, optional): Whether to return a partial function for registered callables. Defaults to True.
+        pre_validation_function (Optional[Union[Callable[[Any], None], type]], optional): An optional validation function called
+            before constructing the item to return. This should throw an exception if validation fails. Defaults to None.
+        post_validation_function (Optional[Callable[[Any], None]], optional): An optional validation function called after
+            constructing the item to return. This should throw an exception if validation fails. Defaults to None.
+    Raises:
+        ValueError: If the validation functions failed or the registered item is invalid
+    Returns:
+        Any: The constructed item from the registry
+    """
+    if kwargs is None:
+        kwargs = {}
+    registered_constructor = registry.get(name)
+    if pre_validation_function is not None:
+        if isinstance(pre_validation_function, type):
+            if not issubclass(registered_constructor, pre_validation_function):
+                raise ValueError(f'Expected {name} to be of type {pre_validation_function}, but got {type(registered_constructor)}')
+        elif isinstance(pre_validation_function, Callable):
+            pre_validation_function(registered_constructor)
+        else:
+            raise ValueError(f'Expected pre_validation_function to be a callable or a type, but got {type(pre_validation_function)}')
+    if isinstance(registered_constructor, type) or (callable(registered_constructor) and (not partial_function)):
+        constructed_item = registered_constructor(**kwargs)
+    elif callable(registered_constructor):
+        constructed_item = functools.partial(registered_constructor, **kwargs)
+    else:
+        raise ValueError(f'Expected {name} to be a class or function, but got {type(registered_constructor)}')
+    if post_validation_function is not None:
+        post_validation_function(registered_constructor)
+    return constructed_item
+def import_file(loc: Union[str, Path]) -> ModuleType:
+    """Import module from a file.
+    Used to run arbitrary python code.
+    Args:
+        name (str): Name of module to load.
+        loc (str / Path): Path to the file.
+    Returns:
+        ModuleType: The module object.
+    """
+    if not os.path.exists(loc):
+        raise FileNotFoundError(f'File {loc} does not exist.')
+    spec = importlib.util.spec_from_file_location('python_code', str(loc))
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    try:
+        spec.loader.exec_module(module)
+    except Exception as e:
+        raise RuntimeError(f'Error executing {loc}') from e
+    return module

resumption_callbacks.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import logging
+from typing import List
+log = logging.getLogger(__name__)
+class GlobalLRScaling(Callback):
+    """GlobalLRScaling.
+    This callback can be applied upon resuming a model checkpoint. Upon
+    fit_start it will multiply the base LR by `lr_scale` and set the WD to be.
+    `wd_pct` * `lr`.
+    Args:
+        lr_scale (float): Multiplicative factor to scale LR by
+        wd_pct (float): Percentage of LR to set weight decay to.
+    """
+    def __init__(self, lr_scale: float, wd_pct: float=0.0):
+        self.lr_scale = lr_scale
+        self.wd_pct = wd_pct
+    def fit_start(self, state: State, logger: Logger) -> None:
+        del logger
+        if hasattr(state, 'optimizer') and state.optimizers is None:
+            raise Exception('No optimizers defined')
+        for optimizer in state.optimizers:
+            for group in optimizer.param_groups:
+                group['lr'] *= self.lr_scale
+                group['weight_decay'] = group['lr'] * self.wd_pct
+                if 'initial_lr' in group:
+                    group['initial_lr'] *= self.lr_scale
+                log.info(f"Set LR and WD to {group['lr']}, {group['weight_decay']}")
+        for scheduler in state.schedulers:
+            scheduler.base_lrs = [self.lr_scale * lr for lr in scheduler.base_lrs]
+class LayerFreezing(Callback):
+    """LayerFreezing.
+    This callback can be applied upon resuming a model checkpoint. Upon
+    fit_start it freeze the layers specified in `layer_names`. If using
+    activation checkpointing, please set the
+    `activation_checkpointing_reentrant` flag in `fsdp_config` to false.
+    Args:
+        layer_names (float): Names of layers to freeze.
+    """
+    def __init__(self, layer_names: List[str]):
+        self.layer_names = set(layer_names)
+    def fit_start(self, state: State, logger: Logger) -> None:
+        del logger
+        model_layers = set((name for name, _ in state.model.named_parameters()))
+        for layer in self.layer_names:
+            if layer not in model_layers:
+                raise Exception(f'Attempted to freeze layer not found in model: {layer}\nAvailable layers: {model_layers}')
+        successful_freeze = False
+        for name, p in state.model.named_parameters():
+            if p.requires_grad and name in self.layer_names:
+                p.requires_grad = False
+                log.debug(f'Froze layer: {name}\nParam: {p}')
+                successful_freeze = True
+        if not successful_freeze:
+            raise Exception(f"Tried to run LayerFreezing but didn't freeze any layers")

scheduled_gc_callback.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import gc
+from typing import Optional
+import torch
+def gc_cuda():
+    """Garbage collect Torch (CUDA) memory."""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+class ScheduledGarbageCollector(Callback):
+    """Disable automatic garbage collection and collect garbage at interval.
+    Args:
+        batch_interval (int): Number of batches between calls to gc.collect()
+        gen_1_batch_interval(int, optional): Number of batches between calls to gc.collect(1)
+        eval_keep_disabled (bool): keep gc disabled during eval (default: False)
+    """
+    def __init__(self, batch_interval: int, gen_1_batch_interval: Optional[int]=None, eval_keep_disabled: bool=False):
+        self.batch_interval = batch_interval
+        self.gen_1_batch_interval = gen_1_batch_interval
+        self.eval_keep_disabled = eval_keep_disabled
+        self.gc_init_state = None
+    def fit_start(self, state: State, logger: Logger) -> None:
+        del state, logger
+        self.gc_init_state = gc.isenabled()
+        gc.disable()
+        gc_cuda()
+    def fit_end(self, state: State, logger: Logger) -> None:
+        del state, logger
+        gc_cuda()
+        if self.gc_init_state:
+            gc.enable()
+        else:
+            gc.disable()
+    def before_dataloader(self, state: State, logger: Logger) -> None:
+        del logger
+        if self.gen_1_batch_interval is not None and state.timestamp.batch.value % self.gen_1_batch_interval == 0:
+            gc.collect(1)
+        if state.timestamp.batch.value % self.batch_interval == 0:
+            gc_cuda()
+    def eval_start(self, state: State, logger: Logger) -> None:
+        del state, logger
+        gc_cuda()
+        if not self.eval_keep_disabled:
+            gc.enable()
+    def eval_end(self, state: State, logger: Logger) -> None:
+        del state, logger
+        if not self.eval_keep_disabled:
+            gc.disable()
+        gc_cuda()

tasks.py ADDED Viewed

	@@ -0,0 +1,581 @@

+"""Includes code for task-specific seq-to-seq data formatting.
+This file provides some templates/examples of preprocessing functions
+that format examples for use in seq-to-seq finetuning tasks.
+These preprocessing functions take individual examples that contain raw
+text and process them into formatted examples.
+These functions have this basic structure:
+    def preprocessing_fn(example: Dict) -> Dict[str, str]:
+        # code to extract prompt/response from `example`
+        ...
+        return {
+            'prompt': <prompt>,
+            'response': <response>,
+        }
+where `<prompt>` is a placeholder for the prompt text string that you
+extracted from the input example, and '<response>' is a placeholder for
+the response text string.
+Just to be clear, "prompt" represents the text you would give the model
+at inference time, and "response" represents the text you are training
+it to produce given the prompt.
+The key requirement of these functions is that they return a dictionary
+with "prompt" and "response" keys, and that the values associated with
+those keys are strings (i.e. text).
+"""
+import importlib
+import logging
+import os
+import warnings
+from collections.abc import Mapping
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union, cast
+import datasets as hf_datasets
+import huggingface_hub as hf_hub
+import numpy as np
+from streaming import Stream, StreamingDataset
+from transformers import PreTrainedTokenizerBase
+from .collator import _HF_IGNORE_INDEX, stitch_turns_decoder_only, stitch_turns_encoder_decoder
+from .exceptions import ConsecutiveRepeatedChatRolesError, IncorrectMessageKeyQuantityError, InvalidContentTypeError, InvalidFileExtensionError, InvalidLastChatMessageRoleError, InvalidPromptResponseKeysError, InvalidPromptTypeError, InvalidResponseTypeError, InvalidRoleError, NotEnoughChatDataError, TooManyKeysInExampleError, UnableToProcessPromptResponseError, UnknownExampleTypeError
+from .logging_utils import SpecificWarningFilter
+log = logging.getLogger(__name__)
+_ALLOWED_RESPONSE_KEYS = {'response', 'completion'}
+_ALLOWED_PROMPT_KEYS = {'prompt'}
+_ALLOWED_MESSAGES_KEYS = {'messages'}
+_ALLOWED_ROLE_KEYS = {'role'}
+_ALLOWED_CONTENT_KEYS = {'content'}
+_ALLOWED_ROLES = {'user', 'assistant', 'system'}
+_ALLOWED_LAST_MESSAGE_ROLES = {'assistant'}
+DOWNLOADED_FT_DATASETS_DIRPATH = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir, os.pardir, os.pardir, '.downloaded_finetuning'))
+SUPPORTED_EXTENSIONS = ['.csv', '.jsonl', '.parquet']
+PromptResponseDict = Mapping[str, str]
+ChatFormattedDict = Mapping[str, List[Dict[str, str]]]
+Example = Union[PromptResponseDict, ChatFormattedDict]
+ExampleType = Literal['prompt_response', 'chat']
+TokenizedExample = Dict[str, List[Dict[str, List[int]]]]
+def _get_example_type(example: Example) -> ExampleType:
+    """Determines the type of the input example.
+    Args:
+        example (Example): The input example, which can be a multi-way chat formatted conversation or an instruction-response pair.
+    Returns:
+        ExampleType: The type of the input example, which can be either 'chat' for multi-way chat formatted conversation or 'prompt_response' for instruction-response pair.
+    Raises:
+        KeyError: If the example type is unknown.
+    """
+    if not isinstance(example, Mapping):
+        raise TypeError(f'Expected example to be a Mapping, but found {type(example)}')
+    if any((allowed_message_key in example for allowed_message_key in _ALLOWED_MESSAGES_KEYS)):
+        return 'chat'
+    elif any((p in example for p in _ALLOWED_PROMPT_KEYS)) and any((r in example for r in _ALLOWED_RESPONSE_KEYS)):
+        return 'prompt_response'
+    else:
+        raise UnknownExampleTypeError(example)
+def _is_empty_or_nonexistent(dirpath: str) -> bool:
+    """Check if a directory is empty or non-existent.
+    Args:
+        dirpath (str): Directory path to check.
+    Returns
+        True if directory is empty or non-existent. False otherwise.
+    """
+    return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0
+def _get_key(dictionary: Mapping[str, Any], allowed_keys: set[str]):
+    if not isinstance(dictionary, Mapping):
+        raise TypeError(f'Expected dictionary to be a mapping, but found {type(dictionary)}')
+    desired_keys = allowed_keys.intersection(dictionary.keys())
+    if len(desired_keys) != 1:
+        raise TooManyKeysInExampleError(allowed_keys, desired_keys)
+    return list(desired_keys)[0]
+def _validate_chat_formatted_example(example: ChatFormattedDict):
+    if not isinstance(example, Mapping):
+        raise TypeError(f'Expected example to be a mapping, but found {type(example)}')
+    messages = example[_get_key(example, _ALLOWED_MESSAGES_KEYS)]
+    if not isinstance(messages, List):
+        raise TypeError(f'Expected messages to be an iterable, but found {type(messages)}')
+    if len(messages) <= 1:
+        raise NotEnoughChatDataError()
+    last_message = messages[-1]
+    role_key = _get_key(last_message, _ALLOWED_ROLE_KEYS)
+    last_role = last_message[role_key]
+    if last_role not in _ALLOWED_LAST_MESSAGE_ROLES:
+        raise InvalidLastChatMessageRoleError(last_role, _ALLOWED_LAST_MESSAGE_ROLES)
+    last_message_role = None
+    for message in messages:
+        role_key, content_key = (_get_key(message, _ALLOWED_ROLE_KEYS), _get_key(message, _ALLOWED_CONTENT_KEYS))
+        if len(message.keys()) != 2:
+            raise IncorrectMessageKeyQuantityError(list(message.keys()))
+        if message[role_key] not in _ALLOWED_ROLES:
+            raise InvalidRoleError(message[role_key], _ALLOWED_ROLES)
+        if not isinstance(message[content_key], str):
+            raise InvalidContentTypeError(type(message[content_key]))
+        if last_message_role is not None and last_message_role == message[role_key]:
+            raise ConsecutiveRepeatedChatRolesError(last_message_role)
+        last_message_role = message[role_key]
+def _slice_chat_formatted_example(example: ChatFormattedDict, tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, str]]:
+    """Slices chat example into a list of templated prompt, response turns.
+    Note: Assistant messages mark the end of chat turns. So there are as many turns as there are
+        assistant messages in the chat example.
+    Args:
+        example (ChatFormattedDict): The chat example containing the messages.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to apply the chat template.
+    Returns:
+        List[Tuple[str, str]]: A list of templated prompt and response string pairs, one pair per chat turn.
+    Raises:
+        ValueError: If any chat turn in the example has less than two messages or if the last message is not from the assistant.
+        KeyError: If a message does not have a role or content.
+    """
+    _validate_chat_formatted_example(example)
+    messages = example[_get_key(example, _ALLOWED_MESSAGES_KEYS)]
+    last_message = messages[-1]
+    if last_message['role'] != 'assistant':
+        raise InvalidLastChatMessageRoleError(last_message['role'], set(['assistant']))
+    def slice_out_last_turn(messages_through_current_turn: List[Dict[str, str]], conversation_through_previous_turn: str) -> Tuple[str, str]:
+        full_conversation = tokenizer.apply_chat_template(messages_through_current_turn, tokenize=False)
+        prompt_with_history = tokenizer.apply_chat_template(messages_through_current_turn[:-1], tokenize=False, add_generation_prompt=True)
+        if conversation_through_previous_turn != full_conversation[:len(conversation_through_previous_turn)]:
+            raise ValueError(f'The full conversation must start with the conversation through the previous turn. conversation_through_previous_turn={conversation_through_previous_turn!r}, full_conversation={full_conversation!r}')
+        if conversation_through_previous_turn != prompt_with_history[:len(conversation_through_previous_turn)]:
+            raise ValueError(f'The prompt_with_histry must start with the conversation through the previous turn. conversation_through_previous_turn={conversation_through_previous_turn!r}, prompt_with_history={prompt_with_history!r}')
+        if prompt_with_history != full_conversation[:len(prompt_with_history)]:
+            raise ValueError(f'prompt_with_history must be the first part of the full conversation. prompt_with_history={prompt_with_history!r}, full_conversation={full_conversation!r}')
+        prompt = prompt_with_history[len(conversation_through_previous_turn):]
+        response = full_conversation[len(prompt_with_history):]
+        return (prompt, response)
+    templated_prompt_response_turns: List[Tuple[str, str]] = []
+    conversation_through_previous_turn = ''
+    for idx, message in enumerate(messages):
+        if message['role'] == 'assistant':
+            prompt, response = slice_out_last_turn(messages[:idx + 1], conversation_through_previous_turn)
+            templated_prompt_response_turns.append((prompt, response))
+            conversation_through_previous_turn += prompt
+            conversation_through_previous_turn += response
+    return templated_prompt_response_turns
+def _tokenize_with_bos_removal(tokenizer: PreTrainedTokenizerBase, text: str, text_target: str) -> Dict[str, List[int]]:
+    """Tokenizes the prompt and response using the provided tokenizer.
+    Args:
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to use for tokenization.
+        text (str): The prompt to tokenize.
+        text_target (str): The response to tokenize.
+    Returns:
+        Dict[str, List[int]]: The tokenized text and text_target.
+    """
+    tokenized_sample = tokenizer(text=text, text_target=text_target, padding=False, truncation=False)
+    if hasattr(tokenizer, 'add_bos_token') and tokenizer.add_bos_token:
+        if tokenizer.bos_token_id is not None and tokenized_sample['labels'][0] == tokenizer.bos_token_id:
+            tokenized_sample['labels'] = tokenized_sample['labels'][1:]
+    return tokenized_sample
+def _tokenize_chat_formatted_example(example: ChatFormattedDict, tokenizer: PreTrainedTokenizerBase) -> TokenizedExample:
+    """Tokenizes a chat-formatted example using the provided tokenizer.
+    Args:
+        example (ChatFormattedDict): The chat-formatted example to tokenize.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to use for tokenization.
+    Returns:
+        TokenizedExample: The tokenized example.
+    """
+    return {'turns': [tokenizer(text=prompt, text_target=response, add_special_tokens=False, padding=False, truncation=False) for prompt, response in _slice_chat_formatted_example(example, tokenizer)]}
+def _tokenize_prompt_response_formatted_example(example: PromptResponseDict, tokenizer: PreTrainedTokenizerBase) -> TokenizedExample:
+    """Tokenize a formatted example and validate expected keys."""
+    example_keys = set(example.keys())
+    prompt_keys = example_keys.intersection(_ALLOWED_PROMPT_KEYS)
+    response_keys = example_keys.intersection(_ALLOWED_RESPONSE_KEYS)
+    if len(prompt_keys) != 1:
+        raise TooManyKeysInExampleError(_ALLOWED_PROMPT_KEYS, prompt_keys)
+    if len(response_keys) != 1:
+        raise TooManyKeysInExampleError(_ALLOWED_RESPONSE_KEYS, response_keys)
+    prompt_key = prompt_keys.pop()
+    response_key = response_keys.pop()
+    prompt = example[prompt_key]
+    response = example[response_key]
+    if not isinstance(prompt, str):
+        raise InvalidPromptTypeError(type(prompt))
+    if not isinstance(response, str):
+        raise InvalidResponseTypeError(type(response))
+    return {'turns': [_tokenize_with_bos_removal(tokenizer=tokenizer, text=prompt, text_target=response)]}
+def tokenize_formatted_example(example: Example, tokenizer: PreTrainedTokenizerBase) -> TokenizedExample:
+    """Tokenizes a formatted example using the provided tokenizer.
+    Args:
+        example (Example): The input example to be tokenized.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenization.
+    Returns:
+        TokenizedExample: The tokenized example.
+    Raises:
+        ValueError: If the example format is unknown.
+    """
+    example_format = _get_example_type(example)
+    if example_format == 'chat':
+        chat_example = cast(ChatFormattedDict, example)
+        return _tokenize_chat_formatted_example(chat_example, tokenizer)
+    elif example_format == 'prompt_response':
+        prompt_response_example: PromptResponseDict = cast(PromptResponseDict, example)
+        return _tokenize_prompt_response_formatted_example(prompt_response_example, tokenizer)
+    else:
+        raise UnknownExampleTypeError(example)
+def is_valid_ift_example(max_seq_len: int, target_prompts: str, target_responses: str, decoder_only_format: bool, example: TokenizedExample) -> bool:
+    """Check if the example is a valid ift example.
+    This function confirms that none of the ``input_ids`` and ``labels`` fields
+    are empty in any of the turns within the example.
+    This function also prepares the final input_ids and labels
+    of the (potentially multi-turn) example, using the target settings
+    and format, and checks whether they are suitable for training at max_seq_len.
+    The example is not valid if (1) after truncation (if necessary),
+    the training targets contain no loss-generating tokens, or (2) either the
+    input_ids and labels are empty.
+    The token sequences in ``example`` are assumed to not have had
+    any padding or truncation applied already.
+    Args:
+        max_seq_len (int): Maximum sequence length.
+        target_prompts (str): The prompts that are used as targets.
+        target_responses (str): The responses that are used as targets.
+        decoder_only_format (bool): Whether the data will be formatted
+            for a decoder-only model.
+        example (Dict): The input example after tokenization, which has
+            a list of dicts, each with ``input_ids`` and ``labels`` fields.
+    Returns:
+        bool: Indicator of whether the input example is valid
+    """
+    for turn in example['turns']:
+        if len(turn['input_ids']) == 0:
+            return False
+        if len(turn['labels']) == 0:
+            return False
+    if decoder_only_format:
+        input_ids, labels = stitch_turns_decoder_only(example_turns=example['turns'], target_prompts=target_prompts, target_responses=target_responses)
+    else:
+        input_ids, labels = stitch_turns_encoder_decoder(example_turns=example['turns'])
+    input_ids = input_ids[:max_seq_len]
+    labels = labels[:max_seq_len]
+    if len(input_ids) == 0:
+        return False
+    if len([label for label in labels if label != _HF_IGNORE_INDEX]) == 0:
+        return False
+    return True
+def _stream_remote_local_validate(remote: Optional[str], local: Optional[str], split: Optional[str]):
+    if remote is None or local == remote:
+        if local is not None and os.path.isdir(local):
+            contents = set(os.listdir(local))
+            if split is not None and split not in contents:
+                raise ValueError(f'Local directory {local} does not contain split {split}')
+class StreamingFinetuningDataset(StreamingDataset):
+    """Finetuning dataset with flexible tokenization using StreamingDataset.
+    Args:
+        tokenizer (Tokenizer): The name of the HuggingFace tokenizer to use to
+            tokenize samples.
+        streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
+            which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
+            ``remote``/``local``. Defaults to ``None``.
+        local (str): Local dataset directory where shards are cached by split.
+        remote (str, optional): Remote path or directory to download the dataset from. If ``None``,
+            its data must exist locally. StreamingDataset uses either ``streams`` or
+            ``remote``/``local``. Defaults to ``None``.
+        split (str, optional): Which dataset split to use, if any. If provided, we stream from/to
+            the ``split`` subdirs of  ``remote`` and ``local``. Defaults to ``None``.
+        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
+        download_timeout (float): Number of seconds to wait for a shard to download before raising
+            an exception. Defaults to ``60``.
+        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
+            shards. Defaults to ``None``.
+        keep_zip (bool): Whether to keep or delete the compressed form when decompressing
+            downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
+            `False``.
+        epoch_size (Union[int, str], optional): Number of samples to draw per epoch balanced across all
+            streams. If ``None``, takes its value from the total number of underlying samples.
+            Provide this field if you are weighting streams relatively to target a larger or
+            smaller epoch size. Defaults to ``None``.
+        predownload (int, optional): Target number of samples ahead to download the shards of while
+            iterating. If ``None``, its value is set to ``8 * batch_size``. Defaults to ``None``.
+        cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
+            shard cache. Before downloading a shard, the least recently used resident shard(s) may
+            be evicted (deleted from the local cache) in order to stay under the limit. Set to None
+            to disable shard eviction. Supports integer bytes as well as string human-readable
+            bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
+        partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
+        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
+            resumption. If ``None``, this is interpreted as 64 times the number of physical
+            nodes of the initial run if ``shuffle_algo`` is ``py1s`` or ``py2s``, and simply the
+            number of physical nodes of the initial run otherwise. Defaults to ``None``.
+        batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
+            partitioned over the workers. Defaults to ``None``.
+        shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
+            ``False``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1e``.
+        shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
+        shuffle_block_size (int): Unit of shuffle. If ``None``, its value is calculated as
+            ``max(4_000_000 // num_canonical_nodes), 1 << 18)``. Defaults to ``None``.
+        sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
+            Defaults to ``balanced``.
+        sampling_granularity (int): When picking samples for a stream's final partial repeat,
+            how many samples to pick from the same shard at a time (``1`` for evenly balanced
+            across shards, ``1000`` to pick 1000 samples from the same shard at a time, etc).
+            Defaults to ``1``.
+        batching_method (str): Which batching method to use, either ``random``, ``stratified``, or
+            ``per_stream``. Defaults to ``random``.
+    """
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, streams: Optional[Sequence[Stream]]=None, local: Optional[str]=None, remote: Optional[str]=None, split: Optional[str]=None, download_retry: int=2, download_timeout: float=60, validate_hash: Optional[str]=None, keep_zip: bool=False, epoch_size: Optional[Union[int, str]]=None, predownload: Optional[int]=None, cache_limit: Optional[Union[int, str]]=None, partition_algo: str='relaxed', num_canonical_nodes: Optional[int]=None, batch_size: Optional[int]=None, shuffle: bool=False, shuffle_algo: str='py1e', shuffle_seed: int=9176, shuffle_block_size: Optional[int]=None, sampling_method: str='balanced', sampling_granularity: int=1, batching_method: str='random', max_seq_len: int=2048, **kwargs: Any):
+        if len(kwargs) > 0:
+            raise ValueError(f'StreamingFinetuningDataset() got an unexpected keyword argument: {kwargs}')
+        if streams is None:
+            _stream_remote_local_validate(remote, local, split)
+        else:
+            for stream in streams:
+                _stream_remote_local_validate(stream.remote, stream.local, split)
+        super().__init__(streams=streams, local=local, remote=remote, split=split, download_retry=download_retry, download_timeout=download_timeout, validate_hash=validate_hash, keep_zip=keep_zip, epoch_size=epoch_size, predownload=predownload, cache_limit=cache_limit, partition_algo=partition_algo, num_canonical_nodes=num_canonical_nodes, batch_size=batch_size, shuffle=shuffle, shuffle_algo=shuffle_algo, shuffle_seed=shuffle_seed, shuffle_block_size=shuffle_block_size, sampling_method=sampling_method, sampling_granularity=sampling_granularity, batching_method=batching_method)
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        sample = super().__getitem__(idx)
+        if 'turns' in sample:
+            return sample
+        if 'input_ids' in sample:
+            if isinstance(sample['input_ids'], bytes):
+                sample['input_ids'] = np.frombuffer(sample['input_ids'], dtype=np.int64)[:self.max_seq_len].tolist().copy()
+                sample['labels'] = np.frombuffer(sample['labels'], dtype=np.int64)[:self.max_seq_len].tolist().copy()
+            elif isinstance(sample['input_ids'], np.ndarray):
+                sample['input_ids'] = sample['input_ids'][:self.max_seq_len].tolist().copy()
+                sample['labels'] = sample['labels'][:self.max_seq_len].tolist().copy()
+            else:
+                raise ValueError(f"Expect input_ids to be bytes or numpy.ndarray type, but got {type(sample['input_ids'])}")
+            return {'turns': [sample]}
+        return tokenize_formatted_example(sample, tokenizer=self.tokenizer)
+class DatasetConstructor:
+    def __init__(self):
+        self._task_preprocessing_registry: Dict[str, Callable] = {}
+    def register(self, *names: str) -> Callable[[Callable], Callable]:
+        """Decorator for registering preprocessing functions."""
+        def _register_func(name: str, func: Callable) -> None:
+            if name in self._task_preprocessing_registry:
+                raise ValueError(f'A tokenization function has already been registered with name={name!r}.')
+            self._task_preprocessing_registry[name] = func
+            return
+        def wrapper(func: Callable) -> Callable:
+            for name in names:
+                _register_func(name, func)
+            return func
+        return wrapper
+    def print_registered_tasks(self) -> None:
+        tasks = sorted(self._task_preprocessing_registry.keys())
+        log.info('\n'.join(tasks))
+    def get_preprocessing_fn_from_dict(self, mapping: Dict[str, str]) -> Callable[[Dict[str, Any]], Dict[str, str]]:
+        """Get a preprocessing function from a dictionary.
+        The dictionary maps column names in the dataset to "prompt" and "response".
+        For example,
+            ```yaml
+            preprocessing_fn:
+                prompt: text
+                response: summary
+            ```
+        would map the `text` column as to prompt and the `summary` column as the response.
+        Args:
+            mapping (dict): A dictionary mapping column names to "prompt" and "response".
+        Returns:
+            Callable: The preprocessing function.
+        Raises:
+            ValueError: If the mapping does not have keys "prompt" and "response".
+        """
+        def _preprocessor(example: Dict[str, Any]) -> Dict[str, str]:
+            if list(mapping.keys()) != ['prompt', 'response']:
+                raise InvalidPromptResponseKeysError(mapping, example)
+            return {'prompt': example[mapping['prompt']], 'response': example[mapping['response']]}
+        return _preprocessor
+    def get_preprocessing_fn_from_str(self, preprocessor: Optional[str], dataset_name: Optional[str]=None) -> Optional[Callable[[Dict[str, Any]], Dict[str, str]]]:
+        """Get a preprocessing function from a string.
+        String can be either a registered function or an import path.
+        Args:
+            preprocessor (Optional[str]): The name of the preprocessing function, or an import path.
+            dataset_name (Optional[str]): The dataset name to look up in the registry.
+        Returns:
+            Callable: The preprocessing function or None if not found.
+        Raises:
+            ValueError: If the preprocessing function import from the provided string fails.
+        """
+        if preprocessor is None:
+            if dataset_name is None:
+                return None
+            if dataset_name in self._task_preprocessing_registry:
+                log.info(f'Re-formatting dataset with "{dataset_name}" preprocessing function.')
+                return self._task_preprocessing_registry[dataset_name]
+            else:
+                log.info('No preprocessor was supplied and no preprocessing function ' + f'is registered for dataset name "{dataset_name}". No additional ' + 'preprocessing will be applied. If the dataset is already formatted ' + 'correctly, you can ignore this message.')
+                return None
+        if preprocessor in self._task_preprocessing_registry:
+            log.info(f'Re-formatting dataset with "{preprocessor}" preprocessing function.')
+            return self._task_preprocessing_registry[preprocessor]
+        try:
+            import_path, function_name = preprocessor.split(':', maxsplit=1)
+            module = importlib.import_module(import_path)
+            preprocessing_fn = getattr(module, function_name)
+        except Exception as e:
+            raise ValueError(f'Failed to import preprocessing function from string = {preprocessor}.') from e
+        return preprocessing_fn
+    def build_from_hf(self, dataset_name: str, split: str, safe_load: bool, max_seq_len: int, preprocessing_fn: Optional[Callable[[dict[str, Any]], dict[str, str]]], tokenizer: PreTrainedTokenizerBase, target_prompts: str, target_responses: str, decoder_only_format: bool, hf_kwargs: Dict[str, Any]) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset, hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
+        """Load a HuggingFace Datasets, preprocess, and tokenize.
+        Note: This function will drop examples where the prompt is longer than the max_seq_len
+        Args:
+            cfg (DictConfig): The dataset configuration.
+            max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
+            tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
+        Returns:
+            Dataset: The tokenized dataset.
+        """
+        signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed'
+        if dist.get_local_rank() != 0:
+            log.debug('Waiting for local_rank 0 to finish data prep')
+            with dist.local_rank_zero_download_and_wait(signal_file_path):
+                pass
+        hf_tokenization_logger = logging.getLogger('transformers.tokenization_utils_base')
+        sequence_length_warning_filter = SpecificWarningFilter('Token indices sequence length is longer than the specified maximum sequence length')
+        hf_tokenization_logger.addFilter(sequence_length_warning_filter)
+        error: Optional[Exception] = None
+        filtered_dataset = None
+        try:
+            if safe_load:
+                if not os.path.isdir(dataset_name):
+                    local_dataset_dir = os.path.join(DOWNLOADED_FT_DATASETS_DIRPATH, dataset_name)
+                    if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
+                        hf_hub.snapshot_download(dataset_name, repo_type='dataset', allow_patterns=['*' + ext for ext in SUPPORTED_EXTENSIONS], token=hf_kwargs.get('token', None), revision=hf_kwargs.get('revision', None), local_dir_use_symlinks=False, local_dir=local_dataset_dir)
+                        if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
+                            raise InvalidFileExtensionError(dataset_name, SUPPORTED_EXTENSIONS)
+                    dataset_name = local_dataset_dir
+                dataset_name = os.path.abspath(dataset_name)
+                dataset_files = [f for _, _, files in os.walk(dataset_name) for f in files]
+                if not all((Path(f).suffix in SUPPORTED_EXTENSIONS for f in dataset_files)):
+                    raise InvalidFileExtensionError(dataset_name, SUPPORTED_EXTENSIONS)
+            dataset = hf_datasets.load_dataset(dataset_name, split=split, **hf_kwargs)
+            def dataset_mapper(example: Dict):
+                if preprocessing_fn is not None:
+                    example = preprocessing_fn(example)
+                return tokenize_formatted_example(example, tokenizer)
+            detected_cpu_count = os.cpu_count() or 1
+            detected_cpus_with_margin = detected_cpu_count - 8
+            num_cpus_to_use = max(1, detected_cpus_with_margin)
+            columns_to_remove = list(dataset[0].keys())
+            tokenized_dataset = dataset.map(dataset_mapper, batched=False, remove_columns=columns_to_remove, num_proc=num_cpus_to_use, desc='Tokenizing dataset')
+            filtered_dataset = tokenized_dataset.filter(partial(is_valid_ift_example, max_seq_len, target_prompts, target_responses, decoder_only_format), num_proc=num_cpus_to_use, desc='Filtering out long prompts')
+            examples_removed = len(tokenized_dataset) - len(filtered_dataset)
+            if examples_removed > 0:
+                warnings.warn(f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}, ' + 'the prompt or response was empty, or the response was all padding tokens.')
+        except Exception as e:
+            error = e
+        if dist.get_local_rank() == 0:
+            log.debug('Local rank 0 finished data prep')
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_data_prep')
+        dist.barrier()
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
+        if error is not None:
+            log.error('Error during data prep')
+            raise error
+        log.debug('All ranks finished data prep')
+        hf_tokenization_logger.removeFilter(sequence_length_warning_filter)
+        assert filtered_dataset is not None
+        return filtered_dataset
+    def build_from_streaming(self, *args: Any, **kwargs: Any) -> StreamingFinetuningDataset:
+        return StreamingFinetuningDataset(*args, **kwargs)
+dataset_constructor = DatasetConstructor()
+@dataset_constructor.register('tatsu-lab/alpaca')
+def alpaca_preprocessing_function(inp: Dict) -> Dict[str, str]:
+    """Split out prompt/response from text."""
+    try:
+        prompt, response = inp['text'].split('### Response:')
+        prompt += '### Response:'
+    except Exception as e:
+        raise UnableToProcessPromptResponseError(inp) from e
+    return {'prompt': prompt, 'response': response}
+@dataset_constructor.register('HuggingFaceH4/databricks_dolly_15k')
+def dolly_preprocessing_function(inp: Dict) -> Dict[str, str]:
+    """Format the text string."""
+    PROMPT_FORMAT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n'
+    try:
+        if inp['input'] != '':
+            instruction = inp['instruction'] + '\n' + inp['input']
+        else:
+            instruction = inp['instruction']
+        prompt = PROMPT_FORMAT.format(instruction=instruction)
+        response = inp['output']
+    except Exception as e:
+        raise UnableToProcessPromptResponseError(inp) from e
+    return {'prompt': prompt, 'response': response}
+@dataset_constructor.register('bigscience/P3')
+def p3_preprocessing_function(inp: Dict) -> Dict[str, str]:
+    """Format the already-split example."""
+    return {'prompt': inp['inputs'] + ':', 'response': inp['targets']}
+@dataset_constructor.register('Muennighoff/P3', 'Muennighoff/flan')
+def muennighoff_tokenize_function(inp: Dict) -> Dict[str, str]:
+    """Format the already-split example."""
+    try:
+        prompt: str = inp['inputs']
+        response: str = inp['targets']
+        transitions = (' ', '\n', '\t')
+        if not (prompt.endswith(transitions) or response.startswith(transitions)):
+            response = ' ' + response
+    except Exception as e:
+        raise UnableToProcessPromptResponseError(inp) from e
+    return {'prompt': prompt, 'response': response}

text_data.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""Build a StreamingTextDataset dataset and dataloader for training."""
+import os
+from itertools import islice
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Union, cast
+import numpy as np
+import torch
+import transformers
+from streaming import Stream, StreamingDataset
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+class StreamingTextDataset(StreamingDataset):
+    """Generic text dataset using MosaicML's StreamingDataset.
+    Args:
+        tokenizer (Tokenizer): HuggingFace tokenizer to
+            tokenize samples.
+        max_seq_len (int): The max sequence length of each sample.
+        streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
+            which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
+            ``remote``/``local``. Defaults to ``None``.
+        remote (str, optional): Remote path or directory to download the dataset from. If ``None``,
+            its data must exist locally. StreamingDataset uses either ``streams`` or
+            ``remote``/``local``. Defaults to ``None``.
+        local (str, optional): Local working directory to download shards to. This is where shards
+            are cached while they are being used. Uses a temp directory if not set.
+            StreamingDataset uses either ``streams`` or ``remote``/``local``. Defaults to ``None``.
+        split (str, optional): Which dataset split to use, if any. If provided, we stream from/to
+            the ``split`` subdirs of  ``remote`` and ``local``. Defaults to ``None``.
+        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
+        download_timeout (float): Number of seconds to wait for a shard to download before raising
+            an exception. Defaults to ``60``.
+        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
+            shards. Defaults to ``None``.
+        keep_zip (bool): Whether to keep or delete the compressed form when decompressing
+            downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
+            `False``.
+        epoch_size (Union[int, str], optional): Number of samples to draw per epoch balanced across all
+            streams. If ``None``, takes its value from the total number of underlying samples.
+            Provide this field if you are weighting streams relatively to target a larger or
+            smaller epoch size. Defaults to ``None``.
+        predownload (int, optional): Target number of samples ahead to download the shards of while
+            iterating. If ``None``, its value is set to ``8 * batch_size``. Defaults to ``None``.
+        cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
+            shard cache. Before downloading a shard, the least recently used resident shard(s) may
+            be evicted (deleted from the local cache) in order to stay under the limit. Set to None
+            to disable shard eviction. Supports integer bytes as well as string human-readable
+            bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
+        partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
+        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
+            resumption. If ``None``, this is interpreted as 64 times the number of physical
+            nodes of the initial run if ``shuffle_algo`` is ``py1s`` or ``py2s``, and simply the
+            number of physical nodes of the initial run otherwise. Defaults to ``None``.
+        batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
+            partitioned over the workers. Defaults to ``None``.
+        shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
+            ``False``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1e``.
+        shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
+        shuffle_block_size (int, optional): Unit of shuffle. A canonical node's samples are split
+            into blocks of this size, and samples within each block are shuffled. If ``None``, its
+            value is calculated as ``max(4_000_000 // num_canonical_nodes), 1 << 18)``. Defaults to
+            ``None``.
+        sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
+            Defaults to ``balanced``.
+        sampling_granularity (int): When picking samples for a stream's final partial repeat,
+            how many samples to pick from the same shard at a time (``1`` for evenly balanced
+            across shards, ``1000`` to pick 1000 samples from the same shard at a time, etc).
+            Defaults to ``1``.
+        batching_method (str): Which batching method to use, either ``random``, ``stratified``, or
+            ``per_stream``. Defaults to ``random``.
+    """
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, max_seq_len: int, streams: Optional[Sequence[Stream]]=None, remote: Optional[str]=None, local: Optional[str]=None, split: Optional[str]=None, download_retry: int=2, download_timeout: float=60, validate_hash: Optional[str]=None, keep_zip: bool=False, epoch_size: Optional[Union[int, str]]=None, predownload: Optional[int]=None, cache_limit: Optional[Union[int, str]]=None, partition_algo: str='relaxed', num_canonical_nodes: Optional[int]=None, batch_size: Optional[int]=None, shuffle: bool=False, shuffle_algo: str='py1e', shuffle_seed: int=9176, shuffle_block_size: Optional[int]=None, sampling_method: str='balanced', sampling_granularity: int=1, batching_method: str='random', **kwargs: Any):
+        if len(kwargs) > 0:
+            raise ValueError(f'StreamingTextDataset() got an unexpected keyword argument: {kwargs}')
+        if local is not None and (remote is None or local == remote):
+            if os.path.isdir(local):
+                contents = set(os.listdir(local))
+                if split not in contents:
+                    raise ValueError(f'local directory {local} does not contain split {split}')
+        if isinstance(shuffle_block_size, float):
+            shuffle_block_size = int(shuffle_block_size)
+        super().__init__(streams=streams, remote=remote, local=local, split=split, download_retry=download_retry, download_timeout=download_timeout, validate_hash=validate_hash, keep_zip=keep_zip, epoch_size=epoch_size, predownload=predownload, cache_limit=cache_limit, partition_algo=partition_algo, num_canonical_nodes=num_canonical_nodes, batch_size=batch_size, shuffle=shuffle, shuffle_algo=shuffle_algo, shuffle_seed=shuffle_seed, shuffle_block_size=shuffle_block_size, sampling_method=sampling_method, sampling_granularity=sampling_granularity, batching_method=batching_method)
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+    def _tokenize(self, text_sample: Mapping) -> Dict[str, List[int]]:
+        if self.tokenizer._pad_token is None:
+            raise RuntimeError('If tokenizing on-the-fly, tokenizer must have a pad_token_id')
+        return self.tokenizer(text_sample['text'], truncation=True, padding='max_length', max_length=self.max_seq_len)
+    def _read_binary_tokenized_sample(self, sample: Dict[str, Any]) -> torch.Tensor:
+        return torch.from_numpy(np.frombuffer(sample['tokens'], dtype=np.int64)[:self.max_seq_len].copy())
+    def __getitem__(self, idx: int) -> Union[Dict[str, List[int]], torch.Tensor]:
+        sample = super().__getitem__(idx)
+        if 'text' in sample:
+            token_sample = self._tokenize(sample)
+        elif 'tokens' in sample:
+            token_sample = self._read_binary_tokenized_sample(sample)
+        else:
+            raise RuntimeError('StreamingTextDataset needs samples to have a `text` or `tokens` column')
+        return token_sample
+class ConcatenatedSequenceCollatorWrapper:
+    """Collator wrapper to add sequence_id to batch."""
+    def __init__(self, base_collator: Callable, eos_token_id: Optional[int]=None, bos_token_id: Optional[int]=None):
+        self.base_collator = base_collator
+        if eos_token_id is None and bos_token_id is None:
+            raise ValueError('Must supply a value for either eos_token_id or bos_token_id, but got None for both.')
+        if eos_token_id is not None and bos_token_id is not None:
+            raise ValueError('Cannot use *both* EOS and BOS tokens for detecting sequence boundaries. ' + 'Please supply `eos_token_id` if sequences end with an EOS token, or use ' + '`bos_token_id` if sequences start with a BOS token.')
+        if eos_token_id is None:
+            self.split_token_id = cast(int, bos_token_id)
+            self.bos_mode = True
+        else:
+            self.split_token_id = eos_token_id
+            self.bos_mode = False
+    def __call__(self, examples: List[Any]) -> Dict[str, torch.Tensor]:
+        batch = self.base_collator(examples)
+        batch['sequence_id'] = self.get_sequence_id_from_batch(batch)
+        return batch
+    def get_sequence_id_from_batch(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        is_separator = torch.eq(batch['input_ids'], self.split_token_id)
+        cumulative_sep = torch.cumsum(is_separator, dim=1).to(batch['input_ids'].dtype)
+        if self.bos_mode:
+            return cumulative_sep
+        left_zeros = cumulative_sep.new_zeros((cumulative_sep.shape[0], 1))
+        return torch.cat([left_zeros, cumulative_sep[:, :-1]], dim=1)
+def build_streams(dataset_cfg: DictConfig):
+    streams_dict = dataset_cfg.pop('streams', None)
+    streams = None
+    if streams_dict is not None:
+        streams = []
+        for _, stream in streams_dict.items():
+            streams.append(Stream(**stream))
+    return streams
+def build_text_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int) -> DataSpec:
+    assert cfg.name == 'text', f'Tried to build text dataloader with cfg.name={cfg.name}'
+    mlm_probability = cfg.dataset.pop('mlm_probability', None)
+    eos_token_id = cfg.dataset.pop('eos_token_id', None)
+    bos_token_id = cfg.dataset.pop('bos_token_id', None)
+    streams = build_streams(cfg.dataset)
+    dataset = StreamingTextDataset(tokenizer=tokenizer, streams=streams, batch_size=device_batch_size, **cfg.dataset)
+    collate_fn = transformers.DataCollatorForLanguageModeling(tokenizer=dataset.tokenizer, mlm=mlm_probability is not None, mlm_probability=mlm_probability)
+    if eos_token_id is not None or bos_token_id is not None:
+        collate_fn = ConcatenatedSequenceCollatorWrapper(base_collator=collate_fn, eos_token_id=eos_token_id, bos_token_id=bos_token_id)
+    dl = DataLoader(dataset, collate_fn=collate_fn, batch_size=device_batch_size, drop_last=cfg.drop_last, num_workers=cfg.num_workers, pin_memory=cfg.get('pin_memory', True), prefetch_factor=cfg.get('prefetch_factor', 2), persistent_workers=cfg.get('persistent_workers', True), timeout=cfg.get('timeout', 0))
+    token_counting_func = None
+    if tokenizer.pad_token_id is not None:
+        token_counting_func = get_tokens_per_batch_func()
+    return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func)
+def get_tokens_per_batch_func(decoder_only: bool=True) -> Callable[[Batch], int]:
+    """Returns a callable that counts the number of tokens in a batch.
+    Args:
+        pad_token_id (int): The id of the padding token.
+        decoder_only (bool, optional): Whether to expect the batch to just contain ``input_ids`` (decoder only)
+            or to also contain ``decoder_input_ids`` (encoder decoder). Defaults to ``True``.
+    Returns:
+        Callable[[Batch], int]: A callable that counts the number of tokens in a batch.
+    """
+    def get_num_samples_in_batch(batch: Batch) -> int:
+        if not isinstance(batch, Mapping) or ('attention_mask' not in batch and 'input_ids' not in batch):
+            raise ValueError('get_tokens_per_batch_func() requires a batch with an attention_mask key or an input_ids key')
+        if not decoder_only and 'decoder_attention_mask' not in batch:
+            raise ValueError('get_tokens_per_batch_func() for encoder decoder requires a batch with a decoder_attention_mask key')
+        if 'attention_mask' in batch:
+            input_ids_tokens = int(torch.sum(batch['attention_mask']).item())
+        else:
+            input_ids_tokens = batch['input_ids'].numel()
+        decoder_input_ids_tokens = 0
+        if not decoder_only:
+            decoder_input_ids_tokens = int(torch.sum(batch['decoder_attention_mask']).item())
+        return input_ids_tokens + decoder_input_ids_tokens
+    return get_num_samples_in_batch
+if __name__ == '__main__':
+    import argparse
+    from .builders import build_tokenizer
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--tokenizer', type=str, default='EleutherAI/gpt-neox-20b', help='the name of the tokenizer to use')
+    parser.add_argument('--local_path', type=str, required=True, help='the path to the local copy of the dataset')
+    parser.add_argument('--remote_path', type=str, default=None, help='the path to the remote copy to stream from (optional)')
+    parser.add_argument('--split', type=str, default='val', help='which split of the dataset to use')
+    parser.add_argument('--max_seq_len', type=int, default=32, help='max sequence length to test')
+    args = parser.parse_args()
+    if args.remote_path is not None:
+        print(f'Reading {args.split} split from {args.local_path} <- streamed from <- {args.remote_path}')
+    else:
+        print(f'Reading {args.split} split from {args.local_path}')
+    cfg = {'name': 'text', 'dataset': {'local': args.local_path, 'remote': args.remote_path, 'split': args.split, 'shuffle': False, 'max_seq_len': args.max_seq_len, 'keep_zip': True}, 'drop_last': False, 'num_workers': 4}
+    cfg = om.create(cfg)
+    device_batch_size = 2
+    tokenizer_name = args.tokenizer
+    tokenizer_kwargs = {'model_max_length': args.max_seq_len}
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+    loader = build_text_dataloader(cfg, tokenizer, device_batch_size).dataloader
+    assert isinstance(loader, DataLoader)
+    assert isinstance(loader.dataset, StreamingTextDataset)
+    tokenizer = loader.dataset.tokenizer
+    for batch_ix, batch in enumerate(islice(loader, 5)):
+        print('\n')
+        print('#' * 20, f'Batch {batch_ix}', '#' * 20)
+        for k, v in batch.items():
+            print(k, v.shape, v.dtype)
+        for sample_ix, token_sample in enumerate(batch['input_ids']):
+            print('-' * 20, f' Sample {sample_ix} ', '-' * 20)
+            print(tokenizer.decode(token_sample))

tiktoken.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from functools import lru_cache
+from typing import Any, Dict, List, Optional, Tuple
+from transformers import PreTrainedTokenizer
+DEFAULT_SYSTEM_PROMPT = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.'
+@lru_cache()
+def bytes_to_unicode():
+    """Returns list of utf-8 byte and a mapping to unicode strings.
+    We specifically avoids mapping to whitespace/control characters the bpe code
+    barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a
+    large # of unicode characters in your vocab if you want to avoid UNKs. When
+    you're at something like a 10B token dataset you end up needing around 5K
+    for decent coverage. This is a significant percentage of your normal, say,
+    32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
+    unicode strings.
+    """
+    bs = list(range(ord('!'), ord('~') + 1)) + list(range(ord('¡'), ord('¬') + 1)) + list(range(ord('®'), ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+class TiktokenTokenizerWrapper(PreTrainedTokenizer):
+    """A thin wrapper around tiktoken to make it compatible with Hugging Face.
+    tokenizers.
+    See HuggingFace for further documentation on general tokenizer methods.
+    """
+    model_input_names = ['input_ids', 'attention_mask']
+    def __init__(self, model_name: Optional[str]=None, encoding_name: Optional[str]=None, add_bos_token: bool=False, add_eos_token: bool=False, use_default_system_prompt: bool=False, unk_token: Optional[str]='<|endoftext|>', eos_token: Optional[str]='<|endoftext|>', bos_token: Optional[str]='<|endoftext|>', pad_token: Optional[str]=None, errors: str='replace', **kwargs: Any):
+        """Constructor creates a tiktoken tokenizer to use as the underlying.
+        tokenizer.
+        Args:
+            model_name (Optional[str], optional): The name of the model to load from tiktoken. Defaults to None.
+                Either model_name or encoding_name must be set, but not both.
+            encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None.
+                Either model_name or encoding_name must be set, but not both.
+            add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
+            add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
+            use_default_system_prompt (bool, optional): Use the default system prompt or not. Defaults to False.
+            unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
+            eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
+            bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
+            pad_token (Optional[str], optional): The pad token. Defaults to None.
+            errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
+                [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+                Defaults to `"replace"`.
+        """
+        try:
+            import tiktoken
+        except:
+            raise ImportError('You need to install tiktoken to use TiktokenTokenizerWrapper.')
+        import copyreg
+        import functools
+        from tiktoken import Encoding
+        def pickle_Encoding(enc: Encoding):
+            return (functools.partial(Encoding, enc.name, pat_str=enc._pat_str, mergeable_ranks=enc._mergeable_ranks, special_tokens=enc._special_tokens), ())
+        copyreg.pickle(Encoding, pickle_Encoding)
+        if model_name is not None and encoding_name is not None:
+            raise ValueError('You need to specify either model_name or encoding_name, not both.')
+        self.model_name = model_name
+        self.encoding_name = encoding_name
+        if self.model_name is not None:
+            self.encoding = tiktoken.encoding_for_model(self.model_name)
+        elif self.encoding_name is not None:
+            self.encoding = tiktoken.get_encoding(self.encoding_name)
+        else:
+            raise ValueError('You need to specify either model_name or encoding_name.')
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.errors = errors
+        self.decoder: Dict[int, str] = {}
+        for i in range(self.encoding.n_vocab):
+            try:
+                self.encoding.decode_single_token_bytes(i)
+            except KeyError:
+                continue
+            decoding = ''.join([bytes_to_unicode()[ord(char)] for char in self.encoding.decode_single_token_bytes(i).decode('latin-1')])
+            self.decoder[i] = decoding
+        self.encoder: Dict[str, int] = {}
+        for i in range(self.encoding.n_vocab):
+            if i in self.decoder:
+                self.encoder[self.decoder[i]] = i
+        super().__init__(model_name=model_name, encoding_name=encoding_name, add_bos_token=add_bos_token, add_eos_token=add_eos_token, use_default_system_prompt=use_default_system_prompt, unk_token=unk_token, eos_token=eos_token, bos_token=bos_token, pad_token=pad_token, errors=errors, **kwargs)
+    @property
+    def vocab_size(self) -> int:
+        """Returns vocab size."""
+        return self.encoding.n_vocab
+    @property
+    def is_fast(self) -> bool:
+        return False
+    @property
+    def default_chat_template(self):
+        """Chat ML Template for User/Assistant.
+        Pinning default Chat ML template in case defaults change.
+        """
+        template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}"
+        template = template.replace('USE_DEFAULT_PROMPT', 'true' if self.use_default_system_prompt else 'false')
+        template = template.replace('DEFAULT_SYSTEM_PROMPT', DEFAULT_SYSTEM_PROMPT)
+        return template
+    def get_vocab(self) -> Dict[str, int]:
+        """Returns vocab as a dict."""
+        vocab_clone = self.encoder.copy()
+        extra_id_index = 0
+        candidate_extra_id = f'<extra_id_{extra_id_index}>'
+        indices_to_fill_in = {i for i in range(self.vocab_size)} - set(vocab_clone.values())
+        for index_to_add in indices_to_fill_in:
+            while candidate_extra_id in vocab_clone:
+                extra_id_index += 1
+                candidate_extra_id = f'<extra_id_{extra_id_index}>'
+            vocab_clone[candidate_extra_id] = index_to_add
+        return vocab_clone
+    def _tokenize(self, text: str) -> List[str]:
+        """Returns a tokenized string."""
+        if not isinstance(text, str):
+            raise ValueError(f'Expected a string input to _tokenize but got {type(text)}.')
+        tokens = [self.decoder[t] for t in self.encoding.encode(text, allowed_special='all')]
+        return tokens
+    def _convert_token_to_id(self, token: str) -> Optional[int]:
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index: int) -> Optional[str]:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, '')
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Converts a sequence of tokens (string) in a single string."""
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None) -> List[int]:
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    def get_special_tokens_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None, already_has_special_tokens: bool=False) -> List[int]:
+        """Retrieves sequence ids from a token list that has no special tokens.
+        Function copied from
+        https://github.com/huggingface/transformers/blob/e3a4bd2bee212a2d0fd9f03b27fe7bfc1debe42d/src/transformers/models/gpt2/tokenization_gpt2.py#L265-L295
+        added. This method is called when adding special tokens using the
+        tokenizer `prepare_for_model` or `encode_plus` methods.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + [0] * len(token_ids_0) + eos_token_id
+        return bos_token_id + [0] * len(token_ids_0) + eos_token_id + bos_token_id + [0] * len(token_ids_1) + eos_token_id
+    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None) -> List[int]:
+        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0]
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str]=None) -> Tuple[str]:
+        return (None, None)
+    def sanitize_special_tokens(self) -> int:
+        """Make sure that all the special tokens attributes of the tokenizer.
+        (`tokenizer.mask_token`, `tokenizer.cls_token`, etc.) are in the
+        vocabulary.
+        Add the missing ones to the vocabulary if needed.
+        Return:
+            `int`: The number of tokens added in the vocabulary during the operation.
+        """
+        actual_new_tokens = []
+        for token in self.all_special_tokens_extended:
+            encoded = self.encoding.encode(token, allowed_special='all')
+            if len(encoded) > 1:
+                actual_new_tokens.append(token)
+        return self.add_tokens(actual_new_tokens, special_tokens=True)
+TiktokenTokenizerWrapper.register_for_auto_class()

tokenizers.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .tiktoken import TiktokenTokenizerWrapper

utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .builders import build_algorithm, build_callback, build_logger, build_optimizer, build_scheduler, build_tokenizer
+from .checkpoint_conversion_helpers import convert_and_save_ft_weights, get_hf_tokenizer_from_composer_state_dict, load_tokenizer
+from .config_utils import calculate_batch_size_info, log_config, pop_config, process_init_device, update_batch_size_info
+from .data_prep_utils import DownloadingIterable, merge_shard_groups, with_id
+from .huggingface_hub_utils import edit_files_for_hf_compatibility
+from .logging_utils import SpecificWarningFilter
+from .model_download_utils import download_from_hf_hub, download_from_http_fileserver, download_from_oras
+from .mosaicml_logger_utils import find_mosaicml_logger, log_eval_analytics, log_train_analytics, maybe_create_mosaicml_logger
+from .prompt_files import load_prompts, load_prompts_from_file
+from .registry_utils import TypedRegistry, construct_from_registry, create_registry
+from .warnings import VersionedDeprecationWarning

warnings.py CHANGED Viewed

@@ -1,4 +1,8 @@
-class VersionedDeprecationWarning(DeprecationWarning):
     """A custom deprecation warning class that includes version information.
     Attributes:
@@ -10,7 +14,7 @@ class VersionedDeprecationWarning(DeprecationWarning):
         ...     warnings.warn(
         ...         VersionedDeprecationWarning(
         ...             "Function XYZ is deprecated.",
-        ...             after_version="2.0.0"
         ...         )
         ...     )
         ...
@@ -19,4 +23,49 @@ class VersionedDeprecationWarning(DeprecationWarning):
     """
     def __init__(self, message: str, remove_version: str) -> None:
-        super().__init__(message + f' It will be removed in version {remove_version}.')

+import functools
+import warnings
+from typing import Any, Callable, Type, TypeVar, cast
+class VersionedDeprecationWarning(UserWarning):
     """A custom deprecation warning class that includes version information.
     Attributes:
         ...     warnings.warn(
         ...         VersionedDeprecationWarning(
         ...             "Function XYZ is deprecated.",
+        ...             remove_version="2.0.0"
         ...         )
         ...     )
         ...
     """
     def __init__(self, message: str, remove_version: str) -> None:
+        super().__init__(message + f' It will be removed in version {remove_version}.')
+class ExperimentalWarning(Warning):
+    """A warning for experimental features.
+    Attributes:
+        feature_name (str): The name of the experimental feature.
+    """
+    def __init__(self, feature_name: str) -> None:
+        super().__init__(f'{feature_name} is experimental and may change with future versions.')
+F = TypeVar('F', bound=Callable[..., Any])
+def experimental_function(feature_name: str) -> Callable[[F], F]:
+    """Decorator to mark a function as experimental.
+    The message displayed will be {feature_name} is experimental and may change with future versions.
+    Args:
+        feature_name (str): The name of the experimental feature.
+    Returns:
+        The decorated function.
+    """
+    def decorator(func: Callable):
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any):
+            warnings.warn(ExperimentalWarning(feature_name))
+            return func(*args, **kwargs)
+        return cast(F, wrapper)
+    return decorator
+def experimental_class(feature_name: str) -> Callable[[Type], Type]:
+    """Class decorator to mark a class as experimental."""
+    def class_decorator(cls: Type):
+        original_init = cls.__init__
+        def new_init(self: Any, *args: Any, **kwargs: Any):
+            warnings.warn(ExperimentalWarning(feature_name))
+            original_init(self, *args, **kwargs)
+        cls.__init__ = new_init
+        return cls
+    return class_decorator