Shteyman commited on Jun 5

Commit

558c5b7

•

1 Parent(s): d7537dd

retry

Browse files

Files changed (29) hide show

cmd.txt +0 -1
commit.txt +0 -5
config.json +0 -29
experiment_code/config/config1.yaml +0 -28
experiment_code/config/config_redpajama.yaml +0 -27
experiment_code/prepare_sharegpt.py +0 -44
experiment_code/requirements.txt +0 -2
experiment_code/run_clm.py +0 -754
experiment_code/submit_job.sh +0 -91
last-checkpoint/config.json +0 -29
last-checkpoint/generation_config.json +0 -7
last-checkpoint/model.safetensors +0 -3
last-checkpoint/optimizer.pt +0 -3
last-checkpoint/rng_state.pth +0 -3
last-checkpoint/scheduler.pt +0 -3
last-checkpoint/special_tokens_map.json +0 -24
last-checkpoint/tokenizer.json +0 -0
last-checkpoint/tokenizer.model +0 -3
last-checkpoint/tokenizer_config.json +0 -45
last-checkpoint/trainer_state.json +0 -125
last-checkpoint/training_args.bin +0 -3
log.txt +0 -0
model.safetensors +0 -3
pip_freeze.txt +0 -330
special_tokens_map.json +0 -24
tokenizer.json +0 -0
tokenizer.model +0 -3
tokenizer_config.json +0 -45
training_args.bin +0 -3

cmd.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- /var/spool/slurmd/job117535/slurm_script 05-06_00-42

commit.txt DELETED Viewed

@@ -1,5 +0,0 @@
-commit c4fe47d125efdcc428a5dd46500d754dc07f4a94
-Author: Shteyman <[email protected]>
-Date:   Sun Jun 2 08:25:22 2024 -0700
-    clean version of run_clm.py

config.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-  "_name_or_path": "JackFram/llama-68m",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "max_position_embeddings": 2048,
-  "model_type": "llama",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 2,
-  "num_key_value_heads": 12,
-  "pad_token_id": 1,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.41.0.dev0",
-  "use_cache": true,
-  "vocab_size": 32000
-}

experiment_code/config/config1.yaml DELETED Viewed

@@ -1,28 +0,0 @@
-config_name: "JackFram/llama-68m"
-tokenizer_name: "JackFram/llama-68m"
-validation_split_percentage: 2
-train_file: "/home/dshteyma/shareGPT_data/ShareGPT_V3_unfiltered_cleaned_split.json"
-dataset_name_hub: "anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json"
-dataset_name_local: "ShareGPT"
-# max_train_samples: 1000
-# max_eval_samples: 10
-do_train: True
-do_eval: True
-output_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
-overwrite_output_dir: True
-per_device_train_batch_size: 4
-gradient_accumulation_steps: 1
-report_to: "tensorboard"
-logging_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
-logging_steps: 500
-save_steps: 1000
-eval_strategy: "steps"
-eval_steps: 1000
-learning_rate: 0.0001
-gradient_accumulation_steps: 1
-weight_decay: 0.01
-warmup_ratio: 0.05
-push_to_hub: True
-hub_model_id: "DorinSht/ShareGPT_llama2_68M"
-hub_strategy: "checkpoint"

experiment_code/config/config_redpajama.yaml DELETED Viewed

@@ -1,27 +0,0 @@
-config_name: "JackFram/llama-68m"
-tokenizer_name: "JackFram/llama-68m"
-validation_split_percentage: 2
-train_file: "/home/dshteyma/target_draft_coupling_code/dataset_dict.json"
-dataset_name_local: "RedPajama"
-dataset_name: "togethercomputer/RedPajama-Data-1T-Sample"
-dataset_name_hub: "togethercomputer/RedPajama-Data-1T-Sample"
-# max_train_samples: 1000
-# max_eval_samples: 10
-do_train: True
-do_eval: True
-output_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
-overwrite_output_dir: True
-per_device_train_batch_size: 4
-gradient_accumulation_steps: 3
-report_to: "tensorboard"
-logging_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
-logging_steps: 10000
-save_steps: 10000
-eval_strategy: "steps"
-eval_steps: 10000
-learning_rate: 0.0001
-weight_decay: 0.01
-warmup_ratio: 0.05
-push_to_hub: False
-hub_model_id: "DorinSht/llama_68M_redpajama"
-hub_strategy: "all_checkpoints"

experiment_code/prepare_sharegpt.py DELETED Viewed

@@ -1,44 +0,0 @@
-"""
-This script is largely copied from the Vicuna repo: https://github.com/lm-sys/FastChat/blob/main/fastchat/data/split_long_conversation.py
-We fixed a bug in `split_one_sample`, which previously includes long conversations in the processed data. Now we skip these long conversations.
-"""
-import argparse
-from concurrent.futures import ProcessPoolExecutor
-import json
-import transformers
-from tqdm import tqdm
-def shareGPT_pipeline(tokenizer, raw_datasets, overwrite_cache):
-    def preprocess_conversation(convo):
-        key_mapping = {"role" : "from", "content" : "value"}
-        value_mapping = {"user" : "user", "human" : "user", "gpt" : "assistant", 'system': 'assitant', 'bing': 'assitant', 'chatgpt': 'assitant', 'bard': 'assitant'}
-        # mapping = {"human" : "user", "gpt" : "assitant"}
-        if value_mapping[convo[0][key_mapping['role']]] != 'user':
-            convo = convo[1:]
-        preproc_convos_user = [{"role": 'user', "content": convo_elem[key_mapping['content']]} for i, convo_elem in enumerate(convo) if (i % 2 == 0 and value_mapping[convo_elem[key_mapping['role']]] == 'user')]
-        preproc_convos_assistant = [{"role": 'assistant', "content": convo_elem[key_mapping['content']]} for i, convo_elem in enumerate(convo) if (i % 2 == 1 and value_mapping[convo_elem[key_mapping['role']]] == 'assistant')]
-        if len(preproc_convos_user) != len(preproc_convos_assistant):
-            return []
-        preproc_convos = [conv_elem for pair in zip(preproc_convos_user, preproc_convos_assistant) for conv_elem in pair]
-        return preproc_convos
-    def filter_incorrect_conversations(examples):
-        convos = examples["conversations"]
-        ids_to_remove = [True if preprocess_conversation(convo) == [] else False for convo in convos]
-        return { "ids_to_remove" : ids_to_remove, }
-    def formatting_prompts_func(examples):
-        convos = examples["conversations"]
-        # preproc_convos = [convo for convo in convos if (convo[0]['from'] == 'human' or convo[0]['from'] == 'user')]
-        preproc_convos = [preprocess_conversation(convo) for convo in convos]
-        # preproc_convos2 = [preproc_convo for preproc_convo in preproc_convos if preproc_convo[0]['role'] == 'user']
-        texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for i, convo in enumerate(preproc_convos)]
-        return { "text" : texts,}
-    filtered_datasets = raw_datasets.filter(lambda example: example['conversations'] != [], load_from_cache_file=not overwrite_cache,)
-    dataset = filtered_datasets.map(filter_incorrect_conversations, batched = True, load_from_cache_file=not overwrite_cache,)
-    filtered_datasets2 = dataset.filter(lambda example: example['ids_to_remove'] == False, load_from_cache_file=not overwrite_cache,)
-    raw_datasets_preprocessed = filtered_datasets2.map(formatting_prompts_func, batched = True, load_from_cache_file=not overwrite_cache,)
-    return raw_datasets_preprocessed

experiment_code/requirements.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- huggingface-hub==0.22.2
2	- -e git+https://github.com/huggingface/transformers.git@bbaa8ceff696c479aecdb4575b2deb1349efd3aa#egg=transformers

experiment_code/run_clm.py DELETED Viewed

@@ -1,754 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=text-generation
-"""
-# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
-import random
-import logging
-import math
-import os
-from datetime import datetime
-import sys
-import warnings
-from dataclasses import dataclass, field
-from itertools import chain
-from typing import Optional
-import datasets
-import evaluate
-import torch
-from datasets import load_dataset
-import argparse
-import transformers
-from prepare_sharegpt import shareGPT_pipeline
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    default_data_collator,
-    set_seed,
-)
-from transformers.testing_utils import CaptureLogger
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-from functools import partial
-from omegaconf import DictConfig, OmegaConf
-import hydra
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.41.0.dev0")
-require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-logger = logging.getLogger(__name__)
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-random.seed(42)
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    padding_side: str = field(
-        default="right", metadata={"help": "The padding side in tokenizer"}
-    )
-    config_overrides: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override some existing default config settings when a model is trained from scratch. Example: "
-                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
-            )
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    use_auth_token: bool = field(
-        default=None,
-        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
-        },
-    )
-    trust_remote_code: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    torch_dtype: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
-                "dtype will be automatically derived from the model's weights."
-            ),
-            "choices": ["auto", "bfloat16", "float16", "float32"],
-        },
-    )
-    low_cpu_mem_usage: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
-                "set True will benefit LLM loading time and RAM consumption."
-            )
-        },
-    )
-    def __post_init__(self):
-        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
-            raise ValueError(
-                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
-            )
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_name_hub: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_name_local: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset for identification."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
-    block_size: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Optional input sequence length after tokenization. "
-                "The training dataset will be truncated in block of this size for training. "
-                "Default to the model max input length for single sentence inputs (take into account special tokens)."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    keep_linebreaks: bool = field(
-        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
-    )
-    lazy_preprocess: bool = False
-    def __post_init__(self):
-        if self.streaming:
-            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-# @dataclass
-# class TrainingArguments(transformers.TrainingArguments):
-#     cache_dir: Optional[str] = field(default=None)
-#     optim: str = field(default="adamw_torch")
-#     model_max_length: int = field(
-#         default=2048,
-#         metadata={
-#             "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
-#         },
-#     )
-def create_output_directory(dir_root_path):
-    # Get the current date and time
-    current_time = datetime.now()
-    # Format the date and time as a string
-    # Example format: YYYYMMDD_HHMMSS
-    formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
-    # Define the directory name with the formatted time
-    directory_full_path = os.path.join(dir_root_path, f"training_outputs_{formatted_time}")
-    # Create the directory
-    os.makedirs(directory_full_path)
-    print(f"Directory '{directory_full_path}' created successfully.")
-    return directory_full_path
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = argparse.ArgumentParser(description="parser for arguments from .py script call")
-    parser.add_argument('--output_dir', type=str, help='Path for training_args.output_dir')
-    parser.add_argument('--logging_dir', type=str, help='Path for training_args.logging_dir')
-    parser.add_argument('--config_file', type=str, help='An additional required option.')
-    args = parser.parse_args()
-    parser_hf = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if args.config_file is not None and args.output_dir is not None and args.output_dir is not None:
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser_hf.parse_yaml_file(args.config_file)
-        training_args.output_dir = args.output_dir
-        training_args.logging_dir = args.logging_dir
-    else:
-        # use the preset config file defined in the slurm .sh script
-        # model_args, data_args, training_args = parser_hf.parse_yaml_file(os.getenv("DEFAULT_CONFIG_FILE"))
-        model_args, data_args, training_args = parser_hf.parse_yaml_file('./config/config1.yaml')
-    if model_args.use_auth_token is not None:
-        warnings.warn(
-            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
-            FutureWarning,
-        )
-        if model_args.token is not None:
-            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
-        model_args.token = model_args.use_auth_token
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_clm", model_args, data_args)
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-            streaming=data_args.streaming,
-        )
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                streaming=data_args.streaming,
-            )
-            raw_datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                streaming=data_args.streaming,
-            )
-    else:
-        data_files = {}
-        dataset_args = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = (
-            data_args.train_file.split(".")[-1]
-            if data_args.train_file is not None
-            else data_args.validation_file.split(".")[-1]
-        )
-        if extension == "txt":
-            extension = "text"
-            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-            **dataset_args,
-        )
-        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                **dataset_args,
-            )
-            raw_datasets["train"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                **dataset_args,
-            )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "token": model_args.token,
-        "trust_remote_code": model_args.trust_remote_code,
-    }
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-        if model_args.config_overrides is not None:
-            logger.info(f"Overriding config: {model_args.config_overrides}")
-            config.update_from_string(model_args.config_overrides)
-            logger.info(f"New config: {config}")
-    tokenizer_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "use_fast": model_args.use_fast_tokenizer,
-        "revision": model_args.model_revision,
-        "token": model_args.token,
-        "padding": 'max_length',
-        "trust_remote_code": model_args.trust_remote_code,
-        "model_max_length": config.max_position_embeddings,
-        "return_tensors":'pt'
-    }
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-    if tokenizer.pad_token != tokenizer.unk_token:
-        tokenizer.pad_token = tokenizer.unk_token
-    if model_args.model_name_or_path:
-        torch_dtype = (
-            model_args.torch_dtype
-            if model_args.torch_dtype in ["auto", None]
-            else getattr(torch, model_args.torch_dtype)
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            token=model_args.token,
-            trust_remote_code=model_args.trust_remote_code,
-            torch_dtype=torch_dtype,
-            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-        )
-    else:
-        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
-        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
-    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
-    # on a small vocab and want a smaller embedding size, remove this test.
-    embedding_size = model.get_input_embeddings().weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
-    if "ShareGPT" == data_args.dataset_name_local:
-        raw_datasets_preprocessed = shareGPT_pipeline(tokenizer=tokenizer, raw_datasets=raw_datasets, overwrite_cache=data_args.overwrite_cache)
-    if "RedPajama" == data_args.dataset_name_local:
-        raw_datasets_preprocessed = raw_datasets
-### HEREE
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = list(raw_datasets_preprocessed["train"].features)
-    else:
-        column_names = list(raw_datasets_preprocessed["validation"].features)
-    text_column_name = "text"
-    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
-    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
-    def tokenize_function(examples):
-        with CaptureLogger(tok_logger) as cl:
-            # print(tokenizer(examples[text_column_name]))
-            # output = tokenizer(examples[text_column_name])
-            output = tokenizer(
-                            examples[text_column_name],
-                            return_tensors="pt",
-                            padding="max_length",
-                            max_length=tokenizer.model_max_length,
-                            truncation=True,
-                        )
-            # output = input_ids.clone()
-        # clm input could be much much longer than block_size
-        if "Token indices sequence length is longer than the" in cl.out:
-            tok_logger.warning(
-                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
-                " before being passed to the model."
-            )
-        return output
-    with training_args.main_process_first(desc="dataset map tokenization"):
-        if not data_args.streaming:
-            tokenized_datasets = raw_datasets_preprocessed.map(
-                tokenize_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on dataset",
-            )
-        else:
-            tokenized_datasets = raw_datasets_preprocessed.map(
-                tokenize_function,
-                batched=True,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-            )
-    if hasattr(config, "max_position_embeddings"):
-        max_pos_embeddings = config.max_position_embeddings
-    else:
-        # Define a default value if the attribute is missing in the config.
-        max_pos_embeddings = 1024
-    if data_args.block_size is None:
-        block_size = tokenizer.model_max_length
-        if block_size > max_pos_embeddings:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
-            )
-            if max_pos_embeddings > 0:
-                block_size = min(1024, max_pos_embeddings)
-            else:
-                block_size = 1024
-    else:
-        if data_args.block_size > tokenizer.model_max_length:
-            logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
-                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
-            )
-        block_size = min(data_args.block_size, tokenizer.model_max_length)
-    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
-    def group_texts(examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
-        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-            for k, t in concatenated_examples.items()
-        }
-        result["labels"] = result["input_ids"].copy()
-        return result
-    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
-    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
-    # to preprocess.
-    #
-    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/process#map
-    with training_args.main_process_first(desc="grouping texts together"):
-        if not data_args.streaming:
-            lm_datasets = tokenized_datasets.map(
-                group_texts,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc=f"Grouping texts in chunks of {block_size}",
-            )
-        else:
-            lm_datasets = tokenized_datasets.map(
-                group_texts,
-                batched=True,
-                load_from_cache_file=not data_args.overwrite_cache,
-            )
-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = lm_datasets["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = lm_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-        def preprocess_logits_for_metrics(logits, labels):
-            if isinstance(logits, tuple):
-                # Depending on the model and config, logits may contain extra tensors,
-                # like past_key_values, but logits always come first
-                logits = logits[0]
-            return logits.argmax(dim=-1)
-        def compute_metrics(eval_preds):
-            accuracy = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
-            perplexity = evaluate.load("perplexity", module_type="metric")
-            preds, labels = eval_preds
-            # preds have the same shape as the labels, after the argmax(-1) has been calculated
-            # by preprocess_logits_for_metrics but we need to shift the labels
-            labels = labels[:, 1:].reshape(-1)
-            preds = preds[:, :-1].reshape(-1)
-            accuracy = accuracy.compute(predictions=preds, references=labels)
-            # perplexity = perplexity.compute(predictions=preds, model_id='llama')
-            return accuracy
-    # Initialize the optimizer
-    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
-    # Calculate the number of training steps
-    train_steps = (len(train_dataset) // (training_args.per_device_train_batch_size * training_args._n_gpu)) * training_args.num_train_epochs
-    # Initialize the scheduler
-    linear_scheduler = transformers.get_linear_schedule_with_warmup(
-        optimizer,
-        num_warmup_steps=train_steps*training_args.warmup_ratio,
-        num_training_steps=train_steps
-    )
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        optimizers=(optimizer, linear_scheduler),
-        # Data collator will default to DataCollatorWithPadding, so we change it.
-        data_collator=default_data_collator,
-        compute_metrics=compute_metrics if training_args.do_eval else None,
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics
-        if training_args.do_eval else None,
-    )
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-        try:
-            torch.save([vars(a) for a in [training_args, data_args, model_args]], os.path.join(training_args.output_dir, "args.bin"))
-        except:
-            logger.info("Failed to save arguments")
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["perplexity"] = perplexity
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-    elif data_args.dataset_name_hub is not None:
-        kwargs["dataset"] = data_args.dataset_name_hub
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-if __name__ == "__main__":
-    main()

experiment_code/submit_job.sh DELETED Viewed

@@ -1,91 +0,0 @@
-#!/bin/bash
-#SBATCH -p g24
-#SBATCH --job-name=myjob_shareGPT
-#SBATCH --qos=normal
-#SBATCH --nodes=1                 # Number of nodes
-#SBATCH --ntasks=1         # Number of tasks (one for each script)
-#SBATCH --cpus-per-task=60
-#SBATCH --gres=gpu:6
-#SBATCH --array=1-1                      # Array range
-# #SBATCH --output=./slurm_outputs/run_clm_job_%A_task_%a.out  # Standard output
-#SBATCH --output=/dev/null     # Discard standard output  # Because we write to the log.txt file
-# # Get the current date and time
-current_time=$(date +"%d-%m_%H-%M")
-OUTPUT_DIR="./training_outputs_job_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}_${current_time}"
-while test $# -gt 0; do
-    echo $1
-    case "$1" in
-        --output_dir)
-            shift
-            OUTPUT_DIR=$1
-            shift
-            ;;
-    esac
-done
-mkdir_is_exists() {
-    if [ -d "$1" ]; then
-        echo "Directory '$1' already exists."
-    else
-        mkdir -p "$1"
-        echo "Directory '$1' created."
-    fi
-}
-mkdir_is_exists $OUTPUT_DIR
-mkdir_is_exists $OUTPUT_DIR/experiment_code
-git log -n 1 > $OUTPUT_DIR/commit.txt
-pip freeze > $OUTPUT_DIR/pip_freeze.txt
-echo $0 $ARGS $current_time > $OUTPUT_DIR/cmd.txt
-cp -r ./run_clm.py $OUTPUT_DIR/experiment_code
-cp -r ./prepare_sharegpt.py $OUTPUT_DIR/experiment_code
-cp -r config $OUTPUT_DIR/experiment_code
-cp -r ./submit_job.sh $OUTPUT_DIR/experiment_code
-cp -r ./requirements.txt $OUTPUT_DIR/experiment_code
-# Define the Python scripts and their corresponding input files
-declare -A scripts_and_inputs=(
-        ["1"]="./config/config1.yaml"
-    # ["2"]="./config/config_redpajama.yaml"
-    # ["3"]="./config/config1.yaml"
-    # ["4"]="./config/config1.yaml"
-    # ["5"]="./config/config1.yaml"
-    # ["6"]="./config/config1.yaml"
-    # ["7"]="./config/config1.yaml"
-    # ["8"]="./config/config1.yaml"
-    # ["9"]="./config/config1.yaml"
-    # ["10"]="./config/config1.yaml"
-    # ["11"]="./config/config1.yaml"
-    # ["12"]="./config/config1.yaml"
-    # ["13"]="./config/config1.yaml"
-    # ["14"]="./config/config1.yaml"
-    # ["15"]="./config/config1.yaml"
-    # ["16"]="./config/config1.yaml"
-    # ["17"]="./config/config1.yaml"
-    # ["18"]="./config/config1.yaml"
-    # ["19"]="./config/config1.yaml"
-    # ["20"]="./config/config1.yaml"
-)
-# Launch each script with its corresponding input file as a separate task
-echo "Starting job array task: $SLURM_ARRAY_TASK_ID"
-INPUT_DIR="${scripts_and_inputs[$SLURM_ARRAY_TASK_ID]}"
-export DEFAULT_CONFIG_FILE="./config/config1.yaml"
-srun --exclusive python run_clm.py --output_dir $OUTPUT_DIR --logging_dir $OUTPUT_DIR --config_file $INPUT_DIR 2>&1 | tee $OUTPUT_DIR/log.txt
-# Wait for all background jobs to complete
-wait
-# Print a message indicating completion
-echo "All Python scripts have been executed."
-# mv ./slurm_outputs/run_clm_job_$SLURM_ARRAY_JOB_ID*$SLURM_ARRAY_TASK_ID* "$output_dir/"
-# python -m torch.distributed.launch ~/target_draft_coupling_code/target_draft_training/run_clm.py --multirun task=1,2

last-checkpoint/config.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-  "_name_or_path": "JackFram/llama-68m",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "max_position_embeddings": 2048,
-  "model_type": "llama",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 2,
-  "num_key_value_heads": 12,
-  "pad_token_id": 1,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.41.0.dev0",
-  "use_cache": true,
-  "vocab_size": 32000
-}

last-checkpoint/generation_config.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "pad_token_id": 1,
-  "transformers_version": "4.41.0.dev0"
-}

last-checkpoint/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:baf7620b0c51ef17a030b63cfe26c514df5d88602a1b8140fb12c4968dfa6ff4
-size 272123144

last-checkpoint/optimizer.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:908d9d7ed41d479f7f47a9fd0646de3f7800df94e052115c9815ea463d99e70d
-size 544259743

last-checkpoint/rng_state.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
-size 14244

last-checkpoint/scheduler.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:394be853393fcf0db07e5bdfe4c0d7e15ce8f5fac5bdbb2ad1b413385499af51
-size 1000

last-checkpoint/special_tokens_map.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": "<unk>",
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

last-checkpoint/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/tokenizer.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723

last-checkpoint/tokenizer_config.json DELETED Viewed

@@ -1,45 +0,0 @@
-{
-  "add_bos_token": true,
-  "add_eos_token": false,
-  "add_prefix_space": true,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
-  "legacy": true,
-  "model_max_length": 2048,
-  "pad_token": "<unk>",
-  "padding": "max_length",
-  "return_tensors": "pt",
-  "sp_model_kwargs": {},
-  "spaces_between_special_tokens": false,
-  "tokenizer_class": "LlamaTokenizer",
-  "unk_token": "<unk>",
-  "use_default_system_prompt": false,
-  "use_fast": true
-}

last-checkpoint/trainer_state.json DELETED Viewed

@@ -1,125 +0,0 @@
-{
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 1.0576414595452142,
-  "eval_steps": 1000,
-  "global_step": 4000,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.13220518244315177,
-      "grad_norm": 0.8546391725540161,
-      "learning_rate": 8.816009873931059e-05,
-      "loss": 5.1118,
-      "step": 500
-    },
-    {
-      "epoch": 0.26441036488630354,
-      "grad_norm": 0.8593688607215881,
-      "learning_rate": 9.59831475011252e-05,
-      "loss": 3.406,
-      "step": 1000
-    },
-    {
-      "epoch": 0.26441036488630354,
-      "eval_accuracy": 0.5035306174465283,
-      "eval_loss": 3.23445987701416,
-      "eval_runtime": 73.8676,
-      "eval_samples_per_second": 24.909,
-      "eval_steps_per_second": 0.528,
-      "step": 1000
-    },
-    {
-      "epoch": 0.3966155473294553,
-      "grad_norm": 0.9617258906364441,
-      "learning_rate": 9.134314230431938e-05,
-      "loss": 3.0005,
-      "step": 1500
-    },
-    {
-      "epoch": 0.5288207297726071,
-      "grad_norm": 0.8953185677528381,
-      "learning_rate": 8.670313710751356e-05,
-      "loss": 2.8119,
-      "step": 2000
-    },
-    {
-      "epoch": 0.5288207297726071,
-      "eval_accuracy": 0.5365118094348038,
-      "eval_loss": 2.821384906768799,
-      "eval_runtime": 72.909,
-      "eval_samples_per_second": 25.237,
-      "eval_steps_per_second": 0.535,
-      "step": 2000
-    },
-    {
-      "epoch": 0.6610259122157589,
-      "grad_norm": 1.4154396057128906,
-      "learning_rate": 8.206313191070773e-05,
-      "loss": 2.686,
-      "step": 2500
-    },
-    {
-      "epoch": 0.7932310946589106,
-      "grad_norm": 1.821349024772644,
-      "learning_rate": 7.742312671390191e-05,
-      "loss": 2.607,
-      "step": 3000
-    },
-    {
-      "epoch": 0.7932310946589106,
-      "eval_accuracy": 0.5497897240925214,
-      "eval_loss": 2.657219886779785,
-      "eval_runtime": 73.4297,
-      "eval_samples_per_second": 25.058,
-      "eval_steps_per_second": 0.531,
-      "step": 3000
-    },
-    {
-      "epoch": 0.9254362771020624,
-      "grad_norm": 2.0297396183013916,
-      "learning_rate": 7.278312151709609e-05,
-      "loss": 2.5642,
-      "step": 3500
-    },
-    {
-      "epoch": 1.0576414595452142,
-      "grad_norm": 2.8318285942077637,
-      "learning_rate": 6.814311632029027e-05,
-      "loss": 2.4734,
-      "step": 4000
-    },
-    {
-      "epoch": 1.0576414595452142,
-      "eval_accuracy": 0.5582058048894458,
-      "eval_loss": 2.5735702514648438,
-      "eval_runtime": 73.4679,
-      "eval_samples_per_second": 25.045,
-      "eval_steps_per_second": 0.531,
-      "step": 4000
-    }
-  ],
-  "logging_steps": 500,
-  "max_steps": 11346,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
-  "save_steps": 1000,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": false
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 5.124838835670221e+16,
-  "train_batch_size": 24,
-  "trial_name": null,
-  "trial_params": null
-}

last-checkpoint/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9ce5f4c1939d798f9579c06cb7c41ca4f80497b830ef82299a5b5b802ba651a2
-size 5176

log.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:baf7620b0c51ef17a030b63cfe26c514df5d88602a1b8140fb12c4968dfa6ff4
-size 272123144

pip_freeze.txt DELETED Viewed

@@ -1,330 +0,0 @@
-absl-py==2.1.0
-accelerate==0.26.1
-aiofiles==23.2.1
-aiohttp==3.8.6
-aiosignal==1.3.1
-altair==5.3.0
-annotated-types==0.6.0
-antlr4-python3-runtime==4.9.3
-anyio==4.0.0
-argon2-cffi==23.1.0
-argon2-cffi-bindings==21.2.0
-arrow==1.3.0
-asttokens==2.4.0
-astunparse==1.6.3
-async-lru==2.0.4
-async-timeout==4.0.3
-attrs==23.1.0
-auto-gptq==0.6.0
-Babel==2.13.0
-backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
-beartype==0.17.2
-beautifulsoup4==4.12.2
-bitsandbytes==0.43.1
-bleach==6.1.0
-blis==0.7.11
-brotlipy==0.7.0
-cachetools==5.3.2
-catalogue==2.0.10
-certifi==2023.7.22
-cffi==1.16.0
-chardet==5.2.0
-charset-normalizer==3.3.0
-click==8.1.7
-cloudpathlib==0.16.0
-cloudpickle==3.0.0
-colorama @ file:///tmp/build/80754af9/colorama_1607707115595/work
-coloredlogs==15.0.1
-comm==0.1.4
-conda==4.12.0
-conda-content-trust @ file:///tmp/build/80754af9/conda-content-trust_1617045594566/work
-conda-package-handling @ file:///tmp/build/80754af9/conda-package-handling_1649105784853/work
-confection==0.1.4
-contextlib2==21.6.0
-contexttimer==0.3.3
-contourpy==1.1.1
-cryptography @ file:///tmp/build/80754af9/cryptography_1639414572950/work
-cycler==0.12.1
-cymem==2.0.8
-dataclasses-json==0.6.4
-DataProperty==1.0.1
-datasets==2.19.1
-debugpy==1.8.0
-decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
-defusedxml==0.7.1
-dill==0.3.7
-dnspython==2.6.1
-docstring_parser==0.16
-dos2unix==1
-einops==0.8.0
-eval_type_backport==0.2.0
-evaluate==0.4.1
-exceptiongroup==1.1.3
-executing==2.0.0
-fastapi==0.111.0
-fastapi-cli==0.0.2
-fastchat==0.1.0
-fastjsonschema==2.18.1
-ffmpy==0.3.2
-filelock==3.12.4
-fire==0.5.0
-flash-attn==2.5.8
-flatbuffers==23.5.26
-fonttools==4.43.1
-fqdn==1.5.1
-frozenlist==1.4.0
-fschat==0.2.36
-fsspec==2023.6.0
-gast==0.5.4
-gekko==1.0.6
-globals==0.3.36
-google-auth==2.27.0
-google-auth-oauthlib==1.2.0
-google-pasta==0.2.0
-gradio==4.29.0
-gradio_client==0.16.1
-greenlet==3.0.3
-grpcio==1.60.1
-h11==0.14.0
-h5py==3.10.0
-httpcore==1.0.5
-httptools==0.6.1
-httpx==0.27.0
-huggingface-hub==0.22.2
-humanfriendly==10.0
-hydra-core==1.3.2
-hydra-joblib-launcher==1.2.0
-hydra-submitit-launcher==1.2.0
-idna==3.4
-importlib-metadata==6.8.0
-importlib-resources==6.1.0
-ipykernel==6.25.2
-ipython==8.18.1
-isoduration==20.11.0
-jedi==0.19.1
-Jinja2==3.1.2
-joblib==1.3.2
-json5==0.9.14
-jsonlines==4.0.0
-jsonpatch==1.33
-jsonpointer==2.4
-jsonschema==4.19.1
-jsonschema-specifications==2023.7.1
-jupyter-events==0.7.0
-jupyter-lsp==2.2.0
-jupyter_client==8.3.1
-jupyter_core==5.3.2
-jupyter_server==2.7.3
-jupyter_server_terminals==0.4.4
-jupyterlab==4.0.6
-jupyterlab-pygments==0.2.2
-jupyterlab_server==2.25.0
-keras==2.15.0
-kiwisolver==1.4.5
-langchain==0.1.8
-langchain-community==0.0.21
-langchain-core==0.1.25
-langcodes==3.3.0
-langdetect==1.0.9
-langsmith==0.1.5
-libclang==16.0.6
-lxml==5.1.0
-Markdown==3.5.2
-markdown-it-py==3.0.0
-markdown2==2.4.13
-MarkupSafe==2.1.5
-marshmallow==3.20.2
-matplotlib==3.8.0
-matplotlib-inline @ file:///opt/conda/conda-bld/matplotlib-inline_1662014470464/work
-mbstrdecoder==1.1.3
-mdurl==0.1.2
-mistune==3.0.2
-ml-collections==0.1.1
-ml-dtypes==0.2.0
-more-itertools==10.2.0
-mpmath==1.3.0
-multidict==6.0.4
-multiprocess==0.70.15
-murmurhash==1.0.10
-mypy-extensions==1.0.0
-nbclient==0.8.0
-nbconvert==7.9.2
-nbformat==5.9.2
-nest-asyncio==1.5.8
-networkx==3.1
-nh3==0.2.17
-ninja==1.11.1.1
-nltk==3.8.1
-notebook==7.0.4
-notebook_shim==0.2.3
-numexpr==2.9.0
-numpy==1.26.0
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==8.9.2.26
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-ml-py3==7.352.0
-nvidia-nccl-cu12==2.18.1
-nvidia-nvjitlink-cu12==12.2.140
-nvidia-nvtx-cu12==12.1.105
-oauthlib==3.2.2
-omegaconf==2.3.0
-opt-einsum==3.3.0
-optimum==1.16.2
-orjson==3.10.3
-overrides==7.4.0
-packaging==23.2
-pandas==2.1.1
-pandocfilters==1.5.0
-parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
-pathvalidate==3.2.0
-patsy==0.5.3
-peft==0.8.2
-pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
-pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
-Pillow==10.0.1
-platformdirs==3.11.0
-plotly==5.17.0
-plotly-express==0.4.1
-portalocker==2.8.2
-preshed==3.0.9
-prometheus-client==0.17.1
-prompt-toolkit==3.0.43
-protobuf==3.20.3
-psutil==5.9.5
-ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
-pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
-pyarrow==13.0.0
-pyarrow-hotfix==0.6
-pyasn1==0.5.1
-pyasn1-modules==0.3.0
-pybind11==2.11.1
-pycosat==0.6.3
-pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
-pydantic==2.6.1
-pydantic_core==2.16.2
-pydub==0.25.1
-Pygments==2.16.1
-pyOpenSSL @ file:///opt/conda/conda-bld/pyopenssl_1643788558760/work
-pyparsing==3.1.1
-PySocks @ file:///tmp/build/80754af9/pysocks_1605305812635/work
-pytablewriter==1.2.0
-python-dateutil==2.8.2
-python-dotenv==1.0.1
-python-helper==0.3.74
-python-json-logger==2.0.7
-python-multipart==0.0.9
-pytz==2023.3.post1
-PyYAML==6.0.1
-pyzmq==25.1.1
-referencing==0.30.2
-regex==2023.10.3
-requests==2.31.0
-requests-oauthlib==1.3.1
-responses==0.18.0
-rfc3339-validator==0.1.4
-rfc3986-validator==0.1.1
-rich==13.7.1
-rotary-embedding-torch==0.5.3
-rouge==1.0.1
-rouge-score==0.1.2
-rpds-py==0.10.4
-rsa==4.9
-ruamel-yaml-conda @ file:///tmp/build/80754af9/ruamel_yaml_1616016711199/work
-ruff==0.4.3
-sacrebleu==2.4.0
-safetensors==0.4.3
-scikit-learn==1.4.1.post1
-scipy==1.11.3
-seaborn==0.13.0
-semantic-version==2.10.0
-Send2Trash==1.8.2
-sentencepiece==0.2.0
-shellingham==1.5.4
-shortuuid==1.0.13
-shtab==1.7.1
-six @ file:///tmp/build/80754af9/six_1644875935023/work
-smart-open==6.4.0
-sniffio==1.3.0
-soupsieve==2.5
-spacy==3.7.4
-spacy-legacy==3.0.12
-spacy-loggers==1.0.5
-speculative-decoding==0.1.2
-SQLAlchemy==2.0.27
-sqlitedict==2.1.0
-srsly==2.4.8
-stack-data==0.6.3
-starlette==0.37.2
-statsmodels==0.14.0
-submitit==1.5.1
-svgwrite==1.4.3
-sympy==1.12
-tabledata==1.3.3
-tabulate==0.9.0
-tcolorpy==0.1.4
-tenacity==8.2.3
-tensorboard==2.15.1
-tensorboard-data-server==0.7.2
-tensorflow==2.15.0.post1
-tensorflow-estimator==2.15.0
-tensorflow-io-gcs-filesystem==0.35.0
-tensorrt==8.6.1.post1
-tensorrt-bindings==8.6.1
-tensorrt-libs==8.6.1
-termcolor==2.4.0
-terminado==0.17.1
-thinc==8.2.3
-threadpoolctl==3.3.0
-tiktoken==0.6.0
-tinycss2==1.2.1
-tk==0.1.0
-tokenizers==0.19.1
-tomli==2.0.1
-tomlkit==0.12.0
-toolz==0.12.1
-torch==2.1.0
-torchaudio==2.1.0
-torchvision==0.16.0
-tornado==6.3.3
-tqdm==4.66.1
-tqdm-multiprocess==0.0.11
-traitlets==5.11.2
--e git+https://github.com/huggingface/transformers.git@bbaa8ceff696c479aecdb4575b2deb1349efd3aa#egg=transformers
-triton==2.1.0
-trl==0.8.6
-typepy==1.3.2
-typer==0.12.3
-types-python-dateutil==2.8.19.14
-typing-inspect==0.9.0
-typing_extensions==4.8.0
-tyro==0.8.3
-tzdata==2023.3
-ujson==5.9.0
-unsloth @ git+https://github.com/unslothai/unsloth.git@4211cc01409e3ced4f7abebaf68e244193b46e2c
-uri-template==1.3.0
-urllib3==2.0.6
-uvicorn==0.29.0
-uvloop==0.19.0
-wasabi==1.1.2
-watchfiles==0.21.0
-wavedrom==2.0.3.post3
-wcwidth==0.2.8
-weasel==0.3.4
-webcolors==1.13
-webencodings==0.5.1
-websocket-client==1.6.4
-websockets==11.0.3
-Werkzeug==3.0.1
-word2number==1.1
-wrapt==1.14.1
-xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl
-xxhash==3.4.1
-yarl==1.9.2
-zipp==3.17.0
-zstandard==0.22.0

special_tokens_map.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": "<unk>",
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723

tokenizer_config.json DELETED Viewed

@@ -1,45 +0,0 @@
-{
-  "add_bos_token": true,
-  "add_eos_token": false,
-  "add_prefix_space": true,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
-  "legacy": true,
-  "model_max_length": 2048,
-  "pad_token": "<unk>",
-  "padding": "max_length",
-  "return_tensors": "pt",
-  "sp_model_kwargs": {},
-  "spaces_between_special_tokens": false,
-  "tokenizer_class": "LlamaTokenizer",
-  "unk_token": "<unk>",
-  "use_default_system_prompt": false,
-  "use_fast": true
-}

training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9ce5f4c1939d798f9579c06cb7c41ca4f80497b830ef82299a5b5b802ba651a2
-size 5176