Shteyman commited on
Commit
558c5b7
1 Parent(s): d7537dd
cmd.txt DELETED
@@ -1 +0,0 @@
1
- /var/spool/slurmd/job117535/slurm_script 05-06_00-42
 
 
commit.txt DELETED
@@ -1,5 +0,0 @@
1
- commit c4fe47d125efdcc428a5dd46500d754dc07f4a94
2
- Author: Shteyman <[email protected]>
3
- Date: Sun Jun 2 08:25:22 2024 -0700
4
-
5
- clean version of run_clm.py
 
 
 
 
 
 
config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "_name_or_path": "JackFram/llama-68m",
3
- "architectures": [
4
- "LlamaForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "bos_token_id": 0,
9
- "eos_token_id": 2,
10
- "hidden_act": "silu",
11
- "hidden_size": 768,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 3072,
14
- "max_position_embeddings": 2048,
15
- "model_type": "llama",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 2,
18
- "num_key_value_heads": 12,
19
- "pad_token_id": 1,
20
- "pretraining_tp": 1,
21
- "rms_norm_eps": 1e-06,
22
- "rope_scaling": null,
23
- "rope_theta": 10000.0,
24
- "tie_word_embeddings": false,
25
- "torch_dtype": "float32",
26
- "transformers_version": "4.41.0.dev0",
27
- "use_cache": true,
28
- "vocab_size": 32000
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
experiment_code/config/config1.yaml DELETED
@@ -1,28 +0,0 @@
1
- config_name: "JackFram/llama-68m"
2
- tokenizer_name: "JackFram/llama-68m"
3
- validation_split_percentage: 2
4
- train_file: "/home/dshteyma/shareGPT_data/ShareGPT_V3_unfiltered_cleaned_split.json"
5
- dataset_name_hub: "anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json"
6
- dataset_name_local: "ShareGPT"
7
- # max_train_samples: 1000
8
- # max_eval_samples: 10
9
- do_train: True
10
- do_eval: True
11
- output_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
12
- overwrite_output_dir: True
13
- per_device_train_batch_size: 4
14
- gradient_accumulation_steps: 1
15
- report_to: "tensorboard"
16
- logging_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
17
- logging_steps: 500
18
- save_steps: 1000
19
- eval_strategy: "steps"
20
- eval_steps: 1000
21
- learning_rate: 0.0001
22
- gradient_accumulation_steps: 1
23
- weight_decay: 0.01
24
- warmup_ratio: 0.05
25
- push_to_hub: True
26
- hub_model_id: "DorinSht/ShareGPT_llama2_68M"
27
- hub_strategy: "checkpoint"
28
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
experiment_code/config/config_redpajama.yaml DELETED
@@ -1,27 +0,0 @@
1
- config_name: "JackFram/llama-68m"
2
- tokenizer_name: "JackFram/llama-68m"
3
- validation_split_percentage: 2
4
- train_file: "/home/dshteyma/target_draft_coupling_code/dataset_dict.json"
5
- dataset_name_local: "RedPajama"
6
- dataset_name: "togethercomputer/RedPajama-Data-1T-Sample"
7
- dataset_name_hub: "togethercomputer/RedPajama-Data-1T-Sample"
8
- # max_train_samples: 1000
9
- # max_eval_samples: 10
10
- do_train: True
11
- do_eval: True
12
- output_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
13
- overwrite_output_dir: True
14
- per_device_train_batch_size: 4
15
- gradient_accumulation_steps: 3
16
- report_to: "tensorboard"
17
- logging_dir: "/home/dshteyma/target_draft_coupling_code/target_draft_training/training_outputs"
18
- logging_steps: 10000
19
- save_steps: 10000
20
- eval_strategy: "steps"
21
- eval_steps: 10000
22
- learning_rate: 0.0001
23
- weight_decay: 0.01
24
- warmup_ratio: 0.05
25
- push_to_hub: False
26
- hub_model_id: "DorinSht/llama_68M_redpajama"
27
- hub_strategy: "all_checkpoints"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
experiment_code/prepare_sharegpt.py DELETED
@@ -1,44 +0,0 @@
1
- """
2
- This script is largely copied from the Vicuna repo: https://github.com/lm-sys/FastChat/blob/main/fastchat/data/split_long_conversation.py
3
- We fixed a bug in `split_one_sample`, which previously includes long conversations in the processed data. Now we skip these long conversations.
4
- """
5
- import argparse
6
- from concurrent.futures import ProcessPoolExecutor
7
- import json
8
- import transformers
9
- from tqdm import tqdm
10
-
11
- def shareGPT_pipeline(tokenizer, raw_datasets, overwrite_cache):
12
-
13
- def preprocess_conversation(convo):
14
- key_mapping = {"role" : "from", "content" : "value"}
15
- value_mapping = {"user" : "user", "human" : "user", "gpt" : "assistant", 'system': 'assitant', 'bing': 'assitant', 'chatgpt': 'assitant', 'bard': 'assitant'}
16
- # mapping = {"human" : "user", "gpt" : "assitant"}
17
- if value_mapping[convo[0][key_mapping['role']]] != 'user':
18
- convo = convo[1:]
19
- preproc_convos_user = [{"role": 'user', "content": convo_elem[key_mapping['content']]} for i, convo_elem in enumerate(convo) if (i % 2 == 0 and value_mapping[convo_elem[key_mapping['role']]] == 'user')]
20
- preproc_convos_assistant = [{"role": 'assistant', "content": convo_elem[key_mapping['content']]} for i, convo_elem in enumerate(convo) if (i % 2 == 1 and value_mapping[convo_elem[key_mapping['role']]] == 'assistant')]
21
- if len(preproc_convos_user) != len(preproc_convos_assistant):
22
- return []
23
- preproc_convos = [conv_elem for pair in zip(preproc_convos_user, preproc_convos_assistant) for conv_elem in pair]
24
- return preproc_convos
25
-
26
- def filter_incorrect_conversations(examples):
27
- convos = examples["conversations"]
28
- ids_to_remove = [True if preprocess_conversation(convo) == [] else False for convo in convos]
29
- return { "ids_to_remove" : ids_to_remove, }
30
-
31
- def formatting_prompts_func(examples):
32
- convos = examples["conversations"]
33
- # preproc_convos = [convo for convo in convos if (convo[0]['from'] == 'human' or convo[0]['from'] == 'user')]
34
- preproc_convos = [preprocess_conversation(convo) for convo in convos]
35
- # preproc_convos2 = [preproc_convo for preproc_convo in preproc_convos if preproc_convo[0]['role'] == 'user']
36
- texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for i, convo in enumerate(preproc_convos)]
37
- return { "text" : texts,}
38
-
39
- filtered_datasets = raw_datasets.filter(lambda example: example['conversations'] != [], load_from_cache_file=not overwrite_cache,)
40
- dataset = filtered_datasets.map(filter_incorrect_conversations, batched = True, load_from_cache_file=not overwrite_cache,)
41
- filtered_datasets2 = dataset.filter(lambda example: example['ids_to_remove'] == False, load_from_cache_file=not overwrite_cache,)
42
- raw_datasets_preprocessed = filtered_datasets2.map(formatting_prompts_func, batched = True, load_from_cache_file=not overwrite_cache,)
43
-
44
- return raw_datasets_preprocessed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
experiment_code/requirements.txt DELETED
@@ -1,2 +0,0 @@
1
- huggingface-hub==0.22.2
2
- -e git+https://github.com/huggingface/transformers.git@bbaa8ceff696c479aecdb4575b2deb1349efd3aa#egg=transformers
 
 
 
experiment_code/run_clm.py DELETED
@@ -1,754 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding=utf-8
3
- # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- """
17
- Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
18
-
19
- Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
20
- https://huggingface.co/models?filter=text-generation
21
- """
22
- # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
23
- import random
24
- import logging
25
- import math
26
- import os
27
- from datetime import datetime
28
- import sys
29
- import warnings
30
- from dataclasses import dataclass, field
31
- from itertools import chain
32
- from typing import Optional
33
- import datasets
34
- import evaluate
35
- import torch
36
- from datasets import load_dataset
37
- import argparse
38
- import transformers
39
- from prepare_sharegpt import shareGPT_pipeline
40
- from transformers import (
41
- CONFIG_MAPPING,
42
- MODEL_FOR_CAUSAL_LM_MAPPING,
43
- AutoConfig,
44
- AutoModelForCausalLM,
45
- AutoTokenizer,
46
- HfArgumentParser,
47
- Trainer,
48
- TrainingArguments,
49
- default_data_collator,
50
- set_seed,
51
- )
52
- from transformers.testing_utils import CaptureLogger
53
- from transformers.trainer_utils import get_last_checkpoint
54
- from transformers.utils import check_min_version, send_example_telemetry
55
- from transformers.utils.versions import require_version
56
- from functools import partial
57
-
58
- from omegaconf import DictConfig, OmegaConf
59
- import hydra
60
-
61
- # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
62
- check_min_version("4.41.0.dev0")
63
-
64
- require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
65
-
66
- logger = logging.getLogger(__name__)
67
-
68
- MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
69
- MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
70
-
71
- random.seed(42)
72
-
73
- @dataclass
74
- class ModelArguments:
75
- """
76
- Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
77
- """
78
-
79
- model_name_or_path: Optional[str] = field(
80
- default=None,
81
- metadata={
82
- "help": (
83
- "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
84
- )
85
- },
86
- )
87
- model_type: Optional[str] = field(
88
- default=None,
89
- metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
90
- )
91
- padding_side: str = field(
92
- default="right", metadata={"help": "The padding side in tokenizer"}
93
- )
94
- config_overrides: Optional[str] = field(
95
- default=None,
96
- metadata={
97
- "help": (
98
- "Override some existing default config settings when a model is trained from scratch. Example: "
99
- "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
100
- )
101
- },
102
- )
103
- config_name: Optional[str] = field(
104
- default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
105
- )
106
- tokenizer_name: Optional[str] = field(
107
- default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
108
- )
109
- cache_dir: Optional[str] = field(
110
- default=None,
111
- metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
112
- )
113
- use_fast_tokenizer: bool = field(
114
- default=True,
115
- metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
116
- )
117
- model_revision: str = field(
118
- default="main",
119
- metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
120
- )
121
- token: str = field(
122
- default=None,
123
- metadata={
124
- "help": (
125
- "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
126
- "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
127
- )
128
- },
129
- )
130
- use_auth_token: bool = field(
131
- default=None,
132
- metadata={
133
- "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
134
- },
135
- )
136
- trust_remote_code: bool = field(
137
- default=True,
138
- metadata={
139
- "help": (
140
- "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
141
- "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
142
- "execute code present on the Hub on your local machine."
143
- )
144
- },
145
- )
146
- torch_dtype: Optional[str] = field(
147
- default=None,
148
- metadata={
149
- "help": (
150
- "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
151
- "dtype will be automatically derived from the model's weights."
152
- ),
153
- "choices": ["auto", "bfloat16", "float16", "float32"],
154
- },
155
- )
156
- low_cpu_mem_usage: bool = field(
157
- default=False,
158
- metadata={
159
- "help": (
160
- "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
161
- "set True will benefit LLM loading time and RAM consumption."
162
- )
163
- },
164
- )
165
-
166
- def __post_init__(self):
167
- if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
168
- raise ValueError(
169
- "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
170
- )
171
-
172
-
173
-
174
- @dataclass
175
- class DataTrainingArguments:
176
- """
177
- Arguments pertaining to what data we are going to input our model for training and eval.
178
- """
179
- dataset_name: Optional[str] = field(
180
- default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
181
- )
182
- dataset_name_hub: Optional[str] = field(
183
- default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
184
- )
185
- dataset_name_local: Optional[str] = field(
186
- default=None, metadata={"help": "The name of the dataset for identification."}
187
- )
188
- dataset_config_name: Optional[str] = field(
189
- default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
190
- )
191
- train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
192
- validation_file: Optional[str] = field(
193
- default=None,
194
- metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
195
- )
196
- max_train_samples: Optional[int] = field(
197
- default=None,
198
- metadata={
199
- "help": (
200
- "For debugging purposes or quicker training, truncate the number of training examples to this "
201
- "value if set."
202
- )
203
- },
204
- )
205
- max_eval_samples: Optional[int] = field(
206
- default=None,
207
- metadata={
208
- "help": (
209
- "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
210
- "value if set."
211
- )
212
- },
213
- )
214
- streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
215
- block_size: Optional[int] = field(
216
- default=None,
217
- metadata={
218
- "help": (
219
- "Optional input sequence length after tokenization. "
220
- "The training dataset will be truncated in block of this size for training. "
221
- "Default to the model max input length for single sentence inputs (take into account special tokens)."
222
- )
223
- },
224
- )
225
- overwrite_cache: bool = field(
226
- default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
227
- )
228
- validation_split_percentage: Optional[int] = field(
229
- default=5,
230
- metadata={
231
- "help": "The percentage of the train set used as validation set in case there's no validation split"
232
- },
233
- )
234
- preprocessing_num_workers: Optional[int] = field(
235
- default=None,
236
- metadata={"help": "The number of processes to use for the preprocessing."},
237
- )
238
- keep_linebreaks: bool = field(
239
- default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
240
- )
241
- lazy_preprocess: bool = False
242
-
243
- def __post_init__(self):
244
- if self.streaming:
245
- require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
246
-
247
- if self.dataset_name is None and self.train_file is None and self.validation_file is None:
248
- raise ValueError("Need either a dataset name or a training/validation file.")
249
- else:
250
- if self.train_file is not None:
251
- extension = self.train_file.split(".")[-1]
252
- assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
253
- if self.validation_file is not None:
254
- extension = self.validation_file.split(".")[-1]
255
- assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
256
-
257
- # @dataclass
258
- # class TrainingArguments(transformers.TrainingArguments):
259
- # cache_dir: Optional[str] = field(default=None)
260
- # optim: str = field(default="adamw_torch")
261
- # model_max_length: int = field(
262
- # default=2048,
263
- # metadata={
264
- # "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
265
- # },
266
- # )
267
-
268
- def create_output_directory(dir_root_path):
269
- # Get the current date and time
270
- current_time = datetime.now()
271
- # Format the date and time as a string
272
- # Example format: YYYYMMDD_HHMMSS
273
- formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
274
- # Define the directory name with the formatted time
275
- directory_full_path = os.path.join(dir_root_path, f"training_outputs_{formatted_time}")
276
- # Create the directory
277
- os.makedirs(directory_full_path)
278
- print(f"Directory '{directory_full_path}' created successfully.")
279
- return directory_full_path
280
-
281
- def main():
282
- # See all possible arguments in src/transformers/training_args.py
283
- # or by passing the --help flag to this script.
284
- # We now keep distinct sets of args, for a cleaner separation of concerns.
285
- parser = argparse.ArgumentParser(description="parser for arguments from .py script call")
286
- parser.add_argument('--output_dir', type=str, help='Path for training_args.output_dir')
287
- parser.add_argument('--logging_dir', type=str, help='Path for training_args.logging_dir')
288
- parser.add_argument('--config_file', type=str, help='An additional required option.')
289
- args = parser.parse_args()
290
-
291
- parser_hf = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
292
- if args.config_file is not None and args.output_dir is not None and args.output_dir is not None:
293
- # If we pass only one argument to the script and it's the path to a json file,
294
- # let's parse it to get our arguments.
295
- model_args, data_args, training_args = parser_hf.parse_yaml_file(args.config_file)
296
- training_args.output_dir = args.output_dir
297
- training_args.logging_dir = args.logging_dir
298
- else:
299
- # use the preset config file defined in the slurm .sh script
300
- # model_args, data_args, training_args = parser_hf.parse_yaml_file(os.getenv("DEFAULT_CONFIG_FILE"))
301
- model_args, data_args, training_args = parser_hf.parse_yaml_file('./config/config1.yaml')
302
-
303
-
304
- if model_args.use_auth_token is not None:
305
- warnings.warn(
306
- "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
307
- FutureWarning,
308
- )
309
- if model_args.token is not None:
310
- raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
311
- model_args.token = model_args.use_auth_token
312
-
313
- # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
314
- # information sent is the one passed as arguments along with your Python/PyTorch versions.
315
- send_example_telemetry("run_clm", model_args, data_args)
316
-
317
- # Setup logging
318
- logging.basicConfig(
319
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
320
- datefmt="%m/%d/%Y %H:%M:%S",
321
- handlers=[logging.StreamHandler(sys.stdout)],
322
- )
323
-
324
- if training_args.should_log:
325
- # The default of training_args.log_level is passive, so we set log level at info here to have that default.
326
- transformers.utils.logging.set_verbosity_info()
327
-
328
- log_level = training_args.get_process_log_level()
329
- logger.setLevel(log_level)
330
- datasets.utils.logging.set_verbosity(log_level)
331
- transformers.utils.logging.set_verbosity(log_level)
332
- transformers.utils.logging.enable_default_handler()
333
- transformers.utils.logging.enable_explicit_format()
334
-
335
- # Log on each process the small summary:
336
- logger.warning(
337
- f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
338
- + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
339
- )
340
- logger.info(f"Training/evaluation parameters {training_args}")
341
-
342
- # Detecting last checkpoint.
343
- last_checkpoint = None
344
- if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
345
- last_checkpoint = get_last_checkpoint(training_args.output_dir)
346
- if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
347
- raise ValueError(
348
- f"Output directory ({training_args.output_dir}) already exists and is not empty. "
349
- "Use --overwrite_output_dir to overcome."
350
- )
351
- elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
352
- logger.info(
353
- f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
354
- "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
355
- )
356
-
357
- # Set seed before initializing model.
358
- set_seed(training_args.seed)
359
-
360
- # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
361
- # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
362
- # (the dataset will be downloaded automatically from the datasets Hub).
363
- #
364
- # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
365
- # 'text' is found. You can easily tweak this behavior (see below).
366
- #
367
- # In distributed training, the load_dataset function guarantee that only one local process can concurrently
368
- # download the dataset.
369
- if data_args.dataset_name is not None:
370
- # Downloading and loading a dataset from the hub.
371
- raw_datasets = load_dataset(
372
- data_args.dataset_name,
373
- data_args.dataset_config_name,
374
- cache_dir=model_args.cache_dir,
375
- token=model_args.token,
376
- streaming=data_args.streaming,
377
- )
378
- if "validation" not in raw_datasets.keys():
379
- raw_datasets["validation"] = load_dataset(
380
- data_args.dataset_name,
381
- data_args.dataset_config_name,
382
- split=f"train[:{data_args.validation_split_percentage}%]",
383
- cache_dir=model_args.cache_dir,
384
- token=model_args.token,
385
- streaming=data_args.streaming,
386
- )
387
- raw_datasets["train"] = load_dataset(
388
- data_args.dataset_name,
389
- data_args.dataset_config_name,
390
- split=f"train[{data_args.validation_split_percentage}%:]",
391
- cache_dir=model_args.cache_dir,
392
- token=model_args.token,
393
- streaming=data_args.streaming,
394
- )
395
- else:
396
- data_files = {}
397
- dataset_args = {}
398
- if data_args.train_file is not None:
399
- data_files["train"] = data_args.train_file
400
- if data_args.validation_file is not None:
401
- data_files["validation"] = data_args.validation_file
402
- extension = (
403
- data_args.train_file.split(".")[-1]
404
- if data_args.train_file is not None
405
- else data_args.validation_file.split(".")[-1]
406
- )
407
- if extension == "txt":
408
- extension = "text"
409
- dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
410
- raw_datasets = load_dataset(
411
- extension,
412
- data_files=data_files,
413
- cache_dir=model_args.cache_dir,
414
- token=model_args.token,
415
- **dataset_args,
416
- )
417
- # If no validation data is there, validation_split_percentage will be used to divide the dataset.
418
- if "validation" not in raw_datasets.keys():
419
- raw_datasets["validation"] = load_dataset(
420
- extension,
421
- data_files=data_files,
422
- split=f"train[:{data_args.validation_split_percentage}%]",
423
- cache_dir=model_args.cache_dir,
424
- token=model_args.token,
425
- **dataset_args,
426
- )
427
- raw_datasets["train"] = load_dataset(
428
- extension,
429
- data_files=data_files,
430
- split=f"train[{data_args.validation_split_percentage}%:]",
431
- cache_dir=model_args.cache_dir,
432
- token=model_args.token,
433
- **dataset_args,
434
- )
435
-
436
- # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
437
- # https://huggingface.co/docs/datasets/loading_datasets.
438
-
439
- # Load pretrained model and tokenizer
440
- #
441
- # Distributed training:
442
- # The .from_pretrained methods guarantee that only one local process can concurrently
443
- # download model & vocab.
444
-
445
- config_kwargs = {
446
- "cache_dir": model_args.cache_dir,
447
- "revision": model_args.model_revision,
448
- "token": model_args.token,
449
- "trust_remote_code": model_args.trust_remote_code,
450
- }
451
- if model_args.config_name:
452
- config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
453
- elif model_args.model_name_or_path:
454
- config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
455
- else:
456
- config = CONFIG_MAPPING[model_args.model_type]()
457
- logger.warning("You are instantiating a new config instance from scratch.")
458
- if model_args.config_overrides is not None:
459
- logger.info(f"Overriding config: {model_args.config_overrides}")
460
- config.update_from_string(model_args.config_overrides)
461
- logger.info(f"New config: {config}")
462
-
463
- tokenizer_kwargs = {
464
- "cache_dir": model_args.cache_dir,
465
- "use_fast": model_args.use_fast_tokenizer,
466
- "revision": model_args.model_revision,
467
- "token": model_args.token,
468
- "padding": 'max_length',
469
- "trust_remote_code": model_args.trust_remote_code,
470
- "model_max_length": config.max_position_embeddings,
471
- "return_tensors":'pt'
472
- }
473
- if model_args.tokenizer_name:
474
- tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
475
- elif model_args.model_name_or_path:
476
- tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
477
- else:
478
- raise ValueError(
479
- "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
480
- "You can do it from another script, save it, and load it from here, using --tokenizer_name."
481
- )
482
- if tokenizer.pad_token != tokenizer.unk_token:
483
- tokenizer.pad_token = tokenizer.unk_token
484
-
485
- if model_args.model_name_or_path:
486
- torch_dtype = (
487
- model_args.torch_dtype
488
- if model_args.torch_dtype in ["auto", None]
489
- else getattr(torch, model_args.torch_dtype)
490
- )
491
- model = AutoModelForCausalLM.from_pretrained(
492
- model_args.model_name_or_path,
493
- from_tf=bool(".ckpt" in model_args.model_name_or_path),
494
- config=config,
495
- cache_dir=model_args.cache_dir,
496
- revision=model_args.model_revision,
497
- token=model_args.token,
498
- trust_remote_code=model_args.trust_remote_code,
499
- torch_dtype=torch_dtype,
500
- low_cpu_mem_usage=model_args.low_cpu_mem_usage,
501
- )
502
- else:
503
- model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
504
- n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
505
- logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
506
-
507
- # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
508
- # on a small vocab and want a smaller embedding size, remove this test.
509
- embedding_size = model.get_input_embeddings().weight.shape[0]
510
- if len(tokenizer) > embedding_size:
511
- model.resize_token_embeddings(len(tokenizer))
512
-
513
- if "ShareGPT" == data_args.dataset_name_local:
514
- raw_datasets_preprocessed = shareGPT_pipeline(tokenizer=tokenizer, raw_datasets=raw_datasets, overwrite_cache=data_args.overwrite_cache)
515
- if "RedPajama" == data_args.dataset_name_local:
516
- raw_datasets_preprocessed = raw_datasets
517
-
518
- ### HEREE
519
- # Preprocessing the datasets.
520
- # First we tokenize all the texts.
521
- if training_args.do_train:
522
- column_names = list(raw_datasets_preprocessed["train"].features)
523
- else:
524
- column_names = list(raw_datasets_preprocessed["validation"].features)
525
- text_column_name = "text"
526
-
527
-
528
- # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
529
- tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
530
-
531
- def tokenize_function(examples):
532
- with CaptureLogger(tok_logger) as cl:
533
- # print(tokenizer(examples[text_column_name]))
534
- # output = tokenizer(examples[text_column_name])
535
- output = tokenizer(
536
- examples[text_column_name],
537
- return_tensors="pt",
538
- padding="max_length",
539
- max_length=tokenizer.model_max_length,
540
- truncation=True,
541
- )
542
- # output = input_ids.clone()
543
- # clm input could be much much longer than block_size
544
- if "Token indices sequence length is longer than the" in cl.out:
545
- tok_logger.warning(
546
- "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
547
- " before being passed to the model."
548
- )
549
- return output
550
-
551
- with training_args.main_process_first(desc="dataset map tokenization"):
552
- if not data_args.streaming:
553
- tokenized_datasets = raw_datasets_preprocessed.map(
554
- tokenize_function,
555
- batched=True,
556
- num_proc=data_args.preprocessing_num_workers,
557
- remove_columns=column_names,
558
- load_from_cache_file=not data_args.overwrite_cache,
559
- desc="Running tokenizer on dataset",
560
- )
561
- else:
562
- tokenized_datasets = raw_datasets_preprocessed.map(
563
- tokenize_function,
564
- batched=True,
565
- remove_columns=column_names,
566
- load_from_cache_file=not data_args.overwrite_cache,
567
- )
568
- if hasattr(config, "max_position_embeddings"):
569
- max_pos_embeddings = config.max_position_embeddings
570
- else:
571
- # Define a default value if the attribute is missing in the config.
572
- max_pos_embeddings = 1024
573
-
574
- if data_args.block_size is None:
575
- block_size = tokenizer.model_max_length
576
- if block_size > max_pos_embeddings:
577
- logger.warning(
578
- f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
579
- f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
580
- )
581
- if max_pos_embeddings > 0:
582
- block_size = min(1024, max_pos_embeddings)
583
- else:
584
- block_size = 1024
585
- else:
586
- if data_args.block_size > tokenizer.model_max_length:
587
- logger.warning(
588
- f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
589
- f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
590
- )
591
- block_size = min(data_args.block_size, tokenizer.model_max_length)
592
-
593
- # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
594
- def group_texts(examples):
595
- # Concatenate all texts.
596
- concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
597
- total_length = len(concatenated_examples[list(examples.keys())[0]])
598
- # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
599
- # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
600
- total_length = (total_length // block_size) * block_size
601
- # Split by chunks of max_len.
602
- result = {
603
- k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
604
- for k, t in concatenated_examples.items()
605
- }
606
- result["labels"] = result["input_ids"].copy()
607
- return result
608
-
609
- # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
610
- # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
611
- # to preprocess.
612
- #
613
- # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
614
- # https://huggingface.co/docs/datasets/process#map
615
-
616
- with training_args.main_process_first(desc="grouping texts together"):
617
- if not data_args.streaming:
618
- lm_datasets = tokenized_datasets.map(
619
- group_texts,
620
- batched=True,
621
- num_proc=data_args.preprocessing_num_workers,
622
- load_from_cache_file=not data_args.overwrite_cache,
623
- desc=f"Grouping texts in chunks of {block_size}",
624
- )
625
- else:
626
- lm_datasets = tokenized_datasets.map(
627
- group_texts,
628
- batched=True,
629
- load_from_cache_file=not data_args.overwrite_cache,
630
- )
631
-
632
- if training_args.do_train:
633
- if "train" not in tokenized_datasets:
634
- raise ValueError("--do_train requires a train dataset")
635
- train_dataset = lm_datasets["train"]
636
- if data_args.max_train_samples is not None:
637
- max_train_samples = min(len(train_dataset), data_args.max_train_samples)
638
- train_dataset = train_dataset.select(range(max_train_samples))
639
-
640
- if training_args.do_eval:
641
- if "validation" not in tokenized_datasets:
642
- raise ValueError("--do_eval requires a validation dataset")
643
- eval_dataset = lm_datasets["validation"]
644
- if data_args.max_eval_samples is not None:
645
- max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
646
- eval_dataset = eval_dataset.select(range(max_eval_samples))
647
-
648
- def preprocess_logits_for_metrics(logits, labels):
649
- if isinstance(logits, tuple):
650
- # Depending on the model and config, logits may contain extra tensors,
651
- # like past_key_values, but logits always come first
652
- logits = logits[0]
653
- return logits.argmax(dim=-1)
654
-
655
-
656
- def compute_metrics(eval_preds):
657
- accuracy = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
658
- perplexity = evaluate.load("perplexity", module_type="metric")
659
- preds, labels = eval_preds
660
- # preds have the same shape as the labels, after the argmax(-1) has been calculated
661
- # by preprocess_logits_for_metrics but we need to shift the labels
662
- labels = labels[:, 1:].reshape(-1)
663
- preds = preds[:, :-1].reshape(-1)
664
- accuracy = accuracy.compute(predictions=preds, references=labels)
665
- # perplexity = perplexity.compute(predictions=preds, model_id='llama')
666
- return accuracy
667
-
668
- # Initialize the optimizer
669
- optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
670
- # Calculate the number of training steps
671
- train_steps = (len(train_dataset) // (training_args.per_device_train_batch_size * training_args._n_gpu)) * training_args.num_train_epochs
672
-
673
- # Initialize the scheduler
674
- linear_scheduler = transformers.get_linear_schedule_with_warmup(
675
- optimizer,
676
- num_warmup_steps=train_steps*training_args.warmup_ratio,
677
- num_training_steps=train_steps
678
- )
679
-
680
- # Initialize our Trainer
681
- trainer = Trainer(
682
- model=model,
683
- args=training_args,
684
- train_dataset=train_dataset if training_args.do_train else None,
685
- eval_dataset=eval_dataset if training_args.do_eval else None,
686
- tokenizer=tokenizer,
687
- optimizers=(optimizer, linear_scheduler),
688
- # Data collator will default to DataCollatorWithPadding, so we change it.
689
- data_collator=default_data_collator,
690
- compute_metrics=compute_metrics if training_args.do_eval else None,
691
- preprocess_logits_for_metrics=preprocess_logits_for_metrics
692
- if training_args.do_eval else None,
693
- )
694
-
695
- # Training
696
- if training_args.do_train:
697
- checkpoint = None
698
- if training_args.resume_from_checkpoint is not None:
699
- checkpoint = training_args.resume_from_checkpoint
700
- elif last_checkpoint is not None:
701
- checkpoint = last_checkpoint
702
- train_result = trainer.train(resume_from_checkpoint=checkpoint)
703
- trainer.save_model() # Saves the tokenizer too for easy upload
704
-
705
- metrics = train_result.metrics
706
-
707
- max_train_samples = (
708
- data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
709
- )
710
- metrics["train_samples"] = min(max_train_samples, len(train_dataset))
711
-
712
- trainer.log_metrics("train", metrics)
713
- trainer.save_metrics("train", metrics)
714
- trainer.save_state()
715
- try:
716
- torch.save([vars(a) for a in [training_args, data_args, model_args]], os.path.join(training_args.output_dir, "args.bin"))
717
- except:
718
- logger.info("Failed to save arguments")
719
-
720
- # Evaluation
721
- if training_args.do_eval:
722
- logger.info("*** Evaluate ***")
723
-
724
- metrics = trainer.evaluate()
725
-
726
- max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
727
- metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
728
- try:
729
- perplexity = math.exp(metrics["eval_loss"])
730
- except OverflowError:
731
- perplexity = float("inf")
732
- metrics["perplexity"] = perplexity
733
-
734
- trainer.log_metrics("eval", metrics)
735
- trainer.save_metrics("eval", metrics)
736
-
737
- kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
738
- if data_args.dataset_name is not None:
739
- kwargs["dataset_tags"] = data_args.dataset_name
740
- if data_args.dataset_config_name is not None:
741
- kwargs["dataset_args"] = data_args.dataset_config_name
742
- kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
743
- else:
744
- kwargs["dataset"] = data_args.dataset_name
745
- elif data_args.dataset_name_hub is not None:
746
- kwargs["dataset"] = data_args.dataset_name_hub
747
-
748
- if training_args.push_to_hub:
749
- trainer.push_to_hub(**kwargs)
750
- else:
751
- trainer.create_model_card(**kwargs)
752
-
753
- if __name__ == "__main__":
754
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
experiment_code/submit_job.sh DELETED
@@ -1,91 +0,0 @@
1
- #!/bin/bash
2
- #SBATCH -p g24
3
- #SBATCH --job-name=myjob_shareGPT
4
- #SBATCH --qos=normal
5
- #SBATCH --nodes=1 # Number of nodes
6
- #SBATCH --ntasks=1 # Number of tasks (one for each script)
7
- #SBATCH --cpus-per-task=60
8
- #SBATCH --gres=gpu:6
9
- #SBATCH --array=1-1 # Array range
10
- # #SBATCH --output=./slurm_outputs/run_clm_job_%A_task_%a.out # Standard output
11
- #SBATCH --output=/dev/null # Discard standard output # Because we write to the log.txt file
12
-
13
- # # Get the current date and time
14
- current_time=$(date +"%d-%m_%H-%M")
15
- OUTPUT_DIR="./training_outputs_job_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}_${current_time}"
16
-
17
- while test $# -gt 0; do
18
- echo $1
19
- case "$1" in
20
- --output_dir)
21
- shift
22
- OUTPUT_DIR=$1
23
- shift
24
- ;;
25
- esac
26
- done
27
-
28
- mkdir_is_exists() {
29
- if [ -d "$1" ]; then
30
- echo "Directory '$1' already exists."
31
- else
32
- mkdir -p "$1"
33
- echo "Directory '$1' created."
34
- fi
35
- }
36
-
37
-
38
- mkdir_is_exists $OUTPUT_DIR
39
- mkdir_is_exists $OUTPUT_DIR/experiment_code
40
- git log -n 1 > $OUTPUT_DIR/commit.txt
41
- pip freeze > $OUTPUT_DIR/pip_freeze.txt
42
- echo $0 $ARGS $current_time > $OUTPUT_DIR/cmd.txt
43
- cp -r ./run_clm.py $OUTPUT_DIR/experiment_code
44
- cp -r ./prepare_sharegpt.py $OUTPUT_DIR/experiment_code
45
- cp -r config $OUTPUT_DIR/experiment_code
46
- cp -r ./submit_job.sh $OUTPUT_DIR/experiment_code
47
- cp -r ./requirements.txt $OUTPUT_DIR/experiment_code
48
-
49
- # Define the Python scripts and their corresponding input files
50
- declare -A scripts_and_inputs=(
51
- ["1"]="./config/config1.yaml"
52
- # ["2"]="./config/config_redpajama.yaml"
53
- # ["3"]="./config/config1.yaml"
54
- # ["4"]="./config/config1.yaml"
55
- # ["5"]="./config/config1.yaml"
56
- # ["6"]="./config/config1.yaml"
57
- # ["7"]="./config/config1.yaml"
58
- # ["8"]="./config/config1.yaml"
59
- # ["9"]="./config/config1.yaml"
60
- # ["10"]="./config/config1.yaml"
61
- # ["11"]="./config/config1.yaml"
62
- # ["12"]="./config/config1.yaml"
63
- # ["13"]="./config/config1.yaml"
64
- # ["14"]="./config/config1.yaml"
65
- # ["15"]="./config/config1.yaml"
66
- # ["16"]="./config/config1.yaml"
67
- # ["17"]="./config/config1.yaml"
68
- # ["18"]="./config/config1.yaml"
69
- # ["19"]="./config/config1.yaml"
70
- # ["20"]="./config/config1.yaml"
71
- )
72
-
73
- # Launch each script with its corresponding input file as a separate task
74
- echo "Starting job array task: $SLURM_ARRAY_TASK_ID"
75
-
76
- INPUT_DIR="${scripts_and_inputs[$SLURM_ARRAY_TASK_ID]}"
77
- export DEFAULT_CONFIG_FILE="./config/config1.yaml"
78
- srun --exclusive python run_clm.py --output_dir $OUTPUT_DIR --logging_dir $OUTPUT_DIR --config_file $INPUT_DIR 2>&1 | tee $OUTPUT_DIR/log.txt
79
-
80
-
81
- # Wait for all background jobs to complete
82
- wait
83
-
84
- # Print a message indicating completion
85
- echo "All Python scripts have been executed."
86
-
87
-
88
- # mv ./slurm_outputs/run_clm_job_$SLURM_ARRAY_JOB_ID*$SLURM_ARRAY_TASK_ID* "$output_dir/"
89
-
90
-
91
- # python -m torch.distributed.launch ~/target_draft_coupling_code/target_draft_training/run_clm.py --multirun task=1,2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "_name_or_path": "JackFram/llama-68m",
3
- "architectures": [
4
- "LlamaForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "bos_token_id": 0,
9
- "eos_token_id": 2,
10
- "hidden_act": "silu",
11
- "hidden_size": 768,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 3072,
14
- "max_position_embeddings": 2048,
15
- "model_type": "llama",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 2,
18
- "num_key_value_heads": 12,
19
- "pad_token_id": 1,
20
- "pretraining_tp": 1,
21
- "rms_norm_eps": 1e-06,
22
- "rope_scaling": null,
23
- "rope_theta": 10000.0,
24
- "tie_word_embeddings": false,
25
- "torch_dtype": "float32",
26
- "transformers_version": "4.41.0.dev0",
27
- "use_cache": true,
28
- "vocab_size": 32000
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 0,
4
- "eos_token_id": 2,
5
- "pad_token_id": 1,
6
- "transformers_version": "4.41.0.dev0"
7
- }
 
 
 
 
 
 
 
 
last-checkpoint/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:baf7620b0c51ef17a030b63cfe26c514df5d88602a1b8140fb12c4968dfa6ff4
3
- size 272123144
 
 
 
 
last-checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:908d9d7ed41d479f7f47a9fd0646de3f7800df94e052115c9815ea463d99e70d
3
- size 544259743
 
 
 
 
last-checkpoint/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
3
- size 14244
 
 
 
 
last-checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:394be853393fcf0db07e5bdfe4c0d7e15ce8f5fac5bdbb2ad1b413385499af51
3
- size 1000
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "<unk>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "add_prefix_space": true,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": true,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": true,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": true
29
- }
30
- },
31
- "bos_token": "<s>",
32
- "clean_up_tokenization_spaces": false,
33
- "eos_token": "</s>",
34
- "legacy": true,
35
- "model_max_length": 2048,
36
- "pad_token": "<unk>",
37
- "padding": "max_length",
38
- "return_tensors": "pt",
39
- "sp_model_kwargs": {},
40
- "spaces_between_special_tokens": false,
41
- "tokenizer_class": "LlamaTokenizer",
42
- "unk_token": "<unk>",
43
- "use_default_system_prompt": false,
44
- "use_fast": true
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/trainer_state.json DELETED
@@ -1,125 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.0576414595452142,
5
- "eval_steps": 1000,
6
- "global_step": 4000,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.13220518244315177,
13
- "grad_norm": 0.8546391725540161,
14
- "learning_rate": 8.816009873931059e-05,
15
- "loss": 5.1118,
16
- "step": 500
17
- },
18
- {
19
- "epoch": 0.26441036488630354,
20
- "grad_norm": 0.8593688607215881,
21
- "learning_rate": 9.59831475011252e-05,
22
- "loss": 3.406,
23
- "step": 1000
24
- },
25
- {
26
- "epoch": 0.26441036488630354,
27
- "eval_accuracy": 0.5035306174465283,
28
- "eval_loss": 3.23445987701416,
29
- "eval_runtime": 73.8676,
30
- "eval_samples_per_second": 24.909,
31
- "eval_steps_per_second": 0.528,
32
- "step": 1000
33
- },
34
- {
35
- "epoch": 0.3966155473294553,
36
- "grad_norm": 0.9617258906364441,
37
- "learning_rate": 9.134314230431938e-05,
38
- "loss": 3.0005,
39
- "step": 1500
40
- },
41
- {
42
- "epoch": 0.5288207297726071,
43
- "grad_norm": 0.8953185677528381,
44
- "learning_rate": 8.670313710751356e-05,
45
- "loss": 2.8119,
46
- "step": 2000
47
- },
48
- {
49
- "epoch": 0.5288207297726071,
50
- "eval_accuracy": 0.5365118094348038,
51
- "eval_loss": 2.821384906768799,
52
- "eval_runtime": 72.909,
53
- "eval_samples_per_second": 25.237,
54
- "eval_steps_per_second": 0.535,
55
- "step": 2000
56
- },
57
- {
58
- "epoch": 0.6610259122157589,
59
- "grad_norm": 1.4154396057128906,
60
- "learning_rate": 8.206313191070773e-05,
61
- "loss": 2.686,
62
- "step": 2500
63
- },
64
- {
65
- "epoch": 0.7932310946589106,
66
- "grad_norm": 1.821349024772644,
67
- "learning_rate": 7.742312671390191e-05,
68
- "loss": 2.607,
69
- "step": 3000
70
- },
71
- {
72
- "epoch": 0.7932310946589106,
73
- "eval_accuracy": 0.5497897240925214,
74
- "eval_loss": 2.657219886779785,
75
- "eval_runtime": 73.4297,
76
- "eval_samples_per_second": 25.058,
77
- "eval_steps_per_second": 0.531,
78
- "step": 3000
79
- },
80
- {
81
- "epoch": 0.9254362771020624,
82
- "grad_norm": 2.0297396183013916,
83
- "learning_rate": 7.278312151709609e-05,
84
- "loss": 2.5642,
85
- "step": 3500
86
- },
87
- {
88
- "epoch": 1.0576414595452142,
89
- "grad_norm": 2.8318285942077637,
90
- "learning_rate": 6.814311632029027e-05,
91
- "loss": 2.4734,
92
- "step": 4000
93
- },
94
- {
95
- "epoch": 1.0576414595452142,
96
- "eval_accuracy": 0.5582058048894458,
97
- "eval_loss": 2.5735702514648438,
98
- "eval_runtime": 73.4679,
99
- "eval_samples_per_second": 25.045,
100
- "eval_steps_per_second": 0.531,
101
- "step": 4000
102
- }
103
- ],
104
- "logging_steps": 500,
105
- "max_steps": 11346,
106
- "num_input_tokens_seen": 0,
107
- "num_train_epochs": 3,
108
- "save_steps": 1000,
109
- "stateful_callbacks": {
110
- "TrainerControl": {
111
- "args": {
112
- "should_epoch_stop": false,
113
- "should_evaluate": false,
114
- "should_log": false,
115
- "should_save": true,
116
- "should_training_stop": false
117
- },
118
- "attributes": {}
119
- }
120
- },
121
- "total_flos": 5.124838835670221e+16,
122
- "train_batch_size": 24,
123
- "trial_name": null,
124
- "trial_params": null
125
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ce5f4c1939d798f9579c06cb7c41ca4f80497b830ef82299a5b5b802ba651a2
3
- size 5176
 
 
 
 
log.txt DELETED
The diff for this file is too large to render. See raw diff
 
model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:baf7620b0c51ef17a030b63cfe26c514df5d88602a1b8140fb12c4968dfa6ff4
3
- size 272123144
 
 
 
 
pip_freeze.txt DELETED
@@ -1,330 +0,0 @@
1
- absl-py==2.1.0
2
- accelerate==0.26.1
3
- aiofiles==23.2.1
4
- aiohttp==3.8.6
5
- aiosignal==1.3.1
6
- altair==5.3.0
7
- annotated-types==0.6.0
8
- antlr4-python3-runtime==4.9.3
9
- anyio==4.0.0
10
- argon2-cffi==23.1.0
11
- argon2-cffi-bindings==21.2.0
12
- arrow==1.3.0
13
- asttokens==2.4.0
14
- astunparse==1.6.3
15
- async-lru==2.0.4
16
- async-timeout==4.0.3
17
- attrs==23.1.0
18
- auto-gptq==0.6.0
19
- Babel==2.13.0
20
- backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
21
- beartype==0.17.2
22
- beautifulsoup4==4.12.2
23
- bitsandbytes==0.43.1
24
- bleach==6.1.0
25
- blis==0.7.11
26
- brotlipy==0.7.0
27
- cachetools==5.3.2
28
- catalogue==2.0.10
29
- certifi==2023.7.22
30
- cffi==1.16.0
31
- chardet==5.2.0
32
- charset-normalizer==3.3.0
33
- click==8.1.7
34
- cloudpathlib==0.16.0
35
- cloudpickle==3.0.0
36
- colorama @ file:///tmp/build/80754af9/colorama_1607707115595/work
37
- coloredlogs==15.0.1
38
- comm==0.1.4
39
- conda==4.12.0
40
- conda-content-trust @ file:///tmp/build/80754af9/conda-content-trust_1617045594566/work
41
- conda-package-handling @ file:///tmp/build/80754af9/conda-package-handling_1649105784853/work
42
- confection==0.1.4
43
- contextlib2==21.6.0
44
- contexttimer==0.3.3
45
- contourpy==1.1.1
46
- cryptography @ file:///tmp/build/80754af9/cryptography_1639414572950/work
47
- cycler==0.12.1
48
- cymem==2.0.8
49
- dataclasses-json==0.6.4
50
- DataProperty==1.0.1
51
- datasets==2.19.1
52
- debugpy==1.8.0
53
- decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
54
- defusedxml==0.7.1
55
- dill==0.3.7
56
- dnspython==2.6.1
57
- docstring_parser==0.16
58
- dos2unix==1
59
- einops==0.8.0
60
- eval_type_backport==0.2.0
61
- evaluate==0.4.1
62
- exceptiongroup==1.1.3
63
- executing==2.0.0
64
- fastapi==0.111.0
65
- fastapi-cli==0.0.2
66
- fastchat==0.1.0
67
- fastjsonschema==2.18.1
68
- ffmpy==0.3.2
69
- filelock==3.12.4
70
- fire==0.5.0
71
- flash-attn==2.5.8
72
- flatbuffers==23.5.26
73
- fonttools==4.43.1
74
- fqdn==1.5.1
75
- frozenlist==1.4.0
76
- fschat==0.2.36
77
- fsspec==2023.6.0
78
- gast==0.5.4
79
- gekko==1.0.6
80
- globals==0.3.36
81
- google-auth==2.27.0
82
- google-auth-oauthlib==1.2.0
83
- google-pasta==0.2.0
84
- gradio==4.29.0
85
- gradio_client==0.16.1
86
- greenlet==3.0.3
87
- grpcio==1.60.1
88
- h11==0.14.0
89
- h5py==3.10.0
90
- httpcore==1.0.5
91
- httptools==0.6.1
92
- httpx==0.27.0
93
- huggingface-hub==0.22.2
94
- humanfriendly==10.0
95
- hydra-core==1.3.2
96
- hydra-joblib-launcher==1.2.0
97
- hydra-submitit-launcher==1.2.0
98
- idna==3.4
99
- importlib-metadata==6.8.0
100
- importlib-resources==6.1.0
101
- ipykernel==6.25.2
102
- ipython==8.18.1
103
- isoduration==20.11.0
104
- jedi==0.19.1
105
- Jinja2==3.1.2
106
- joblib==1.3.2
107
- json5==0.9.14
108
- jsonlines==4.0.0
109
- jsonpatch==1.33
110
- jsonpointer==2.4
111
- jsonschema==4.19.1
112
- jsonschema-specifications==2023.7.1
113
- jupyter-events==0.7.0
114
- jupyter-lsp==2.2.0
115
- jupyter_client==8.3.1
116
- jupyter_core==5.3.2
117
- jupyter_server==2.7.3
118
- jupyter_server_terminals==0.4.4
119
- jupyterlab==4.0.6
120
- jupyterlab-pygments==0.2.2
121
- jupyterlab_server==2.25.0
122
- keras==2.15.0
123
- kiwisolver==1.4.5
124
- langchain==0.1.8
125
- langchain-community==0.0.21
126
- langchain-core==0.1.25
127
- langcodes==3.3.0
128
- langdetect==1.0.9
129
- langsmith==0.1.5
130
- libclang==16.0.6
131
- lxml==5.1.0
132
- Markdown==3.5.2
133
- markdown-it-py==3.0.0
134
- markdown2==2.4.13
135
- MarkupSafe==2.1.5
136
- marshmallow==3.20.2
137
- matplotlib==3.8.0
138
- matplotlib-inline @ file:///opt/conda/conda-bld/matplotlib-inline_1662014470464/work
139
- mbstrdecoder==1.1.3
140
- mdurl==0.1.2
141
- mistune==3.0.2
142
- ml-collections==0.1.1
143
- ml-dtypes==0.2.0
144
- more-itertools==10.2.0
145
- mpmath==1.3.0
146
- multidict==6.0.4
147
- multiprocess==0.70.15
148
- murmurhash==1.0.10
149
- mypy-extensions==1.0.0
150
- nbclient==0.8.0
151
- nbconvert==7.9.2
152
- nbformat==5.9.2
153
- nest-asyncio==1.5.8
154
- networkx==3.1
155
- nh3==0.2.17
156
- ninja==1.11.1.1
157
- nltk==3.8.1
158
- notebook==7.0.4
159
- notebook_shim==0.2.3
160
- numexpr==2.9.0
161
- numpy==1.26.0
162
- nvidia-cublas-cu12==12.1.3.1
163
- nvidia-cuda-cupti-cu12==12.1.105
164
- nvidia-cuda-nvrtc-cu12==12.1.105
165
- nvidia-cuda-runtime-cu12==12.1.105
166
- nvidia-cudnn-cu12==8.9.2.26
167
- nvidia-cufft-cu12==11.0.2.54
168
- nvidia-curand-cu12==10.3.2.106
169
- nvidia-cusolver-cu12==11.4.5.107
170
- nvidia-cusparse-cu12==12.1.0.106
171
- nvidia-ml-py3==7.352.0
172
- nvidia-nccl-cu12==2.18.1
173
- nvidia-nvjitlink-cu12==12.2.140
174
- nvidia-nvtx-cu12==12.1.105
175
- oauthlib==3.2.2
176
- omegaconf==2.3.0
177
- opt-einsum==3.3.0
178
- optimum==1.16.2
179
- orjson==3.10.3
180
- overrides==7.4.0
181
- packaging==23.2
182
- pandas==2.1.1
183
- pandocfilters==1.5.0
184
- parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
185
- pathvalidate==3.2.0
186
- patsy==0.5.3
187
- peft==0.8.2
188
- pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
189
- pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
190
- Pillow==10.0.1
191
- platformdirs==3.11.0
192
- plotly==5.17.0
193
- plotly-express==0.4.1
194
- portalocker==2.8.2
195
- preshed==3.0.9
196
- prometheus-client==0.17.1
197
- prompt-toolkit==3.0.43
198
- protobuf==3.20.3
199
- psutil==5.9.5
200
- ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
201
- pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
202
- pyarrow==13.0.0
203
- pyarrow-hotfix==0.6
204
- pyasn1==0.5.1
205
- pyasn1-modules==0.3.0
206
- pybind11==2.11.1
207
- pycosat==0.6.3
208
- pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
209
- pydantic==2.6.1
210
- pydantic_core==2.16.2
211
- pydub==0.25.1
212
- Pygments==2.16.1
213
- pyOpenSSL @ file:///opt/conda/conda-bld/pyopenssl_1643788558760/work
214
- pyparsing==3.1.1
215
- PySocks @ file:///tmp/build/80754af9/pysocks_1605305812635/work
216
- pytablewriter==1.2.0
217
- python-dateutil==2.8.2
218
- python-dotenv==1.0.1
219
- python-helper==0.3.74
220
- python-json-logger==2.0.7
221
- python-multipart==0.0.9
222
- pytz==2023.3.post1
223
- PyYAML==6.0.1
224
- pyzmq==25.1.1
225
- referencing==0.30.2
226
- regex==2023.10.3
227
- requests==2.31.0
228
- requests-oauthlib==1.3.1
229
- responses==0.18.0
230
- rfc3339-validator==0.1.4
231
- rfc3986-validator==0.1.1
232
- rich==13.7.1
233
- rotary-embedding-torch==0.5.3
234
- rouge==1.0.1
235
- rouge-score==0.1.2
236
- rpds-py==0.10.4
237
- rsa==4.9
238
- ruamel-yaml-conda @ file:///tmp/build/80754af9/ruamel_yaml_1616016711199/work
239
- ruff==0.4.3
240
- sacrebleu==2.4.0
241
- safetensors==0.4.3
242
- scikit-learn==1.4.1.post1
243
- scipy==1.11.3
244
- seaborn==0.13.0
245
- semantic-version==2.10.0
246
- Send2Trash==1.8.2
247
- sentencepiece==0.2.0
248
- shellingham==1.5.4
249
- shortuuid==1.0.13
250
- shtab==1.7.1
251
- six @ file:///tmp/build/80754af9/six_1644875935023/work
252
- smart-open==6.4.0
253
- sniffio==1.3.0
254
- soupsieve==2.5
255
- spacy==3.7.4
256
- spacy-legacy==3.0.12
257
- spacy-loggers==1.0.5
258
- speculative-decoding==0.1.2
259
- SQLAlchemy==2.0.27
260
- sqlitedict==2.1.0
261
- srsly==2.4.8
262
- stack-data==0.6.3
263
- starlette==0.37.2
264
- statsmodels==0.14.0
265
- submitit==1.5.1
266
- svgwrite==1.4.3
267
- sympy==1.12
268
- tabledata==1.3.3
269
- tabulate==0.9.0
270
- tcolorpy==0.1.4
271
- tenacity==8.2.3
272
- tensorboard==2.15.1
273
- tensorboard-data-server==0.7.2
274
- tensorflow==2.15.0.post1
275
- tensorflow-estimator==2.15.0
276
- tensorflow-io-gcs-filesystem==0.35.0
277
- tensorrt==8.6.1.post1
278
- tensorrt-bindings==8.6.1
279
- tensorrt-libs==8.6.1
280
- termcolor==2.4.0
281
- terminado==0.17.1
282
- thinc==8.2.3
283
- threadpoolctl==3.3.0
284
- tiktoken==0.6.0
285
- tinycss2==1.2.1
286
- tk==0.1.0
287
- tokenizers==0.19.1
288
- tomli==2.0.1
289
- tomlkit==0.12.0
290
- toolz==0.12.1
291
- torch==2.1.0
292
- torchaudio==2.1.0
293
- torchvision==0.16.0
294
- tornado==6.3.3
295
- tqdm==4.66.1
296
- tqdm-multiprocess==0.0.11
297
- traitlets==5.11.2
298
- -e git+https://github.com/huggingface/transformers.git@bbaa8ceff696c479aecdb4575b2deb1349efd3aa#egg=transformers
299
- triton==2.1.0
300
- trl==0.8.6
301
- typepy==1.3.2
302
- typer==0.12.3
303
- types-python-dateutil==2.8.19.14
304
- typing-inspect==0.9.0
305
- typing_extensions==4.8.0
306
- tyro==0.8.3
307
- tzdata==2023.3
308
- ujson==5.9.0
309
- unsloth @ git+https://github.com/unslothai/unsloth.git@4211cc01409e3ced4f7abebaf68e244193b46e2c
310
- uri-template==1.3.0
311
- urllib3==2.0.6
312
- uvicorn==0.29.0
313
- uvloop==0.19.0
314
- wasabi==1.1.2
315
- watchfiles==0.21.0
316
- wavedrom==2.0.3.post3
317
- wcwidth==0.2.8
318
- weasel==0.3.4
319
- webcolors==1.13
320
- webencodings==0.5.1
321
- websocket-client==1.6.4
322
- websockets==11.0.3
323
- Werkzeug==3.0.1
324
- word2number==1.1
325
- wrapt==1.14.1
326
- xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl
327
- xxhash==3.4.1
328
- yarl==1.9.2
329
- zipp==3.17.0
330
- zstandard==0.22.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "<unk>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
tokenizer_config.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "add_prefix_space": true,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": true,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": true,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": true
29
- }
30
- },
31
- "bos_token": "<s>",
32
- "clean_up_tokenization_spaces": false,
33
- "eos_token": "</s>",
34
- "legacy": true,
35
- "model_max_length": 2048,
36
- "pad_token": "<unk>",
37
- "padding": "max_length",
38
- "return_tensors": "pt",
39
- "sp_model_kwargs": {},
40
- "spaces_between_special_tokens": false,
41
- "tokenizer_class": "LlamaTokenizer",
42
- "unk_token": "<unk>",
43
- "use_default_system_prompt": false,
44
- "use_fast": true
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ce5f4c1939d798f9579c06cb7c41ca4f80497b830ef82299a5b5b802ba651a2
3
- size 5176