pretrain model
Browse files- .gitignore +3 -1
- scripts/prepare_pretrain_dataset.py +1 -1
- scripts/pretrain-model.yaml +1 -1
.gitignore
CHANGED
@@ -163,4 +163,6 @@ cython_debug/
|
|
163 |
.DS_Store
|
164 |
.ruff_cache
|
165 |
venv*/
|
166 |
-
data/
|
|
|
|
|
|
163 |
.DS_Store
|
164 |
.ruff_cache
|
165 |
venv*/
|
166 |
+
data/
|
167 |
+
pretrain-data/
|
168 |
+
contrain-data/
|
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -233,7 +233,7 @@ datasets_names = [
|
|
233 |
outputs = optimize(
|
234 |
fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
|
235 |
inputs=datasets_names,
|
236 |
-
output_dir='../data/',
|
237 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
238 |
chunk_size=(4097 * 4006),
|
239 |
num_workers=16,
|
|
|
233 |
outputs = optimize(
|
234 |
fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
|
235 |
inputs=datasets_names,
|
236 |
+
output_dir='../pretrain-data/',
|
237 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
238 |
chunk_size=(4097 * 4006),
|
239 |
num_workers=16,
|
scripts/pretrain-model.yaml
CHANGED
@@ -46,7 +46,7 @@ data:
|
|
46 |
class_path: LitData
|
47 |
|
48 |
init_args:
|
49 |
-
data_path: "../data/"
|
50 |
num_workers: 16
|
51 |
|
52 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
|
|
46 |
class_path: LitData
|
47 |
|
48 |
init_args:
|
49 |
+
data_path: "../pretrain-data/"
|
50 |
num_workers: 16
|
51 |
|
52 |
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|