update model weights
Browse files- cfg.yaml +21 -20
- generation_config.json +6 -6
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
cfg.yaml
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
architecture:
|
2 |
backbone_dtype: bfloat16
|
3 |
-
force_embedding_gradients: false
|
4 |
gradient_checkpointing: true
|
5 |
intermediate_dropout: 0.0
|
6 |
pretrained: true
|
@@ -27,19 +26,19 @@ dataset:
|
|
27 |
personalize: false
|
28 |
prompt_column:
|
29 |
- instruction
|
30 |
-
system_column:
|
31 |
text_answer_separator: <|answer|>
|
32 |
text_prompt_start: <|prompt|>
|
33 |
text_system_start: <|system|>
|
34 |
-
train_dataframe: /home/
|
35 |
validation_dataframe: None
|
36 |
validation_size: 0.01
|
37 |
validation_strategy: automatic
|
38 |
environment:
|
39 |
compile_model: false
|
40 |
-
deepspeed_allgather_bucket_size:
|
41 |
deepspeed_method: ZeRO2
|
42 |
-
deepspeed_reduce_bucket_size:
|
43 |
deepspeed_stage3_param_persistence_threshold: 1000000
|
44 |
deepspeed_stage3_prefetch_bucket_size: 1000000
|
45 |
find_unused_parameters: false
|
@@ -47,22 +46,23 @@ environment:
|
|
47 |
- '0'
|
48 |
- '1'
|
49 |
huggingface_branch: main
|
50 |
-
mixed_precision:
|
|
|
51 |
number_of_workers: 8
|
52 |
seed: -1
|
53 |
trust_remote_code: true
|
54 |
use_deepspeed: true
|
55 |
-
experiment_name:
|
56 |
llm_backbone: meta-llama/Meta-Llama-3-8B-Instruct
|
57 |
logging:
|
58 |
logger: None
|
59 |
neptune_project: ''
|
60 |
-
output_directory: /home/
|
61 |
prediction:
|
62 |
batch_size_inference: 0
|
63 |
do_sample: false
|
64 |
-
max_length_inference:
|
65 |
-
max_time:
|
66 |
metric: Perplexity
|
67 |
metric_gpt_model: gpt-3.5-turbo-0301
|
68 |
metric_gpt_template: general
|
@@ -77,11 +77,9 @@ prediction:
|
|
77 |
problem_type: text_causal_language_modeling
|
78 |
tokenizer:
|
79 |
add_prompt_answer_tokens: false
|
80 |
-
max_length:
|
81 |
-
max_length_answer: 4064
|
82 |
-
max_length_prompt: 4096
|
83 |
padding_quantile: 1.0
|
84 |
-
use_fast: true
|
85 |
training:
|
86 |
batch_size: 2
|
87 |
differential_learning_rate: 1.0e-05
|
@@ -90,19 +88,22 @@ training:
|
|
90 |
epochs: 1
|
91 |
evaluate_before_training: false
|
92 |
evaluation_epochs: 1.0
|
93 |
-
|
|
|
94 |
gradient_clip: 0.0
|
95 |
-
learning_rate:
|
96 |
lora: true
|
97 |
lora_alpha: 16
|
98 |
lora_dropout: 0.05
|
99 |
lora_r: 4
|
100 |
-
lora_target_modules:
|
|
|
101 |
loss_function: TokenAveragedCrossEntropy
|
102 |
optimizer: AdamW
|
103 |
-
|
104 |
schedule: Cosine
|
105 |
train_validation_data: false
|
106 |
-
|
107 |
-
|
|
|
108 |
weight_decay: 0.0
|
|
|
1 |
architecture:
|
2 |
backbone_dtype: bfloat16
|
|
|
3 |
gradient_checkpointing: true
|
4 |
intermediate_dropout: 0.0
|
5 |
pretrained: true
|
|
|
26 |
personalize: false
|
27 |
prompt_column:
|
28 |
- instruction
|
29 |
+
system_column: system
|
30 |
text_answer_separator: <|answer|>
|
31 |
text_prompt_start: <|prompt|>
|
32 |
text_system_start: <|system|>
|
33 |
+
train_dataframe: /home/user/src/h2o-llmstudio/data/user/japanese_hh-rlhf-49k/japanese_hh-rlhf-49k.csv
|
34 |
validation_dataframe: None
|
35 |
validation_size: 0.01
|
36 |
validation_strategy: automatic
|
37 |
environment:
|
38 |
compile_model: false
|
39 |
+
deepspeed_allgather_bucket_size: 100000000
|
40 |
deepspeed_method: ZeRO2
|
41 |
+
deepspeed_reduce_bucket_size: 100000000
|
42 |
deepspeed_stage3_param_persistence_threshold: 1000000
|
43 |
deepspeed_stage3_prefetch_bucket_size: 1000000
|
44 |
find_unused_parameters: false
|
|
|
46 |
- '0'
|
47 |
- '1'
|
48 |
huggingface_branch: main
|
49 |
+
mixed_precision: false
|
50 |
+
mixed_precision_dtype: bfloat16
|
51 |
number_of_workers: 8
|
52 |
seed: -1
|
53 |
trust_remote_code: true
|
54 |
use_deepspeed: true
|
55 |
+
experiment_name: llama-3-8b-ja
|
56 |
llm_backbone: meta-llama/Meta-Llama-3-8B-Instruct
|
57 |
logging:
|
58 |
logger: None
|
59 |
neptune_project: ''
|
60 |
+
output_directory: /home/user/src/h2o-llmstudio/output/user/llama-3-8b-ja/
|
61 |
prediction:
|
62 |
batch_size_inference: 0
|
63 |
do_sample: false
|
64 |
+
max_length_inference: 512
|
65 |
+
max_time: 0.0
|
66 |
metric: Perplexity
|
67 |
metric_gpt_model: gpt-3.5-turbo-0301
|
68 |
metric_gpt_template: general
|
|
|
77 |
problem_type: text_causal_language_modeling
|
78 |
tokenizer:
|
79 |
add_prompt_answer_tokens: false
|
80 |
+
max_length: 1024
|
|
|
|
|
81 |
padding_quantile: 1.0
|
82 |
+
tokenizer_kwargs: '{"use_fast": true, "add_prefix_space": false}'
|
83 |
training:
|
84 |
batch_size: 2
|
85 |
differential_learning_rate: 1.0e-05
|
|
|
88 |
epochs: 1
|
89 |
evaluate_before_training: false
|
90 |
evaluation_epochs: 1.0
|
91 |
+
freeze_layers: []
|
92 |
+
grad_accumulation: 4
|
93 |
gradient_clip: 0.0
|
94 |
+
learning_rate: 1.0e-05
|
95 |
lora: true
|
96 |
lora_alpha: 16
|
97 |
lora_dropout: 0.05
|
98 |
lora_r: 4
|
99 |
+
lora_target_modules: ''
|
100 |
+
lora_unfreeze_layers: []
|
101 |
loss_function: TokenAveragedCrossEntropy
|
102 |
optimizer: AdamW
|
103 |
+
save_checkpoint: last
|
104 |
schedule: Cosine
|
105 |
train_validation_data: false
|
106 |
+
use_dora: false
|
107 |
+
use_flash_attention_2: true
|
108 |
+
warmup_epochs: 0.05
|
109 |
weight_decay: 0.0
|
generation_config.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 128000,
|
4 |
-
"eos_token_id":
|
5 |
-
"
|
6 |
-
"
|
7 |
-
"
|
8 |
-
"
|
9 |
-
"transformers_version": "4.
|
10 |
}
|
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 128000,
|
4 |
+
"eos_token_id": 128009,
|
5 |
+
"pad_token_id": 128009,
|
6 |
+
"temperature": null,
|
7 |
+
"top_k": null,
|
8 |
+
"top_p": null,
|
9 |
+
"transformers_version": "4.40.2"
|
10 |
}
|
model-00001-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4976698672
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97f086559f12923ca1b59a1c1c0367d4b0913ab6bfc66efd687af54e5dff8a41
|
3 |
size 4976698672
|
model-00002-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999802720
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac035c87cda3f09cc332ab50ec588537b44b447d165a81fb92d510155198b777
|
3 |
size 4999802720
|
model-00003-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4915916176
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99313f6223273a020ecc15a3a249307eb34e49ef4ca24ce60982cda6430c7272
|
3 |
size 4915916176
|
model-00004-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1168138808
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c7049e9604e4838318948587dd94b0f31cd101a4a90529ec266540ac1d2035f
|
3 |
size 1168138808
|