ZeroUniqueness commited on
Commit
0acb17c
β€’
1 Parent(s): a831524
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. adapter_config.json +4 -4
  2. adapter_model.bin +1 -1
  3. checkpoint-58000/README.md +0 -20
  4. checkpoint-58000/adapter_config.json +0 -26
  5. checkpoint-58000/adapter_model.bin +0 -3
  6. checkpoint-58000/adapter_model/README.md +0 -20
  7. checkpoint-58000/adapter_model/adapter_config.json +0 -26
  8. checkpoint-58000/adapter_model/adapter_model.bin +0 -3
  9. checkpoint-59000/README.md +0 -20
  10. checkpoint-59000/adapter_config.json +0 -26
  11. checkpoint-59000/adapter_model.bin +0 -3
  12. {checkpoint-56000 β†’ checkpoint-69000}/README.md +1 -0
  13. {checkpoint-56000/adapter_model β†’ checkpoint-69000}/adapter_config.json +4 -4
  14. {checkpoint-56000/adapter_model β†’ checkpoint-69000}/adapter_model.bin +1 -1
  15. {checkpoint-57000 β†’ checkpoint-69000/adapter_model}/README.md +1 -0
  16. {checkpoint-56000 β†’ checkpoint-69000/adapter_model}/adapter_config.json +4 -4
  17. {checkpoint-57000 β†’ checkpoint-69000/adapter_model}/adapter_model.bin +1 -1
  18. {checkpoint-57000 β†’ checkpoint-69000}/optimizer.pt +2 -2
  19. {checkpoint-58000 β†’ checkpoint-69000}/rng_state.pth +1 -1
  20. {checkpoint-58000 β†’ checkpoint-69000}/scheduler.pt +1 -1
  21. {checkpoint-59000 β†’ checkpoint-69000}/trainer_state.json +288 -5
  22. {checkpoint-56000 β†’ checkpoint-69000}/training_args.bin +2 -2
  23. {checkpoint-57000/adapter_model β†’ checkpoint-69500}/README.md +1 -0
  24. {checkpoint-57000 β†’ checkpoint-69500}/adapter_config.json +4 -4
  25. {checkpoint-56000 β†’ checkpoint-69500}/adapter_model.bin +1 -1
  26. {checkpoint-56000 β†’ checkpoint-69500}/adapter_model/README.md +1 -0
  27. {checkpoint-57000 β†’ checkpoint-69500}/adapter_model/adapter_config.json +4 -4
  28. {checkpoint-57000 β†’ checkpoint-69500}/adapter_model/adapter_model.bin +1 -1
  29. {checkpoint-56000 β†’ checkpoint-69500}/optimizer.pt +2 -2
  30. {checkpoint-59000 β†’ checkpoint-69500}/rng_state.pth +1 -1
  31. {checkpoint-56000 β†’ checkpoint-69500}/scheduler.pt +1 -1
  32. {checkpoint-58000 β†’ checkpoint-69500}/trainer_state.json +370 -5
  33. {checkpoint-58000 β†’ checkpoint-69500}/training_args.bin +2 -2
  34. checkpoint-70000/README.md +21 -0
  35. checkpoint-70000/adapter_config.json +26 -0
  36. checkpoint-70000/adapter_model.bin +3 -0
  37. checkpoint-70000/adapter_model/README.md +21 -0
  38. checkpoint-70000/adapter_model/adapter_config.json +26 -0
  39. checkpoint-70000/adapter_model/adapter_model.bin +3 -0
  40. {checkpoint-58000 β†’ checkpoint-70000}/optimizer.pt +2 -2
  41. {checkpoint-57000 β†’ checkpoint-70000}/rng_state.pth +1 -1
  42. {checkpoint-57000 β†’ checkpoint-70000}/scheduler.pt +1 -1
  43. {checkpoint-57000 β†’ checkpoint-70000}/trainer_state.json +452 -5
  44. {checkpoint-59000 β†’ checkpoint-70000}/training_args.bin +2 -2
  45. checkpoint-70500/README.md +21 -0
  46. checkpoint-70500/adapter_config.json +26 -0
  47. checkpoint-70500/adapter_model.bin +3 -0
  48. checkpoint-70500/adapter_model/README.md +21 -0
  49. checkpoint-70500/adapter_model/adapter_config.json +26 -0
  50. checkpoint-70500/adapter_model/adapter_model.bin +3 -0
adapter_config.json CHANGED
@@ -14,12 +14,12 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
  "v_proj",
 
21
  "k_proj",
22
- "gate_proj",
 
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
+ "gate_proj",
 
 
18
  "v_proj",
19
+ "down_proj",
20
  "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f56b8a333605f03b496496aac3531e5eb50e390d67be06083619275a78de77da
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a26259b6c7f10eacd37169a51779a24aa9d6a76d8fdef027422bdcbf2557c2f
3
  size 500897101
checkpoint-58000/README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
- ## Training procedure
5
-
6
-
7
- The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
- - llm_int8_threshold: 6.0
11
- - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
- - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: bfloat16
17
- ### Framework versions
18
-
19
-
20
- - PEFT 0.5.0.dev0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-58000/adapter_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
- "bias": "none",
5
- "fan_in_fan_out": null,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 16,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": null,
13
- "peft_type": "LORA",
14
- "r": 32,
15
- "revision": null,
16
- "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
- "v_proj",
21
- "k_proj",
22
- "gate_proj",
23
- "o_proj"
24
- ],
25
- "task_type": "CAUSAL_LM"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-58000/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf9efdf73d7ecc9f45ca166bec5b70555182c38338e6de139c6203b8a009fc59
3
- size 500897101
 
 
 
 
checkpoint-58000/adapter_model/README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
- ## Training procedure
5
-
6
-
7
- The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
- - llm_int8_threshold: 6.0
11
- - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
- - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: bfloat16
17
- ### Framework versions
18
-
19
-
20
- - PEFT 0.5.0.dev0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-58000/adapter_model/adapter_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
- "bias": "none",
5
- "fan_in_fan_out": null,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 16,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": null,
13
- "peft_type": "LORA",
14
- "r": 32,
15
- "revision": null,
16
- "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
- "v_proj",
21
- "k_proj",
22
- "gate_proj",
23
- "o_proj"
24
- ],
25
- "task_type": "CAUSAL_LM"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-58000/adapter_model/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf9efdf73d7ecc9f45ca166bec5b70555182c38338e6de139c6203b8a009fc59
3
- size 500897101
 
 
 
 
checkpoint-59000/README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
- ## Training procedure
5
-
6
-
7
- The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
- - llm_int8_threshold: 6.0
11
- - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
- - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: bfloat16
17
- ### Framework versions
18
-
19
-
20
- - PEFT 0.5.0.dev0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-59000/adapter_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
- "bias": "none",
5
- "fan_in_fan_out": null,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 16,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": null,
13
- "peft_type": "LORA",
14
- "r": 32,
15
- "revision": null,
16
- "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
- "v_proj",
21
- "k_proj",
22
- "gate_proj",
23
- "o_proj"
24
- ],
25
- "task_type": "CAUSAL_LM"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-59000/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f56b8a333605f03b496496aac3531e5eb50e390d67be06083619275a78de77da
3
- size 500897101
 
 
 
 
{checkpoint-56000 β†’ checkpoint-69000}/README.md RENAMED
@@ -5,6 +5,7 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
 
8
  - load_in_8bit: False
9
  - load_in_4bit: True
10
  - llm_int8_threshold: 6.0
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
  - load_in_8bit: False
10
  - load_in_4bit: True
11
  - llm_int8_threshold: 6.0
{checkpoint-56000/adapter_model β†’ checkpoint-69000}/adapter_config.json RENAMED
@@ -14,12 +14,12 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
  "v_proj",
 
21
  "k_proj",
22
- "gate_proj",
 
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
+ "gate_proj",
 
 
18
  "v_proj",
19
+ "down_proj",
20
  "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
{checkpoint-56000/adapter_model β†’ checkpoint-69000}/adapter_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d61b2ab661f17f1b28e07a6ea4c559efd2487b69440f512fbda902147b2007f
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16127581d1b65765200af747a5c98d27b237b49430e306dfd23a9c3ad6af3b9c
3
  size 500897101
{checkpoint-57000 β†’ checkpoint-69000/adapter_model}/README.md RENAMED
@@ -5,6 +5,7 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
 
8
  - load_in_8bit: False
9
  - load_in_4bit: True
10
  - llm_int8_threshold: 6.0
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
  - load_in_8bit: False
10
  - load_in_4bit: True
11
  - llm_int8_threshold: 6.0
{checkpoint-56000 β†’ checkpoint-69000/adapter_model}/adapter_config.json RENAMED
@@ -14,12 +14,12 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
  "v_proj",
 
21
  "k_proj",
22
- "gate_proj",
 
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
+ "gate_proj",
 
 
18
  "v_proj",
19
+ "down_proj",
20
  "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
{checkpoint-57000 β†’ checkpoint-69000/adapter_model}/adapter_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9463fbc37a2c37f850b2aa713212bd675cce373b2a226f9fecf647f60157d1a1
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16127581d1b65765200af747a5c98d27b237b49430e306dfd23a9c3ad6af3b9c
3
  size 500897101
{checkpoint-57000 β†’ checkpoint-69000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d85e0cce4ea774ece1bba3b083129dd4ea4f075278346655fd271c9663edf7a0
3
- size 1001723453
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52478f59ec5c65d4db6d79009fc0c477e003ba9db2b5648781779b6963bc40cb
3
+ size 1001724605
{checkpoint-58000 β†’ checkpoint-69000}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5923ba7d43395d2ab7a25af40d67f773d9e67e462f9250548814d4e4d1853054
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7732edd0ae5999edb700e14bae64e828df5241beb83fbee05815f6c10b73570
3
  size 14575
{checkpoint-58000 β†’ checkpoint-69000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb6d23b542a910d4d880a9ad37544effe8607b091db3f3b955d778af0357176f
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f5690258b17f07cbd583d2e586e1be27217d957aa1adadeb296ee58f808a87
3
  size 627
{checkpoint-59000 β†’ checkpoint-69000}/trainer_state.json RENAMED
@@ -1,8 +1,9 @@
1
  {
2
- "best_metric": 0.4893116354942322,
3
- "best_model_checkpoint": "./qlora-out/checkpoint-59000",
4
- "epoch": 2.1997688378509377,
5
- "global_step": 59000,
 
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -4018,11 +4019,293 @@
4018
  "eval_samples_per_second": 0.436,
4019
  "eval_steps_per_second": 0.436,
4020
  "step": 59000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4021
  }
4022
  ],
 
4023
  "max_steps": 80463,
4024
  "num_train_epochs": 3,
4025
- "total_flos": 1.6542001385066742e+19,
 
4026
  "trial_name": null,
4027
  "trial_params": null
4028
  }
 
1
  {
2
+ "best_metric": 0.4789520502090454,
3
+ "best_model_checkpoint": "./qlora-out/checkpoint-69000",
4
+ "epoch": 2.5726110137578764,
5
+ "eval_steps": 500,
6
+ "global_step": 69000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4019
  "eval_samples_per_second": 0.436,
4020
  "eval_steps_per_second": 0.436,
4021
  "step": 59000
4022
+ },
4023
+ {
4024
+ "epoch": 2.22,
4025
+ "learning_rate": 3.167411635594364e-05,
4026
+ "loss": 0.3867,
4027
+ "step": 59500
4028
+ },
4029
+ {
4030
+ "epoch": 2.22,
4031
+ "eval_loss": 0.48985520005226135,
4032
+ "eval_runtime": 1240.4608,
4033
+ "eval_samples_per_second": 0.437,
4034
+ "eval_steps_per_second": 0.437,
4035
+ "step": 59500
4036
+ },
4037
+ {
4038
+ "epoch": 2.24,
4039
+ "learning_rate": 3.0261604379828834e-05,
4040
+ "loss": 0.3736,
4041
+ "step": 60000
4042
+ },
4043
+ {
4044
+ "epoch": 2.24,
4045
+ "eval_loss": 0.489548921585083,
4046
+ "eval_runtime": 1234.7527,
4047
+ "eval_samples_per_second": 0.439,
4048
+ "eval_steps_per_second": 0.439,
4049
+ "step": 60000
4050
+ },
4051
+ {
4052
+ "epoch": 2.26,
4053
+ "learning_rate": 2.887567598106955e-05,
4054
+ "loss": 0.361,
4055
+ "step": 60500
4056
+ },
4057
+ {
4058
+ "epoch": 2.26,
4059
+ "eval_loss": 0.4885287582874298,
4060
+ "eval_runtime": 1231.4045,
4061
+ "eval_samples_per_second": 0.44,
4062
+ "eval_steps_per_second": 0.44,
4063
+ "step": 60500
4064
+ },
4065
+ {
4066
+ "epoch": 2.27,
4067
+ "learning_rate": 2.7516859461678857e-05,
4068
+ "loss": 0.3778,
4069
+ "step": 61000
4070
+ },
4071
+ {
4072
+ "epoch": 2.27,
4073
+ "eval_loss": 0.4883672893047333,
4074
+ "eval_runtime": 1235.8497,
4075
+ "eval_samples_per_second": 0.439,
4076
+ "eval_steps_per_second": 0.439,
4077
+ "step": 61000
4078
+ },
4079
+ {
4080
+ "epoch": 2.29,
4081
+ "learning_rate": 2.618567278889328e-05,
4082
+ "loss": 0.3791,
4083
+ "step": 61500
4084
+ },
4085
+ {
4086
+ "epoch": 2.29,
4087
+ "eval_loss": 0.4874744415283203,
4088
+ "eval_runtime": 1231.8195,
4089
+ "eval_samples_per_second": 0.44,
4090
+ "eval_steps_per_second": 0.44,
4091
+ "step": 61500
4092
+ },
4093
+ {
4094
+ "epoch": 2.31,
4095
+ "learning_rate": 2.4882623397728655e-05,
4096
+ "loss": 0.3705,
4097
+ "step": 62000
4098
+ },
4099
+ {
4100
+ "epoch": 2.31,
4101
+ "eval_loss": 0.486933171749115,
4102
+ "eval_runtime": 1227.5583,
4103
+ "eval_samples_per_second": 0.442,
4104
+ "eval_steps_per_second": 0.442,
4105
+ "step": 62000
4106
+ },
4107
+ {
4108
+ "epoch": 2.33,
4109
+ "learning_rate": 2.3608207997551255e-05,
4110
+ "loss": 0.3698,
4111
+ "step": 62500
4112
+ },
4113
+ {
4114
+ "epoch": 2.33,
4115
+ "eval_loss": 0.48592954874038696,
4116
+ "eval_runtime": 1282.2531,
4117
+ "eval_samples_per_second": 0.423,
4118
+ "eval_steps_per_second": 0.423,
4119
+ "step": 62500
4120
+ },
4121
+ {
4122
+ "epoch": 2.35,
4123
+ "learning_rate": 2.2362912382736857e-05,
4124
+ "loss": 0.381,
4125
+ "step": 63000
4126
+ },
4127
+ {
4128
+ "epoch": 2.35,
4129
+ "eval_loss": 0.4852922856807709,
4130
+ "eval_runtime": 1229.4457,
4131
+ "eval_samples_per_second": 0.441,
4132
+ "eval_steps_per_second": 0.441,
4133
+ "step": 63000
4134
+ },
4135
+ {
4136
+ "epoch": 2.37,
4137
+ "learning_rate": 2.1147211247491084e-05,
4138
+ "loss": 0.3728,
4139
+ "step": 63500
4140
+ },
4141
+ {
4142
+ "epoch": 2.37,
4143
+ "eval_loss": 0.484967440366745,
4144
+ "eval_runtime": 1296.2845,
4145
+ "eval_samples_per_second": 0.418,
4146
+ "eval_steps_per_second": 0.418,
4147
+ "step": 63500
4148
+ },
4149
+ {
4150
+ "epoch": 2.39,
4151
+ "learning_rate": 1.9961568004900565e-05,
4152
+ "loss": 0.3695,
4153
+ "step": 64000
4154
+ },
4155
+ {
4156
+ "epoch": 2.39,
4157
+ "eval_loss": 0.4844016432762146,
4158
+ "eval_runtime": 1317.5418,
4159
+ "eval_samples_per_second": 0.411,
4160
+ "eval_steps_per_second": 0.411,
4161
+ "step": 64000
4162
+ },
4163
+ {
4164
+ "epoch": 2.4,
4165
+ "learning_rate": 1.8806434610284497e-05,
4166
+ "loss": 0.3682,
4167
+ "step": 64500
4168
+ },
4169
+ {
4170
+ "epoch": 2.4,
4171
+ "eval_loss": 0.4838670790195465,
4172
+ "eval_runtime": 1337.5922,
4173
+ "eval_samples_per_second": 0.405,
4174
+ "eval_steps_per_second": 0.405,
4175
+ "step": 64500
4176
+ },
4177
+ {
4178
+ "epoch": 2.42,
4179
+ "learning_rate": 1.768225138891393e-05,
4180
+ "loss": 0.3594,
4181
+ "step": 65000
4182
+ },
4183
+ {
4184
+ "epoch": 2.42,
4185
+ "eval_loss": 0.48305046558380127,
4186
+ "eval_runtime": 1317.2888,
4187
+ "eval_samples_per_second": 0.411,
4188
+ "eval_steps_per_second": 0.411,
4189
+ "step": 65000
4190
+ },
4191
+ {
4192
+ "epoch": 2.44,
4193
+ "learning_rate": 1.6589446868164037e-05,
4194
+ "loss": 0.367,
4195
+ "step": 65500
4196
+ },
4197
+ {
4198
+ "epoch": 2.44,
4199
+ "eval_loss": 0.48225167393684387,
4200
+ "eval_runtime": 1315.9763,
4201
+ "eval_samples_per_second": 0.412,
4202
+ "eval_steps_per_second": 0.412,
4203
+ "step": 65500
4204
+ },
4205
+ {
4206
+ "epoch": 2.46,
4207
+ "learning_rate": 1.552843761416395e-05,
4208
+ "loss": 0.3781,
4209
+ "step": 66000
4210
+ },
4211
+ {
4212
+ "epoch": 2.46,
4213
+ "eval_loss": 0.48182958364486694,
4214
+ "eval_runtime": 1298.0711,
4215
+ "eval_samples_per_second": 0.418,
4216
+ "eval_steps_per_second": 0.418,
4217
+ "step": 66000
4218
+ },
4219
+ {
4220
+ "epoch": 2.48,
4221
+ "learning_rate": 1.4499628073005733e-05,
4222
+ "loss": 0.3632,
4223
+ "step": 66500
4224
+ },
4225
+ {
4226
+ "epoch": 2.48,
4227
+ "eval_loss": 0.48136985301971436,
4228
+ "eval_runtime": 1295.6256,
4229
+ "eval_samples_per_second": 0.418,
4230
+ "eval_steps_per_second": 0.418,
4231
+ "step": 66500
4232
+ },
4233
+ {
4234
+ "epoch": 2.5,
4235
+ "learning_rate": 1.350341041657378e-05,
4236
+ "loss": 0.3707,
4237
+ "step": 67000
4238
+ },
4239
+ {
4240
+ "epoch": 2.5,
4241
+ "eval_loss": 0.48081424832344055,
4242
+ "eval_runtime": 1297.8801,
4243
+ "eval_samples_per_second": 0.418,
4244
+ "eval_steps_per_second": 0.418,
4245
+ "step": 67000
4246
+ },
4247
+ {
4248
+ "epoch": 2.52,
4249
+ "learning_rate": 1.2540164393052622e-05,
4250
+ "loss": 0.3657,
4251
+ "step": 67500
4252
+ },
4253
+ {
4254
+ "epoch": 2.52,
4255
+ "eval_loss": 0.48031187057495117,
4256
+ "eval_runtime": 1299.2471,
4257
+ "eval_samples_per_second": 0.417,
4258
+ "eval_steps_per_second": 0.417,
4259
+ "step": 67500
4260
+ },
4261
+ {
4262
+ "epoch": 2.54,
4263
+ "learning_rate": 1.1610257182170914e-05,
4264
+ "loss": 0.3742,
4265
+ "step": 68000
4266
+ },
4267
+ {
4268
+ "epoch": 2.54,
4269
+ "eval_loss": 0.479922354221344,
4270
+ "eval_runtime": 1275.2567,
4271
+ "eval_samples_per_second": 0.425,
4272
+ "eval_steps_per_second": 0.425,
4273
+ "step": 68000
4274
+ },
4275
+ {
4276
+ "epoch": 2.55,
4277
+ "learning_rate": 1.0714043255236094e-05,
4278
+ "loss": 0.3761,
4279
+ "step": 68500
4280
+ },
4281
+ {
4282
+ "epoch": 2.55,
4283
+ "eval_loss": 0.4795922338962555,
4284
+ "eval_runtime": 1321.5276,
4285
+ "eval_samples_per_second": 0.41,
4286
+ "eval_steps_per_second": 0.41,
4287
+ "step": 68500
4288
+ },
4289
+ {
4290
+ "epoch": 2.57,
4291
+ "learning_rate": 9.851864240013509e-06,
4292
+ "loss": 0.3754,
4293
+ "step": 69000
4294
+ },
4295
+ {
4296
+ "epoch": 2.57,
4297
+ "eval_loss": 0.4789520502090454,
4298
+ "eval_runtime": 1345.4528,
4299
+ "eval_samples_per_second": 0.403,
4300
+ "eval_steps_per_second": 0.403,
4301
+ "step": 69000
4302
  }
4303
  ],
4304
+ "logging_steps": 500,
4305
  "max_steps": 80463,
4306
  "num_train_epochs": 3,
4307
+ "save_steps": 500,
4308
+ "total_flos": 1.9364073941589443e+19,
4309
  "trial_name": null,
4310
  "trial_params": null
4311
  }
{checkpoint-56000 β†’ checkpoint-69000}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8ca8c55b410908f1a6fb4d78d55fe6aad82bbca76ec8021e18981496f18fa70
3
- size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35750ce2c97e67db338d1121db50269062def2ea29de48747dfd43b7a072ee79
3
+ size 4155
{checkpoint-57000/adapter_model β†’ checkpoint-69500}/README.md RENAMED
@@ -5,6 +5,7 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
 
8
  - load_in_8bit: False
9
  - load_in_4bit: True
10
  - llm_int8_threshold: 6.0
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
  - load_in_8bit: False
10
  - load_in_4bit: True
11
  - llm_int8_threshold: 6.0
{checkpoint-57000 β†’ checkpoint-69500}/adapter_config.json RENAMED
@@ -14,12 +14,12 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
  "v_proj",
 
21
  "k_proj",
22
- "gate_proj",
 
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
+ "gate_proj",
 
 
18
  "v_proj",
19
+ "down_proj",
20
  "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
{checkpoint-56000 β†’ checkpoint-69500}/adapter_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d61b2ab661f17f1b28e07a6ea4c559efd2487b69440f512fbda902147b2007f
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d536051f2a1ab536e6e716808efa406b8fc4bc641ebcf6102a663de9eab5ffe
3
  size 500897101
{checkpoint-56000 β†’ checkpoint-69500}/adapter_model/README.md RENAMED
@@ -5,6 +5,7 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
 
8
  - load_in_8bit: False
9
  - load_in_4bit: True
10
  - llm_int8_threshold: 6.0
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
  - load_in_8bit: False
10
  - load_in_4bit: True
11
  - llm_int8_threshold: 6.0
{checkpoint-57000 β†’ checkpoint-69500}/adapter_model/adapter_config.json RENAMED
@@ -14,12 +14,12 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "up_proj",
18
- "down_proj",
19
- "q_proj",
20
  "v_proj",
 
21
  "k_proj",
22
- "gate_proj",
 
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
+ "gate_proj",
 
 
18
  "v_proj",
19
+ "down_proj",
20
  "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
  "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
{checkpoint-57000 β†’ checkpoint-69500}/adapter_model/adapter_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9463fbc37a2c37f850b2aa713212bd675cce373b2a226f9fecf647f60157d1a1
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d536051f2a1ab536e6e716808efa406b8fc4bc641ebcf6102a663de9eab5ffe
3
  size 500897101
{checkpoint-56000 β†’ checkpoint-69500}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0d0299416431a6687f29eb725bd8536e5bc5512ff27981755266d125bd960dc
3
- size 1001723453
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0df421a10c3784a131b0ab37e1485ed063b6fa56024cc56104f9dbaad09ebe1
3
+ size 1001724605
{checkpoint-59000 β†’ checkpoint-69500}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bd334de4d3525ea70c0977c8fe7956563ce9e7d3af12dc2b9fcbbc68894cb2d
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9ee221e71303f97217b0d58a1364dcc9e4c1fac4ba0baf829b9e79b7ae1680b
3
  size 14575
{checkpoint-56000 β†’ checkpoint-69500}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40f07a3bf2b8b7e85bd7ec32b459bd8eba34e3ffd70129884ee8cac79708a84f
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bca69d6e74edb4d1fa3e9c45efbdb18d22e7412cb25b7cb947ef97719376c1f2
3
  size 627
{checkpoint-58000 β†’ checkpoint-69500}/trainer_state.json RENAMED
@@ -1,8 +1,9 @@
1
  {
2
- "best_metric": 0.4916069805622101,
3
- "best_model_checkpoint": "./qlora-out/checkpoint-58000",
4
- "epoch": 2.1624846202602437,
5
- "global_step": 58000,
 
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -3950,11 +3951,375 @@
3950
  "eval_samples_per_second": 0.436,
3951
  "eval_steps_per_second": 0.436,
3952
  "step": 58000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3953
  }
3954
  ],
 
3955
  "max_steps": 80463,
3956
  "num_train_epochs": 3,
3957
- "total_flos": 1.6261229153876214e+19,
 
3958
  "trial_name": null,
3959
  "trial_params": null
3960
  }
 
1
  {
2
+ "best_metric": 0.47866225242614746,
3
+ "best_model_checkpoint": "./qlora-out/checkpoint-69500",
4
+ "epoch": 2.591253122553223,
5
+ "eval_steps": 500,
6
+ "global_step": 69500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3951
  "eval_samples_per_second": 0.436,
3952
  "eval_steps_per_second": 0.436,
3953
  "step": 58000
3954
+ },
3955
+ {
3956
+ "epoch": 2.17,
3957
+ "learning_rate": 3.576595345767464e-05,
3958
+ "loss": 0.3759,
3959
+ "step": 58100
3960
+ },
3961
+ {
3962
+ "epoch": 2.17,
3963
+ "learning_rate": 3.5467166397551524e-05,
3964
+ "loss": 0.3987,
3965
+ "step": 58200
3966
+ },
3967
+ {
3968
+ "epoch": 2.17,
3969
+ "learning_rate": 3.5169363338208094e-05,
3970
+ "loss": 0.3809,
3971
+ "step": 58300
3972
+ },
3973
+ {
3974
+ "epoch": 2.18,
3975
+ "learning_rate": 3.4872548820564455e-05,
3976
+ "loss": 0.3851,
3977
+ "step": 58400
3978
+ },
3979
+ {
3980
+ "epoch": 2.18,
3981
+ "learning_rate": 3.457672737046737e-05,
3982
+ "loss": 0.3832,
3983
+ "step": 58500
3984
+ },
3985
+ {
3986
+ "epoch": 2.18,
3987
+ "learning_rate": 3.42819034986213e-05,
3988
+ "loss": 0.3923,
3989
+ "step": 58600
3990
+ },
3991
+ {
3992
+ "epoch": 2.19,
3993
+ "learning_rate": 3.398808170051951e-05,
3994
+ "loss": 0.3609,
3995
+ "step": 58700
3996
+ },
3997
+ {
3998
+ "epoch": 2.19,
3999
+ "learning_rate": 3.369526645637556e-05,
4000
+ "loss": 0.3538,
4001
+ "step": 58800
4002
+ },
4003
+ {
4004
+ "epoch": 2.2,
4005
+ "learning_rate": 3.3403462231055107e-05,
4006
+ "loss": 0.3941,
4007
+ "step": 58900
4008
+ },
4009
+ {
4010
+ "epoch": 2.2,
4011
+ "learning_rate": 3.3112673474007584e-05,
4012
+ "loss": 0.3984,
4013
+ "step": 59000
4014
+ },
4015
+ {
4016
+ "epoch": 2.2,
4017
+ "eval_loss": 0.4893116354942322,
4018
+ "eval_runtime": 1243.7748,
4019
+ "eval_samples_per_second": 0.436,
4020
+ "eval_steps_per_second": 0.436,
4021
+ "step": 59000
4022
+ },
4023
+ {
4024
+ "epoch": 2.22,
4025
+ "learning_rate": 3.167411635594364e-05,
4026
+ "loss": 0.3867,
4027
+ "step": 59500
4028
+ },
4029
+ {
4030
+ "epoch": 2.22,
4031
+ "eval_loss": 0.48985520005226135,
4032
+ "eval_runtime": 1240.4608,
4033
+ "eval_samples_per_second": 0.437,
4034
+ "eval_steps_per_second": 0.437,
4035
+ "step": 59500
4036
+ },
4037
+ {
4038
+ "epoch": 2.24,
4039
+ "learning_rate": 3.0261604379828834e-05,
4040
+ "loss": 0.3736,
4041
+ "step": 60000
4042
+ },
4043
+ {
4044
+ "epoch": 2.24,
4045
+ "eval_loss": 0.489548921585083,
4046
+ "eval_runtime": 1234.7527,
4047
+ "eval_samples_per_second": 0.439,
4048
+ "eval_steps_per_second": 0.439,
4049
+ "step": 60000
4050
+ },
4051
+ {
4052
+ "epoch": 2.26,
4053
+ "learning_rate": 2.887567598106955e-05,
4054
+ "loss": 0.361,
4055
+ "step": 60500
4056
+ },
4057
+ {
4058
+ "epoch": 2.26,
4059
+ "eval_loss": 0.4885287582874298,
4060
+ "eval_runtime": 1231.4045,
4061
+ "eval_samples_per_second": 0.44,
4062
+ "eval_steps_per_second": 0.44,
4063
+ "step": 60500
4064
+ },
4065
+ {
4066
+ "epoch": 2.27,
4067
+ "learning_rate": 2.7516859461678857e-05,
4068
+ "loss": 0.3778,
4069
+ "step": 61000
4070
+ },
4071
+ {
4072
+ "epoch": 2.27,
4073
+ "eval_loss": 0.4883672893047333,
4074
+ "eval_runtime": 1235.8497,
4075
+ "eval_samples_per_second": 0.439,
4076
+ "eval_steps_per_second": 0.439,
4077
+ "step": 61000
4078
+ },
4079
+ {
4080
+ "epoch": 2.29,
4081
+ "learning_rate": 2.618567278889328e-05,
4082
+ "loss": 0.3791,
4083
+ "step": 61500
4084
+ },
4085
+ {
4086
+ "epoch": 2.29,
4087
+ "eval_loss": 0.4874744415283203,
4088
+ "eval_runtime": 1231.8195,
4089
+ "eval_samples_per_second": 0.44,
4090
+ "eval_steps_per_second": 0.44,
4091
+ "step": 61500
4092
+ },
4093
+ {
4094
+ "epoch": 2.31,
4095
+ "learning_rate": 2.4882623397728655e-05,
4096
+ "loss": 0.3705,
4097
+ "step": 62000
4098
+ },
4099
+ {
4100
+ "epoch": 2.31,
4101
+ "eval_loss": 0.486933171749115,
4102
+ "eval_runtime": 1227.5583,
4103
+ "eval_samples_per_second": 0.442,
4104
+ "eval_steps_per_second": 0.442,
4105
+ "step": 62000
4106
+ },
4107
+ {
4108
+ "epoch": 2.33,
4109
+ "learning_rate": 2.3608207997551255e-05,
4110
+ "loss": 0.3698,
4111
+ "step": 62500
4112
+ },
4113
+ {
4114
+ "epoch": 2.33,
4115
+ "eval_loss": 0.48592954874038696,
4116
+ "eval_runtime": 1282.2531,
4117
+ "eval_samples_per_second": 0.423,
4118
+ "eval_steps_per_second": 0.423,
4119
+ "step": 62500
4120
+ },
4121
+ {
4122
+ "epoch": 2.35,
4123
+ "learning_rate": 2.2362912382736857e-05,
4124
+ "loss": 0.381,
4125
+ "step": 63000
4126
+ },
4127
+ {
4128
+ "epoch": 2.35,
4129
+ "eval_loss": 0.4852922856807709,
4130
+ "eval_runtime": 1229.4457,
4131
+ "eval_samples_per_second": 0.441,
4132
+ "eval_steps_per_second": 0.441,
4133
+ "step": 63000
4134
+ },
4135
+ {
4136
+ "epoch": 2.37,
4137
+ "learning_rate": 2.1147211247491084e-05,
4138
+ "loss": 0.3728,
4139
+ "step": 63500
4140
+ },
4141
+ {
4142
+ "epoch": 2.37,
4143
+ "eval_loss": 0.484967440366745,
4144
+ "eval_runtime": 1296.2845,
4145
+ "eval_samples_per_second": 0.418,
4146
+ "eval_steps_per_second": 0.418,
4147
+ "step": 63500
4148
+ },
4149
+ {
4150
+ "epoch": 2.39,
4151
+ "learning_rate": 1.9961568004900565e-05,
4152
+ "loss": 0.3695,
4153
+ "step": 64000
4154
+ },
4155
+ {
4156
+ "epoch": 2.39,
4157
+ "eval_loss": 0.4844016432762146,
4158
+ "eval_runtime": 1317.5418,
4159
+ "eval_samples_per_second": 0.411,
4160
+ "eval_steps_per_second": 0.411,
4161
+ "step": 64000
4162
+ },
4163
+ {
4164
+ "epoch": 2.4,
4165
+ "learning_rate": 1.8806434610284497e-05,
4166
+ "loss": 0.3682,
4167
+ "step": 64500
4168
+ },
4169
+ {
4170
+ "epoch": 2.4,
4171
+ "eval_loss": 0.4838670790195465,
4172
+ "eval_runtime": 1337.5922,
4173
+ "eval_samples_per_second": 0.405,
4174
+ "eval_steps_per_second": 0.405,
4175
+ "step": 64500
4176
+ },
4177
+ {
4178
+ "epoch": 2.42,
4179
+ "learning_rate": 1.768225138891393e-05,
4180
+ "loss": 0.3594,
4181
+ "step": 65000
4182
+ },
4183
+ {
4184
+ "epoch": 2.42,
4185
+ "eval_loss": 0.48305046558380127,
4186
+ "eval_runtime": 1317.2888,
4187
+ "eval_samples_per_second": 0.411,
4188
+ "eval_steps_per_second": 0.411,
4189
+ "step": 65000
4190
+ },
4191
+ {
4192
+ "epoch": 2.44,
4193
+ "learning_rate": 1.6589446868164037e-05,
4194
+ "loss": 0.367,
4195
+ "step": 65500
4196
+ },
4197
+ {
4198
+ "epoch": 2.44,
4199
+ "eval_loss": 0.48225167393684387,
4200
+ "eval_runtime": 1315.9763,
4201
+ "eval_samples_per_second": 0.412,
4202
+ "eval_steps_per_second": 0.412,
4203
+ "step": 65500
4204
+ },
4205
+ {
4206
+ "epoch": 2.46,
4207
+ "learning_rate": 1.552843761416395e-05,
4208
+ "loss": 0.3781,
4209
+ "step": 66000
4210
+ },
4211
+ {
4212
+ "epoch": 2.46,
4213
+ "eval_loss": 0.48182958364486694,
4214
+ "eval_runtime": 1298.0711,
4215
+ "eval_samples_per_second": 0.418,
4216
+ "eval_steps_per_second": 0.418,
4217
+ "step": 66000
4218
+ },
4219
+ {
4220
+ "epoch": 2.48,
4221
+ "learning_rate": 1.4499628073005733e-05,
4222
+ "loss": 0.3632,
4223
+ "step": 66500
4224
+ },
4225
+ {
4226
+ "epoch": 2.48,
4227
+ "eval_loss": 0.48136985301971436,
4228
+ "eval_runtime": 1295.6256,
4229
+ "eval_samples_per_second": 0.418,
4230
+ "eval_steps_per_second": 0.418,
4231
+ "step": 66500
4232
+ },
4233
+ {
4234
+ "epoch": 2.5,
4235
+ "learning_rate": 1.350341041657378e-05,
4236
+ "loss": 0.3707,
4237
+ "step": 67000
4238
+ },
4239
+ {
4240
+ "epoch": 2.5,
4241
+ "eval_loss": 0.48081424832344055,
4242
+ "eval_runtime": 1297.8801,
4243
+ "eval_samples_per_second": 0.418,
4244
+ "eval_steps_per_second": 0.418,
4245
+ "step": 67000
4246
+ },
4247
+ {
4248
+ "epoch": 2.52,
4249
+ "learning_rate": 1.2540164393052622e-05,
4250
+ "loss": 0.3657,
4251
+ "step": 67500
4252
+ },
4253
+ {
4254
+ "epoch": 2.52,
4255
+ "eval_loss": 0.48031187057495117,
4256
+ "eval_runtime": 1299.2471,
4257
+ "eval_samples_per_second": 0.417,
4258
+ "eval_steps_per_second": 0.417,
4259
+ "step": 67500
4260
+ },
4261
+ {
4262
+ "epoch": 2.54,
4263
+ "learning_rate": 1.1610257182170914e-05,
4264
+ "loss": 0.3742,
4265
+ "step": 68000
4266
+ },
4267
+ {
4268
+ "epoch": 2.54,
4269
+ "eval_loss": 0.479922354221344,
4270
+ "eval_runtime": 1275.2567,
4271
+ "eval_samples_per_second": 0.425,
4272
+ "eval_steps_per_second": 0.425,
4273
+ "step": 68000
4274
+ },
4275
+ {
4276
+ "epoch": 2.55,
4277
+ "learning_rate": 1.0714043255236094e-05,
4278
+ "loss": 0.3761,
4279
+ "step": 68500
4280
+ },
4281
+ {
4282
+ "epoch": 2.55,
4283
+ "eval_loss": 0.4795922338962555,
4284
+ "eval_runtime": 1321.5276,
4285
+ "eval_samples_per_second": 0.41,
4286
+ "eval_steps_per_second": 0.41,
4287
+ "step": 68500
4288
+ },
4289
+ {
4290
+ "epoch": 2.57,
4291
+ "learning_rate": 9.851864240013509e-06,
4292
+ "loss": 0.3754,
4293
+ "step": 69000
4294
+ },
4295
+ {
4296
+ "epoch": 2.57,
4297
+ "eval_loss": 0.4789520502090454,
4298
+ "eval_runtime": 1345.4528,
4299
+ "eval_samples_per_second": 0.403,
4300
+ "eval_steps_per_second": 0.403,
4301
+ "step": 69000
4302
+ },
4303
+ {
4304
+ "epoch": 2.59,
4305
+ "learning_rate": 9.024048790501272e-06,
4306
+ "loss": 0.3594,
4307
+ "step": 69500
4308
+ },
4309
+ {
4310
+ "epoch": 2.59,
4311
+ "eval_loss": 0.47866225242614746,
4312
+ "eval_runtime": 1316.9883,
4313
+ "eval_samples_per_second": 0.412,
4314
+ "eval_steps_per_second": 0.412,
4315
+ "step": 69500
4316
  }
4317
  ],
4318
+ "logging_steps": 500,
4319
  "max_steps": 80463,
4320
  "num_train_epochs": 3,
4321
+ "save_steps": 500,
4322
+ "total_flos": 1.950603151563399e+19,
4323
  "trial_name": null,
4324
  "trial_params": null
4325
  }
{checkpoint-58000 β†’ checkpoint-69500}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8ca8c55b410908f1a6fb4d78d55fe6aad82bbca76ec8021e18981496f18fa70
3
- size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35750ce2c97e67db338d1121db50269062def2ea29de48747dfd43b7a072ee79
3
+ size 4155
checkpoint-70000/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-70000/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "v_proj",
19
+ "down_proj",
20
+ "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-70000/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35b27172603bfaa42af020910d0f3a0724656396738e74f39eebef1c4c53cd6c
3
+ size 500897101
checkpoint-70000/adapter_model/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-70000/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "v_proj",
19
+ "down_proj",
20
+ "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-70000/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35b27172603bfaa42af020910d0f3a0724656396738e74f39eebef1c4c53cd6c
3
+ size 500897101
{checkpoint-58000 β†’ checkpoint-70000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67f192e31625a5f9d71aaeb75826e3461458c994c58bc8d3d5b3b59fa56efc4b
3
- size 1001723453
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3757834dca752ceb36448c74c65b6c698a3cf7eac3b443be1d20520a1ef75c80
3
+ size 1001724605
{checkpoint-57000 β†’ checkpoint-70000}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7b0ae395ccd0b4875fa94f8cd4ee3274662f44279f744979610604a15d72da0
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b4a721a0714cca4311a027981bf55d9c240a69a7f46c912f368eb795c5d17f
3
  size 14575
{checkpoint-57000 β†’ checkpoint-70000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c18b73ff8e0ca9bda5d92134e841aafa154377e996a5dd3b1b1a3a0b329e74e
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d978803312071ed04341fcce57866c271d97c1ced7225c7be19f70453e4d9836
3
  size 627
{checkpoint-57000 β†’ checkpoint-70000}/trainer_state.json RENAMED
@@ -1,8 +1,9 @@
1
  {
2
- "best_metric": 0.49361398816108704,
3
- "best_model_checkpoint": "./qlora-out/checkpoint-57000",
4
- "epoch": 2.12520040266955,
5
- "global_step": 57000,
 
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -3882,11 +3883,457 @@
3882
  "eval_samples_per_second": 0.449,
3883
  "eval_steps_per_second": 0.449,
3884
  "step": 57000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3885
  }
3886
  ],
 
3887
  "max_steps": 80463,
3888
  "num_train_epochs": 3,
3889
- "total_flos": 1.5981607298407956e+19,
 
3890
  "trial_name": null,
3891
  "trial_params": null
3892
  }
 
1
  {
2
+ "best_metric": 0.47838443517684937,
3
+ "best_model_checkpoint": "./qlora-out/checkpoint-70000",
4
+ "epoch": 2.6098952313485704,
5
+ "eval_steps": 500,
6
+ "global_step": 70000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3883
  "eval_samples_per_second": 0.449,
3884
  "eval_steps_per_second": 0.449,
3885
  "step": 57000
3886
+ },
3887
+ {
3888
+ "epoch": 2.13,
3889
+ "learning_rate": 3.8806934461220826e-05,
3890
+ "loss": 0.3512,
3891
+ "step": 57100
3892
+ },
3893
+ {
3894
+ "epoch": 2.13,
3895
+ "learning_rate": 3.8498560410244546e-05,
3896
+ "loss": 0.3715,
3897
+ "step": 57200
3898
+ },
3899
+ {
3900
+ "epoch": 2.14,
3901
+ "learning_rate": 3.819112413715791e-05,
3902
+ "loss": 0.3803,
3903
+ "step": 57300
3904
+ },
3905
+ {
3906
+ "epoch": 2.14,
3907
+ "learning_rate": 3.7884630329768875e-05,
3908
+ "loss": 0.3785,
3909
+ "step": 57400
3910
+ },
3911
+ {
3912
+ "epoch": 2.14,
3913
+ "learning_rate": 3.757908366151463e-05,
3914
+ "loss": 0.3626,
3915
+ "step": 57500
3916
+ },
3917
+ {
3918
+ "epoch": 2.15,
3919
+ "learning_rate": 3.72744887913904e-05,
3920
+ "loss": 0.3981,
3921
+ "step": 57600
3922
+ },
3923
+ {
3924
+ "epoch": 2.15,
3925
+ "learning_rate": 3.697085036387822e-05,
3926
+ "loss": 0.3918,
3927
+ "step": 57700
3928
+ },
3929
+ {
3930
+ "epoch": 2.16,
3931
+ "learning_rate": 3.6668173008876324e-05,
3932
+ "loss": 0.3876,
3933
+ "step": 57800
3934
+ },
3935
+ {
3936
+ "epoch": 2.16,
3937
+ "learning_rate": 3.6366461341628396e-05,
3938
+ "loss": 0.3878,
3939
+ "step": 57900
3940
+ },
3941
+ {
3942
+ "epoch": 2.16,
3943
+ "learning_rate": 3.606571996265321e-05,
3944
+ "loss": 0.3674,
3945
+ "step": 58000
3946
+ },
3947
+ {
3948
+ "epoch": 2.16,
3949
+ "eval_loss": 0.4916069805622101,
3950
+ "eval_runtime": 1244.109,
3951
+ "eval_samples_per_second": 0.436,
3952
+ "eval_steps_per_second": 0.436,
3953
+ "step": 58000
3954
+ },
3955
+ {
3956
+ "epoch": 2.17,
3957
+ "learning_rate": 3.576595345767464e-05,
3958
+ "loss": 0.3759,
3959
+ "step": 58100
3960
+ },
3961
+ {
3962
+ "epoch": 2.17,
3963
+ "learning_rate": 3.5467166397551524e-05,
3964
+ "loss": 0.3987,
3965
+ "step": 58200
3966
+ },
3967
+ {
3968
+ "epoch": 2.17,
3969
+ "learning_rate": 3.5169363338208094e-05,
3970
+ "loss": 0.3809,
3971
+ "step": 58300
3972
+ },
3973
+ {
3974
+ "epoch": 2.18,
3975
+ "learning_rate": 3.4872548820564455e-05,
3976
+ "loss": 0.3851,
3977
+ "step": 58400
3978
+ },
3979
+ {
3980
+ "epoch": 2.18,
3981
+ "learning_rate": 3.457672737046737e-05,
3982
+ "loss": 0.3832,
3983
+ "step": 58500
3984
+ },
3985
+ {
3986
+ "epoch": 2.18,
3987
+ "learning_rate": 3.42819034986213e-05,
3988
+ "loss": 0.3923,
3989
+ "step": 58600
3990
+ },
3991
+ {
3992
+ "epoch": 2.19,
3993
+ "learning_rate": 3.398808170051951e-05,
3994
+ "loss": 0.3609,
3995
+ "step": 58700
3996
+ },
3997
+ {
3998
+ "epoch": 2.19,
3999
+ "learning_rate": 3.369526645637556e-05,
4000
+ "loss": 0.3538,
4001
+ "step": 58800
4002
+ },
4003
+ {
4004
+ "epoch": 2.2,
4005
+ "learning_rate": 3.3403462231055107e-05,
4006
+ "loss": 0.3941,
4007
+ "step": 58900
4008
+ },
4009
+ {
4010
+ "epoch": 2.2,
4011
+ "learning_rate": 3.3112673474007584e-05,
4012
+ "loss": 0.3984,
4013
+ "step": 59000
4014
+ },
4015
+ {
4016
+ "epoch": 2.2,
4017
+ "eval_loss": 0.4893116354942322,
4018
+ "eval_runtime": 1243.7748,
4019
+ "eval_samples_per_second": 0.436,
4020
+ "eval_steps_per_second": 0.436,
4021
+ "step": 59000
4022
+ },
4023
+ {
4024
+ "epoch": 2.22,
4025
+ "learning_rate": 3.167411635594364e-05,
4026
+ "loss": 0.3867,
4027
+ "step": 59500
4028
+ },
4029
+ {
4030
+ "epoch": 2.22,
4031
+ "eval_loss": 0.48985520005226135,
4032
+ "eval_runtime": 1240.4608,
4033
+ "eval_samples_per_second": 0.437,
4034
+ "eval_steps_per_second": 0.437,
4035
+ "step": 59500
4036
+ },
4037
+ {
4038
+ "epoch": 2.24,
4039
+ "learning_rate": 3.0261604379828834e-05,
4040
+ "loss": 0.3736,
4041
+ "step": 60000
4042
+ },
4043
+ {
4044
+ "epoch": 2.24,
4045
+ "eval_loss": 0.489548921585083,
4046
+ "eval_runtime": 1234.7527,
4047
+ "eval_samples_per_second": 0.439,
4048
+ "eval_steps_per_second": 0.439,
4049
+ "step": 60000
4050
+ },
4051
+ {
4052
+ "epoch": 2.26,
4053
+ "learning_rate": 2.887567598106955e-05,
4054
+ "loss": 0.361,
4055
+ "step": 60500
4056
+ },
4057
+ {
4058
+ "epoch": 2.26,
4059
+ "eval_loss": 0.4885287582874298,
4060
+ "eval_runtime": 1231.4045,
4061
+ "eval_samples_per_second": 0.44,
4062
+ "eval_steps_per_second": 0.44,
4063
+ "step": 60500
4064
+ },
4065
+ {
4066
+ "epoch": 2.27,
4067
+ "learning_rate": 2.7516859461678857e-05,
4068
+ "loss": 0.3778,
4069
+ "step": 61000
4070
+ },
4071
+ {
4072
+ "epoch": 2.27,
4073
+ "eval_loss": 0.4883672893047333,
4074
+ "eval_runtime": 1235.8497,
4075
+ "eval_samples_per_second": 0.439,
4076
+ "eval_steps_per_second": 0.439,
4077
+ "step": 61000
4078
+ },
4079
+ {
4080
+ "epoch": 2.29,
4081
+ "learning_rate": 2.618567278889328e-05,
4082
+ "loss": 0.3791,
4083
+ "step": 61500
4084
+ },
4085
+ {
4086
+ "epoch": 2.29,
4087
+ "eval_loss": 0.4874744415283203,
4088
+ "eval_runtime": 1231.8195,
4089
+ "eval_samples_per_second": 0.44,
4090
+ "eval_steps_per_second": 0.44,
4091
+ "step": 61500
4092
+ },
4093
+ {
4094
+ "epoch": 2.31,
4095
+ "learning_rate": 2.4882623397728655e-05,
4096
+ "loss": 0.3705,
4097
+ "step": 62000
4098
+ },
4099
+ {
4100
+ "epoch": 2.31,
4101
+ "eval_loss": 0.486933171749115,
4102
+ "eval_runtime": 1227.5583,
4103
+ "eval_samples_per_second": 0.442,
4104
+ "eval_steps_per_second": 0.442,
4105
+ "step": 62000
4106
+ },
4107
+ {
4108
+ "epoch": 2.33,
4109
+ "learning_rate": 2.3608207997551255e-05,
4110
+ "loss": 0.3698,
4111
+ "step": 62500
4112
+ },
4113
+ {
4114
+ "epoch": 2.33,
4115
+ "eval_loss": 0.48592954874038696,
4116
+ "eval_runtime": 1282.2531,
4117
+ "eval_samples_per_second": 0.423,
4118
+ "eval_steps_per_second": 0.423,
4119
+ "step": 62500
4120
+ },
4121
+ {
4122
+ "epoch": 2.35,
4123
+ "learning_rate": 2.2362912382736857e-05,
4124
+ "loss": 0.381,
4125
+ "step": 63000
4126
+ },
4127
+ {
4128
+ "epoch": 2.35,
4129
+ "eval_loss": 0.4852922856807709,
4130
+ "eval_runtime": 1229.4457,
4131
+ "eval_samples_per_second": 0.441,
4132
+ "eval_steps_per_second": 0.441,
4133
+ "step": 63000
4134
+ },
4135
+ {
4136
+ "epoch": 2.37,
4137
+ "learning_rate": 2.1147211247491084e-05,
4138
+ "loss": 0.3728,
4139
+ "step": 63500
4140
+ },
4141
+ {
4142
+ "epoch": 2.37,
4143
+ "eval_loss": 0.484967440366745,
4144
+ "eval_runtime": 1296.2845,
4145
+ "eval_samples_per_second": 0.418,
4146
+ "eval_steps_per_second": 0.418,
4147
+ "step": 63500
4148
+ },
4149
+ {
4150
+ "epoch": 2.39,
4151
+ "learning_rate": 1.9961568004900565e-05,
4152
+ "loss": 0.3695,
4153
+ "step": 64000
4154
+ },
4155
+ {
4156
+ "epoch": 2.39,
4157
+ "eval_loss": 0.4844016432762146,
4158
+ "eval_runtime": 1317.5418,
4159
+ "eval_samples_per_second": 0.411,
4160
+ "eval_steps_per_second": 0.411,
4161
+ "step": 64000
4162
+ },
4163
+ {
4164
+ "epoch": 2.4,
4165
+ "learning_rate": 1.8806434610284497e-05,
4166
+ "loss": 0.3682,
4167
+ "step": 64500
4168
+ },
4169
+ {
4170
+ "epoch": 2.4,
4171
+ "eval_loss": 0.4838670790195465,
4172
+ "eval_runtime": 1337.5922,
4173
+ "eval_samples_per_second": 0.405,
4174
+ "eval_steps_per_second": 0.405,
4175
+ "step": 64500
4176
+ },
4177
+ {
4178
+ "epoch": 2.42,
4179
+ "learning_rate": 1.768225138891393e-05,
4180
+ "loss": 0.3594,
4181
+ "step": 65000
4182
+ },
4183
+ {
4184
+ "epoch": 2.42,
4185
+ "eval_loss": 0.48305046558380127,
4186
+ "eval_runtime": 1317.2888,
4187
+ "eval_samples_per_second": 0.411,
4188
+ "eval_steps_per_second": 0.411,
4189
+ "step": 65000
4190
+ },
4191
+ {
4192
+ "epoch": 2.44,
4193
+ "learning_rate": 1.6589446868164037e-05,
4194
+ "loss": 0.367,
4195
+ "step": 65500
4196
+ },
4197
+ {
4198
+ "epoch": 2.44,
4199
+ "eval_loss": 0.48225167393684387,
4200
+ "eval_runtime": 1315.9763,
4201
+ "eval_samples_per_second": 0.412,
4202
+ "eval_steps_per_second": 0.412,
4203
+ "step": 65500
4204
+ },
4205
+ {
4206
+ "epoch": 2.46,
4207
+ "learning_rate": 1.552843761416395e-05,
4208
+ "loss": 0.3781,
4209
+ "step": 66000
4210
+ },
4211
+ {
4212
+ "epoch": 2.46,
4213
+ "eval_loss": 0.48182958364486694,
4214
+ "eval_runtime": 1298.0711,
4215
+ "eval_samples_per_second": 0.418,
4216
+ "eval_steps_per_second": 0.418,
4217
+ "step": 66000
4218
+ },
4219
+ {
4220
+ "epoch": 2.48,
4221
+ "learning_rate": 1.4499628073005733e-05,
4222
+ "loss": 0.3632,
4223
+ "step": 66500
4224
+ },
4225
+ {
4226
+ "epoch": 2.48,
4227
+ "eval_loss": 0.48136985301971436,
4228
+ "eval_runtime": 1295.6256,
4229
+ "eval_samples_per_second": 0.418,
4230
+ "eval_steps_per_second": 0.418,
4231
+ "step": 66500
4232
+ },
4233
+ {
4234
+ "epoch": 2.5,
4235
+ "learning_rate": 1.350341041657378e-05,
4236
+ "loss": 0.3707,
4237
+ "step": 67000
4238
+ },
4239
+ {
4240
+ "epoch": 2.5,
4241
+ "eval_loss": 0.48081424832344055,
4242
+ "eval_runtime": 1297.8801,
4243
+ "eval_samples_per_second": 0.418,
4244
+ "eval_steps_per_second": 0.418,
4245
+ "step": 67000
4246
+ },
4247
+ {
4248
+ "epoch": 2.52,
4249
+ "learning_rate": 1.2540164393052622e-05,
4250
+ "loss": 0.3657,
4251
+ "step": 67500
4252
+ },
4253
+ {
4254
+ "epoch": 2.52,
4255
+ "eval_loss": 0.48031187057495117,
4256
+ "eval_runtime": 1299.2471,
4257
+ "eval_samples_per_second": 0.417,
4258
+ "eval_steps_per_second": 0.417,
4259
+ "step": 67500
4260
+ },
4261
+ {
4262
+ "epoch": 2.54,
4263
+ "learning_rate": 1.1610257182170914e-05,
4264
+ "loss": 0.3742,
4265
+ "step": 68000
4266
+ },
4267
+ {
4268
+ "epoch": 2.54,
4269
+ "eval_loss": 0.479922354221344,
4270
+ "eval_runtime": 1275.2567,
4271
+ "eval_samples_per_second": 0.425,
4272
+ "eval_steps_per_second": 0.425,
4273
+ "step": 68000
4274
+ },
4275
+ {
4276
+ "epoch": 2.55,
4277
+ "learning_rate": 1.0714043255236094e-05,
4278
+ "loss": 0.3761,
4279
+ "step": 68500
4280
+ },
4281
+ {
4282
+ "epoch": 2.55,
4283
+ "eval_loss": 0.4795922338962555,
4284
+ "eval_runtime": 1321.5276,
4285
+ "eval_samples_per_second": 0.41,
4286
+ "eval_steps_per_second": 0.41,
4287
+ "step": 68500
4288
+ },
4289
+ {
4290
+ "epoch": 2.57,
4291
+ "learning_rate": 9.851864240013509e-06,
4292
+ "loss": 0.3754,
4293
+ "step": 69000
4294
+ },
4295
+ {
4296
+ "epoch": 2.57,
4297
+ "eval_loss": 0.4789520502090454,
4298
+ "eval_runtime": 1345.4528,
4299
+ "eval_samples_per_second": 0.403,
4300
+ "eval_steps_per_second": 0.403,
4301
+ "step": 69000
4302
+ },
4303
+ {
4304
+ "epoch": 2.59,
4305
+ "learning_rate": 9.024048790501272e-06,
4306
+ "loss": 0.3594,
4307
+ "step": 69500
4308
+ },
4309
+ {
4310
+ "epoch": 2.59,
4311
+ "eval_loss": 0.47866225242614746,
4312
+ "eval_runtime": 1316.9883,
4313
+ "eval_samples_per_second": 0.412,
4314
+ "eval_steps_per_second": 0.412,
4315
+ "step": 69500
4316
+ },
4317
+ {
4318
+ "epoch": 2.61,
4319
+ "learning_rate": 8.230912461650797e-06,
4320
+ "loss": 0.3601,
4321
+ "step": 70000
4322
+ },
4323
+ {
4324
+ "epoch": 2.61,
4325
+ "eval_loss": 0.47838443517684937,
4326
+ "eval_runtime": 1306.7325,
4327
+ "eval_samples_per_second": 0.415,
4328
+ "eval_steps_per_second": 0.415,
4329
+ "step": 70000
4330
  }
4331
  ],
4332
+ "logging_steps": 500,
4333
  "max_steps": 80463,
4334
  "num_train_epochs": 3,
4335
+ "save_steps": 500,
4336
+ "total_flos": 1.96476655962565e+19,
4337
  "trial_name": null,
4338
  "trial_params": null
4339
  }
{checkpoint-59000 β†’ checkpoint-70000}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8ca8c55b410908f1a6fb4d78d55fe6aad82bbca76ec8021e18981496f18fa70
3
- size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35750ce2c97e67db338d1121db50269062def2ea29de48747dfd43b7a072ee79
3
+ size 4155
checkpoint-70500/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-70500/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "v_proj",
19
+ "down_proj",
20
+ "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-70500/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a26259b6c7f10eacd37169a51779a24aa9d6a76d8fdef027422bdcbf2557c2f
3
+ size 500897101
checkpoint-70500/adapter_model/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.5.0.dev0
checkpoint-70500/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 32,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "v_proj",
19
+ "down_proj",
20
+ "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-70500/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a26259b6c7f10eacd37169a51779a24aa9d6a76d8fdef027422bdcbf2557c2f
3
+ size 500897101