Tristan commited on
Commit
39872b4
1 Parent(s): 8e5853d

Training in progress, epoch 1

Browse files
eval_job_output.txt CHANGED
@@ -1,4 +1,4 @@
1
- slurm submission log: 2024-05-09 15:03:32.459554
2
  created following sbatch script:
3
 
4
  ###############################
@@ -7,13 +7,13 @@ created following sbatch script:
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
- #SBATCH --dependency=afterok:7592317
11
  #SBATCH --gres=gpu:1
12
- #SBATCH --job-name=tthrush-job-964700
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx2
15
  #SBATCH --open-mode=append
16
- #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_xnli_en/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
@@ -24,7 +24,7 @@ created following sbatch script:
24
  cd .
25
 
26
  # launch commands
27
- srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_xnli_en,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_xnli_en/perf'
28
 
29
  ###############################
30
 
@@ -34,7 +34,133 @@ submission to slurm complete!
34
  ###############################
35
  slurm submission output
36
 
37
- Submitted batch job 7592318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
 
 
1
+ slurm submission log: 2024-05-09 22:15:09.685549
2
  created following sbatch script:
3
 
4
  ###############################
 
7
 
8
  #SBATCH --account=nlp
9
  #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:7593066
11
  #SBATCH --gres=gpu:1
12
+ #SBATCH --job-name=tthrush-job-4826856
13
  #SBATCH --mem=60G
14
  #SBATCH --nodelist=sphinx2
15
  #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/eval_job_output.txt
17
  #SBATCH --partition=sphinx
18
  #SBATCH --time=14-0
19
 
 
24
  cd .
25
 
26
  # launch commands
27
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/perf'
28
 
29
  ###############################
30
 
 
34
  ###############################
35
  slurm submission output
36
 
37
+ Submitted batch job 7593067
38
+
39
+
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-09 23:03:12.596676
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7593144
53
+ #SBATCH --gres=gpu:1
54
+ #SBATCH --job-name=tthrush-job-1449593
55
+ #SBATCH --mem=60G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/eval_job_output.txt
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/perf'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7593145
80
+
81
+
82
+
83
+ ###############################
84
+
85
+ slurm submission log: 2024-05-10 08:21:52.650420
86
+ created following sbatch script:
87
+
88
+ ###############################
89
+
90
+ #!/bin/bash
91
+
92
+ #SBATCH --account=nlp
93
+ #SBATCH --cpus-per-task=16
94
+ #SBATCH --dependency=afterok:7593605
95
+ #SBATCH --gres=gpu:1
96
+ #SBATCH --job-name=tthrush-job-1511651
97
+ #SBATCH --mem=60G
98
+ #SBATCH --nodelist=sphinx1
99
+ #SBATCH --open-mode=append
100
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/eval_job_output.txt
101
+ #SBATCH --partition=sphinx
102
+ #SBATCH --time=14-0
103
+
104
+ # activate your desired anaconda environment
105
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
106
+
107
+ # cd to working directory
108
+ cd .
109
+
110
+ # launch commands
111
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/perf'
112
+
113
+ ###############################
114
+
115
+ submission to slurm complete!
116
+
117
+
118
+ ###############################
119
+ slurm submission output
120
+
121
+ Submitted batch job 7593606
122
+
123
+
124
+
125
+ ###############################
126
+
127
+ slurm submission log: 2024-05-10 08:23:18.600174
128
+ created following sbatch script:
129
+
130
+ ###############################
131
+
132
+ #!/bin/bash
133
+
134
+ #SBATCH --account=nlp
135
+ #SBATCH --cpus-per-task=16
136
+ #SBATCH --dependency=afterok:7593618
137
+ #SBATCH --gres=gpu:1
138
+ #SBATCH --job-name=tthrush-job-2660082
139
+ #SBATCH --mem=60G
140
+ #SBATCH --nodelist=sphinx2
141
+ #SBATCH --open-mode=append
142
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/eval_job_output.txt
143
+ #SBATCH --partition=sphinx
144
+ #SBATCH --time=14-0
145
+
146
+ # activate your desired anaconda environment
147
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
148
+
149
+ # cd to working directory
150
+ cd .
151
+
152
+ # launch commands
153
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/perf'
154
+
155
+ ###############################
156
+
157
+ submission to slurm complete!
158
+
159
+
160
+ ###############################
161
+ slurm submission output
162
+
163
+ Submitted batch job 7593619
164
 
165
 
166
 
logs/events.out.tfevents.1715378050.sphinx2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:367b56a7197e3e691141f03054206d000081646ff06bfbe1388b784a99ac7de9
3
+ size 11521
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b273ff2a9804e3488f6f847e84b099718af578ca28877329de9c2dc7acf7293
3
  size 281715176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75ce8b0dffc07c7350558ed3e247117b68570fc0d7d717b2f8906caa9f00f00
3
  size 281715176
train_job_o ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ slurm submission log: 2024-05-09 22:15:09.451748
2
+ created following sbatch script:
3
+
4
+ ###############################
5
+
6
+ #!/bin/bash
7
+
8
+ #SBATCH --account=nlp
9
+ #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:7593065
11
+ #SBATCH --gres=gpu:2
12
+ #SBATCH --job-name=tthrush-job-3358034
13
+ #SBATCH --mem=400G
14
+ #SBATCH --nodelist=sphinx2
15
+ #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/train_job_o
17
+ #SBATCH --partition=sphinx
18
+ #SBATCH --time=14-0
19
+
20
+ # activate your desired anaconda environment
21
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
22
+
23
+ # cd to working directory
24
+ cd .
25
+
26
+ # launch commands
27
+ srun --unbuffered run_as_child_processes 'utput.txt' 'torchrun --master_port 29500 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/xnli_en --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en --o utput_hub_id pythia-70m_xnli_en --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
28
+
29
+ ###############################
30
+
31
+ submission to slurm complete!
32
+
33
+
34
+ ###############################
35
+ slurm submission output
36
+
37
+ Submitted batch job 7593066
38
+
39
+
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-09 23:03:12.351551
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7593143
53
+ #SBATCH --gres=gpu:2
54
+ #SBATCH --job-name=tthrush-job-2078504
55
+ #SBATCH --mem=400G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en/train_job_o
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'utput.txt' 'torchrun --master_port 29500 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/xnli_en --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_xnli_en --o utput_hub_id pythia-70m_xnli_en --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7593144
80
+
81
+
82
+
83
+ ###############################
84
+
train_job_output.txt CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ff883ad86802128c0aa97e80f71b71c55a94c13a43f3cae8dfab6dfcb3e074d
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d9e15a46006853a122970f426d6179ad8f955096cd5269037dcc8e49a307e60
3
  size 5112