Training in progress, epoch 1
Browse files- eval_job_output.txt +100 -4
- logs/events.out.tfevents.1715270185.sphinx2 +3 -0
- model.safetensors +1 -1
- train_job_output.txt +0 -0
eval_job_output.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
slurm submission log: 2024-05-
|
2 |
created following sbatch script:
|
3 |
|
4 |
###############################
|
@@ -7,9 +7,9 @@ created following sbatch script:
|
|
7 |
|
8 |
#SBATCH --account=nlp
|
9 |
#SBATCH --cpus-per-task=16
|
10 |
-
#SBATCH --dependency=afterok:
|
11 |
#SBATCH --gres=gpu:1
|
12 |
-
#SBATCH --job-name=tthrush-job-
|
13 |
#SBATCH --mem=60G
|
14 |
#SBATCH --nodelist=sphinx2
|
15 |
#SBATCH --open-mode=append
|
@@ -34,7 +34,103 @@ submission to slurm complete!
|
|
34 |
###############################
|
35 |
slurm submission output
|
36 |
|
37 |
-
Submitted batch job
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
|
|
|
1 |
+
slurm submission log: 2024-05-08 15:15:14.783860
|
2 |
created following sbatch script:
|
3 |
|
4 |
###############################
|
|
|
7 |
|
8 |
#SBATCH --account=nlp
|
9 |
#SBATCH --cpus-per-task=16
|
10 |
+
#SBATCH --dependency=afterok:7590683
|
11 |
#SBATCH --gres=gpu:1
|
12 |
+
#SBATCH --job-name=tthrush-job-534086
|
13 |
#SBATCH --mem=60G
|
14 |
#SBATCH --nodelist=sphinx2
|
15 |
#SBATCH --open-mode=append
|
|
|
34 |
###############################
|
35 |
slurm submission output
|
36 |
|
37 |
+
Submitted batch job 7590684
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
###############################
|
42 |
+
|
43 |
+
###############################
|
44 |
+
start time: 2024-05-08 16:30:21.427634
|
45 |
+
machine: sphinx2
|
46 |
+
conda env: pretraining-coreset-selection
|
47 |
+
###############################
|
48 |
+
running following processes
|
49 |
+
|
50 |
+
lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_xnli_en,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_xnli_en/perf
|
51 |
+
|
52 |
+
|
53 |
+
###############################
|
54 |
+
command outputs:
|
55 |
+
|
56 |
+
|
57 |
+
2024-05-08:16:30:23,469 INFO [utils.py:145] Note: detected 255 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
|
58 |
+
2024-05-08:16:30:23,469 INFO [utils.py:148] Note: NumExpr detected 255 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
|
59 |
+
2024-05-08:16:30:23,469 INFO [utils.py:160] NumExpr defaulting to 8 threads.
|
60 |
+
2024-05-08:16:30:23,683 INFO [config.py:58] PyTorch version 2.2.2 available.
|
61 |
+
2024-05-08:16:30:26,810 INFO [__main__.py:156] Verbosity set to INFO
|
62 |
+
2024-05-08:16:30:32,728 WARNING [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
|
63 |
+
/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py:1429: FutureWarning: The repository for hails/mmlu_no_train contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hails/mmlu_no_train
|
64 |
+
You can avoid this message in future by passing the argument `trust_remote_code=True`.
|
65 |
+
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
|
66 |
+
warnings.warn(
|
67 |
+
2024-05-08:16:31:48,007 WARNING [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
|
68 |
+
2024-05-08:16:31:48,012 INFO [__main__.py:229] Selected Tasks: ['arc_easy', 'lambada', 'piqa', 'sciq', 'xnli_en', 'xnli_fr']
|
69 |
+
2024-05-08:16:31:48,364 INFO [huggingface.py:148] Using device 'cuda'
|
70 |
+
Traceback (most recent call last):
|
71 |
+
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/lm_eval", line 8, in <module>
|
72 |
+
sys.exit(cli_evaluate())
|
73 |
+
File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/__main__.py", line 231, in cli_evaluate
|
74 |
+
results = evaluator.simple_evaluate(
|
75 |
+
File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/utils.py", line 415, in _wrapper
|
76 |
+
return fn(*args, **kwargs)
|
77 |
+
File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/evaluator.py", line 98, in simple_evaluate
|
78 |
+
lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
|
79 |
+
File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/api/model.py", line 134, in create_from_arg_string
|
80 |
+
return cls(**args, **args2)
|
81 |
+
File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/models/huggingface.py", line 174, in __init__
|
82 |
+
self._get_config(
|
83 |
+
File "/sailhome/tthrush/lm-evaluation-harness/lm_eval/models/huggingface.py", line 420, in _get_config
|
84 |
+
self._config = transformers.AutoConfig.from_pretrained(
|
85 |
+
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 1138, in from_pretrained
|
86 |
+
config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
87 |
+
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 631, in get_config_dict
|
88 |
+
config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
|
89 |
+
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 686, in _get_config_dict
|
90 |
+
resolved_config_file = cached_file(
|
91 |
+
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/utils/hub.py", line 369, in cached_file
|
92 |
+
raise EnvironmentError(
|
93 |
+
OSError: /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_xnli_en does not appear to have a file named config.json. Checkout 'https://huggingface.co//juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_xnli_en/tree/main' for available files.
|
94 |
+
###############################
|
95 |
+
end time: 2024-05-08 16:31:51.524193
|
96 |
+
elapsed time: 0:01:30.096559
|
97 |
+
slurm submission log: 2024-05-09 07:34:36.889218
|
98 |
+
created following sbatch script:
|
99 |
+
|
100 |
+
###############################
|
101 |
+
|
102 |
+
#!/bin/bash
|
103 |
+
|
104 |
+
#SBATCH --account=nlp
|
105 |
+
#SBATCH --cpus-per-task=16
|
106 |
+
#SBATCH --dependency=afterok:7591646
|
107 |
+
#SBATCH --gres=gpu:1
|
108 |
+
#SBATCH --job-name=tthrush-job-4681876
|
109 |
+
#SBATCH --mem=60G
|
110 |
+
#SBATCH --nodelist=sphinx2
|
111 |
+
#SBATCH --open-mode=append
|
112 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_xnli_en/eval_job_output.txt
|
113 |
+
#SBATCH --partition=sphinx
|
114 |
+
#SBATCH --time=14-0
|
115 |
+
|
116 |
+
# activate your desired anaconda environment
|
117 |
+
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
118 |
+
|
119 |
+
# cd to working directory
|
120 |
+
cd .
|
121 |
+
|
122 |
+
# launch commands
|
123 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_xnli_en,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_xnli_en/perf'
|
124 |
+
|
125 |
+
###############################
|
126 |
+
|
127 |
+
submission to slurm complete!
|
128 |
+
|
129 |
+
|
130 |
+
###############################
|
131 |
+
slurm submission output
|
132 |
+
|
133 |
+
Submitted batch job 7591647
|
134 |
|
135 |
|
136 |
|
logs/events.out.tfevents.1715270185.sphinx2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b1c1c687b1100aa90b7c7592fa5bf79868362accc8d1fa9250cb6649bc893ab
|
3 |
+
size 95282
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 281715176
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b5637eacff0cbb3d000b41088168f088919a8cc4a1f613b4eec974fc29ba7202
|
3 |
size 281715176
|
train_job_output.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|