none
commited on
Commit
•
e235434
1
Parent(s):
3592a0d
wip
Browse files- Dockerfile +3 -2
- learn.py +13 -2
- start.sh +6 -2
Dockerfile
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
# build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
|
2 |
# run with
|
3 |
-
# docker run --gpus all --user=42420:42420 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh hf_TOKEN
|
|
|
4 |
FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
|
5 |
# FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
|
6 |
# RUN mkdir -p /workspace
|
@@ -9,7 +10,6 @@ RUN /usr/sbin/useradd -u 42420 --gid 42420 -m -d /workspace -s /bin/bash ovh
|
|
9 |
RUN apt update -y && apt-get install -y curl git git-lfs screen
|
10 |
COPY --chmod=777 start.sh /start.sh
|
11 |
COPY learn.py /learn.py
|
12 |
-
COPY preload.py /preload.py
|
13 |
# Mandatory to run the jobs in rootless mode
|
14 |
USER root
|
15 |
RUN chown -R 42420:42420 /workspace
|
@@ -25,6 +25,7 @@ RUN . /workspace/.miniconda3/bin/activate \
|
|
25 |
&& pip install -U Pillow \
|
26 |
&& pip install -U torchvision torchaudio
|
27 |
RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
|
|
|
28 |
# Mandatory to run the jobs in rootless mode
|
29 |
# USER root
|
30 |
# RUN chown -R 42420:42420 /workspace
|
|
|
1 |
# build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
|
2 |
# run with
|
3 |
+
# docker run --gpus all --user=42420:42420 -p 8080:8080 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh sleep infinity hf_TOKEN
|
4 |
+
# docker run --gpus all --user=42420:42420 -p 8080:8080 -it sctg/roco-idefics3:0.0.5 bash -i /start.sh python /learn.py hf_...
|
5 |
FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
|
6 |
# FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
|
7 |
# RUN mkdir -p /workspace
|
|
|
10 |
RUN apt update -y && apt-get install -y curl git git-lfs screen
|
11 |
COPY --chmod=777 start.sh /start.sh
|
12 |
COPY learn.py /learn.py
|
|
|
13 |
# Mandatory to run the jobs in rootless mode
|
14 |
USER root
|
15 |
RUN chown -R 42420:42420 /workspace
|
|
|
25 |
&& pip install -U Pillow \
|
26 |
&& pip install -U torchvision torchaudio
|
27 |
RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
|
28 |
+
RUN rm -f /workspace/miniconda.sh
|
29 |
# Mandatory to run the jobs in rootless mode
|
30 |
# USER root
|
31 |
# RUN chown -R 42420:42420 /workspace
|
learn.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
# License: Apache License 2.0
|
3 |
# Description: Train the model on the dataset
|
4 |
import os
|
|
|
5 |
import torch
|
6 |
|
7 |
from huggingface_hub import login as hf_login
|
@@ -12,11 +13,17 @@ from datasets.utils.logging import disable_progress_bar
|
|
12 |
disable_progress_bar()
|
13 |
|
14 |
HF_TOKEN = ""
|
|
|
15 |
|
16 |
if os.environ.get('HF_TOKEN') is not None:
|
17 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
18 |
print(f"Hugging Face token found in environment variable")
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
hf_login(
|
21 |
token=HF_TOKEN,
|
22 |
add_to_git_credential=True
|
@@ -27,7 +34,10 @@ source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
|
|
27 |
destination_model_id = "eltorio/IDEFICS3_ROCOv2"
|
28 |
output_dir = "IDEFICS3_ROCOv2"
|
29 |
cache_dir = "/workspace/data"
|
30 |
-
|
|
|
|
|
|
|
31 |
|
32 |
DEVICE = "cuda:0"
|
33 |
USE_LORA = False
|
@@ -127,7 +137,7 @@ training_args = TrainingArguments(
|
|
127 |
gradient_accumulation_steps = 8,
|
128 |
dataloader_pin_memory = False,
|
129 |
save_total_limit = 3,
|
130 |
-
|
131 |
save_strategy = "steps",
|
132 |
eval_steps = 100,
|
133 |
save_steps = 10, # checkpoint each 10 steps
|
@@ -146,6 +156,7 @@ trainer = Trainer(
|
|
146 |
args = training_args,
|
147 |
data_collator = data_collator,
|
148 |
train_dataset = train_dataset,
|
|
|
149 |
)
|
150 |
|
151 |
trainer.train()
|
|
|
2 |
# License: Apache License 2.0
|
3 |
# Description: Train the model on the dataset
|
4 |
import os
|
5 |
+
import sys
|
6 |
import torch
|
7 |
|
8 |
from huggingface_hub import login as hf_login
|
|
|
13 |
disable_progress_bar()
|
14 |
|
15 |
HF_TOKEN = ""
|
16 |
+
arguments = sys.argv[1:]
|
17 |
|
18 |
if os.environ.get('HF_TOKEN') is not None:
|
19 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
20 |
print(f"Hugging Face token found in environment variable")
|
21 |
|
22 |
+
# If HF_TOKEN is empty checks if the first argument seems to be the token (ie starts with "hf_" )
|
23 |
+
if not HF_TOKEN and arguments and arguments[0].startswith("hf_"):
|
24 |
+
HF_TOKEN = arguments[0]
|
25 |
+
print(f"Hugging Face token found in script arguments")
|
26 |
+
|
27 |
hf_login(
|
28 |
token=HF_TOKEN,
|
29 |
add_to_git_credential=True
|
|
|
34 |
destination_model_id = "eltorio/IDEFICS3_ROCOv2"
|
35 |
output_dir = "IDEFICS3_ROCOv2"
|
36 |
cache_dir = "/workspace/data"
|
37 |
+
|
38 |
+
full_dataset = load_dataset(dataset_id,keep_in_memory=False)
|
39 |
+
train_dataset = full_dataset["train"]
|
40 |
+
eval_dataset = full_dataset["validation"]
|
41 |
|
42 |
DEVICE = "cuda:0"
|
43 |
USE_LORA = False
|
|
|
137 |
gradient_accumulation_steps = 8,
|
138 |
dataloader_pin_memory = False,
|
139 |
save_total_limit = 3,
|
140 |
+
eval_strategy = "steps",
|
141 |
save_strategy = "steps",
|
142 |
eval_steps = 100,
|
143 |
save_steps = 10, # checkpoint each 10 steps
|
|
|
156 |
args = training_args,
|
157 |
data_collator = data_collator,
|
158 |
train_dataset = train_dataset,
|
159 |
+
eval_dataset = train_dataset,
|
160 |
)
|
161 |
|
162 |
trainer.train()
|
start.sh
CHANGED
@@ -6,14 +6,18 @@ export HOME=/workspace
|
|
6 |
cd /workspace
|
7 |
git lfs install
|
8 |
if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
9 |
-
export HF_TOKEN=$
|
10 |
-
unset $
|
11 |
fi
|
12 |
|
13 |
echo "HF_TOKEN: $HF_TOKEN"
|
14 |
. /workspace/.bashrc
|
15 |
. /workspace/.miniconda3/bin/activate
|
|
|
|
|
16 |
git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
|
|
|
|
|
17 |
git config --global credential.helper store
|
18 |
|
19 |
huggingface-cli login --add-to-git-credential --token $HF_TOKEN
|
|
|
6 |
cd /workspace
|
7 |
git lfs install
|
8 |
if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
9 |
+
export HF_TOKEN=${!#}
|
10 |
+
unset ${!#}
|
11 |
fi
|
12 |
|
13 |
echo "HF_TOKEN: $HF_TOKEN"
|
14 |
. /workspace/.bashrc
|
15 |
. /workspace/.miniconda3/bin/activate
|
16 |
+
export SHELL=/bin/bash
|
17 |
+
|
18 |
git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
|
19 |
+
git config --global user.email "[email protected]"
|
20 |
+
git config --global user.name "[email protected]"
|
21 |
git config --global credential.helper store
|
22 |
|
23 |
huggingface-cli login --add-to-git-credential --token $HF_TOKEN
|