none commited on
Commit
e235434
1 Parent(s): 3592a0d
Files changed (3) hide show
  1. Dockerfile +3 -2
  2. learn.py +13 -2
  3. start.sh +6 -2
Dockerfile CHANGED
@@ -1,6 +1,7 @@
1
  # build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
2
  # run with
3
- # docker run --gpus all --user=42420:42420 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh hf_TOKEN
 
4
  FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
5
  # FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
6
  # RUN mkdir -p /workspace
@@ -9,7 +10,6 @@ RUN /usr/sbin/useradd -u 42420 --gid 42420 -m -d /workspace -s /bin/bash ovh
9
  RUN apt update -y && apt-get install -y curl git git-lfs screen
10
  COPY --chmod=777 start.sh /start.sh
11
  COPY learn.py /learn.py
12
- COPY preload.py /preload.py
13
  # Mandatory to run the jobs in rootless mode
14
  USER root
15
  RUN chown -R 42420:42420 /workspace
@@ -25,6 +25,7 @@ RUN . /workspace/.miniconda3/bin/activate \
25
  && pip install -U Pillow \
26
  && pip install -U torchvision torchaudio
27
  RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
 
28
  # Mandatory to run the jobs in rootless mode
29
  # USER root
30
  # RUN chown -R 42420:42420 /workspace
 
1
  # build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
2
  # run with
3
+ # docker run --gpus all --user=42420:42420 -p 8080:8080 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh sleep infinity hf_TOKEN
4
+ # docker run --gpus all --user=42420:42420 -p 8080:8080 -it sctg/roco-idefics3:0.0.5 bash -i /start.sh python /learn.py hf_...
5
  FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
6
  # FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
7
  # RUN mkdir -p /workspace
 
10
  RUN apt update -y && apt-get install -y curl git git-lfs screen
11
  COPY --chmod=777 start.sh /start.sh
12
  COPY learn.py /learn.py
 
13
  # Mandatory to run the jobs in rootless mode
14
  USER root
15
  RUN chown -R 42420:42420 /workspace
 
25
  && pip install -U Pillow \
26
  && pip install -U torchvision torchaudio
27
  RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
28
+ RUN rm -f /workspace/miniconda.sh
29
  # Mandatory to run the jobs in rootless mode
30
  # USER root
31
  # RUN chown -R 42420:42420 /workspace
learn.py CHANGED
@@ -2,6 +2,7 @@
2
  # License: Apache License 2.0
3
  # Description: Train the model on the dataset
4
  import os
 
5
  import torch
6
 
7
  from huggingface_hub import login as hf_login
@@ -12,11 +13,17 @@ from datasets.utils.logging import disable_progress_bar
12
  disable_progress_bar()
13
 
14
  HF_TOKEN = ""
 
15
 
16
  if os.environ.get('HF_TOKEN') is not None:
17
  HF_TOKEN = os.environ.get('HF_TOKEN')
18
  print(f"Hugging Face token found in environment variable")
19
 
 
 
 
 
 
20
  hf_login(
21
  token=HF_TOKEN,
22
  add_to_git_credential=True
@@ -27,7 +34,10 @@ source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
27
  destination_model_id = "eltorio/IDEFICS3_ROCOv2"
28
  output_dir = "IDEFICS3_ROCOv2"
29
  cache_dir = "/workspace/data"
30
- train_dataset = load_dataset(dataset_id, split="train", cache_dir=cache_dir)
 
 
 
31
 
32
  DEVICE = "cuda:0"
33
  USE_LORA = False
@@ -127,7 +137,7 @@ training_args = TrainingArguments(
127
  gradient_accumulation_steps = 8,
128
  dataloader_pin_memory = False,
129
  save_total_limit = 3,
130
- evaluation_strategy = None,
131
  save_strategy = "steps",
132
  eval_steps = 100,
133
  save_steps = 10, # checkpoint each 10 steps
@@ -146,6 +156,7 @@ trainer = Trainer(
146
  args = training_args,
147
  data_collator = data_collator,
148
  train_dataset = train_dataset,
 
149
  )
150
 
151
  trainer.train()
 
2
  # License: Apache License 2.0
3
  # Description: Train the model on the dataset
4
  import os
5
+ import sys
6
  import torch
7
 
8
  from huggingface_hub import login as hf_login
 
13
  disable_progress_bar()
14
 
15
  HF_TOKEN = ""
16
+ arguments = sys.argv[1:]
17
 
18
  if os.environ.get('HF_TOKEN') is not None:
19
  HF_TOKEN = os.environ.get('HF_TOKEN')
20
  print(f"Hugging Face token found in environment variable")
21
 
22
+ # If HF_TOKEN is empty checks if the first argument seems to be the token (ie starts with "hf_" )
23
+ if not HF_TOKEN and arguments and arguments[0].startswith("hf_"):
24
+ HF_TOKEN = arguments[0]
25
+ print(f"Hugging Face token found in script arguments")
26
+
27
  hf_login(
28
  token=HF_TOKEN,
29
  add_to_git_credential=True
 
34
  destination_model_id = "eltorio/IDEFICS3_ROCOv2"
35
  output_dir = "IDEFICS3_ROCOv2"
36
  cache_dir = "/workspace/data"
37
+
38
+ full_dataset = load_dataset(dataset_id,keep_in_memory=False)
39
+ train_dataset = full_dataset["train"]
40
+ eval_dataset = full_dataset["validation"]
41
 
42
  DEVICE = "cuda:0"
43
  USE_LORA = False
 
137
  gradient_accumulation_steps = 8,
138
  dataloader_pin_memory = False,
139
  save_total_limit = 3,
140
+ eval_strategy = "steps",
141
  save_strategy = "steps",
142
  eval_steps = 100,
143
  save_steps = 10, # checkpoint each 10 steps
 
156
  args = training_args,
157
  data_collator = data_collator,
158
  train_dataset = train_dataset,
159
+ eval_dataset = train_dataset,
160
  )
161
 
162
  trainer.train()
start.sh CHANGED
@@ -6,14 +6,18 @@ export HOME=/workspace
6
  cd /workspace
7
  git lfs install
8
  if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
9
- export HF_TOKEN=$1
10
- unset $1
11
  fi
12
 
13
  echo "HF_TOKEN: $HF_TOKEN"
14
  . /workspace/.bashrc
15
  . /workspace/.miniconda3/bin/activate
 
 
16
  git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
 
 
17
  git config --global credential.helper store
18
 
19
  huggingface-cli login --add-to-git-credential --token $HF_TOKEN
 
6
  cd /workspace
7
  git lfs install
8
  if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
9
+ export HF_TOKEN=${!#}
10
+ unset ${!#}
11
  fi
12
 
13
  echo "HF_TOKEN: $HF_TOKEN"
14
  . /workspace/.bashrc
15
  . /workspace/.miniconda3/bin/activate
16
+ export SHELL=/bin/bash
17
+
18
  git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
19
+ git config --global user.email "[email protected]"
20
+ git config --global user.name "[email protected]"
21
  git config --global credential.helper store
22
 
23
  huggingface-cli login --add-to-git-credential --token $HF_TOKEN