eltorio
/

IDEFICS3_ROCOv2

none commited on 8 days ago

Commit

e235434

•

1 Parent(s): 3592a0d

wip

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -1,6 +1,7 @@
 # build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
 # run with
-# docker run --gpus all --user=42420:42420 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh hf_TOKEN
 FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
 # FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
 # RUN mkdir -p /workspace
@@ -9,7 +10,6 @@ RUN /usr/sbin/useradd -u 42420 --gid 42420 -m -d /workspace -s /bin/bash ovh
 RUN apt update -y && apt-get install -y curl git git-lfs screen
 COPY --chmod=777 start.sh /start.sh
 COPY learn.py /learn.py
-COPY preload.py /preload.py
 # Mandatory to run the jobs in rootless mode
 USER root
 RUN chown -R 42420:42420 /workspace
@@ -25,6 +25,7 @@ RUN . /workspace/.miniconda3/bin/activate \
     && pip install -U Pillow \
     && pip install -U torchvision torchaudio
 RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
 # Mandatory to run the jobs in rootless mode
 # USER root
 # RUN chown -R 42420:42420 /workspace

 # build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
 # run with
+# docker run --gpus all --user=42420:42420 -p 8080:8080 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh sleep infinity hf_TOKEN
+# docker run --gpus all --user=42420:42420 -p 8080:8080 -it sctg/roco-idefics3:0.0.5 bash -i /start.sh python /learn.py hf_...
 FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
 # FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
 # RUN mkdir -p /workspace
 RUN apt update -y && apt-get install -y curl git git-lfs screen
 COPY --chmod=777 start.sh /start.sh
 COPY learn.py /learn.py
 # Mandatory to run the jobs in rootless mode
 USER root
 RUN chown -R 42420:42420 /workspace
     && pip install -U Pillow \
     && pip install -U torchvision torchaudio
 RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
+RUN rm -f /workspace/miniconda.sh
 # Mandatory to run the jobs in rootless mode
 # USER root
 # RUN chown -R 42420:42420 /workspace

learn.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # License: Apache License 2.0
 # Description: Train the model on the dataset
 import os
 import torch
 from huggingface_hub import login as hf_login
@@ -12,11 +13,17 @@ from datasets.utils.logging import disable_progress_bar
 disable_progress_bar()
 HF_TOKEN = ""
 if os.environ.get('HF_TOKEN') is not None:
   HF_TOKEN = os.environ.get('HF_TOKEN')
   print(f"Hugging Face token found in environment variable")
 hf_login(
   token=HF_TOKEN,
   add_to_git_credential=True
@@ -27,7 +34,10 @@ source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
 destination_model_id = "eltorio/IDEFICS3_ROCOv2"
 output_dir = "IDEFICS3_ROCOv2"
 cache_dir = "/workspace/data"
-train_dataset = load_dataset(dataset_id, split="train", cache_dir=cache_dir)
 DEVICE = "cuda:0"
 USE_LORA = False
@@ -127,7 +137,7 @@ training_args = TrainingArguments(
     gradient_accumulation_steps = 8,
     dataloader_pin_memory = False,
     save_total_limit = 3,
-    evaluation_strategy = None,
     save_strategy = "steps",
     eval_steps = 100,
     save_steps = 10, # checkpoint each 10 steps
@@ -146,6 +156,7 @@ trainer = Trainer(
     args = training_args,
     data_collator = data_collator,
     train_dataset = train_dataset,
 )
 trainer.train()

 # License: Apache License 2.0
 # Description: Train the model on the dataset
 import os
+import sys
 import torch
 from huggingface_hub import login as hf_login
 disable_progress_bar()
 HF_TOKEN = ""
+arguments = sys.argv[1:]
 if os.environ.get('HF_TOKEN') is not None:
   HF_TOKEN = os.environ.get('HF_TOKEN')
   print(f"Hugging Face token found in environment variable")
+# If HF_TOKEN is empty checks if the first argument seems to be the token (ie starts with "hf_" )
+if not HF_TOKEN and arguments and arguments[0].startswith("hf_"):
+    HF_TOKEN = arguments[0]
+    print(f"Hugging Face token found in script arguments")
 hf_login(
   token=HF_TOKEN,
   add_to_git_credential=True
 destination_model_id = "eltorio/IDEFICS3_ROCOv2"
 output_dir = "IDEFICS3_ROCOv2"
 cache_dir = "/workspace/data"
+full_dataset = load_dataset(dataset_id,keep_in_memory=False)
+train_dataset = full_dataset["train"]
+eval_dataset = full_dataset["validation"]
 DEVICE = "cuda:0"
 USE_LORA = False
     gradient_accumulation_steps = 8,
     dataloader_pin_memory = False,
     save_total_limit = 3,
+    eval_strategy = "steps",
     save_strategy = "steps",
     eval_steps = 100,
     save_steps = 10, # checkpoint each 10 steps
     args = training_args,
     data_collator = data_collator,
     train_dataset = train_dataset,
+    eval_dataset = train_dataset,
 )
 trainer.train()

start.sh CHANGED Viewed

@@ -6,14 +6,18 @@ export HOME=/workspace
 cd /workspace
 git lfs install
 if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
-  export HF_TOKEN=$1
-  unset $1
 fi
 echo "HF_TOKEN: $HF_TOKEN"
 . /workspace/.bashrc
 . /workspace/.miniconda3/bin/activate
 git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
 git config --global credential.helper store
 huggingface-cli login --add-to-git-credential --token $HF_TOKEN

 cd /workspace
 git lfs install
 if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
+  export HF_TOKEN=${!#}
+  unset ${!#}
 fi
 echo "HF_TOKEN: $HF_TOKEN"
 . /workspace/.bashrc
 . /workspace/.miniconda3/bin/activate
+export SHELL=/bin/bash
 git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
+git config --global user.email "[email protected]"
+git config --global user.name "[email protected]"
 git config --global credential.helper store
 huggingface-cli login --add-to-git-credential --token $HF_TOKEN