Ronan commited on
Commit
44fc622
1 Parent(s): a820973
Files changed (3) hide show
  1. Dockerfile +35 -0
  2. learn.py +152 -0
  3. start.sh +31 -0
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
2
+ # run with
3
+ # docker run --gpus all --user=42420:42420 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh hf_TOKEN
4
+ FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
5
+ # FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
6
+ # RUN mkdir -p /workspace
7
+ RUN /usr/sbin/addgroup --gid 42420 ovh
8
+ RUN /usr/sbin/useradd -u 42420 --gid 42420 -m -d /workspace -s /bin/bash ovh
9
+ RUN apt update -y && apt-get install -y curl git git-lfs screen
10
+ COPY --chmod=777 start.sh /start.sh
11
+ COPY learn.py /learn.py
12
+ COPY preload.py /preload.py
13
+ # Mandatory to run the jobs in rootless mode
14
+ USER root
15
+ RUN chown -R 42420:42420 /workspace
16
+ USER 42420
17
+ RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > /workspace/miniconda.sh
18
+ RUN /bin/bash /workspace/miniconda.sh -b -p /workspace/.miniconda3
19
+ RUN . /workspace/.miniconda3/bin/activate && conda init --all
20
+ RUN . /workspace/.miniconda3/bin/activate \
21
+ && pip install -U "safetensors>=0.4.5" \
22
+ && pip install -U "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.44.2.dev0-py3-none-manylinux_2_24_x86_64.whl" \
23
+ && pip install -U git+https://github.com/huggingface/transformers.git\
24
+ && pip install huggingface_hub[cli] accelerate datasets peft\
25
+ && pip install -U Pillow \
26
+ && pip install -U torchvision torchaudio
27
+ RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
28
+ # Mandatory to run the jobs in rootless mode
29
+ # USER root
30
+ # RUN chown -R 42420:42420 /workspace
31
+ USER 42420
32
+ WORKDIR /workspace
33
+ # RUN export HOME=/workspace && cd /workspace && . /workspace/.miniconda3/bin/activate \
34
+ # && mkdir -p /workspace/data \
35
+ # && python /preload.py
learn.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024 Ronan Le Meillat
2
+ # License: Apache License 2.0
3
+ # Description: Train the model on the dataset
4
+ import os
5
+ import torch
6
+
7
+ from huggingface_hub import login as hf_login
8
+ from datasets import load_dataset
9
+ from peft import LoraConfig
10
+ from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration, TrainingArguments, Trainer
11
+ from datasets.utils.logging import disable_progress_bar
12
+ disable_progress_bar()
13
+
14
+ HF_TOKEN = ""
15
+
16
+ if os.environ.get('HF_TOKEN') is not None:
17
+ HF_TOKEN = os.environ.get('HF_TOKEN')
18
+ print(f"Hugging Face token found in environment variable")
19
+
20
+ hf_login(
21
+ token=HF_TOKEN,
22
+ add_to_git_credential=True
23
+ )
24
+ dataset_id = "eltorio/ROCO-radiology"
25
+ prompt= "You are an expert radiologist certified with over 15 years of experience in diagnostic imaging, describe this image"
26
+ source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
27
+ destination_model_id = "eltorio/IDEFICS3_ROCOv2"
28
+ output_dir = "IDEFICS3_ROCOv2"
29
+ cache_dir = "/workspace/data"
30
+ train_dataset = load_dataset(dataset_id, split="train", cache_dir=cache_dir)
31
+
32
+ DEVICE = "cuda:0"
33
+ USE_LORA = False
34
+ USE_QLORA = True
35
+
36
+ processor = AutoProcessor.from_pretrained(
37
+ source_model_id,
38
+ do_image_splitting=False
39
+ )
40
+
41
+ if USE_QLORA or USE_LORA:
42
+ lora_config = LoraConfig(
43
+ r=8,
44
+ lora_alpha=8,
45
+ lora_dropout=0.1,
46
+ target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
47
+ use_dora=False if USE_QLORA else True,
48
+ init_lora_weights="gaussian"
49
+ )
50
+ if USE_QLORA:
51
+ bnb_config = BitsAndBytesConfig(
52
+ load_in_4bit=True,
53
+ bnb_4bit_quant_type="nf4",
54
+ bnb_4bit_compute_dtype=torch.float16
55
+ )
56
+ model = Idefics3ForConditionalGeneration.from_pretrained(
57
+ source_model_id,
58
+ torch_dtype=torch.float16,
59
+ quantization_config=bnb_config if USE_QLORA else None,
60
+ )
61
+ model.add_adapter(lora_config)
62
+ model.enable_adapters()
63
+ else:
64
+ model = Idefics3ForConditionalGeneration.from_pretrained(
65
+ source_model_id,
66
+ torch_dtype=torch.float16,
67
+ _attn_implementation="flash_attention_2", # This works for A100 or H100
68
+ ).to(DEVICE)
69
+
70
+ class MyDataCollator:
71
+ def __init__(self, processor):
72
+ self.processor = processor
73
+ self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
74
+ processor.tokenizer.additional_special_tokens.index("<image>")
75
+ ]
76
+
77
+ def __call__(self, samples):
78
+ texts = []
79
+ images = []
80
+ for sample in samples:
81
+ image = sample["image"]
82
+ answer = sample["caption"]
83
+ messages = [
84
+ {
85
+ "role": "system",
86
+ "content": [
87
+ {"type": "text", "text": prompt}
88
+ ]
89
+
90
+ },
91
+ {
92
+ "role": "user",
93
+ "content": [
94
+ {"type": "image"},
95
+ ]
96
+ },
97
+ {
98
+ "role": "assistant",
99
+ "content": [
100
+ {"type": "text", "text": answer}
101
+ ]
102
+ }
103
+ ]
104
+ text = processor.apply_chat_template(messages, add_generation_prompt=False)
105
+ texts.append(text.strip())
106
+ images.append([image.convert('RGB')])
107
+
108
+ batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
109
+
110
+ labels = batch["input_ids"].clone()
111
+ labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
112
+ batch["labels"] = labels
113
+
114
+ return batch
115
+
116
+ data_collator = MyDataCollator(processor)
117
+
118
+
119
+ training_args = TrainingArguments(
120
+ output_dir = output_dir,
121
+ overwrite_output_dir = False,
122
+ auto_find_batch_size = True,
123
+ learning_rate = 2e-4,
124
+ fp16 = True,
125
+ per_device_train_batch_size = 2,
126
+ per_device_eval_batch_size = 2,
127
+ gradient_accumulation_steps = 8,
128
+ dataloader_pin_memory = False,
129
+ save_total_limit = 3,
130
+ evaluation_strategy = None,
131
+ save_strategy = "steps",
132
+ eval_steps = 100,
133
+ save_steps = 10, # checkpoint each 10 steps
134
+ resume_from_checkpoint = True,
135
+ logging_steps = 5,
136
+ remove_unused_columns = False,
137
+ push_to_hub = True,
138
+ label_names = ["labels"],
139
+ load_best_model_at_end = False,
140
+ report_to = "none",
141
+ optim = "paged_adamw_8bit",
142
+ )
143
+
144
+ trainer = Trainer(
145
+ model = model,
146
+ args = training_args,
147
+ data_collator = data_collator,
148
+ train_dataset = train_dataset,
149
+ )
150
+
151
+ trainer.train()
152
+
start.sh ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ JOB_URL_SCHEME=${JOB_URL_SCHEME:-"http://"}
3
+ JOB_ID=${JOB_ID:-'localhost'}
4
+ JOB_HOST=${JOB_HOST:-'local'}
5
+ export HOME=/workspace
6
+ cd /workspace
7
+ git lfs install
8
+ if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
9
+ export HF_TOKEN=$1
10
+ unset $1
11
+ fi
12
+
13
+ echo "HF_TOKEN: $HF_TOKEN"
14
+ . /workspace/.bashrc
15
+ . /workspace/.miniconda3/bin/activate
16
+ git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
17
+ git config --global credential.helper store
18
+
19
+ huggingface-cli login --add-to-git-credential --token $HF_TOKEN
20
+
21
+
22
+ screen -dmS jupyter bash -c 'jupyter lab --ip=0.0.0.0 --port=8080 --no-browser --allow-root \
23
+ --notebook-dir=/workspace \
24
+ --LabApp.token="" \
25
+ --LabApp.custom_display_url=${JOB_URL_SCHEME}${JOB_ID}-8080.${JOB_HOST} \
26
+ --LabApp.allow_remote_access=True \
27
+ --LabApp.allow_origin="*" \
28
+ --LabApp.disable_check_xsrf=True'
29
+
30
+ echo "Jupyter Lab is running at ${JOB_URL_SCHEME}${JOB_ID}-8080.${JOB_HOST}"
31
+ exec "$@"