student-abdullah commited on
Commit
0637b0e
1 Parent(s): 2030587

Upload llama_3_1_8b_+_unsloth_2x_faster_finetuning.py

Browse files
llama_3_1_8b_+_unsloth_2x_faster_finetuning.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Mine Llama-3.1 8b + Unsloth 2x faster finetuning.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1eI70VMZWms-GCr4j3vf-tcprenY6JCnP
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ # %%capture
12
+ # # Installs Unsloth, Xformers (Flash Attention) and all other packages!
13
+ # !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
14
+ #
15
+ # # We have to check which Torch version for Xformers (2.3 -> 0.0.27)
16
+ # from torch import __version__; from packaging.version import Version as V
17
+ # xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
18
+ # !pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
19
+
20
+ from unsloth import FastLanguageModel
21
+ import torch
22
+ max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
23
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
24
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
25
+
26
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
27
+ fourbit_models = [
28
+ "unsloth/Meta-Llama-3.1-8B-bnb-4bit", # Llama-3.1 15 trillion tokens model 2x faster!
29
+ "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
30
+ "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
31
+ "unsloth/Meta-Llama-3.1-405B-bnb-4bit", # We also uploaded 4bit for 405b!
32
+ "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
33
+ "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
34
+ "unsloth/mistral-7b-v0.3-bnb-4bit", # Mistral v3 2x faster!
35
+ "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
36
+ "unsloth/Phi-3.5-mini-instruct", # Phi-3.5 2x faster!
37
+ "unsloth/Phi-3-medium-4k-instruct",
38
+ "unsloth/gemma-2-9b-bnb-4bit",
39
+ "unsloth/gemma-2-27b-bnb-4bit", # Gemma 2x faster!
40
+ ] # More models at https://huggingface.co/unsloth
41
+
42
+ model, tokenizer = FastLanguageModel.from_pretrained(
43
+ model_name = "meta-llama/Meta-Llama-3.1-8B",
44
+ max_seq_length = max_seq_length,
45
+ dtype = dtype,
46
+ load_in_4bit = load_in_4bit,
47
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
48
+ )
49
+
50
+ model = FastLanguageModel.get_peft_model(
51
+ model,
52
+ r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
53
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
54
+ "gate_proj", "up_proj", "down_proj",],
55
+ lora_alpha = 10,
56
+ lora_dropout = 0, # Supports any, but = 0 is optimized
57
+ bias = "none", # Supports any, but = "none" is optimized
58
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
59
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
60
+ random_state = 3407,
61
+ use_rslora = False, # We support rank stabilized LoRA
62
+ loftq_config = None, # And LoftQ
63
+ )
64
+
65
+ alpaca_prompt = """Below is a conversation between a healthcare provider and a patient. The healthcare provider should respond appropriately to the patient's query.
66
+
67
+ ### Question:
68
+ {}
69
+
70
+ ### Response:
71
+ {}"""
72
+
73
+ EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
74
+
75
+ def formatting_prompts_func(examples):
76
+ # contexts = examples["Context"]
77
+ questions = examples["Question"]
78
+ answers = examples["Answer"]
79
+ texts = []
80
+ for question, answer in zip(questions, answers):
81
+ # Must add EOS_TOKEN, otherwise your generation will go on forever!
82
+ text = alpaca_prompt.format(question, answer) + EOS_TOKEN
83
+ texts.append(text)
84
+ return {"text": texts}
85
+
86
+ import pandas as pd
87
+ from datasets import Dataset, DatasetDict
88
+ from sklearn.model_selection import train_test_split
89
+
90
+ # Load your data
91
+ df = pd.read_csv('/content/data/train.csv')
92
+ train_df, test_df = train_test_split(df, test_size=0.001, random_state=42)
93
+ train_df, val_df = train_test_split(train_df, test_size=0.001, random_state=42)
94
+
95
+ # Create datasets
96
+ train_dataset = Dataset.from_pandas(train_df)
97
+ val_dataset = Dataset.from_pandas(val_df)
98
+ test_dataset = Dataset.from_pandas(test_df)
99
+
100
+ # Combine datasets into a DatasetDict
101
+ dataset_dict = DatasetDict({
102
+ 'train': train_dataset,
103
+ 'validation': val_dataset,
104
+ 'test': test_dataset
105
+ })
106
+
107
+ # Apply the formatting function to the datasets
108
+ dataset_dict = dataset_dict.map(formatting_prompts_func, batched=True)
109
+
110
+ from trl import SFTTrainer
111
+ from transformers import TrainingArguments
112
+ from unsloth import is_bfloat16_supported
113
+
114
+ dataset=dataset_dict
115
+ trainer = SFTTrainer(
116
+ model = model,
117
+ tokenizer = tokenizer,
118
+ train_dataset = dataset['train'],
119
+ dataset_text_field = "text",
120
+ max_seq_length = max_seq_length,
121
+ dataset_num_proc = 2,
122
+ packing = False, # Can make training 5x faster for short sequences.
123
+ args = TrainingArguments(
124
+ per_device_train_batch_size = 4,
125
+ gradient_accumulation_steps = 32,
126
+ warmup_steps = 5,
127
+ # num_train_epochs = 1, # Set this for 1 full training run.
128
+ max_steps = 160,
129
+ learning_rate = 2e-4,
130
+ fp16 = not is_bfloat16_supported(),
131
+ bf16 = is_bfloat16_supported(),
132
+ logging_steps = 1,
133
+ optim = "adamw_8bit",
134
+ weight_decay = 0.01,
135
+ lr_scheduler_type = "linear",
136
+ seed = 3407,
137
+ output_dir = "outputs",
138
+ ),
139
+ )
140
+
141
+ #@title Show current memory stats
142
+ gpu_stats = torch.cuda.get_device_properties(0)
143
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
144
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
145
+ print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
146
+ print(f"{start_gpu_memory} GB of memory reserved.")
147
+
148
+ trainer_stats = trainer.train()
149
+
150
+ #@title Show final memory and time stats
151
+ used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
152
+ used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
153
+ used_percentage = round(used_memory /max_memory*100, 3)
154
+ lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
155
+ print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
156
+ print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
157
+ print(f"Peak reserved memory = {used_memory} GB.")
158
+ print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
159
+ print(f"Peak reserved memory % of max memory = {used_percentage} %.")
160
+ print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
161
+
162
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
163
+
164
+ ### Input:
165
+ {}
166
+
167
+ ### Response:
168
+ {}"""
169
+
170
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
171
+ inputs = tokenizer(
172
+ [
173
+ alpaca_prompt.format(
174
+ "What is alternate for Rifagut 400 mg?", # input
175
+ "", # output - leave this blank for generation!
176
+ )
177
+ ], return_tensors = "pt").to("cuda")
178
+
179
+ outputs = model.generate(**inputs, max_new_tokens=64, use_cache = True)
180
+ tokenizer.batch_decode(outputs, skip_special_tokens=True)
181
+
182
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
183
+
184
+ ### Input:
185
+ {}
186
+
187
+ ### Response:
188
+ {}"""
189
+
190
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
191
+ inputs = tokenizer(
192
+ [
193
+ alpaca_prompt.format(
194
+ "what is alternate of Ocurax 400mg tablets?", # input
195
+ "", # output - leave this blank for generation!
196
+ )
197
+ ], return_tensors = "pt").to("cuda")
198
+
199
+ from transformers import TextStreamer
200
+ text_streamer = TextStreamer(tokenizer)
201
+ _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 200)
202
+
203
+ # | f16 | q4_k_m | q5_k_m | q8_0 |
204
+
205
+ # Save to 8-bit quantization (f16) GGUF
206
+ model.save_pretrained_gguf("model-16bit", tokenizer, quantization_method="f16")
207
+
208
+ # Push to Hugging Face hub with 8-bit quantization (f16)
209
+ model.push_to_hub_gguf("student-abdullah/Llama3.1_medicine_fine-tuned_24-09_16bit_gguf", tokenizer, quantization_method="f16", token="hf_...")