PathFinderKR
/

Waktaverse-Llama-3-KO-8B-Instruct

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

PathFinderKR commited on May 27

Commit

9ee8f14

•

1 Parent(s): c2a7866

Update README.md

Files changed (1) hide show

README.md +6 -6

README.md CHANGED Viewed

@@ -155,7 +155,7 @@ The model is trained on the [MarkrAI/KoCommercial-Dataset](https://huggingface.c
 ### Training Procedure
-The model training used LoRA for computational efficiency. 0.02 billion parameters(0.25% of total parameters) were trained.
 #### Training Hyperparameters
@@ -173,8 +173,8 @@ bnb_4bit_use_double_quant=True
 ################################################################################
 task_type="CAUSAL_LM"
 target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
-r=8
-lora_alpha=16
 lora_dropout=0.1
 bias="none"
@@ -182,19 +182,19 @@ bias="none"
 # TrainingArguments parameters
 ################################################################################
 num_train_epochs=1
-per_device_train_batch_size=2
 gradient_accumulation_steps=1
 gradient_checkpointing=True
 learning_rate=2e-5
 lr_scheduler_type="cosine"
 warmup_ratio=0.1
-optim = "paged_adamw_8bit"
 weight_decay=0.01
 ################################################################################
 # SFT parameters
 ################################################################################
-max_seq_length=1024
 packing=False
 ```

 ### Training Procedure
+The model training used LoRA for computational efficiency. 0.04 billion parameters(0.51% of total parameters) were trained.
 #### Training Hyperparameters
 ################################################################################
 task_type="CAUSAL_LM"
 target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+r=16
+lora_alpha=32
 lora_dropout=0.1
 bias="none"
 # TrainingArguments parameters
 ################################################################################
 num_train_epochs=1
+per_device_train_batch_size=1
 gradient_accumulation_steps=1
 gradient_checkpointing=True
 learning_rate=2e-5
 lr_scheduler_type="cosine"
 warmup_ratio=0.1
+optim = "paged_adamw_32bit"
 weight_decay=0.01
 ################################################################################
 # SFT parameters
 ################################################################################
+max_seq_length=2048
 packing=False
 ```