BeaverAI
/

mistral-dory-12b

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

mistral-dory-12b / axolotl_config.yaml

Fizzarolli's picture

Create axolotl_config.yaml

3f5a614 verified 4 months ago

history blame contribute delete

2.46 kB

	# Weights and Biases logging config
	wandb_project: nemo-instruct-tune
	wandb_entity:
	wandb_watch:
	wandb_name: v1
	wandb_log_model:

	# Model architecture config
	base_model: mistralai/Mistral-Nemo-Base-2407
	model_type: AutoModelForCausalLM
	tokenizer_type: AutoTokenizer
	chat_template: alpaca

	# Hugging Face saving config
	hub_model_id: Fizzarolli/nemo-instruct-tune-v1
	hub_strategy: all_checkpoints
	push_dataset_to_hub:
	hf_use_auth_token:

	# Model checkpointing config
	output_dir: ./lora-out
	resume_from_checkpoint:
	save_steps:
	saves_per_epoch: 10
	save_safetensors: true
	save_total_limit: 2

	# Mixed precision training config
	bf16: true
	fp16: false
	tf32: false

	# Model loading config
	load_in_8bit: false
	load_in_4bit: true
	strict: false

	# Sequence config
	sequence_len: 16384
	s2_attention: false
	sample_packing: true
	eval_sample_packing: true
	pad_to_sequence_len: true
	train_on_inputs: false
	group_by_length: false

	# QLoRA adapter config
	adapter: qlora
	lora_model_dir:
	lora_r: 64
	lora_alpha: 64
	lora_dropout: 0.125
	lora_fan_in_fan_out:
	lora_target_linear:
	save_embedding_layers:
	peft_layers_to_transform:
	peft_use_dora: true
	peft_use_rslora:
	peft_layer_replication:
	lora_target_modules:
	- gate_proj
	- down_proj
	- up_proj
	- q_proj
	- v_proj
	- k_proj
	- o_proj
	lora_modules_to_save:

	# Unfrozen parameters for FFT
	unfrozen_parameters:

	# Dataset config
	datasets:
	- path: BeaverAI/Nemo-Inst-Tune-ds
	type: chat_template
	val_set_size: 0.05
	evaluation_strategy:
	eval_steps:
	evals_per_epoch: 20
	test_datasets:
	dataset_prepared_path: ./prepared-datasets
	shuffle_merged_datasets: true

	# Training hyperparameters
	num_epochs: 1
	gradient_accumulation_steps: 2
	micro_batch_size: 1
	eval_batch_size: 1
	warmup_steps: 25
	optimizer: paged_adamw_8bit
	lr_scheduler: cosine
	learning_rate: 0.000007
	loraplus_lr_ratio: 8
	loraplus_lr_embedding:
	cosine_min_lr_ratio: 0.1
	weight_decay: 0.1
	max_grad_norm: 1
	logging_steps: 1

	# Model optimization
	gradient_checkpointing: unsloth
	xformers_attention: false
	flash_attention: true
	sdp_attention: false
	unsloth_cross_entropy_loss: false
	unsloth_lora_mlp: false
	unsloth_lora_qkv: false
	unsloth_lora_o: false

	# Loss monitoring config
	early_stopping_patience: false
	loss_watchdog_threshold: 100.0
	loss_watchdog_patience: 3

	# Debug config
	debug: true
	seed: 42

	# DeepSpeed and FSDP config
	deepspeed: deepspeed_configs/zero2.json
	fsdp:
	fsdp_config:

	# Token config
	special_tokens:
	pad_token: "<pad>"
	tokens: