|
--- |
|
license: mit |
|
license_link: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/LICENSE |
|
|
|
language: |
|
- en |
|
pipeline_tag: text-generation |
|
tags: |
|
- nlp |
|
- code |
|
widget: |
|
- messages: |
|
- role: user |
|
content: Can you provide ways to eat combinations of bananas and dragonfruits? |
|
--- |
|
|
|
<h1 align="center">Phi-3 Mini-128K-ChatQA-v0.1</h1> |
|
|
|
Phi-3 Mini-128K-ChatQA-v0.1 is an experimental LoRA fine-tune of the [Phi-3 Mini-128K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) model a subset of the [nvidia/ChatQA-Training-Data](https://huggingface.co/datasets/nvidia/ChatQA-Training-Data) dataset. The goal is to make a low-parameter model usable in RAG applications. |
|
|
|
Below is the `config.yaml` used to train the model (used [mlx-lm](https://github.com/ml-explore/mlx-examples/tree/main/llms/mlx_lm)): |
|
|
|
```yaml |
|
# The path to the local model directory or Hugging Face repo. |
|
model: "phi-3-mini-128k-instruct" |
|
# Whether or not to train (boolean) |
|
train: true |
|
|
|
# Directory with {train, valid, test}.jsonl files |
|
data: "data" |
|
|
|
# The PRNG seed |
|
seed: 31 |
|
|
|
# Number of layers to fine-tune |
|
lora_layers: 32 |
|
|
|
# Minibatch size. |
|
batch_size: 2 |
|
|
|
# Iterations to train for. |
|
iters: 8 |
|
|
|
# Number of validation batches, -1 uses the entire validation set. |
|
val_batches: 25 |
|
|
|
# Adam learning rate. |
|
learning_rate: 2e-5 |
|
|
|
# Number of training steps between loss reporting. |
|
steps_per_report: 2 |
|
|
|
# Number of training steps between validations. |
|
steps_per_eval: 2 |
|
|
|
# Load path to resume training with the given adapter weights. |
|
resume_adapter_file: null |
|
|
|
# Save/load path for the trained adapter weights. |
|
adapter_path: "adapters/phi-3-mini-128k-chatqa" |
|
|
|
# Save the model every N iterations. |
|
save_every: 8 |
|
|
|
# Evaluate on the test set after training |
|
test: true |
|
|
|
# Number of test set batches, -1 uses the entire test set. |
|
test_batches: 32 |
|
|
|
# Maximum sequence length. |
|
max_seq_length: 131072 |
|
|
|
# Use gradient checkpointing to reduce memory use. |
|
grad_checkpoint: false |
|
|
|
# Use DoRA instead of LoRA. |
|
use_dora: false |
|
|
|
# LoRA parameters can only be specified in a config file |
|
lora_parameters: |
|
# The layer keys to apply LoRA to. |
|
# These will be applied for the last lora_layers |
|
keys: ["self_attn.q_proj", "self_attn.v_proj", "self_attn.k_proj", "mlp.down_proj", "mlp.gate_up_proj"] |
|
rank: 256 |
|
alpha: 16.0 |
|
scale: 10.0 |
|
dropout: 0.0 |
|
|
|
# Schedule can only be specified in a config file, uncomment to use. |
|
#lr_schedule: |
|
# name: cosine_decay |
|
# warmup: 100 # 0 for no warmup |
|
# warmup_init: 1e-7 # 0 if not specified |
|
# arguments: [1e-5, 1000, 1e-7] # passed to scheduler |
|
``` |