Model Card for Llama3-8B-DPO

License: Apache-2.0
Datasets: CultriX/llama70B-dpo-dataset
Language: English
Base Model: NousResearch/Hermes-3-Llama-3.1-8B
Pipeline Tag: Text-Generation
Tags: DPO, Llama3, General Library: Transformers

Performance

Model Name	AGIEval	TruthfulQA	BigBench
Hermes-3-Llama-3.1-8B	41.51	58.61	43.08
Llama3-8B-DPO	41.87	71.38	44.5

Training Script

# Install required libraries
!pip install --upgrade pip
!pip install git+https://github.com/huggingface/transformers
!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/huggingface/trl.git
!pip install --upgrade wandb accelerate datasets

import os
import gc
import torch
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from trl import DPOTrainer, DPOConfig
from huggingface_hub import notebook_login

# Log in to Hugging Face and WandB
hf_token = os.getenv('HF_TOKEN')
if not hf_token:
    notebook_login()
else:
    notebook_login(token=hf_token)

wb_token = os.getenv('WANDB_API_KEY')
if not wb_token:
    wandb.login()
else:
    wandb.login(key=wb_token)

# Set model names
model_name = "NousResearch/Hermes-3-Llama-3.1-8B"
base_model_name = model_name
fine_tuned_model_name = "OrpoLlama-3-8B"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model.config.use_cache = False

# Apply LoRA for fine-tuning
peft_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, peft_config)
model.gradient_checkpointing_enable()

# Load and format dataset
dataset = load_dataset("CultriX/llama70B-dpo-dataset")["train"]

def chatml_format(example):
    system = example.get("system", "")
    question = example.get("question", "")
    chosen = example.get("chosen", "")
    rejected = example.get("rejected", "")

    prompt = ""
    if system:
        prompt += f"<|im_start|>system\n{system}<|im_end|>\n"
    prompt += f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"

    return {
        "prompt": prompt,
        "chosen": f"{chosen}<|im_end|>\n",
        "rejected": f"{rejected}<|im_end|>\n",
    }

dataset = dataset.map(chatml_format, remove_columns=dataset.column_names)

# Fine-tune the model using DPO Trainer
training_args = DPOConfig(
    output_dir="model-output",
    logging_steps=50,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=4,
    save_strategy="no",
    optim="adamw_torch",
    warmup_ratio=0.03,
    bf16=True,
    report_to="wandb",
    beta=0.1,
    max_prompt_length=2048,
    max_length=4096,
    disable_dropout=False,
    force_use_ref_model=True,
)

trainer = DPOTrainer(
    model=model,
    ref_model=AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.bfloat16),
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset,
)
trainer.train()

# Save fine-tuned model
trainer.model.save_pretrained("final_ckpt")
tokenizer.save_pretrained("final_ckpt")

# Test the fine-tuned model
from transformers import pipeline

fine_tuned_model = AutoModelForCausalLM.from_pretrained("final_ckpt", torch_dtype=torch.bfloat16)
text_gen_pipeline = pipeline(
    "text-generation",
    model=fine_tuned_model,
    tokenizer=tokenizer,
    max_length=4096,
)

messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant chatbot that provides concise answers.",
    },
    {
        "role": "user",
        "content": "What are GPUs and why would I use them for machine learning tasks?",
    },
]
prompt = "".join(f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n" for msg in messages)

sequences = text_gen_pipeline(prompt, do_sample=True, temperature=0.7, top_p=0.9)
print(sequences[0]["generated_text"])

CultriX
/

Llama3-8B-DPO

Model Card for Llama3-8B-DPO

Performance

Training Script

Model tree for CultriX/Llama3-8B-DPO

Dataset used to train CultriX/Llama3-8B-DPO