Will it be possible to run this on PC with 8 GeForce RTX 3060 with 8 Gb VRAM each?

#11
by ai2p - opened

Can it correctly span VRAM between many GPU cards? Or it needs to have all required VRAM in one videocard only?

Together org

@ai2p Sure you can! Here is an example to load model across multiple devices (need to install accelerate first):

from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from accelerate.utils import get_balanced_memory, infer_auto_device_map
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
import torch

def load_model(model_name):
    weights_path = snapshot_download(model_name)

    config = AutoConfig.from_pretrained(model_name)

    # This will init model with meta tensors, which basically does nothing.
    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(config)

    max_memory = get_balanced_memory(
        model,
        max_memory=None,
        no_split_module_classes=["GPTNeoXLayer"],
        dtype='float16',
        low_zero=False,
    )

    device_map = infer_auto_device_map(
        model, 
        max_memory=max_memory,
        no_split_module_classes=["GPTNeoXLayer"], 
        dtype='float16'
    )

    model = load_checkpoint_and_dispatch(
        model, weights_path, device_map=device_map, no_split_module_classes=["GPTNeoXLayer"]
    )

    return model

model_name = 'togethercomputer/GPT-NeoXT-Chat-Base-20B'
model = load_model(model_name)

Sign up or log in to comment