Will it be possible to run this on PC with 8 GeForce RTX 3060 with 8 Gb VRAM each?
#11
by
ai2p
- opened
Can it correctly span VRAM between many GPU cards? Or it needs to have all required VRAM in one videocard only?
Yes
@ai2p
Sure you can! Here is an example to load model across multiple devices (need to install accelerate
first):
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from accelerate.utils import get_balanced_memory, infer_auto_device_map
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
import torch
def load_model(model_name):
weights_path = snapshot_download(model_name)
config = AutoConfig.from_pretrained(model_name)
# This will init model with meta tensors, which basically does nothing.
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
max_memory = get_balanced_memory(
model,
max_memory=None,
no_split_module_classes=["GPTNeoXLayer"],
dtype='float16',
low_zero=False,
)
device_map = infer_auto_device_map(
model,
max_memory=max_memory,
no_split_module_classes=["GPTNeoXLayer"],
dtype='float16'
)
model = load_checkpoint_and_dispatch(
model, weights_path, device_map=device_map, no_split_module_classes=["GPTNeoXLayer"]
)
return model
model_name = 'togethercomputer/GPT-NeoXT-Chat-Base-20B'
model = load_model(model_name)