How to load in multi-gpu instance ?
#19
by
aastha6
- opened
import os, torch, gc
from mistral_inference.model import Transformer
from mistral_inference.generate import generate
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
torch.cuda.empty_cache()
gc.collect()
# 1. load tokenizer
model_folder_path = "/home/ubuntu/mistral_models/codestral-22b-v0x1"
# model_folder_path = "/home/ubuntu/mistral_models/mistral-7b-instruct-v0x3"
mistral_tokenizer = MistralTokenizer.from_file(model_folder_path+"/tokenizer.model.v3")
# 2. chat completion request
user_question = "Explain Travelling Salesman Problem in a nutshell."
completion_request = ChatCompletionRequest(
messages=[UserMessage(content=user_question)]
)
# 3. encode message
tokens = mistral_tokenizer.encode_chat_completion(completion_request).tokens
# 4. load model
model = Transformer.from_folder(
model_folder_path,
# num_pipeline_ranks=4,
# device="cuda",
dtype=torch.bfloat16
)
# 5. generate results
out_tokens, _ = generate(
[tokens],
model,
max_tokens=64,
temperature=0.0,
eos_id=mistral_tokenizer.instruct_tokenizer.tokenizer.eos_id
)
# 6. decode generated tokens
result = mistral_tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
print(result)
Using A10G with 4 gpus of 24GB each.
Error: OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU
Tried setting num_pipeline_ranks=4 but facing different error: ValueError: Default process group has not been initialized, please make sure to call init_process_group.
Any suggestions would be really helpful !
I would recommend launching using vllm; simple set --tp 4
for 4 GPU. set CUDA_VISIBLE_DEVICE=0,1,2,3 https://docs.vllm.ai/en/stable/
How does transformer.from_folder support multi -card reasoning? could we use Fill-in-the-middle (FIM) function if use vllm?because i need Code completion 。
The same goes for me; did you solve it?@phbll