Sagemaker deployment
#14
by
vibranium
- opened
Could someone help how to deploy this on Sagemaker? Hitting all kinds of errors. Here is what I have tried:
def deploy_mixtral():
instance_type = "ml.g5.12xlarge"
number_of_gpu = 1
health_check_timeout = 300
# Define Model and Endpoint configuration parameter
config = {
"HF_MODEL_ID": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
"SM_NUM_GPUS": json.dumps(number_of_gpu),
"HF_TASK": "text-generation",
"HF_MODEL_QUANTIZE": "gptq",
}
llm_model = HuggingFaceModel(
role=role,
image_uri=get_huggingface_llm_image_uri("huggingface"),
name="mistral",
env=config,
model_server_workers=number_of_gpu,
)
llm = llm_model.deploy(
initial_instance_count=1,
instance_type=instance_type,
endpoint_name="mistral",
container_startup_health_check_timeout=health_check_timeout,
)
print(llm)
/opt/conda/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:602: UserWarning: You are using a Backend <class 'text_generation_server.utils.dist.FakeGroup'> as a ProcessGroup. This usage is deprecated since PyTorch 2.0. Please use a public API of PyTorch Distributed instead.
warnings.warn(
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in <module>
sys.exit(app())
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py", line 89, in serve
server.serve(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 228, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 174, in serve_inner
model = get_model(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/__init__.py", line 310, in get_model
return FlashMixtral(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_mixtral.py", line 21, in __init__
super(FlashMixtral, self).__init__(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_mistral.py", line 333, in __init__
model = model_cls(config, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 820, in __init__
self.model = MixtralModel(config, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 757, in __init__
[
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 758, in <listcomp>
MixtralLayer(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 692, in __init__
self.self_attn = MixtralAttention(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 232, in __init__
self.query_key_value = load_attention(config, prefix, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 125, in load_attention
return _load_gqa(config, prefix, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 158, in _load_gqa
get_linear(weight, bias=None, quantize=config.quantize)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/layers.py", line 330, in get_linear
linear = ExllamaQuantLinear(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/gptq/exllamav2.py", line 145, in __init__
assert qzeros.shape == (
AssertionError
#033[2m#033[3mrank#033[0m#033[2m=#033[0m0#033[0m
#033[2m2024-02-06T16:43:07.427575Z#033[0m #033[31mERROR#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shard 0 failed to start
#033[2m2024-02-06T16:43:07.427609Z#033[0m #033[32m INFO#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shutting down shards
Error: ShardCannotStart
@vibranium were you able to figure this out? I am also having issues deploying to a sagemaker endpoint.