[Cache Request] meta-llama/Meta-Llama-3-70B-Instruct
#130
by
Taper5749
- opened
Please add the following model to the neuron cache
This model is already cached, but perhaps not for the configuration you selected.
For instance, you can deploy it on SageMaker using the following code snippet:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
try:
role = sagemaker.get_execution_role()
except ValueError:
iam = boto3.client("iam")
role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]
# Hub Model configuration. https://huggingface.co/models
hub = {
"HF_MODEL_ID": "meta-llama/Meta-Llama-3-70B-Instruct",
"HF_NUM_CORES": "24",
"HF_AUTO_CAST_TYPE": "fp16",
"MAX_BATCH_SIZE": "1",
"MAX_INPUT_LENGTH": "7373",
"MAX_TOTAL_TOKENS": "4096",
"HF_TOKEN": "<REPLACE WITH YOUR TOKEN>",
}
assert hub["HF_TOKEN"] != "<REPLACE WITH YOUR TOKEN>", "Please replace '<REPLACE WITH YOUR TOKEN>' with your Hugging Face Hub API token"
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
image_uri=get_huggingface_llm_image_uri("huggingface-neuronx", version="0.0.23"),
env=hub,
role=role,
)
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.inf2.48xlarge",
container_startup_health_check_timeout=3600,
volume_size=512,
)
# send request
predictor.predict(
{
"inputs": "What is is the capital of France?",
"parameters": {
"do_sample": True,
"max_new_tokens": 128,
"temperature": 0.7,
"top_k": 50,
"top_p": 0.95,
}
}
)