404 Client Error
Getting an following error when trying to use this model. However, if there is a simpler way to do it, I am just looking to use a phi-3 model with quantization because I am getting OOM errors with my 16 GB VRAM V100.
Runtime:Python 3.10.14 (main, Mar 21 2024, 16:24:04) [GCC 11.2.0]
GPU: NVIDIA V100
Optimum 1.20.0
Transformers 4.41.2
This is the code snippet that I have:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
torch.random.manual_seed(0)
model = ORTModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-128k-instruct-onnx",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct-onnx")
This is the error I am getting:
The ONNX file phi3-mini-128k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx is not a regular name used in optimum.onnxruntime that are ['model.onnx', 'model_quantized.onnx', 'model_optimized.onnx', 'decoder_with_past_model.onnx', 'decoder_with_past_model_quantized.onnx', 'decoder_with_past_model_optimized.onnx'], the ORTModelForCausalLM might not behave as expected.
HTTPError Traceback (most recent call last)
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:304, in hf_raise_for_status(response, endpoint_name)
303 try:
--> 304 response.raise_for_status()
305 except HTTPError as e:
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/requests/models.py:1024, in Response.raise_for_status(self)
1023 if http_error_msg:
-> 1024 raise HTTPError(http_error_msg, response=self)
HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx/resolve/main/phi3-mini-128k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx
The above exception was the direct cause of the following exception:
EntryNotFoundError Traceback (most recent call last)
Cell In[19], line 6
2 from optimum.onnxruntime import ORTModelForCausalLM
4 torch.random.manual_seed(0)
----> 6 model = ORTModelForCausalLM.from_pretrained(
7 "microsoft/Phi-3-mini-128k-instruct-onnx",
8 device_map="cuda",
9 torch_dtype="auto",
10 #quantization_config=quantization_config
11 )
12 tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct-onnx")
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/onnxruntime/modeling_ort.py:669, in ORTModel.from_pretrained(cls, model_id, export, force_download, use_auth_token, cache_dir, subfolder, config, local_files_only, provider, session_options, provider_options, use_io_binding, **kwargs)
620 @classmethod
621 @add_start_docstrings(FROM_PRETRAINED_START_DOCSTRING)
622 def from_pretrained(
(...)
636 **kwargs,
637 ):
638 """
639 provider (str
, defaults to "CPUExecutionProvider"
):
640 ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/ for
(...)
667 ORTModel
: The loaded ORTModel model.
668 """
--> 669 return super().from_pretrained(
670 model_id,
671 export=export,
672 force_download=force_download,
673 use_auth_token=use_auth_token,
674 cache_dir=cache_dir,
675 subfolder=subfolder,
676 config=config,
677 local_files_only=local_files_only,
678 provider=provider,
679 session_options=session_options,
680 provider_options=provider_options,
681 use_io_binding=use_io_binding,
682 **kwargs,
683 )
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/modeling_base.py:402, in OptimizedModel.from_pretrained(cls, model_id, export, force_download, use_auth_token, cache_dir, subfolder, config, local_files_only, trust_remote_code, revision, **kwargs)
398 trust_remote_code = False
400 from_pretrained_method = cls._from_transformers if export else cls._from_pretrained
--> 402 return from_pretrained_method(
403 model_id=model_id,
404 config=config,
405 revision=revision,
406 cache_dir=cache_dir,
407 force_download=force_download,
408 use_auth_token=use_auth_token,
409 subfolder=subfolder,
410 local_files_only=local_files_only,
411 trust_remote_code=trust_remote_code,
412 **kwargs,
413 )
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/onnxruntime/modeling_decoder.py:495, in ORTModelForCausalLM._from_pretrained(cls, model_id, config, use_auth_token, revision, force_download, cache_dir, file_name, subfolder, use_cache, local_files_only, use_merged, provider, session_options, provider_options, use_io_binding, model_save_dir, **kwargs)
489 if file_name not in regular_file_names:
490 logger.warning(
491 f"The ONNX file {file_name} is not a regular name used in optimum.onnxruntime that are {regular_file_names}, the "
492 f"{cls.name} might not behave as expected."
493 )
--> 495 model_cache_path, preprocessors = cls._cached_file(
496 model_path=model_path,
497 use_auth_token=use_auth_token,
498 revision=revision,
499 force_download=force_download,
500 cache_dir=cache_dir,
501 file_name=file_name,
502 subfolder=subfolder,
503 local_files_only=local_files_only,
504 )
505 new_model_save_dir = model_cache_path.parent
507 # model_save_dir can be provided in kwargs as a TemporaryDirectory instance, in which case we want to keep it
508 # instead of the path only.
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/onnxruntime/modeling_ort.py:873, in ORTModel._cached_file(model_path, use_auth_token, revision, force_download, cache_dir, file_name, subfolder, local_files_only)
871 preprocessors = maybe_load_preprocessors(model_path.as_posix())
872 else:
--> 873 model_cache_path = hf_hub_download(
874 repo_id=model_path.as_posix(),
875 filename=file_name,
876 subfolder=subfolder,
877 use_auth_token=use_auth_token,
878 revision=revision,
879 cache_dir=cache_dir,
880 force_download=force_download,
881 local_files_only=local_files_only,
882 )
883 # try download external data
884 try:
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1221, in hf_hub_download(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, user_agent, force_download, proxies, etag_timeout, token, local_files_only, headers, endpoint, legacy_cache_layout, resume_download, force_filename, local_dir_use_symlinks)
1202 return _hf_hub_download_to_local_dir(
1203 # Destination
1204 local_dir=local_dir,
(...)
1218 local_files_only=local_files_only,
1219 )
1220 else:
-> 1221 return _hf_hub_download_to_cache_dir(
1222 # Destination
1223 cache_dir=cache_dir,
1224 # File info
1225 repo_id=repo_id,
1226 filename=filename,
1227 repo_type=repo_type,
1228 revision=revision,
1229 # HTTP info
1230 headers=headers,
1231 proxies=proxies,
1232 etag_timeout=etag_timeout,
1233 endpoint=endpoint,
1234 # Additional options
1235 local_files_only=local_files_only,
1236 force_download=force_download,
1237 )
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1282, in _hf_hub_download_to_cache_dir(cache_dir, repo_id, filename, repo_type, revision, headers, proxies, etag_timeout, endpoint, local_files_only, force_download)
1278 return pointer_path
1280 # Try to get metadata (etag, commit_hash, url, size) from the server.
1281 # If we can't, a HEAD request error is returned.
-> 1282 (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
1283 repo_id=repo_id,
1284 filename=filename,
1285 repo_type=repo_type,
1286 revision=revision,
1287 endpoint=endpoint,
1288 proxies=proxies,
1289 etag_timeout=etag_timeout,
1290 headers=headers,
1291 local_files_only=local_files_only,
1292 storage_folder=storage_folder,
1293 relative_filename=relative_filename,
1294 )
1296 # etag can be None for several reasons:
1297 # 1. we passed local_files_only.
1298 # 2. we don't have a connection
(...)
1304 # If the specified revision is a commit hash, look inside "snapshots".
1305 # If the specified revision is a branch or tag, look inside "refs".
1306 if head_call_error is not None:
1307 # Couldn't make a HEAD call => let's try to find a local file
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1722, in _get_metadata_or_catch_error(repo_id, filename, repo_type, revision, endpoint, proxies, etag_timeout, headers, local_files_only, relative_filename, storage_folder)
1720 try:
1721 try:
-> 1722 metadata = get_hf_file_metadata(url=url, proxies=proxies, timeout=etag_timeout, headers=headers)
1723 except EntryNotFoundError as http_error:
1724 if storage_folder is not None and relative_filename is not None:
1725 # Cache the non-existence of the file
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1645, in get_hf_file_metadata(url, token, proxies, timeout, library_name, library_version, user_agent, headers)
1642 headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
1644 # Retrieve metadata
-> 1645 r = _request_wrapper(
1646 method="HEAD",
1647 url=url,
1648 headers=headers,
1649 allow_redirects=False,
1650 follow_relative_redirects=True,
1651 proxies=proxies,
1652 timeout=timeout,
1653 )
1654 hf_raise_for_status(r)
1656 # Return
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:372, in _request_wrapper(method, url, follow_relative_redirects, **params)
370 # Recursively follow relative redirects
371 if follow_relative_redirects:
--> 372 response = _request_wrapper(
373 method=method,
374 url=url,
375 follow_relative_redirects=False,
376 **params,
377 )
379 # If redirection, we redirect only relative paths.
380 # This is useful in case of a renamed repository.
381 if 300 <= response.status_code <= 399:
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:396, in _request_wrapper(method, url, follow_relative_redirects, **params)
394 # Perform request and return if status_code is not in the retry list.
395 response = get_session().request(method=method, url=url, **params)
--> 396 hf_raise_for_status(response)
397 return response
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:315, in hf_raise_for_status(response, endpoint_name)
313 elif error_code == "EntryNotFound":
314 message = f"{response.status_code} Client Error." + "\n\n" + f"Entry Not Found for url: {response.url}."
--> 315 raise EntryNotFoundError(message, response) from e
317 elif error_code == "GatedRepo":
318 message = (
319 f"{response.status_code} Client Error." + "\n\n" + f"Cannot access gated repo for url {response.url}."
320 )
EntryNotFoundError: 404 Client Error. (Request ID: Root=1-668356a1-2a1c334c7f7dd8405dbde41e;25cd8dfc-0b0f-4777-9251-ecd1670d385f)
Entry Not Found for url: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx/resolve/main/phi3-mini-128k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.
Instead of specifying microsoft/Phi-3-mini-128k-instruct-onnx
in the ORTModelForCausalLM.from_pretrained
and AutoTokenizer.from_pretrained
methods, you need to download the files locally and specify the path to one of the sub-folders within the microsoft/Phi-3-mini-128k-instruct-onnx
repo. Here is an example of how to specify the path.
Gotcha, thank you very much for your help. I'll close this thread then.