microsoft/Phi-3-mini-128k-instruct-onnx

Jul 2

Getting an following error when trying to use this model. However, if there is a simpler way to do it, I am just looking to use a phi-3 model with quantization because I am getting OOM errors with my 16 GB VRAM V100.

Runtime:Python 3.10.14 (main, Mar 21 2024, 16:24:04) [GCC 11.2.0]
GPU: NVIDIA V100
Optimum 1.20.0
Transformers 4.41.2

This is the code snippet that I have:

from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

torch.random.manual_seed(0)

model = ORTModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-128k-instruct-onnx",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct-onnx")

This is the error I am getting:

The ONNX file phi3-mini-128k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx is not a regular name used in optimum.onnxruntime that are ['model.onnx', 'model_quantized.onnx', 'model_optimized.onnx', 'decoder_with_past_model.onnx', 'decoder_with_past_model_quantized.onnx', 'decoder_with_past_model_optimized.onnx'], the ORTModelForCausalLM might not behave as expected.

HTTPError Traceback (most recent call last)
File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:304, in hf_raise_for_status(response, endpoint_name)
303 try:
--> 304 response.raise_for_status()
305 except HTTPError as e:

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/requests/models.py:1024, in Response.raise_for_status(self)
1023 if http_error_msg:
-> 1024 raise HTTPError(http_error_msg, response=self)

HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx/resolve/main/phi3-mini-128k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx

The above exception was the direct cause of the following exception:

EntryNotFoundError Traceback (most recent call last)
Cell In[19], line 6
2 from optimum.onnxruntime import ORTModelForCausalLM
4 torch.random.manual_seed(0)
----> 6 model = ORTModelForCausalLM.from_pretrained(
7 "microsoft/Phi-3-mini-128k-instruct-onnx",
8 device_map="cuda",
9 torch_dtype="auto",
10 #quantization_config=quantization_config
11 )
12 tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct-onnx")

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/onnxruntime/modeling_ort.py:669, in ORTModel.from_pretrained(cls, model_id, export, force_download, use_auth_token, cache_dir, subfolder, config, local_files_only, provider, session_options, provider_options, use_io_binding, **kwargs)
620 @classmethod
621 @add_start_docstrings(FROM_PRETRAINED_START_DOCSTRING)
622 def from_pretrained(
(...)
636 **kwargs,
637 ):
638 """
639 provider (str, defaults to "CPUExecutionProvider"):
640 ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/ for
(...)
667 ORTModel: The loaded ORTModel model.
668 """
--> 669 return super().from_pretrained(
670 model_id,
671 export=export,
672 force_download=force_download,
673 use_auth_token=use_auth_token,
674 cache_dir=cache_dir,
675 subfolder=subfolder,
676 config=config,
677 local_files_only=local_files_only,
678 provider=provider,
679 session_options=session_options,
680 provider_options=provider_options,
681 use_io_binding=use_io_binding,
682 **kwargs,
683 )

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/modeling_base.py:402, in OptimizedModel.from_pretrained(cls, model_id, export, force_download, use_auth_token, cache_dir, subfolder, config, local_files_only, trust_remote_code, revision, **kwargs)
398 trust_remote_code = False
400 from_pretrained_method = cls._from_transformers if export else cls._from_pretrained
--> 402 return from_pretrained_method(
403 model_id=model_id,
404 config=config,
405 revision=revision,
406 cache_dir=cache_dir,
407 force_download=force_download,
408 use_auth_token=use_auth_token,
409 subfolder=subfolder,
410 local_files_only=local_files_only,
411 trust_remote_code=trust_remote_code,
412 **kwargs,
413 )

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/onnxruntime/modeling_decoder.py:495, in ORTModelForCausalLM._from_pretrained(cls, model_id, config, use_auth_token, revision, force_download, cache_dir, file_name, subfolder, use_cache, local_files_only, use_merged, provider, session_options, provider_options, use_io_binding, model_save_dir, **kwargs)
489 if file_name not in regular_file_names:
490 logger.warning(
491 f"The ONNX file {file_name} is not a regular name used in optimum.onnxruntime that are {regular_file_names}, the "
492 f"{cls.name} might not behave as expected."
493 )
--> 495 model_cache_path, preprocessors = cls._cached_file(
496 model_path=model_path,
497 use_auth_token=use_auth_token,
498 revision=revision,
499 force_download=force_download,
500 cache_dir=cache_dir,
501 file_name=file_name,
502 subfolder=subfolder,
503 local_files_only=local_files_only,
504 )
505 new_model_save_dir = model_cache_path.parent
507 # model_save_dir can be provided in kwargs as a TemporaryDirectory instance, in which case we want to keep it
508 # instead of the path only.

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/optimum/onnxruntime/modeling_ort.py:873, in ORTModel._cached_file(model_path, use_auth_token, revision, force_download, cache_dir, file_name, subfolder, local_files_only)
871 preprocessors = maybe_load_preprocessors(model_path.as_posix())
872 else:
--> 873 model_cache_path = hf_hub_download(
874 repo_id=model_path.as_posix(),
875 filename=file_name,
876 subfolder=subfolder,
877 use_auth_token=use_auth_token,
878 revision=revision,
879 cache_dir=cache_dir,
880 force_download=force_download,
881 local_files_only=local_files_only,
882 )
883 # try download external data
884 try:

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1221, in hf_hub_download(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, user_agent, force_download, proxies, etag_timeout, token, local_files_only, headers, endpoint, legacy_cache_layout, resume_download, force_filename, local_dir_use_symlinks)
1202 return _hf_hub_download_to_local_dir(
1203 # Destination
1204 local_dir=local_dir,
(...)
1218 local_files_only=local_files_only,
1219 )
1220 else:
-> 1221 return _hf_hub_download_to_cache_dir(
1222 # Destination
1223 cache_dir=cache_dir,
1224 # File info
1225 repo_id=repo_id,
1226 filename=filename,
1227 repo_type=repo_type,
1228 revision=revision,
1229 # HTTP info
1230 headers=headers,
1231 proxies=proxies,
1232 etag_timeout=etag_timeout,
1233 endpoint=endpoint,
1234 # Additional options
1235 local_files_only=local_files_only,
1236 force_download=force_download,
1237 )

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1282, in _hf_hub_download_to_cache_dir(cache_dir, repo_id, filename, repo_type, revision, headers, proxies, etag_timeout, endpoint, local_files_only, force_download)
1278 return pointer_path
1280 # Try to get metadata (etag, commit_hash, url, size) from the server.
1281 # If we can't, a HEAD request error is returned.
-> 1282 (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
1283 repo_id=repo_id,
1284 filename=filename,
1285 repo_type=repo_type,
1286 revision=revision,
1287 endpoint=endpoint,
1288 proxies=proxies,
1289 etag_timeout=etag_timeout,
1290 headers=headers,
1291 local_files_only=local_files_only,
1292 storage_folder=storage_folder,
1293 relative_filename=relative_filename,
1294 )
1296 # etag can be None for several reasons:
1297 # 1. we passed local_files_only.
1298 # 2. we don't have a connection
(...)
1304 # If the specified revision is a commit hash, look inside "snapshots".
1305 # If the specified revision is a branch or tag, look inside "refs".
1306 if head_call_error is not None:
1307 # Couldn't make a HEAD call => let's try to find a local file

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1722, in _get_metadata_or_catch_error(repo_id, filename, repo_type, revision, endpoint, proxies, etag_timeout, headers, local_files_only, relative_filename, storage_folder)
1720 try:
1721 try:
-> 1722 metadata = get_hf_file_metadata(url=url, proxies=proxies, timeout=etag_timeout, headers=headers)
1723 except EntryNotFoundError as http_error:
1724 if storage_folder is not None and relative_filename is not None:
1725 # Cache the non-existence of the file

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1645, in get_hf_file_metadata(url, token, proxies, timeout, library_name, library_version, user_agent, headers)
1642 headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
1644 # Retrieve metadata
-> 1645 r = _request_wrapper(
1646 method="HEAD",
1647 url=url,
1648 headers=headers,
1649 allow_redirects=False,
1650 follow_relative_redirects=True,
1651 proxies=proxies,
1652 timeout=timeout,
1653 )
1654 hf_raise_for_status(r)
1656 # Return

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:372, in _request_wrapper(method, url, follow_relative_redirects, **params)
370 # Recursively follow relative redirects
371 if follow_relative_redirects:
--> 372 response = _request_wrapper(
373 method=method,
374 url=url,
375 follow_relative_redirects=False,
376 **params,
377 )
379 # If redirection, we redirect only relative paths.
380 # This is useful in case of a renamed repository.
381 if 300 <= response.status_code <= 399:

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:396, in _request_wrapper(method, url, follow_relative_redirects, **params)
394 # Perform request and return if status_code is not in the retry list.
395 response = get_session().request(method=method, url=url, **params)
--> 396 hf_raise_for_status(response)
397 return response

File /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:315, in hf_raise_for_status(response, endpoint_name)
313 elif error_code == "EntryNotFound":
314 message = f"{response.status_code} Client Error." + "\n\n" + f"Entry Not Found for url: {response.url}."
--> 315 raise EntryNotFoundError(message, response) from e
317 elif error_code == "GatedRepo":
318 message = (
319 f"{response.status_code} Client Error." + "\n\n" + f"Cannot access gated repo for url {response.url}."
320 )

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-668356a1-2a1c334c7f7dd8405dbde41e;25cd8dfc-0b0f-4777-9251-ecd1670d385f)

Entry Not Found for url: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx/resolve/main/phi3-mini-128k-instruct-cpu-int4-rtn-block-32-acc-level-4.onnx.

kvaishnavi

Microsoft org Jul 12

Instead of specifying microsoft/Phi-3-mini-128k-instruct-onnx in the ORTModelForCausalLM.from_pretrained and AutoTokenizer.from_pretrained methods, you need to download the files locally and specify the path to one of the sub-folders within the microsoft/Phi-3-mini-128k-instruct-onnx repo. Here is an example of how to specify the path.

jcalvo1234

Jul 13

Gotcha, thank you very much for your help. I'll close this thread then.

jcalvo1234 changed discussion status to closed Jul 13