|
03/06 17:33:07 - OpenCompass - INFO - Task [my_api/siqa] |
|
/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. |
|
torch.utils._pytree._register_pytree_node( |
|
/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. |
|
torch.utils._pytree._register_pytree_node( |
|
03/06 17:33:14 - OpenCompass - INFO - Start inferencing [my_api/siqa] |
|
[2024-03-06 17:33:14,870] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process... |
|
0%| | 0/245 [00:00<?, ?it/s]Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Connection error, reconnect. |
|
Request Error:HTTPConnectionPool(host='127.0.0.1', port=12345): Max retries exceeded with url: /testing (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f7d26a733d0>: Failed to establish a new connection: [Errno 111] Connection refused')) |
|
Request Error:HTTPConnectionPool(host='127.0.0.1', port=12345): Max retries exceeded with url: /testing (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f7d26ab8c10>: Failed to establish a new connection: [Errno 111] Connection refused')) |
|
Request Error:HTTPConnectionPool(host='127.0.0.1', port=12345): Max retries exceeded with url: /testing (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f7d26ab9480>: Failed to establish a new connection: [Errno 111] Connection refused')) |
|
Request Error:HTTPConnectionPool(host='127.0.0.1', port=12345): Max retries exceeded with url: /testing (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f7d26ab9cf0>: Failed to establish a new connection: [Errno 111] Connection refused')) |
|
[2024-03-06 17:33:40,851] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers |
|
[2024-03-06 17:33:40,852] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 264780 closing signal SIGINT |
|
Request Error:HTTPConnectionPool(host='127.0.0.1', port=12345): Max retries exceeded with url: /testing (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f7d26aba560>: Failed to establish a new connection: [Errno 111] Connection refused')) |
|
[2024-03-06 17:33:41,044] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 264780 closing signal SIGTERM |
|
Traceback (most recent call last): |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 727, in run |
|
result = self._invoke_run(role) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 868, in _invoke_run |
|
time.sleep(monitor_interval) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler |
|
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) |
|
torch.distributed.elastic.multiprocessing.api.SignalException: Process 264707 got signal: 2 |
|
|
|
During handling of the above exception, another exception occurred: |
|
|
|
Traceback (most recent call last): |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 734, in run |
|
self._shutdown(e.sigval) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 311, in _shutdown |
|
self._pcontext.close(death_sig) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 318, in close |
|
self._close(death_sig=death_sig, timeout=timeout) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 706, in _close |
|
handler.proc.wait(time_to_wait) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1209, in wait |
|
return self._wait(timeout=timeout) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1953, in _wait |
|
time.sleep(delay) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler |
|
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) |
|
torch.distributed.elastic.multiprocessing.api.SignalException: Process 264707 got signal: 2 |
|
|
|
During handling of the above exception, another exception occurred: |
|
|
|
Traceback (most recent call last): |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/bin/torchrun", line 8, in <module> |
|
sys.exit(main()) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper |
|
return f(*args, **kwargs) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main |
|
run(args) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run |
|
elastic_launch( |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ |
|
return launch_agent(self._config, self._entrypoint, list(args)) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent |
|
result = agent.run() |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper |
|
result = f(*args, **kwargs) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 739, in run |
|
self._shutdown() |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 311, in _shutdown |
|
self._pcontext.close(death_sig) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 318, in close |
|
self._close(death_sig=death_sig, timeout=timeout) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 706, in _close |
|
handler.proc.wait(time_to_wait) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1209, in wait |
|
return self._wait(timeout=timeout) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1953, in _wait |
|
time.sleep(delay) |
|
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler |
|
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) |
|
torch.distributed.elastic.multiprocessing.api.SignalException: Process 264707 got signal: 2 |
|
|