tvar-demo-test-2

Sleeping

App Files Files Community

tvar-demo-test-2 / dist.py

michellemoorre

Initial commit

6c4dee3 about 1 month ago

raw

history blame

5.31 kB

	"""
	Helpers for distributed training.
	"""
	import os
	import socket

	import torch as th
	import torch.distributed as dist
	from torch.distributed import barrier, is_initialized, broadcast

	# Change this to reflect your cluster layout.
	# The GPU for a given rank is (rank % GPUS_PER_NODE).
	GPUS_PER_NODE = 8

	SETUP_RETRY_COUNT = 3

	import datetime
	import os

	import socket
	from contextlib import closing


	def find_free_port() -> int:
	with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
	s.bind(("", 0))
	s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
	return s.getsockname()[1]


	def check_if_port_open(port: int) -> bool:
	with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
	try:
	s.bind(("", port))
	s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
	return True
	except OSError:
	return False


	def initialized():
	return dist.is_initialized()


	def finalize():
	if dist.is_initialized():
	dist.destroy_process_group()


	def initialize():
	is_mpirun = not (
	"RANK" in os.environ
	and "WORLD_SIZE" in os.environ
	and "MASTER_ADDR" in os.environ
	and "MASTER_PORT" in os.environ
	)

	if is_mpirun:
	from mpi4py import MPI
	import subprocess

	comm = MPI.COMM_WORLD
	rank = comm.Get_rank()
	world_size = comm.Get_size()

	master_addr = None
	master_port = None
	if rank == 0:
	hostname_cmd = ["hostname -I"]
	result = subprocess.check_output(hostname_cmd, shell=True)
	master_addr = result.decode("utf-8").split()[0]

	base_port = os.environ.get(
	"MASTER_PORT", "29500"
	) # TORCH_DISTRIBUTED_DEFAULT_PORT
	if check_if_port_open(int(base_port)):
	master_port = base_port
	else:
	master_port = find_free_port()

	master_addr = comm.bcast(master_addr, root=0)
	master_port = comm.bcast(master_port, root=0)
	# Determine local rank by assuming hostnames are unique
	proc_name = MPI.Get_processor_name()
	all_procs = comm.allgather(proc_name)
	local_rank = sum([i == proc_name for i in all_procs[:rank]])
	uniq_proc_names = set(all_procs)
	host_rank = sorted(uniq_proc_names).index(proc_name)

	os.environ["LOCAL_RANK"] = str(local_rank)
	os.environ["HOST_RANK"] = str(host_rank)
	os.environ["NUM_HOSTS"] = str(len(uniq_proc_names))

	os.environ["RANK"] = str(rank)
	os.environ["WORLD_SIZE"] = str(world_size)
	os.environ["MASTER_ADDR"] = master_addr
	os.environ["MASTER_PORT"] = str(master_port)
	os.environ["OMP_NUM_THREADS"] = "1"

	# Initialize torch distributed
	backend = "gloo" if not th.cuda.is_available() else "nccl"
	dist.init_process_group(backend=backend, timeout=datetime.timedelta(0, 3600))
	th.cuda.set_device(int(os.environ.get('LOCAL_RANK', '0')))

	if is_mpirun and dist.get_rank() == 0:
	print("Distributed setup")
	print("LOCAL_RANK", os.environ['LOCAL_RANK'])
	print("HOST_RANK", os.environ['HOST_RANK'])
	print("NUM_HOSTS", os.environ['NUM_HOSTS'])
	print("WORLD_SIZE", os.environ['WORLD_SIZE'])


	def local_host_gather(data):
	from mpi4py import MPI

	comm = MPI.COMM_WORLD
	host_rank = os.environ["HOST_RANK"]
	all_data = comm.allgather((host_rank, data))
	return [d[1] for d in all_data if d[0] == host_rank]


	def in_distributed_mode():
	return dist is not None


	def is_master():
	return get_rank() == 0


	def is_local_master():
	return get_local_rank() == 0


	def get_rank():
	return dist.get_rank() if in_distributed_mode() else 0


	def get_local_rank():
	return int(os.environ["LOCAL_RANK"])


	def worker_host_idx():
	return int(os.environ["HOST_RANK"])


	def num_hosts():
	return int(os.environ['NUM_HOSTS'])


	def get_world_size():
	return dist.get_world_size() if in_distributed_mode() else 1


	def gpu_visible_device_list():
	return str(dist.get_rank()) if in_distributed_mode() else None


	def get_device():
	"""
	Get the device to use for torch.distributed.
	"""
	if th.cuda.is_available():
	return th.device("cuda")
	return th.device("cpu")


	def sync_params(params):
	"""
	Synchronize a sequence of Tensors across ranks from rank 0.
	"""
	for p in params:
	with th.no_grad():
	dist.broadcast(p, 0)


	def print0(args, *kwargs):
	if get_rank() == 0:
	print(args, *kwargs)


	def allreduce(t: th.Tensor, async_op=False):
	if dist.is_initialized():
	if not t.is_cuda:
	cu = t.detach().cuda()
	ret = dist.all_reduce(cu, async_op=async_op)
	t.copy_(cu.cpu())
	else:
	ret = dist.all_reduce(t, async_op=async_op)
	return ret
	return None


	def allgather(t: th.Tensor, cat=True):
	if dist.is_initialized():
	if not t.is_cuda:
	t = t.cuda()
	ls = [th.empty_like(t) for _ in range(get_world_size())]
	dist.all_gather(ls, t)
	else:
	ls = [t]
	if cat:
	ls = th.cat(ls, dim=0)
	return ls