pllava-13b-demo

Paused

cathyxl

added

f239efc over 1 year ago

4.53 kB

	import os
	import torch
	import torch.distributed as dist
	import logging


	logger = logging.getLogger(__name__)


	def setup_for_distributed(is_master):
	import warnings

	builtin_warn = warnings.warn

	def warn(args, *kwargs):
	force = kwargs.pop("force", False)
	if is_master or force:
	builtin_warn(args, *kwargs)

	# Log warnings only once
	warnings.warn = warn
	warnings.simplefilter("once", UserWarning)

	if not is_master:
	logging.disable()


	def is_dist_avail_and_initialized():
	if not dist.is_available():
	return False
	if not dist.is_initialized():
	return False
	return True


	def get_world_size():
	if not is_dist_avail_and_initialized():
	return 1
	return dist.get_world_size()


	def get_rank():
	if not is_dist_avail_and_initialized():
	return 0
	return dist.get_rank()


	def is_main_process():
	return get_rank() == 0


	def save_on_master(args, *kwargs):
	if is_main_process():
	torch.save(args, *kwargs)


	def is_port_in_use(port):
	import socket
	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	return s.connect_ex(('localhost', port)) == 0


	def init_distributed_mode(args):
	if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
	# job started by torch.distributed.launch
	args.rank = int(os.environ["RANK"])
	args.world_size = int(os.environ['WORLD_SIZE'])
	args.gpu = int(os.environ['LOCAL_RANK'])
	elif 'SLURM_PROCID' in os.environ:
	# local rank on the current node / global rank
	local_rank = int(os.environ['SLURM_LOCALID'])
	global_rank = int(os.environ['SLURM_PROCID'])
	# number of processes / GPUs per node
	world_size = int(os.environ["SLURM_NNODES"]) * \
	int(os.environ["SLURM_TASKS_PER_NODE"][0])

	print(world_size)

	args.rank = global_rank
	args.gpu = local_rank
	args.world_size = world_size
	else:
	logger.info('Not using distributed mode')
	args.distributed = False
	return

	args.distributed = True

	torch.cuda.set_device(args.gpu)
	args.dist_backend = 'nccl'

	if "tcp" in args.dist_url: # in slurm, multiple program runs in a single node
	dist_port = int(args.dist_url.split(":")[-1])
	while is_port_in_use(dist_port):
	dist_port += 10
	args.dist_url = ":".join(args.dist_url.split(":")[:-1] + [str(dist_port)])
	print(args.dist_url)

	logger.info('\| distributed init (rank {}): {}'.format(
	args.rank, args.dist_url))
	if "SLURM_JOB_ID" in os.environ:
	logger.info(f"SLURM_JOB_ID {os.environ['SLURM_JOB_ID']}")
	torch.distributed.init_process_group(
	backend=args.dist_backend, init_method=args.dist_url,
	world_size=args.world_size, rank=args.rank)
	torch.distributed.barrier()
	setup_for_distributed(args.rank == 0)


	# Copyright (c) Facebook, Inc. and its affiliates.
	# copied from https://github.com/facebookresearch/vissl/blob/master/vissl/utils/distributed_gradients.py
	class GatherLayer(torch.autograd.Function):
	"""
	Gather tensors from all workers with support for backward propagation:
	This implementation does not cut the gradients as torch.distributed.all_gather does.
	"""

	@staticmethod
	def forward(ctx, x):
	output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
	dist.all_gather(output, x)
	return tuple(output)

	@staticmethod
	def backward(ctx, *grads):
	all_gradients = torch.stack(grads)
	dist.all_reduce(all_gradients)
	return all_gradients[dist.get_rank()]


	# copied from megavlt
	def gather_tensor_along_batch_with_backward(tensor, dim=0):
	world_size = get_world_size()

	if world_size < 2:
	return tensor

	tensor_list = GatherLayer.apply(tensor)
	tensor_list = torch.cat(tensor_list, dim=dim)
	return tensor_list


	@torch.no_grad()
	def gather_tensor_along_batch(tensor, dim=0):
	"""
	Performs all_gather operation on the provided tensors.
	* Warning *: torch.distributed.all_gather has no gradient.
	"""
	world_size = get_world_size()

	if world_size < 2:
	return tensor

	with torch.no_grad():
	tensor_list = []

	for _ in range(world_size):
	tensor_list.append(torch.zeros_like(tensor))

	dist.all_gather(tensor_list, tensor)
	tensor_list = torch.cat(tensor_list, dim=dim)
	return tensor_list