try to fix job hang issue

reference: https://github.com/nv-tlabs/LION/issues/32#issuecomment-1496997294
This commit is contained in:
xzeng 2023-04-05 03:21:20 -04:00
parent b84169e724
commit 0467d21990

View file

@ -1130,24 +1130,26 @@ def init_processes(rank, size, fn, args, config):
""" Initialize the distributed environment. """ """ Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = args.master_address os.environ['MASTER_ADDR'] = args.master_address
os.environ['MASTER_PORT'] = '6020' os.environ['MASTER_PORT'] = '6020'
if args.num_proc_node == 1: logger.info('set MASTER_PORT: {}, MASTER_PORT: {}', os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
import socket
import errno # if args.num_proc_node == 1: # try to solve the port occupied issue
a_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # import socket
for p in range(6010, 6030): # import errno
location = (args.master_address, p) # "127.0.0.1", p) # a_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try: # for p in range(6010, 6030):
a_socket.bind((args.master_address, p)) # location = (args.master_address, p) # "127.0.0.1", p)
logger.debug('set port as {}', p) # try:
os.environ['MASTER_PORT'] = '%d' % p # a_socket.bind((args.master_address, p))
a_socket.close() # logger.debug('set port as {}', p)
break # os.environ['MASTER_PORT'] = '%d' % p
except socket.error as e: # a_socket.close()
a = 0 # break
# if e.errno == errno.EADDRINUSE: # except socket.error as e:
# # logger.debug("Port {} is already in use", p) # a = 0
# else: # # if e.errno == errno.EADDRINUSE:
# logger.debug(e) # # # logger.debug("Port {} is already in use", p)
# # else:
# # logger.debug(e)
logger.info('init_process: rank={}, world_size={}', rank, size) logger.info('init_process: rank={}, world_size={}', rank, size)
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)