try to fix job hang issue
reference: https://github.com/nv-tlabs/LION/issues/32#issuecomment-1496997294
This commit is contained in:
parent
b84169e724
commit
0467d21990
|
@ -1130,24 +1130,26 @@ def init_processes(rank, size, fn, args, config):
|
||||||
""" Initialize the distributed environment. """
|
""" Initialize the distributed environment. """
|
||||||
os.environ['MASTER_ADDR'] = args.master_address
|
os.environ['MASTER_ADDR'] = args.master_address
|
||||||
os.environ['MASTER_PORT'] = '6020'
|
os.environ['MASTER_PORT'] = '6020'
|
||||||
if args.num_proc_node == 1:
|
logger.info('set MASTER_PORT: {}, MASTER_PORT: {}', os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
|
||||||
import socket
|
|
||||||
import errno
|
# if args.num_proc_node == 1: # try to solve the port occupied issue
|
||||||
a_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
# import socket
|
||||||
for p in range(6010, 6030):
|
# import errno
|
||||||
location = (args.master_address, p) # "127.0.0.1", p)
|
# a_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
try:
|
# for p in range(6010, 6030):
|
||||||
a_socket.bind((args.master_address, p))
|
# location = (args.master_address, p) # "127.0.0.1", p)
|
||||||
logger.debug('set port as {}', p)
|
# try:
|
||||||
os.environ['MASTER_PORT'] = '%d' % p
|
# a_socket.bind((args.master_address, p))
|
||||||
a_socket.close()
|
# logger.debug('set port as {}', p)
|
||||||
break
|
# os.environ['MASTER_PORT'] = '%d' % p
|
||||||
except socket.error as e:
|
# a_socket.close()
|
||||||
a = 0
|
# break
|
||||||
# if e.errno == errno.EADDRINUSE:
|
# except socket.error as e:
|
||||||
# # logger.debug("Port {} is already in use", p)
|
# a = 0
|
||||||
# else:
|
# # if e.errno == errno.EADDRINUSE:
|
||||||
# logger.debug(e)
|
# # # logger.debug("Port {} is already in use", p)
|
||||||
|
# # else:
|
||||||
|
# # logger.debug(e)
|
||||||
|
|
||||||
logger.info('init_process: rank={}, world_size={}', rank, size)
|
logger.info('init_process: rank={}, world_size={}', rank, size)
|
||||||
torch.cuda.set_device(args.local_rank)
|
torch.cuda.set_device(args.local_rank)
|
||||||
|
|
Loading…
Reference in a new issue