I use the following code to finally find the bug.
### Import something needed
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
import torch.nn as nn
from torch.multiprocessing import Process
import torch.distributed as dist
### extra Import pytorch geometric package
from torch_geometric.data import Data
### Args Configuration
class Args:
def __init__(self):
self.distributed_thread_size = 8
## Other Args
### Model
class linear_model(nn.Module):
def __init__(self,input_size,output_size):
super(linear_model, self).__init__()
self.fc = nn.Linear(input_size,output_size)
def forward(self,x):
return self.fc(x)
### Torch.distributed to train the model
def run_one_thread(rank, size, opt):
""" Distributed Synchronous training of one thread """
### multiple GPUs
dev_num = torch.cuda.device_count()
device = torch.device("cuda:{}".format(rank % dev_num))
### Model Initialize
generator = linear_model(8,64)
model = generator.to(device)
print("train %d" %(rank))
### Train Code....
def init_processes(rank, size, fn, opt, backend='nccl'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29502'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size, opt)
def execute_train_distributed(opt):
size = opt.distributed_thread_size # the number of threads or GPUs we used
processes = []
for rank in range(size):
p = Process(target=init_processes, args=(rank, size, run_one_thread, opt))
p.start()
processes.append(p)
for p in processes:
p.join()
### Main function
def main():
opt = Args()
execute_train_distributed(opt)
### Execute it.
main()
The above code will output the following logs.
Process Process-1:
Traceback (most recent call last):
File "/home/my_name/anaconda3/envs/env_name/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/home/my_name/anaconda3/envs/env_name/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "<ipython-input-8-20cc82d816d1>", line 40, in init_processes
fn(rank, size, opt)
File "<ipython-input-8-20cc82d816d1>", line 31, in run_one_thread
model = generator.to(device)
File "/home/my_name/anaconda3/envs/env_name/lib/python3.6/site-packages/torch/nn/modules/module.py", line 381, in to
return self._apply(convert)
File "/home/my_name/anaconda3/envs/env_name/lib/python3.6/site-packages/torch/nn/modules/module.py", line 187, in _apply
module._apply(fn)
File "/home/my_name/anaconda3/envs/env_name/lib/python3.6/site-packages/torch/nn/modules/module.py", line 193, in _apply
param.data = fn(param.data)
File "/home/my_name/anaconda3/envs/env_name/lib/python3.6/site-packages/torch/nn/modules/module.py", line 379, in convert
return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
File "/home/my_name/anaconda3/envs/env_name/lib/python3.6/site-packages/torch/cuda/__init__.py", line 162, in _lazy_init
torch._C._cuda_init()
RuntimeError: cuda runtime error (3) : initialization error at /opt/conda/conda-bld/pytorch_1544174967633/work/aten/src/THC/THCGeneral.cpp:51
If I comment the code
from torch_geometric.data import Data
I can successfully run the code and get the results like:
train 7
train 6
train 1
train 2
train 5
train 3
train 4
train 0
So I want to ask why pytorch geometric will affect the distributed training of pytorch model.
How can I solve this problem?
This crashes because we call torch.cuda.is_available() in PyG (see here).
Thanks a lot for your quick feedback! Would you further suggest how I can fix this issue?
According to this issue, the following fixes the problem:
if __name__ == '__main__':
torch.multiprocessing.set_start_method('spawn')
main()
Most helpful comment
According to this issue, the following fixes the problem: