AssertionError: Default process group is not initialized
jakeum opened this issue · comments
jakeum commented
try to use the resnet-101 for cascade network
model_name = "./mask_cascade_rcnn_ResNeSt_101_FPN_syncBN_1x.yaml" cfg.merge_from_file(model_name)
cfg.MODEL.WEIGHTS = os.path.join("./mask_cascade_rcnn_R_101_FPN_syncbn_1x-8cec1631.pth") # path to the model we just trained
got the following error
Traceback (most recent call last):
File "train.py", line 1330, in <module>
main()
File "train.py", line 1325, in main
train(cfg,args)
File "train.py", line 1303, in train
trainer.train()
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/engine/defaults.py", line 419, in train
super().train(self.start_iter, self.max_iter)
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/engine/train_loop.py", line 134, in train
self.run_step()
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/engine/defaults.py", line 429, in run_step
self._trainer.run_step()
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/engine/train_loop.py", line 228, in run_step
loss_dict = self.model(data)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/modeling/meta_arch/rcnn.py", line 157, in forward
features = self.backbone(images.tensor)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/modeling/backbone/fpn.py", line 127, in forward
bottom_up_features = self.bottom_up(x)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/modeling/backbone/resnet.py", line 438, in forward
x = stage(x)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/modeling/backbone/resnet.py", line 196, in forward
out = self.conv1(x)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/jake/Gits/make_demo_clip/car_detection_detectron2/detectron2/detectron2/layers/wrappers.py", line 80, in forward
x = self.norm(x)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 519, in forward
world_size = torch.distributed.get_world_size(process_group)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 625, in get_world_size
return _get_group_size(group)
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 220, in _get_group_size
_check_default_pg()
File "/home/jake/venv_11.1/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 211, in _check_default_pg
"Default process group is not initialized"
AssertionError: Default process group is not initialized
Peter-weng commented
i think it because you use the SyncBN but no use more gpus to train..