Anyone please?The problem is that once i use 2 gpus,the program goes wrong?

Question

Anyone please?The problem is that once i use 2 gpus,the program goes wrong?

DJstepbystep opened this issue a year ago · comments

Training options:
{
"num_gpus": 2,
"image_snapshot_ticks": 10,
"network_snapshot_ticks": 10,
"metrics": [
"fid50k_full"
],
"random_seed": 0,
"training_set_kwargs": {
"class_name": "training.dataset.ImageFolderDataset",
"path": "/root/autodl-tmp/stylegan/img/NEW20230111allrotate.zip",
"use_labels": false,
"max_size": 13276,
"xflip": false,
"resolution": 512
},
"data_loader_kwargs": {
"pin_memory": true,
"num_workers": 3,
"prefetch_factor": 2
},
"G_kwargs": {
"class_name": "training.networks.Generator",
"z_dim": 512,
"w_dim": 512,
"mapping_kwargs": {
"num_layers": 8
},
"synthesis_kwargs": {
"channel_base": 32768,
"channel_max": 512,
"num_fp16_res": 4,
"conv_clamp": 256
}
},
"D_kwargs": {
"class_name": "training.networks.Discriminator",
"block_kwargs": {},
"mapping_kwargs": {},
"epilogue_kwargs": {
"mbstd_group_size": 4
},
"channel_base": 32768,
"channel_max": 512,
"num_fp16_res": 4,
"conv_clamp": 256
},
"G_opt_kwargs": {
"class_name": "torch.optim.Adam",
"lr": 0.002,
"betas": [
0,
0.99
],
"eps": 1e-08
},
"D_opt_kwargs": {
"class_name": "torch.optim.Adam",
"lr": 0.002,
"betas": [
0,
0.99
],
"eps": 1e-08
},
"loss_kwargs": {
"class_name": "training.loss.StyleGAN2Loss",
"r1_gamma": 6.6
},
"total_kimg": 3000,
"batch_size": 4,
"batch_gpu": 2,
"ema_kimg": 10,
"ema_rampup": null,
"ada_target": 0.6,
"augment_kwargs": {
"class_name": "training.augment.AugmentPipe",
"xflip": 1,
"rotate90": 1,
"xint": 1,
"scale": 1,
"rotate": 1,
"aniso": 1,
"xfrac": 1,
"brightness": 1,
"contrast": 1,
"lumaflip": 1,
"hue": 1,
"saturation": 1
},
"run_dir": "./training-runs/00013-NEW20230111allrotate-stylegan2-gamma6.6-kimg3000-batch4"
}

Output directory: ./training-runs/00013-NEW20230111allrotate-stylegan2-gamma6.6-kimg3000-batch4
Training data: /root/autodl-tmp/stylegan/img/NEW20230111allrotate.zip
Training duration: 3000 kimg
Number of GPUs: 2
Number of images: 13276
Image resolution: 512
Conditional model: False
Dataset x-flips: False

Creating output directory...
Launching processes...
Loading training set...

Num images: 13276
Image shape: [3, 512, 512]
Label shape: [0]

Constructing networks...
Setting up PyTorch plugin "bias_act_plugin"... Done.
Setting up PyTorch plugin "upfirdn2d_plugin"... Done.

Generator Parameters Buffers Output shape Datatype

mapping.fc0 262656 - [2, 512] float32
mapping.fc1 262656 - [2, 512] float32
mapping.fc2 262656 - [2, 512] float32
mapping.fc3 262656 - [2, 512] float32
mapping.fc4 262656 - [2, 512] float32
mapping.fc5 262656 - [2, 512] float32
mapping.fc6 262656 - [2, 512] float32
mapping.fc7 262656 - [2, 512] float32
mapping - 512 [2, 16, 512] float32
synthesis.b4.conv1 2622465 32 [2, 512, 4, 4] float32
synthesis.b4.torgb 264195 - [2, 3, 4, 4] float32
synthesis.b4:0 8192 16 [2, 512, 4, 4] float32
synthesis.b4:1 - - [2, 512, 4, 4] float32
synthesis.b8.conv0 2622465 80 [2, 512, 8, 8] float32
synthesis.b8.conv1 2622465 80 [2, 512, 8, 8] float32
synthesis.b8.torgb 264195 - [2, 3, 8, 8] float32
synthesis.b8:0 - 16 [2, 512, 8, 8] float32
synthesis.b8:1 - - [2, 512, 8, 8] float32
synthesis.b16.conv0 2622465 272 [2, 512, 16, 16] float32
synthesis.b16.conv1 2622465 272 [2, 512, 16, 16] float32
synthesis.b16.torgb 264195 - [2, 3, 16, 16] float32
synthesis.b16:0 - 16 [2, 512, 16, 16] float32
synthesis.b16:1 - - [2, 512, 16, 16] float32
synthesis.b32.conv0 2622465 1040 [2, 512, 32, 32] float32
synthesis.b32.conv1 2622465 1040 [2, 512, 32, 32] float32
synthesis.b32.torgb 264195 - [2, 3, 32, 32] float32
synthesis.b32:0 - 16 [2, 512, 32, 32] float32
synthesis.b32:1 - - [2, 512, 32, 32] float32
synthesis.b64.conv0 2622465 4112 [2, 512, 64, 64] float16
synthesis.b64.conv1 2622465 4112 [2, 512, 64, 64] float16
synthesis.b64.torgb 264195 - [2, 3, 64, 64] float16
synthesis.b64:0 - 16 [2, 512, 64, 64] float16
synthesis.b64:1 - - [2, 512, 64, 64] float32
synthesis.b128.conv0 1442561 16400 [2, 256, 128, 128] float16
synthesis.b128.conv1 721409 16400 [2, 256, 128, 128] float16
synthesis.b128.torgb 132099 - [2, 3, 128, 128] float16
synthesis.b128:0 - 16 [2, 256, 128, 128] float16
synthesis.b128:1 - - [2, 256, 128, 128] float32
synthesis.b256.conv0 426369 65552 [2, 128, 256, 256] float16
synthesis.b256.conv1 213249 65552 [2, 128, 256, 256] float16
synthesis.b256.torgb 66051 - [2, 3, 256, 256] float16
synthesis.b256:0 - 16 [2, 128, 256, 256] float16
synthesis.b256:1 - - [2, 128, 256, 256] float32
synthesis.b512.conv0 139457 262160 [2, 64, 512, 512] float16
synthesis.b512.conv1 69761 262160 [2, 64, 512, 512] float16
synthesis.b512.torgb 33027 - [2, 3, 512, 512] float16
synthesis.b512:0 - 16 [2, 64, 512, 512] float16
synthesis.b512:1 - - [2, 64, 512, 512] float32

Total 30276583 699904 - -

Discriminator Parameters Buffers Output shape Datatype

b512.fromrgb 256 16 [2, 64, 512, 512] float16
b512.skip 8192 16 [2, 128, 256, 256] float16
b512.conv0 36928 16 [2, 64, 512, 512] float16
b512.conv1 73856 16 [2, 128, 256, 256] float16
b512 - 16 [2, 128, 256, 256] float16
b256.skip 32768 16 [2, 256, 128, 128] float16
b256.conv0 147584 16 [2, 128, 256, 256] float16
b256.conv1 295168 16 [2, 256, 128, 128] float16
b256 - 16 [2, 256, 128, 128] float16
b128.skip 131072 16 [2, 512, 64, 64] float16
b128.conv0 590080 16 [2, 256, 128, 128] float16
b128.conv1 1180160 16 [2, 512, 64, 64] float16
b128 - 16 [2, 512, 64, 64] float16
b64.skip 262144 16 [2, 512, 32, 32] float16
b64.conv0 2359808 16 [2, 512, 64, 64] float16
b64.conv1 2359808 16 [2, 512, 32, 32] float16
b64 - 16 [2, 512, 32, 32] float16
b32.skip 262144 16 [2, 512, 16, 16] float32
b32.conv0 2359808 16 [2, 512, 32, 32] float32
b32.conv1 2359808 16 [2, 512, 16, 16] float32
b32 - 16 [2, 512, 16, 16] float32
b16.skip 262144 16 [2, 512, 8, 8] float32
b16.conv0 2359808 16 [2, 512, 16, 16] float32
b16.conv1 2359808 16 [2, 512, 8, 8] float32
b16 - 16 [2, 512, 8, 8] float32
b8.skip 262144 16 [2, 512, 4, 4] float32
b8.conv0 2359808 16 [2, 512, 8, 8] float32
b8.conv1 2359808 16 [2, 512, 4, 4] float32
b8 - 16 [2, 512, 4, 4] float32
b4.mbstd - - [2, 513, 4, 4] float32
b4.conv 2364416 16 [2, 512, 4, 4] float32
b4.fc 4194816 - [2, 512] float32
b4.out 513 - [2, 1] float32

Total 28982849 480 - -

Setting up augmentation...
Distributing across 2 GPUs...
Setting up training phases...
Exporting sample images...
Initializing logs...
Skipping tfevents export: No module named 'tensorboard'
Training for 3000 kimg...

tick 0 kimg 0.0 time 30s sec/tick 2.4 sec/kimg 601.84 maintenance 27.9 cpumem 4.56 gpumem 5.39 augment 0.000
Traceback (most recent call last):
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/train.py", line 538, in
main() # pylint: disable=no-value-for-parameter
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 1128, in call
return self.main(*args, **kwargs)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 754, in invoke
return __callback(*args, **kwargs)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/decorators.py", line 26, in new_func
return f(get_current_context(), *args, **kwargs)
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/train.py", line 533, in main
torch.multiprocessing.spawn(fn=subprocess_fn, args=(args, temp_dir), nprocs=args.num_gpus)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:

-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, args)
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/train.py", line 383, in subprocess_fn
training_loop.training_loop(rank=rank, **args)
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/training/training_loop.py", line 409, in training_loop
misc.check_ddp_consistency(module, ignore_regex=r'..w_avg')
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/torch_utils/misc.py", line 187, in check_ddp_consistency
assert (nan_to_num(tensor) == nan_to_num(other)).all(), fullname
AssertionError: Discriminator.b512.fromrgb.weight

bash as below:
--outdir=./training-runs
--data=/root/autodl-tmp/stylegan/img/NEW20230111allrotate.zip
--gpus=2
--batch=4
--gamma=6.6
--cfg=stylegan2
--kimg=3000

saberpoi · Answer 1 · Fri Jul 28 2023 15:40:31 GMT+0800 (China Standard Time)

got the same problem

Lanxin Zeng · Answer 2 · Sun Dec 17 2023 15:15:08 GMT+0800 (China Standard Time)

I've got the same problem, do you have any solutions? Thank you so much !!

Zyriix · Answer 3 · Tue Apr 02 2024 15:07:59 GMT+0800 (China Standard Time)

make sure you call
misc.print_module_summary for G and D

This function make parameters at all nodes become the same