One bias in DnCNN becomes nan

Question

One bias in DnCNN becomes nan

alireza78h opened this issue 7 months ago · comments

Seyed Alireza Hosseini commented 7 months ago

So I have my own set of train and test images, all are 512x512 rgb images and are png. Here is my config file:

{
  "task": "dncnn25"  //  root/task/images-models-options
  , "model": "plain" // "plain"
  , "gpu_ids": [0]

  , "scale": 1       // broadcast to "netG" if SISR
  , "n_channels": 3  // broadcast to "datasets", 1 for grayscale, 3 for color

  , "merge_bn": true               // BN for DnCNN
  , "merge_bn_startpoint": 400000  // merge BN after N iterations

  , "path": {
    "root": "denoising"            // "denoising" | "superresolution"
    , "pretrained_netG": null      // path of pretrained model
  }

  , "datasets": {
    "train": {
      "name": "train_dataset"           // just name
      , "dataset_type": "dncnn"         // "dncnn" | "dnpatch" for dncnn,  | "fdncnn" | "ffdnet" | "sr" | "srmd" | "dpsr" | "plain" | "plainpatch"
      , "dataroot_H": "trainsets/trainH"// path of H training dataset
      , "dataroot_L": null              // path of L training dataset
      , "H_size": 40                    // patch size 40 | 64 | 96 | 128 | 192

      , "sigma": 25                     // 15, 25, 50 for DnCNN | [0, 75] for FFDNet and FDnCNN
      , "sigma_test": 25                // 15, 25, 50 for DnCNN and ffdnet

      , "dataloader_shuffle": true
      , "dataloader_num_workers": 8
      , "dataloader_batch_size": 16     // batch size 1 | 16 | 32 | 48 | 64 | 128
    }
    , "test": {
      "name": "test_dataset"            // just name
      , "dataset_type": "dncnn"         // "dncnn" | "dnpatch" for dncnn,  | "fdncnn" | "ffdnet" | "sr" | "srmd" | "dpsr" | "plain" | "plainpatch"
      , "dataroot_H": "testsets/liko"  // path of H testing dataset
      , "dataroot_L": null              // path of L testing dataset

      , "sigma": 25                     // 15, 25, 50 for DnCNN | [0, 75] for FFDNet and FDnCNN
      , "sigma_test": 25                // 15, 25, 50 for DnCNN and ffdnet

    }
  }

  , "netG": {
    "net_type": "dncnn" // "dncnn" | "fdncnn" | "ffdnet" | "srmd" | "dpsr" | "msrresnet0" |  "msrresnet1" | "rrdb" 
    , "in_nc": 1        // input channel number
    , "out_nc": 1       // ouput channel number
    , "nc": 64          // 64 for "dncnn"
    , "nb": 17          // 17 for "dncnn", 20 for dncnn3, 16 for "srresnet"
    , "gc": 32          // unused
    , "ng": 2           // unused
    , "reduction" : 16  // unused
    , "act_mode": "BR"  // "BR" for BN+ReLU | "R" for ReLU
    , "upsample_mode": "convtranspose"  // "pixelshuffle" | "convtranspose" | "upconv"
    , "downsample_mode": "strideconv"   // "strideconv" | "avgpool" | "maxpool"

    , "init_type": "orthogonal"         // "orthogonal" | "normal" | "uniform" | "xavier_normal" | "xavier_uniform" | "kaiming_normal" | "kaiming_uniform"
    , "init_bn_type": "uniform"         // "uniform" | "constant"
    , "init_gain": 0.2
  }

  , "train": {
    "G_lossfn_type": "l1"               // "l1" preferred | "l2sum" | "l2" | "ssim" 
    , "G_lossfn_weight": 1.0            // default

    , "G_optimizer_type": "adam"        // fixed, adam is enough
    , "G_optimizer_lr": 1e-4            // learning rate
    , "G_optimizer_clipgrad": null      // unused

    , "G_scheduler_type": "MultiStepLR" // "MultiStepLR" is enough
    , "G_scheduler_milestones": [200000, 400000, 600000, 800000, 1000000, 2000000]
    , "G_scheduler_gamma": 0.5

    , "G_regularizer_orthstep": null    // unused
    , "G_regularizer_clipstep": null    // unused

    , "checkpoint_test": 5000           // for testing
    , "checkpoint_save": 5000           // for saving model
    , "checkpoint_print": 200           // for print
  }
}

I run !python main_train_dncnn.py and the output is:

export CUDA_VISIBLE_DEVICES=0
number of GPUs is: 1
LogHandlers setup!
24-02-03 05:08:15.855 :   task: dncnn25
  model: plain
  gpu_ids: [0]
  scale: 1
  n_channels: 3
  merge_bn: False
  merge_bn_startpoint: 40
  path:[
    root: denoising
    pretrained_netG: None
    task: denoising/dncnn25
    log: denoising/dncnn25
    options: denoising/dncnn25/options
    models: denoising/dncnn25/models
    images: denoising/dncnn25/images
  ]
  datasets:[
    train:[
      name: train_dataset
      dataset_type: dncnn
      dataroot_H: trainsets/trainH
      dataroot_L: None
      H_size: 16
      sigma: 25
      sigma_test: 25
      dataloader_shuffle: True
      dataloader_num_workers: 8
      dataloader_batch_size: 16
      phase: train
      scale: 1
      n_channels: 3
    ]
    test:[
      name: test_dataset
      dataset_type: dncnn
      dataroot_H: testsets/liko
      dataroot_L: None
      sigma: 25
      sigma_test: 25
      phase: test
      scale: 1
      n_channels: 3
    ]
  ]
  netG:[
    net_type: dncnn
    in_nc: 1
    out_nc: 1
    nc: 64
    nb: 17
    gc: 32
    ng: 2
    reduction: 16
    act_mode: BR
    upsample_mode: convtranspose
    downsample_mode: strideconv
    init_type: orthogonal
    init_bn_type: uniform
    init_gain: 0.2
    scale: 1
  ]
  train:[
    G_lossfn_type: l1
    G_lossfn_weight: 1.0
    G_optimizer_type: adam
    G_optimizer_lr: 0.0001
    G_optimizer_clipgrad: None
    G_scheduler_type: MultiStepLR
    G_scheduler_milestones: [200000, 400000, 600000, 800000, 1000000, 2000000]
    G_scheduler_gamma: 0.5
    G_regularizer_orthstep: None
    G_regularizer_clipstep: None
    checkpoint_test: 5000
    checkpoint_save: 5000
    checkpoint_print: 200
    F_feature_layer: 34
    F_weights: 1.0
    F_lossfn_type: l1
    F_use_input_norm: True
    F_use_range_norm: False
    G_optimizer_betas: [0.9, 0.999]
    G_scheduler_restart_weights: 1
    G_optimizer_wd: 0
    G_optimizer_reuse: False
    G_param_strict: True
    E_param_strict: True
    E_decay: 0
  ]
  opt_path: options/train_dncnn.json
  is_train: True
  find_unused_parameters: True
  use_static_graph: False
  dist: False
  num_gpu: 1

24-02-03 05:08:15.856 : Random seed: 9679
Dataset: Denosing on AWGN with fixed sigma. Only dataroot_H is needed.
Dataset [DatasetDnCNN - train_dataset] is created.
24-02-03 05:08:15.856 : Number of train images: 14, iters: 1
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:557: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(_create_warning_msg(
Dataset: Denosing on AWGN with fixed sigma. Only dataroot_H is needed.
Dataset [DatasetDnCNN - test_dataset] is created.
Initialization method [orthogonal + uniform], gain is [0.20]
Training model [ModelPlain] is created.
24-02-03 05:08:16.109 : 
Networks name: DnCNN
Params number: 557057
Net structure:
DnCNN(
  (model): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (4): ReLU(inplace=True)
    (5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (7): ReLU(inplace=True)
    (8): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (13): ReLU(inplace=True)
    (14): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (16): ReLU(inplace=True)
    (17): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (19): ReLU(inplace=True)
    (20): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (21): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (22): ReLU(inplace=True)
    (23): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (24): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (25): ReLU(inplace=True)
    (26): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (28): ReLU(inplace=True)
    (29): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (30): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (31): ReLU(inplace=True)
    (32): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (33): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (34): ReLU(inplace=True)
    (35): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (36): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (37): ReLU(inplace=True)
    (38): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (39): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (40): ReLU(inplace=True)
    (41): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (42): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (43): ReLU(inplace=True)
    (44): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (45): BatchNorm2d(64, eps=0.0001, momentum=0.9, affine=True, track_running_stats=True)
    (46): ReLU(inplace=True)
    (47): Conv2d(64, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
)

24-02-03 05:08:16.147 : 
 |  mean  |  min   |  max   |  std   || shape               
 |  0.001 | -0.074 |  0.071 |  0.025 | torch.Size([64, 1, 3, 3]) || model.0.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.0.bias
 | -0.000 | -0.037 |  0.035 |  0.008 | torch.Size([64, 64, 3, 3]) || model.2.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.2.bias
 |  0.569 |  0.107 |  0.999 |  0.299 | torch.Size([64]) || model.3.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.3.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.3.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.3.running_var
 | -0.000 | -0.038 |  0.036 |  0.008 | torch.Size([64, 64, 3, 3]) || model.5.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.5.bias
 |  0.524 |  0.104 |  0.994 |  0.263 | torch.Size([64]) || model.6.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.6.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.6.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.6.running_var
 | -0.000 | -0.038 |  0.033 |  0.008 | torch.Size([64, 64, 3, 3]) || model.8.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.8.bias
 |  0.560 |  0.105 |  0.996 |  0.271 | torch.Size([64]) || model.9.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.9.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.9.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.9.running_var
 | -0.000 | -0.031 |  0.035 |  0.008 | torch.Size([64, 64, 3, 3]) || model.11.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.11.bias
 |  0.490 |  0.113 |  0.962 |  0.254 | torch.Size([64]) || model.12.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.12.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.12.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.12.running_var
 | -0.000 | -0.032 |  0.031 |  0.008 | torch.Size([64, 64, 3, 3]) || model.14.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.14.bias
 |  0.579 |  0.106 |  0.979 |  0.253 | torch.Size([64]) || model.15.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.15.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.15.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.15.running_var
 | -0.000 | -0.039 |  0.033 |  0.008 | torch.Size([64, 64, 3, 3]) || model.17.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.17.bias
 |  0.606 |  0.105 |  0.997 |  0.267 | torch.Size([64]) || model.18.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.18.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.18.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.18.running_var
 |  0.000 | -0.031 |  0.033 |  0.008 | torch.Size([64, 64, 3, 3]) || model.20.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.20.bias
 |  0.548 |  0.114 |  0.994 |  0.262 | torch.Size([64]) || model.21.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.21.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.21.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.21.running_var
 |  0.000 | -0.034 |  0.036 |  0.008 | torch.Size([64, 64, 3, 3]) || model.23.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.23.bias
 |  0.556 |  0.110 |  0.998 |  0.254 | torch.Size([64]) || model.24.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.24.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.24.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.24.running_var
 |  0.000 | -0.034 |  0.030 |  0.008 | torch.Size([64, 64, 3, 3]) || model.26.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.26.bias
 |  0.560 |  0.114 |  0.987 |  0.263 | torch.Size([64]) || model.27.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.27.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.27.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.27.running_var
 | -0.000 | -0.036 |  0.034 |  0.008 | torch.Size([64, 64, 3, 3]) || model.29.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.29.bias
 |  0.526 |  0.114 |  0.980 |  0.258 | torch.Size([64]) || model.30.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.30.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.30.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.30.running_var
 |  0.000 | -0.033 |  0.042 |  0.008 | torch.Size([64, 64, 3, 3]) || model.32.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.32.bias
 |  0.544 |  0.107 |  0.995 |  0.292 | torch.Size([64]) || model.33.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.33.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.33.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.33.running_var
 |  0.000 | -0.035 |  0.033 |  0.008 | torch.Size([64, 64, 3, 3]) || model.35.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.35.bias
 |  0.517 |  0.105 |  0.953 |  0.276 | torch.Size([64]) || model.36.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.36.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.36.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.36.running_var
 |  0.000 | -0.033 |  0.036 |  0.008 | torch.Size([64, 64, 3, 3]) || model.38.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.38.bias
 |  0.502 |  0.107 |  0.988 |  0.240 | torch.Size([64]) || model.39.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.39.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.39.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.39.running_var
 | -0.000 | -0.033 |  0.037 |  0.008 | torch.Size([64, 64, 3, 3]) || model.41.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.41.bias
 |  0.600 |  0.130 |  0.995 |  0.271 | torch.Size([64]) || model.42.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.42.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.42.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.42.running_var
 | -0.000 | -0.034 |  0.034 |  0.008 | torch.Size([64, 64, 3, 3]) || model.44.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.44.bias
 |  0.582 |  0.107 |  0.992 |  0.253 | torch.Size([64]) || model.45.weight
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.45.bias
 |  0.000 |  0.000 |  0.000 |  0.000 | torch.Size([64]) || model.45.running_mean
 |  1.000 |  1.000 |  1.000 |  0.000 | torch.Size([64]) || model.45.running_var
 | -0.000 | -0.022 |  0.022 |  0.008 | torch.Size([1, 64, 3, 3]) || model.47.weight
 |  0.000 |  0.000 |  0.000 |    nan | torch.Size([1]) || model.47.bias

As you see, std for model.47.bias is nan and the code just freeze.
Any helps?