Have you ever tried batch normalization?
twmht opened this issue · comments
I tried to add batch normalization on your modified resnet20, but the loss became 87.3365. As far as I know, BN helps learning more quickly, Is it possible to add batch normalization with amsoftmax?
Here is the prototxt
layer {
name: "input"
type: "Input"
top: "data"
input_param {
shape {
dim: 1
dim: 3
dim: 160
dim: 160
}
}
}
layer {
name: "conv1_1"
type: "Convolution"
bottom: "data"
top: "conv1_1"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
stride: 2
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv1_1/bn"
type: "BatchNorm"
bottom: "conv1_1"
top: "conv1_1"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv1_1/scale"
type: "Scale"
bottom: "conv1_1"
top: "conv1_1"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu1_1"
type: "PReLU"
bottom: "conv1_1"
top: "conv1_1"
}
layer {
name: "conv1_2"
type: "Convolution"
bottom: "conv1_1"
top: "conv1_2"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv1_2/bn"
type: "BatchNorm"
bottom: "conv1_2"
top: "conv1_2"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv1_2/scale"
type: "Scale"
bottom: "conv1_2"
top: "conv1_2"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu1_2"
type: "PReLU"
bottom: "conv1_2"
top: "conv1_2"
}
layer {
name: "conv1_3"
type: "Convolution"
bottom: "conv1_2"
top: "conv1_3"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv1_3/bn"
type: "BatchNorm"
bottom: "conv1_3"
top: "conv1_3"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv1_3/scale"
type: "Scale"
bottom: "conv1_3"
top: "conv1_3"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu1_3"
type: "PReLU"
bottom: "conv1_3"
top: "conv1_3"
}
layer {
name: "res1_3"
type: "Eltwise"
bottom: "conv1_1"
bottom: "conv1_3"
top: "res1_3"
}
layer {
name: "conv2_1"
type: "Convolution"
bottom: "res1_3"
top: "conv2_1"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
stride: 2
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv2_1/bn"
type: "BatchNorm"
bottom: "conv2_1"
top: "conv2_1"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv2_1/scale"
type: "Scale"
bottom: "conv2_1"
top: "conv2_1"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu2_1"
type: "PReLU"
bottom: "conv2_1"
top: "conv2_1"
}
layer {
name: "conv2_2"
type: "Convolution"
bottom: "conv2_1"
top: "conv2_2"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv2_2/bn"
type: "BatchNorm"
bottom: "conv2_2"
top: "conv2_2"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv2_2/scale"
type: "Scale"
bottom: "conv2_2"
top: "conv2_2"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu2_2"
type: "PReLU"
bottom: "conv2_2"
top: "conv2_2"
}
layer {
name: "conv2_3"
type: "Convolution"
bottom: "conv2_2"
top: "conv2_3"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv2_3/bn"
type: "BatchNorm"
bottom: "conv2_3"
top: "conv2_3"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv2_3/scale"
type: "Scale"
bottom: "conv2_3"
top: "conv2_3"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu2_3"
type: "PReLU"
bottom: "conv2_3"
top: "conv2_3"
}
layer {
name: "res2_3"
type: "Eltwise"
bottom: "conv2_1"
bottom: "conv2_3"
top: "res2_3"
}
layer {
name: "conv2_4"
type: "Convolution"
bottom: "res2_3"
top: "conv2_4"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv2_4/bn"
type: "BatchNorm"
bottom: "conv2_4"
top: "conv2_4"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv2_4/scale"
type: "Scale"
bottom: "conv2_4"
top: "conv2_4"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu2_4"
type: "PReLU"
bottom: "conv2_4"
top: "conv2_4"
}
layer {
name: "conv2_5"
type: "Convolution"
bottom: "conv2_4"
top: "conv2_5"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv2_5/bn"
type: "BatchNorm"
bottom: "conv2_5"
top: "conv2_5"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv2_5/scale"
type: "Scale"
bottom: "conv2_5"
top: "conv2_5"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu2_5"
type: "PReLU"
bottom: "conv2_5"
top: "conv2_5"
}
layer {
name: "res2_5"
type: "Eltwise"
bottom: "res2_3"
bottom: "conv2_5"
top: "res2_5"
}
layer {
name: "conv3_1"
type: "Convolution"
bottom: "res2_5"
top: "conv3_1"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 2
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_1/bn"
type: "BatchNorm"
bottom: "conv3_1"
top: "conv3_1"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_1/scale"
type: "Scale"
bottom: "conv3_1"
top: "conv3_1"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_1"
type: "PReLU"
bottom: "conv3_1"
top: "conv3_1"
}
layer {
name: "conv3_2"
type: "Convolution"
bottom: "conv3_1"
top: "conv3_2"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_2/bn"
type: "BatchNorm"
bottom: "conv3_2"
top: "conv3_2"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_2/scale"
type: "Scale"
bottom: "conv3_2"
top: "conv3_2"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_2"
type: "PReLU"
bottom: "conv3_2"
top: "conv3_2"
}
layer {
name: "conv3_3"
type: "Convolution"
bottom: "conv3_2"
top: "conv3_3"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_3/bn"
type: "BatchNorm"
bottom: "conv3_3"
top: "conv3_3"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_3/scale"
type: "Scale"
bottom: "conv3_3"
top: "conv3_3"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_3"
type: "PReLU"
bottom: "conv3_3"
top: "conv3_3"
}
layer {
name: "res3_3"
type: "Eltwise"
bottom: "conv3_1"
bottom: "conv3_3"
top: "res3_3"
}
layer {
name: "conv3_4"
type: "Convolution"
bottom: "res3_3"
top: "conv3_4"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_4/bn"
type: "BatchNorm"
bottom: "conv3_4"
top: "conv3_4"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_4/scale"
type: "Scale"
bottom: "conv3_4"
top: "conv3_4"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_4"
type: "PReLU"
bottom: "conv3_4"
top: "conv3_4"
}
layer {
name: "conv3_5"
type: "Convolution"
bottom: "conv3_4"
top: "conv3_5"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_5/bn"
type: "BatchNorm"
bottom: "conv3_5"
top: "conv3_5"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_5/scale"
type: "Scale"
bottom: "conv3_5"
top: "conv3_5"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_5"
type: "PReLU"
bottom: "conv3_5"
top: "conv3_5"
}
layer {
name: "res3_5"
type: "Eltwise"
bottom: "res3_3"
bottom: "conv3_5"
top: "res3_5"
}
layer {
name: "conv3_6"
type: "Convolution"
bottom: "res3_5"
top: "conv3_6"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_6/bn"
type: "BatchNorm"
bottom: "conv3_6"
top: "conv3_6"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_6/scale"
type: "Scale"
bottom: "conv3_6"
top: "conv3_6"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_6"
type: "PReLU"
bottom: "conv3_6"
top: "conv3_6"
}
layer {
name: "conv3_7"
type: "Convolution"
bottom: "conv3_6"
top: "conv3_7"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_7/bn"
type: "BatchNorm"
bottom: "conv3_7"
top: "conv3_7"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_7/scale"
type: "Scale"
bottom: "conv3_7"
top: "conv3_7"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_7"
type: "PReLU"
bottom: "conv3_7"
top: "conv3_7"
}
layer {
name: "res3_7"
type: "Eltwise"
bottom: "res3_5"
bottom: "conv3_7"
top: "res3_7"
}
layer {
name: "conv3_8"
type: "Convolution"
bottom: "res3_7"
top: "conv3_8"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_8/bn"
type: "BatchNorm"
bottom: "conv3_8"
top: "conv3_8"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_8/scale"
type: "Scale"
bottom: "conv3_8"
top: "conv3_8"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_8"
type: "PReLU"
bottom: "conv3_8"
top: "conv3_8"
}
layer {
name: "conv3_9"
type: "Convolution"
bottom: "conv3_8"
top: "conv3_9"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv3_9/bn"
type: "BatchNorm"
bottom: "conv3_9"
top: "conv3_9"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv3_9/scale"
type: "Scale"
bottom: "conv3_9"
top: "conv3_9"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu3_9"
type: "PReLU"
bottom: "conv3_9"
top: "conv3_9"
}
layer {
name: "res3_9"
type: "Eltwise"
bottom: "res3_7"
bottom: "conv3_9"
top: "res3_9"
}
layer {
name: "conv4_1"
type: "Convolution"
bottom: "res3_9"
top: "conv4_1"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
stride: 2
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv4_1/bn"
type: "BatchNorm"
bottom: "conv4_1"
top: "conv4_1"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv4_1/scale"
type: "Scale"
bottom: "conv4_1"
top: "conv4_1"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu4_1"
type: "PReLU"
bottom: "conv4_1"
top: "conv4_1"
}
layer {
name: "conv4_2"
type: "Convolution"
bottom: "conv4_1"
top: "conv4_2"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv4_2/bn"
type: "BatchNorm"
bottom: "conv4_2"
top: "conv4_2"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv4_2/scale"
type: "Scale"
bottom: "conv4_2"
top: "conv4_2"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu4_2"
type: "PReLU"
bottom: "conv4_2"
top: "conv4_2"
}
layer {
name: "conv4_3"
type: "Convolution"
bottom: "conv4_2"
top: "conv4_3"
param {
lr_mult: 1.0
decay_mult: 1.0
}
param {
lr_mult: 2.0
decay_mult: 0
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "xavier"
}
}
}
layer {
name: "conv4_3/bn"
type: "BatchNorm"
bottom: "conv4_3"
top: "conv4_3"
param {
lr_mult: 0
}
param {
lr_mult: 0
}
param {
lr_mult: 0
}
}
layer {
name: "conv4_3/scale"
type: "Scale"
bottom: "conv4_3"
top: "conv4_3"
param {
lr_mult: 1
decay_mult: 0
}
param {
lr_mult: 2
decay_mult: 0
}
scale_param {
bias_term: true
}
}
layer {
name: "relu4_3"
type: "PReLU"
bottom: "conv4_3"
top: "conv4_3"
}
layer {
name: "res4_3"
type: "Eltwise"
bottom: "conv4_1"
bottom: "conv4_3"
top: "res4_3"
}
layer {
name: "fc5"
type: "InnerProduct"
bottom: "res4_3"
top: "fc5"
inner_product_param {
num_output: 512
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
Please also add BN after fc5. I've tried that. It could converge.
Adding Batch Normalization would improve the results. It is recommended to do so if you want to get higher scores.
Actually I have tried added BN after fc5 and fc6 (Maybe fc6 is not necessary to normalize), but the loss is still 87.3365. What is your learning rate?
I will try to remove the BN after fc6, so keep this issue open:)
I have added BN after fc5, but the loss is still 87.3365. I also tried to reduce the learning rate to 0.01, but still not work.
Any advise?
thank you
ok. it seems that there are some problems with my own folk caffe. Now problem solved.