happynear / AMSoftmax

A simple yet effective loss function for face verification.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Have you ever tried batch normalization?

twmht opened this issue · comments

@happynear

I tried to add batch normalization on your modified resnet20, but the loss became 87.3365. As far as I know, BN helps learning more quickly, Is it possible to add batch normalization with amsoftmax?

Here is the prototxt

layer {
  name: "input"
  type: "Input"
  top: "data"
  input_param {
    shape {
      dim: 1
      dim: 3
      dim: 160
      dim: 160
    }
  }
}
layer {
  name: "conv1_1"
  type: "Convolution"
  bottom: "data"
  top: "conv1_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    stride: 2
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv1_1/bn"
  type: "BatchNorm"
  bottom: "conv1_1"
  top: "conv1_1"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv1_1/scale"
  type: "Scale"
  bottom: "conv1_1"
  top: "conv1_1"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu1_1"
  type: "PReLU"
  bottom: "conv1_1"
  top: "conv1_1"
}
layer {
  name: "conv1_2"
  type: "Convolution"
  bottom: "conv1_1"
  top: "conv1_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv1_2/bn"
  type: "BatchNorm"
  bottom: "conv1_2"
  top: "conv1_2"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv1_2/scale"
  type: "Scale"
  bottom: "conv1_2"
  top: "conv1_2"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu1_2"
  type: "PReLU"
  bottom: "conv1_2"
  top: "conv1_2"
}
layer {
  name: "conv1_3"
  type: "Convolution"
  bottom: "conv1_2"
  top: "conv1_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv1_3/bn"
  type: "BatchNorm"
  bottom: "conv1_3"
  top: "conv1_3"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv1_3/scale"
  type: "Scale"
  bottom: "conv1_3"
  top: "conv1_3"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu1_3"
  type: "PReLU"
  bottom: "conv1_3"
  top: "conv1_3"
}
layer {
  name: "res1_3"
  type: "Eltwise"
  bottom: "conv1_1"
  bottom: "conv1_3"
  top: "res1_3"
}
layer {
  name: "conv2_1"
  type: "Convolution"
  bottom: "res1_3"
  top: "conv2_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 2
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv2_1/bn"
  type: "BatchNorm"
  bottom: "conv2_1"
  top: "conv2_1"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv2_1/scale"
  type: "Scale"
  bottom: "conv2_1"
  top: "conv2_1"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu2_1"
  type: "PReLU"
  bottom: "conv2_1"
  top: "conv2_1"
}
layer {
  name: "conv2_2"
  type: "Convolution"
  bottom: "conv2_1"
  top: "conv2_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv2_2/bn"
  type: "BatchNorm"
  bottom: "conv2_2"
  top: "conv2_2"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv2_2/scale"
  type: "Scale"
  bottom: "conv2_2"
  top: "conv2_2"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu2_2"
  type: "PReLU"
  bottom: "conv2_2"
  top: "conv2_2"
}
layer {
  name: "conv2_3"
  type: "Convolution"
  bottom: "conv2_2"
  top: "conv2_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv2_3/bn"
  type: "BatchNorm"
  bottom: "conv2_3"
  top: "conv2_3"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv2_3/scale"
  type: "Scale"
  bottom: "conv2_3"
  top: "conv2_3"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu2_3"
  type: "PReLU"
  bottom: "conv2_3"
  top: "conv2_3"
}
layer {
  name: "res2_3"
  type: "Eltwise"
  bottom: "conv2_1"
  bottom: "conv2_3"
  top: "res2_3"
}
layer {
  name: "conv2_4"
  type: "Convolution"
  bottom: "res2_3"
  top: "conv2_4"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv2_4/bn"
  type: "BatchNorm"
  bottom: "conv2_4"
  top: "conv2_4"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv2_4/scale"
  type: "Scale"
  bottom: "conv2_4"
  top: "conv2_4"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu2_4"
  type: "PReLU"
  bottom: "conv2_4"
  top: "conv2_4"
}
layer {
  name: "conv2_5"
  type: "Convolution"
  bottom: "conv2_4"
  top: "conv2_5"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv2_5/bn"
  type: "BatchNorm"
  bottom: "conv2_5"
  top: "conv2_5"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv2_5/scale"
  type: "Scale"
  bottom: "conv2_5"
  top: "conv2_5"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu2_5"
  type: "PReLU"
  bottom: "conv2_5"
  top: "conv2_5"
}
layer {
  name: "res2_5"
  type: "Eltwise"
  bottom: "res2_3"
  bottom: "conv2_5"
  top: "res2_5"
}
layer {
  name: "conv3_1"
  type: "Convolution"
  bottom: "res2_5"
  top: "conv3_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 2
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_1/bn"
  type: "BatchNorm"
  bottom: "conv3_1"
  top: "conv3_1"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_1/scale"
  type: "Scale"
  bottom: "conv3_1"
  top: "conv3_1"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_1"
  type: "PReLU"
  bottom: "conv3_1"
  top: "conv3_1"
}
layer {
  name: "conv3_2"
  type: "Convolution"
  bottom: "conv3_1"
  top: "conv3_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_2/bn"
  type: "BatchNorm"
  bottom: "conv3_2"
  top: "conv3_2"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_2/scale"
  type: "Scale"
  bottom: "conv3_2"
  top: "conv3_2"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_2"
  type: "PReLU"
  bottom: "conv3_2"
  top: "conv3_2"
}
layer {
  name: "conv3_3"
  type: "Convolution"
  bottom: "conv3_2"
  top: "conv3_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_3/bn"
  type: "BatchNorm"
  bottom: "conv3_3"
  top: "conv3_3"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_3/scale"
  type: "Scale"
  bottom: "conv3_3"
  top: "conv3_3"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_3"
  type: "PReLU"
  bottom: "conv3_3"
  top: "conv3_3"
}
layer {
  name: "res3_3"
  type: "Eltwise"
  bottom: "conv3_1"
  bottom: "conv3_3"
  top: "res3_3"
}
layer {
  name: "conv3_4"
  type: "Convolution"
  bottom: "res3_3"
  top: "conv3_4"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_4/bn"
  type: "BatchNorm"
  bottom: "conv3_4"
  top: "conv3_4"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_4/scale"
  type: "Scale"
  bottom: "conv3_4"
  top: "conv3_4"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_4"
  type: "PReLU"
  bottom: "conv3_4"
  top: "conv3_4"
}
layer {
  name: "conv3_5"
  type: "Convolution"
  bottom: "conv3_4"
  top: "conv3_5"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_5/bn"
  type: "BatchNorm"
  bottom: "conv3_5"
  top: "conv3_5"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_5/scale"
  type: "Scale"
  bottom: "conv3_5"
  top: "conv3_5"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_5"
  type: "PReLU"
  bottom: "conv3_5"
  top: "conv3_5"
}
layer {
  name: "res3_5"
  type: "Eltwise"
  bottom: "res3_3"
  bottom: "conv3_5"
  top: "res3_5"
}
layer {
  name: "conv3_6"
  type: "Convolution"
  bottom: "res3_5"
  top: "conv3_6"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_6/bn"
  type: "BatchNorm"
  bottom: "conv3_6"
  top: "conv3_6"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_6/scale"
  type: "Scale"
  bottom: "conv3_6"
  top: "conv3_6"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_6"
  type: "PReLU"
  bottom: "conv3_6"
  top: "conv3_6"
}
layer {
  name: "conv3_7"
  type: "Convolution"
  bottom: "conv3_6"
  top: "conv3_7"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_7/bn"
  type: "BatchNorm"
  bottom: "conv3_7"
  top: "conv3_7"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_7/scale"
  type: "Scale"
  bottom: "conv3_7"
  top: "conv3_7"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_7"
  type: "PReLU"
  bottom: "conv3_7"
  top: "conv3_7"
}
layer {
  name: "res3_7"
  type: "Eltwise"
  bottom: "res3_5"
  bottom: "conv3_7"
  top: "res3_7"
}
layer {
  name: "conv3_8"
  type: "Convolution"
  bottom: "res3_7"
  top: "conv3_8"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_8/bn"
  type: "BatchNorm"
  bottom: "conv3_8"
  top: "conv3_8"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_8/scale"
  type: "Scale"
  bottom: "conv3_8"
  top: "conv3_8"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_8"
  type: "PReLU"
  bottom: "conv3_8"
  top: "conv3_8"
}
layer {
  name: "conv3_9"
  type: "Convolution"
  bottom: "conv3_8"
  top: "conv3_9"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv3_9/bn"
  type: "BatchNorm"
  bottom: "conv3_9"
  top: "conv3_9"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv3_9/scale"
  type: "Scale"
  bottom: "conv3_9"
  top: "conv3_9"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu3_9"
  type: "PReLU"
  bottom: "conv3_9"
  top: "conv3_9"
}
layer {
  name: "res3_9"
  type: "Eltwise"
  bottom: "res3_7"
  bottom: "conv3_9"
  top: "res3_9"
}
layer {
  name: "conv4_1"
  type: "Convolution"
  bottom: "res3_9"
  top: "conv4_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 2
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv4_1/bn"
  type: "BatchNorm"
  bottom: "conv4_1"
  top: "conv4_1"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv4_1/scale"
  type: "Scale"
  bottom: "conv4_1"
  top: "conv4_1"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu4_1"
  type: "PReLU"
  bottom: "conv4_1"
  top: "conv4_1"
}
layer {
  name: "conv4_2"
  type: "Convolution"
  bottom: "conv4_1"
  top: "conv4_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv4_2/bn"
  type: "BatchNorm"
  bottom: "conv4_2"
  top: "conv4_2"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv4_2/scale"
  type: "Scale"
  bottom: "conv4_2"
  top: "conv4_2"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu4_2"
  type: "PReLU"
  bottom: "conv4_2"
  top: "conv4_2"
}
layer {
  name: "conv4_3"
  type: "Convolution"
  bottom: "conv4_2"
  top: "conv4_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "conv4_3/bn"
  type: "BatchNorm"
  bottom: "conv4_3"
  top: "conv4_3"
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
  param {
    lr_mult: 0
  }
}
layer {
  name: "conv4_3/scale"
  type: "Scale"
  bottom: "conv4_3"
  top: "conv4_3"
  param {
    lr_mult: 1
    decay_mult: 0
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  scale_param {
    bias_term: true
  }
}
layer {
  name: "relu4_3"
  type: "PReLU"
  bottom: "conv4_3"
  top: "conv4_3"
}
layer {
  name: "res4_3"
  type: "Eltwise"
  bottom: "conv4_1"
  bottom: "conv4_3"
  top: "res4_3"
}
layer {
  name: "fc5"
  type: "InnerProduct"
  bottom: "res4_3"
  top: "fc5"
  inner_product_param {
    num_output: 512
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}

Please also add BN after fc5. I've tried that. It could converge.
Adding Batch Normalization would improve the results. It is recommended to do so if you want to get higher scores.

@happynear

Actually I have tried added BN after fc5 and fc6 (Maybe fc6 is not necessary to normalize), but the loss is still 87.3365. What is your learning rate?

I will try to remove the BN after fc6, so keep this issue open:)

@happynear

I have added BN after fc5, but the loss is still 87.3365. I also tried to reduce the learning rate to 0.01, but still not work.

Any advise?

thank you

ok. it seems that there are some problems with my own folk caffe. Now problem solved.