can train the model just use nuscenes mini dataset with 1 gpu?
zzzz23792364 opened this issue · comments
Sorry for late response. The model will not converge if you just use nuscenes mini dataset with 1 GPU. But you can load our provided weights and finetune it on mini dataset.
Hi,现在我用1GPU训nucenes v1.0全量数据,跑完24epoch之后,map只有0.30左右,loss不再下降了,请问学习率太高了吗?(batch size 2、1 gpu 、初始学习率5e-5)
2023-12-13 20:15:50,297 - mmdet - INFO - Iter [337550/337560] lr: 5.000e-08, eta: 0:00:04, time: 0.422, data_time: 0.007, memory: 3654, frame_0_loss_cls: 0.5035, frame_0_loss_bbox: 0.8621, frame_0_d0.loss_cls: 0.5254, frame_0_d0.loss_bbox: 0.9224, frame_0_d1.loss_cls: 0.5187, frame_0_d1.loss_bbox: 0.9036, frame_0_d2.loss_cls: 0.5141, frame_0_d2.loss_bbox: 0.8810, frame_0_d3.loss_cls: 0.5049, frame_0_d3.loss_bbox: 0.8736, frame_0_d4.loss_cls: 0.5057, frame_0_d4.loss_bbox: 0.8620, frame_0_dn_loss_cls: 0.4425, frame_0_dn_loss_bbox: 0.8301, frame_0_d0.dn_loss_cls: 0.4698, frame_0_d0.dn_loss_bbox: 0.9424, frame_0_d1.dn_loss_cls: 0.4575, frame_0_d1.dn_loss_bbox: 0.8989, frame_0_d2.dn_loss_cls: 0.4512, frame_0_d2.dn_loss_bbox: 0.8623, frame_0_d3.dn_loss_cls: 0.4462, frame_0_d3.dn_loss_bbox: 0.8409, frame_0_d4.dn_loss_cls: 0.4432, frame_0_d4.dn_loss_bbox: 0.8317, frame_0_enc_loss_cls: 0.6009, frame_0_enc_loss_bbox: 0.2750, frame_0_enc_loss_iou: 0.7282, frame_0_centers2d_losses: 0.1873, frame_0_centerness_losses: 0.5275, loss: 18.6126, grad_norm: 99.3941
`point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
dataset_type = 'CustomNuScenesDataset'
data_root = './data/nuscenes/'
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True)
file_client_args = dict(backend='disk')
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_bbox=True,
with_label=True,
with_bbox_depth=True),
dict(
type='ObjectRangeFilter',
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
dict(
type='ObjectNameFilter',
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]),
dict(
type='ResizeCropFlipRotImage',
data_aug_conf=dict(
resize_lim=(0.38, 0.55),
final_dim=(256, 704),
bot_pct_lim=(0.0, 0.0),
rot_lim=(0.0, 0.0),
H=900,
W=1600,
rand_flip=True),
training=True),
dict(
type='GlobalRotScaleTransImage',
rot_range=[-0.3925, 0.3925],
translation_std=[0, 0, 0],
scale_ratio_range=[0.95, 1.05],
reverse_angle=True,
training=True),
dict(
type='NormalizeMultiviewImage',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='PETRFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
],
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv', 'prev_exists'
]),
dict(
type='Collect3D',
keys=[
'gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels',
'centers2d', 'depths', 'prev_exists', 'lidar2img', 'intrinsics',
'extrinsics', 'timestamp', 'img_timestamp', 'ego_pose',
'ego_pose_inv'
],
meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'scene_token', 'gt_bboxes_3d',
'gt_labels_3d'))
]
test_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(
type='ResizeCropFlipRotImage',
data_aug_conf=dict(
resize_lim=(0.38, 0.55),
final_dim=(256, 704),
bot_pct_lim=(0.0, 0.0),
rot_lim=(0.0, 0.0),
H=900,
W=1600,
rand_flip=True),
training=False),
dict(
type='NormalizeMultiviewImage',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='PETRFormatBundle3D',
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv'
],
class_names=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian',
'traffic_cone'
],
with_label=False),
dict(
type='Collect3D',
keys=[
'img', 'lidar2img', 'intrinsics', 'extrinsics',
'timestamp', 'img_timestamp', 'ego_pose', 'ego_pose_inv'
],
meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
'scale_factor', 'flip', 'box_mode_3d',
'box_type_3d', 'img_norm_cfg', 'scene_token'))
])
]
eval_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=dict(backend='disk')),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=dict(backend='disk')),
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
],
with_label=False),
dict(type='Collect3D', keys=['points'])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=4,
train=dict(
type='CustomNuScenesDataset',
data_root='./data/nuscenes/',
ann_file='./data/nuscenes/nuscenes2d_temporal_infos_train.pkl',
pipeline=[
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_bbox=True,
with_label=True,
with_bbox_depth=True),
dict(
type='ObjectRangeFilter',
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
dict(
type='ObjectNameFilter',
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian',
'traffic_cone'
]),
dict(
type='ResizeCropFlipRotImage',
data_aug_conf=dict(
resize_lim=(0.38, 0.55),
final_dim=(256, 704),
bot_pct_lim=(0.0, 0.0),
rot_lim=(0.0, 0.0),
H=900,
W=1600,
rand_flip=True),
training=True),
dict(
type='GlobalRotScaleTransImage',
rot_range=[-0.3925, 0.3925],
translation_std=[0, 0, 0],
scale_ratio_range=[0.95, 1.05],
reverse_angle=True,
training=True),
dict(
type='NormalizeMultiviewImage',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='PETRFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian',
'traffic_cone'
],
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv', 'prev_exists'
]),
dict(
type='Collect3D',
keys=[
'gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes',
'gt_labels', 'centers2d', 'depths', 'prev_exists',
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv'
],
meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
'scale_factor', 'flip', 'box_mode_3d',
'box_type_3d', 'img_norm_cfg', 'scene_token',
'gt_bboxes_3d', 'gt_labels_3d'))
],
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
],
modality=dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True),
test_mode=False,
box_type_3d='LiDAR',
num_frame_losses=1,
seq_split_num=2,
seq_mode=True,
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv', 'img', 'prev_exists',
'img_metas'
],
queue_length=1,
use_valid_flag=True,
filter_empty_gt=False),
val=dict(
type='CustomNuScenesDataset',
data_root='data/nuscenes/',
ann_file='./data/nuscenes/nuscenes2d_temporal_infos_val.pkl',
pipeline=[
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(
type='ResizeCropFlipRotImage',
data_aug_conf=dict(
resize_lim=(0.38, 0.55),
final_dim=(256, 704),
bot_pct_lim=(0.0, 0.0),
rot_lim=(0.0, 0.0),
H=900,
W=1600,
rand_flip=True),
training=False),
dict(
type='NormalizeMultiviewImage',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='PETRFormatBundle3D',
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics',
'timestamp', 'img_timestamp', 'ego_pose',
'ego_pose_inv'
],
class_names=[
'car', 'truck', 'construction_vehicle', 'bus',
'trailer', 'barrier', 'motorcycle', 'bicycle',
'pedestrian', 'traffic_cone'
],
with_label=False),
dict(
type='Collect3D',
keys=[
'img', 'lidar2img', 'intrinsics', 'extrinsics',
'timestamp', 'img_timestamp', 'ego_pose',
'ego_pose_inv'
],
meta_keys=('filename', 'ori_shape', 'img_shape',
'pad_shape', 'scale_factor', 'flip',
'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'scene_token'))
])
],
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
],
modality=dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True),
test_mode=True,
box_type_3d='LiDAR',
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv', 'img', 'img_metas'
],
queue_length=1),
test=dict(
type='CustomNuScenesDataset',
data_root='data/nuscenes/',
ann_file='./data/nuscenes/nuscenes2d_temporal_infos_val.pkl',
pipeline=[
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(
type='ResizeCropFlipRotImage',
data_aug_conf=dict(
resize_lim=(0.38, 0.55),
final_dim=(256, 704),
bot_pct_lim=(0.0, 0.0),
rot_lim=(0.0, 0.0),
H=900,
W=1600,
rand_flip=True),
training=False),
dict(
type='NormalizeMultiviewImage',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='PETRFormatBundle3D',
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics',
'timestamp', 'img_timestamp', 'ego_pose',
'ego_pose_inv'
],
class_names=[
'car', 'truck', 'construction_vehicle', 'bus',
'trailer', 'barrier', 'motorcycle', 'bicycle',
'pedestrian', 'traffic_cone'
],
with_label=False),
dict(
type='Collect3D',
keys=[
'img', 'lidar2img', 'intrinsics', 'extrinsics',
'timestamp', 'img_timestamp', 'ego_pose',
'ego_pose_inv'
],
meta_keys=('filename', 'ori_shape', 'img_shape',
'pad_shape', 'scale_factor', 'flip',
'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'scene_token'))
])
],
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
],
modality=dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True),
test_mode=True,
box_type_3d='LiDAR',
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv', 'img', 'img_metas'
],
queue_length=1),
shuffler_sampler=dict(type='InfiniteGroupEachSampleInBatchSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(
interval=337560,
pipeline=[
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(
type='ResizeCropFlipRotImage',
data_aug_conf=dict(
resize_lim=(0.38, 0.55),
final_dim=(256, 704),
bot_pct_lim=(0.0, 0.0),
rot_lim=(0.0, 0.0),
H=900,
W=1600,
rand_flip=True),
training=False),
dict(
type='NormalizeMultiviewImage',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='PETRFormatBundle3D',
collect_keys=[
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp',
'img_timestamp', 'ego_pose', 'ego_pose_inv'
],
class_names=[
'car', 'truck', 'construction_vehicle', 'bus',
'trailer', 'barrier', 'motorcycle', 'bicycle',
'pedestrian', 'traffic_cone'
],
with_label=False),
dict(
type='Collect3D',
keys=[
'img', 'lidar2img', 'intrinsics', 'extrinsics',
'timestamp', 'img_timestamp', 'ego_pose',
'ego_pose_inv'
],
meta_keys=('filename', 'ori_shape', 'img_shape',
'pad_shape', 'scale_factor', 'flip',
'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
'scene_token'))
])
])
checkpoint_config = dict(interval=14065, max_keep_ckpts=3)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = 'work_dirs/stream_petr_r50_flash_704_bs2_seq_24e-231208/'
load_from = None
resume_from = None
workflow = [('train', 1)]
opencv_num_threads = 0
mp_start_method = 'fork'
backbone_norm_cfg = dict(type='LN', requires_grad=True)
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
voxel_size = [0.2, 0.2, 8]
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
num_gpus = 1
batch_size = 2
num_iters_per_epoch = 14065
num_epochs = 24
queue_length = 1
num_frame_losses = 1
collect_keys = [
'lidar2img', 'intrinsics', 'extrinsics', 'timestamp', 'img_timestamp',
'ego_pose', 'ego_pose_inv'
]
model = dict(
type='Petr3D',
num_frame_head_grads=1,
num_frame_backbone_grads=1,
num_frame_losses=1,
use_grid_mask=True,
img_backbone=dict(
pretrained='torchvision://resnet50',
type='ResNet',
depth=50,
num_stages=4,
out_indices=(2, 3),
frozen_stages=-1,
norm_cfg=dict(type='BN2d', requires_grad=False),
norm_eval=True,
with_cp=True,
style='pytorch'),
img_neck=dict(
type='CPFPN', in_channels=[1024, 2048], out_channels=256, num_outs=2),
img_roi_head=dict(
type='FocalHead',
num_classes=10,
in_channels=256,
loss_cls2d=dict(
type='QualityFocalLoss',
use_sigmoid=True,
beta=2.0,
loss_weight=2.0),
loss_centerness=dict(
type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
loss_bbox2d=dict(type='L1Loss', loss_weight=5.0),
loss_iou2d=dict(type='GIoULoss', loss_weight=2.0),
loss_centers2d=dict(type='L1Loss', loss_weight=10.0),
train_cfg=dict(
assigner2d=dict(
type='HungarianAssigner2D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(
type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
centers2d_cost=dict(type='BBox3DL1Cost', weight=10.0)))),
pts_bbox_head=dict(
type='StreamPETRHead',
num_classes=10,
in_channels=256,
num_query=644,
memory_len=1024,
topk_proposals=256,
num_propagated=256,
with_ego_pos=True,
match_with_velo=False,
scalar=10,
noise_scale=1.0,
dn_weight=1.0,
split=0.75,
LID=True,
with_position=True,
position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
transformer=dict(
type='PETRTemporalTransformer',
decoder=dict(
type='PETRTransformerDecoder',
return_intermediate=True,
num_layers=6,
transformerlayers=dict(
type='PETRTemporalDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.1),
dict(
type='PETRMultiheadFlashAttention',
embed_dims=256,
num_heads=8,
dropout=0.1)
],
feedforward_channels=2048,
ffn_dropout=0.1,
with_cp=True,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
max_num=300,
voxel_size=[0.2, 0.2, 8],
num_classes=10),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='L1Loss', loss_weight=0.25),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=[0.2, 0.2, 8],
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]))))
ida_aug_conf = dict(
resize_lim=(0.38, 0.55),
final_dim=(256, 704),
bot_pct_lim=(0.0, 0.0),
rot_lim=(0.0, 0.0),
H=900,
W=1600,
rand_flip=True)
optimizer = dict(
type='AdamW',
lr=5e-05,
paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.25))),
weight_decay=0.01)
optimizer_config = dict(
type='Fp16OptimizerHook',
loss_scale='dynamic',
grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.3333333333333333,
min_lr_ratio=0.001)
find_unused_parameters = False
runner = dict(type='IterBasedRunner', max_iters=337560)
gpu_ids = range(0, 1)
`
I think the batch size is too small. You can try batch size=8, learning rate =2e-4