AttributeError: 'DataParallel' object has no attribute 'train_model'
sxhxliang opened this issue · comments
Thank for your implementation, but I got an error when using 4 GPUs to train this model
# model = torch.nn.DataParallel(model, device_ids=[0,1,2,3])
Traceback (most recent call last):
File "bdd_coco.py", line 567, in
model.train_model(dataset_train, dataset_val,
File "/home/user/.conda/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/module.py", line 532, in getattr
type(self).name, name))
AttributeError: 'DataParallel' object has no attribute 'train_model'
@AaronLeong Notably, if you use 'DataParallel', the model will be wrapped in DataParallel(). It means you need to change the model.function() to model.module.function() in the following codes.
For example,
model.train_model --> model.module.train_model
@jytime I have tried this setting, but only one GPU can work well
user@ubuntu:~/rcnn$ nvidia-smi Sat Sep 22 15:31:48 2018 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 396.45 Driver Version: 396.45 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 TITAN Xp COLLEC... Off | 00000000:02:00.0 On | N/A | | 32% 57C P2 73W / 250W | 11354MiB / 12194MiB | 5% Default | +-------------------------------+----------------------+----------------------+ | 1 TITAN Xp Off | 00000000:03:00.0 Off | N/A | | 27% 46C P8 18W / 250W | 12MiB / 12196MiB | 0% Default | +-------------------------------+----------------------+----------------------+ | 2 TITAN Xp Off | 00000000:82:00.0 Off | N/A | | 28% 48C P8 19W / 250W | 12MiB / 12196MiB | 0% Default | +-------------------------------+----------------------+----------------------+ | 3 TITAN Xp Off | 00000000:83:00.0 Off | N/A | | 30% 50C P8 18W / 250W | 12MiB / 12196MiB | 0% Default | +-------------------------------+----------------------+----------------------+
`
import os
import time
import numpy as np
import scipy.misc
import scipy.ndimage
import skimage.color
import skimage.io
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from pycocotools import mask as maskUtils
import zipfile
import urllib.request
import shutil
from config import Config
import utils
import model as modellib
import torch
ROOT_DIR = os.getcwd()
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.pth")
DEFAULT_LOGS_DIR = os.path.join(ROOT_DIR, "logs")
DEFAULT_DATASET_YEAR = "2018"
class CocoConfig(Config):
NAME = "bdd"
IMAGES_PER_GPU = 16
GPU_COUNT = 4
class CocoDataset(utils.Dataset):
def load_coco(self, dataset_dir, subset, year=DEFAULT_DATASET_YEAR, class_ids=None,
class_map=None, return_coco=False, auto_download=False):
if auto_download is True:
self.auto_download(dataset_dir, subset, year)
coco = COCO("{}/annotations/instances_{}{}.json".format(dataset_dir, subset, year))
if subset == "minival" or subset == "valminusminival":
subset = "val"
image_dir = "{}/{}/{}".format(dataset_dir,'images/100k', subset+year)
if not class_ids:
class_ids = sorted(coco.getCatIds())
if class_ids:
image_ids = []
for id in class_ids:
image_ids.extend(list(coco.getImgIds(catIds=[id])))
image_ids = list(set(image_ids))
else:
image_ids = list(coco.imgs.keys())
for i in class_ids:
self.add_class("bdd", i, coco.loadCats(i)[0]["name"])
for i in image_ids:
self.add_image(
"bdd", image_id=i,
path=os.path.join(image_dir, coco.imgs[i]['file_name']),
width=coco.imgs[i]["width"],
height=coco.imgs[i]["height"],
annotations=coco.loadAnns(coco.getAnnIds(
imgIds=[i], catIds=class_ids, iscrowd=None)))
if return_coco:
return coco
def load_mask(self, image_id):
image_info = self.image_info[image_id]
if image_info["source"] != "bdd":
return super(CocoDataset, self).load_mask(image_id)
instance_masks = []
class_ids = []
annotations = self.image_info[image_id]["annotations"]
for annotation in annotations:
class_id = self.map_source_class_id(
"bdd.{}".format(annotation['category_id']))
if class_id:
m = self.annToMask(annotation, image_info["height"],
image_info["width"])
if m.max() < 1:
continue
if annotation['iscrowd']:
class_id *= -1
if m.shape[0] != image_info["height"] or m.shape[1] != image_info["width"]:
m = np.ones([image_info["height"], image_info["width"]], dtype=bool)
instance_masks.append(m)
class_ids.append(class_id)
if class_ids:
mask = np.stack(instance_masks, axis=2)
class_ids = np.array(class_ids, dtype=np.int32)
return mask, class_ids
else:
return super(CocoDataset, self).load_mask(image_id)
def load_drivable(self,image_id, use_color_maps=False, use_one_hot_label=True):
image_info = self.image_info[image_id]
path = image_info['path']
if use_color_maps:
label_name = 'color_labels'
path = path.replace('.jpg', '_drivable_color.png')
else:
label_name = 'labels'
path = path.replace('.jpg', '_drivable_id.png')
path = path.replace('images/100k', 'drivable_maps/'+label_name)
image = skimage.io.imread(path)
if image.ndim != 3:
image = skimage.color.gray2rgb(image)
drivable_maps = np.zeros_like(image)
drivable_maps[:,:,0] = 0
drivable_maps[:,:,1] = 1
drivable_maps[:,:,2] = 2
image = image == drivable_maps
return image*1.0
def image_reference(self, image_id):
info = self.image_info[image_id]
if info["source"] == "coco":
return "http://cocodataset.org/#explore?id={}".format(info["id"])
else:
super(CocoDataset, self).image_reference(image_id)
def annToRLE(self, ann, height, width):
segm = ann['segmentation']
if isinstance(segm, list):
rles = maskUtils.frPyObjects(segm, height, width)
rle = maskUtils.merge(rles)
elif isinstance(segm['counts'], list):
rle = maskUtils.frPyObjects(segm, height, width)
else:
rle = ann['segmentation']
return rle
def annToMask(self, ann, height, width):
rle = self.annToRLE(ann, height, width)
m = maskUtils.decode(rle)
return m
def build_coco_results(dataset, image_ids, rois, class_ids, scores, masks):
if rois is None:
return []
results = []
for image_id in image_ids:
for i in range(rois.shape[0]):
class_id = class_ids[i]
score = scores[i]
bbox = np.around(rois[i], 1)
mask = masks[:, :, i]
result = {
"image_id": image_id,
"category_id": dataset.get_source_class_id(class_id, "coco"),
"bbox": [bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]],
"score": score,
"segmentation": maskUtils.encode(np.asfortranarray(mask))
}
results.append(result)
return results
def evaluate_coco(model, dataset, coco, eval_type="bbox", limit=0, image_ids=None):
image_ids = image_ids or dataset.image_ids
if limit:
image_ids = image_ids[:limit]
coco_image_ids = [dataset.image_info[id]["id"] for id in image_ids]
t_prediction = 0
t_start = time.time()
results = []
for i, image_id in enumerate(image_ids):
image = dataset.load_image(image_id)
t = time.time()
r = model.detect([image])[0]
t_prediction += (time.time() - t)
image_results = build_coco_results(dataset, coco_image_ids[i:i + 1],
r["rois"], r["class_ids"],
r["scores"], r["masks"])
results.extend(image_results)
coco_results = coco.loadRes(results)
cocoEval = COCOeval(coco, coco_results, eval_type)
cocoEval.params.imgIds = coco_image_ids
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
print("Prediction time: {}. Average {}/image".format(
t_prediction, t_prediction / len(image_ids)))
print("Total time: ", time.time() - t_start)
if name == 'main':
import argparse
parser = argparse.ArgumentParser(
description='Train Mask R-CNN on MS COCO.')
parser.add_argument("command",
metavar="<command>",
help="'train' or 'evaluate' on MS COCO")
parser.add_argument('--dataset', required=True,
metavar="/data1/datasets/bdd100k",
help='Directory of the MS-COCO dataset')
parser.add_argument('--year', required=False,
default=DEFAULT_DATASET_YEAR,
metavar="<year>",
help='Year of the MS-COCO dataset (2014 or 2017) (default=2014)')
parser.add_argument('--model', required=False,
metavar="/path/to/weights.pth",
help="Path to weights .pth file or 'coco'")
parser.add_argument('--logs', required=False,
default=DEFAULT_LOGS_DIR,
metavar="/path/to/logs/",
help='Logs and checkpoints directory (default=logs/)')
parser.add_argument('--limit', required=False,
default=500,
metavar="<image count>",
help='Images to use for evaluation (default=500)')
parser.add_argument('--download', required=False,
default=False,
metavar="<True|False>",
help='Automatically download and unzip MS-COCO files (default=False)',
type=bool)
parser.add_argument('--lr', required=False,
default=0.001,
help='Learning rate')
parser.add_argument('--batchsize', required=False,
default=4,
help='Batch size')
parser.add_argument('--steps', required=False,
default=200,
help='steps per epoch')
parser.add_argument('--device', required=False,
default="gpu",
help='gpu or cpu')
args = parser.parse_args()
print("Command: ", args.command)
print("Model: ", args.model)
print("Dataset: ", args.dataset)
print("Year: ", args.year)
print("Logs: ", args.logs)
print("Auto Download: ", args.download)
if args.command == "train":
config = CocoConfig()
else:
class InferenceConfig(CocoConfig):
GPU_COUNT = 1
IMAGES_PER_GPU = 1
DETECTION_MIN_CONFIDENCE = 0
config = InferenceConfig()
config.display()
if args.command == "train":
model = modellib.MaskRCNN(config=config,
model_dir=args.logs)
else:
model = modellib.MaskRCNN(config=config,
model_dir=args.logs)
if args.device == "gpu":
device = torch.device("cuda")
else:
device = torch.device("cpu")
model = model.to(device)
if args.model:
if args.model.lower() == "coco":
model_path = COCO_MODEL_PATH
elif args.model.lower() == "last":
model_path = model.find_last()[1]
elif args.model.lower() == "imagenet":
model_path = config.IMAGENET_MODEL_PATH
else:
model_path = args.model
else:
model_path = ""
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = torch.nn.DataParallel(model, device_ids=[0,1,2,3])
model_path = 'mask_rcnn_coco.pth'
print("Loading weights ", model_path)
model.module.load_pre_weights(model_path)
lr=float(args.lr)
batchsize=int(args.batchsize)
steps=int(args.steps)
print('batchsize', batchsize)
print('lr', lr)
print('steps', steps)
if args.command == "train":
dataset_train = CocoDataset()
dataset_train.load_coco(args.dataset, "train", year=args.year, auto_download=args.download)
dataset_train.prepare()
dataset_val = CocoDataset()
dataset_val.load_coco(args.dataset, "val", year=args.year, auto_download=args.download)
dataset_val.prepare()
print("Training network heads")
model.module.train_model(dataset_train, dataset_val,
learning_rate=config.LEARNING_RATE,
epochs=1,
BatchSize=batchsize,
steps=steps,
layers='heads')
print("Fine tune Resnet stage 4 and up")
model.module.train_model(dataset_train, dataset_val,
learning_rate=config.LEARNING_RATE,
epochs=1,
BatchSize=batchsize,
steps=steps,
layers='4+')
print("Fine tune all layers")
model.module.train_model(dataset_train, dataset_val,
learning_rate=config.LEARNING_RATE / 10,
epochs=2,
BatchSize=batchsize,
steps=steps,
layers='all')
elif args.command == "evaluate":
dataset_val = CocoDataset()
coco = dataset_val.load_coco(args.dataset, "val", year=args.year, return_coco=True, auto_download=args.download)
dataset_val.prepare()
print("Running COCO evaluation on {} images.".format(args.limit))
evaluate_coco(model, dataset_val, coco, "bbox", limit=int(args.limit))
else:
print("'{}' is not recognized. "
"Use 'train' or 'evaluate'".format(args.command))
`
It's weird since I have checked it on my computer and it works well... I guess you could find some help from this
Possibly I would only have time to solve this after Dec.
@AaronLeong Notably, if you use 'DataParallel', the model will be wrapped in DataParallel(). It means you need to change the model.function() to model.module.function() in the following codes.
For example,
model.train_model --> model.module.train_model
I tried, but it still cannot work,it just opened the multi python thread in GPU but only one GPU worked.
So I think it looks like model.module.xxx can solve the bugs cased by DataParallel, but it makes problem come back original status, I mean the multi GPU of DataParallel to single GPU of module.
@zhangliyun9120 Hi, did you solve the problem? I am in the same situation.
Hey Guy's,
I got the same situation while I am working on kaggle GPU *2's, but I didn't seen both GPU's are not sharing the memory