jytime / Mask_RCNN_Pytorch

Mask R-CNN for object detection and instance segmentation on Pytorch

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

AttributeError: 'DataParallel' object has no attribute 'train_model'

sxhxliang opened this issue · comments

Thank for your implementation, but I got an error when using 4 GPUs to train this model

# model = torch.nn.DataParallel(model, device_ids=[0,1,2,3])
Traceback (most recent call last):
File "bdd_coco.py", line 567, in
model.train_model(dataset_train, dataset_val,
File "/home/user/.conda/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/module.py", line 532, in getattr
type(self).name, name))
AttributeError: 'DataParallel' object has no attribute 'train_model'

@AaronLeong Notably, if you use 'DataParallel', the model will be wrapped in DataParallel(). It means you need to change the model.function() to model.module.function() in the following codes.
For example,
model.train_model --> model.module.train_model

@jytime I have tried this setting, but only one GPU can work well

user@ubuntu:~/rcnn$ nvidia-smi Sat Sep 22 15:31:48 2018 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 396.45 Driver Version: 396.45 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 TITAN Xp COLLEC... Off | 00000000:02:00.0 On | N/A | | 32% 57C P2 73W / 250W | 11354MiB / 12194MiB | 5% Default | +-------------------------------+----------------------+----------------------+ | 1 TITAN Xp Off | 00000000:03:00.0 Off | N/A | | 27% 46C P8 18W / 250W | 12MiB / 12196MiB | 0% Default | +-------------------------------+----------------------+----------------------+ | 2 TITAN Xp Off | 00000000:82:00.0 Off | N/A | | 28% 48C P8 19W / 250W | 12MiB / 12196MiB | 0% Default | +-------------------------------+----------------------+----------------------+ | 3 TITAN Xp Off | 00000000:83:00.0 Off | N/A | | 30% 50C P8 18W / 250W | 12MiB / 12196MiB | 0% Default | +-------------------------------+----------------------+----------------------+

`
import os
import time
import numpy as np
import scipy.misc
import scipy.ndimage
import skimage.color
import skimage.io

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from pycocotools import mask as maskUtils

import zipfile
import urllib.request
import shutil

from config import Config
import utils
import model as modellib

import torch

ROOT_DIR = os.getcwd()

COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.pth")

DEFAULT_LOGS_DIR = os.path.join(ROOT_DIR, "logs")
DEFAULT_DATASET_YEAR = "2018"

class CocoConfig(Config):

NAME = "bdd"

IMAGES_PER_GPU = 16

GPU_COUNT = 4

class CocoDataset(utils.Dataset):
def load_coco(self, dataset_dir, subset, year=DEFAULT_DATASET_YEAR, class_ids=None,
class_map=None, return_coco=False, auto_download=False):

    if auto_download is True:
        self.auto_download(dataset_dir, subset, year)

    coco = COCO("{}/annotations/instances_{}{}.json".format(dataset_dir, subset, year))
    if subset == "minival" or subset == "valminusminival":
        subset = "val"
    image_dir = "{}/{}/{}".format(dataset_dir,'images/100k', subset+year)

    if not class_ids:
        class_ids = sorted(coco.getCatIds())

    if class_ids:
        image_ids = []
        for id in class_ids:
            image_ids.extend(list(coco.getImgIds(catIds=[id])))
        image_ids = list(set(image_ids))
    else:
        image_ids = list(coco.imgs.keys())

    for i in class_ids:
        self.add_class("bdd", i, coco.loadCats(i)[0]["name"])

    for i in image_ids:
        self.add_image(
            "bdd", image_id=i,
            path=os.path.join(image_dir, coco.imgs[i]['file_name']),
            width=coco.imgs[i]["width"],
            height=coco.imgs[i]["height"],
            annotations=coco.loadAnns(coco.getAnnIds(
                imgIds=[i], catIds=class_ids, iscrowd=None)))
    if return_coco:
        return coco
def load_mask(self, image_id):

    image_info = self.image_info[image_id]

    if image_info["source"] != "bdd":
        return super(CocoDataset, self).load_mask(image_id)

    instance_masks = []
    class_ids = []
    annotations = self.image_info[image_id]["annotations"]
    for annotation in annotations:
        
        class_id = self.map_source_class_id(
            "bdd.{}".format(annotation['category_id']))

        if class_id:
            m = self.annToMask(annotation, image_info["height"],
                               image_info["width"])
            if m.max() < 1:
                continue
            if annotation['iscrowd']:
                class_id *= -1
                if m.shape[0] != image_info["height"] or m.shape[1] != image_info["width"]:
                    m = np.ones([image_info["height"], image_info["width"]], dtype=bool)
            instance_masks.append(m)
            class_ids.append(class_id)

    if class_ids:
        mask = np.stack(instance_masks, axis=2)
        class_ids = np.array(class_ids, dtype=np.int32)
        return mask, class_ids
    else:
        return super(CocoDataset, self).load_mask(image_id)
   
def load_drivable(self,image_id, use_color_maps=False, use_one_hot_label=True):
   
    image_info = self.image_info[image_id]
    path = image_info['path']

    if use_color_maps:
        label_name = 'color_labels'
        path = path.replace('.jpg', '_drivable_color.png')
    else:
        label_name = 'labels'
        path = path.replace('.jpg', '_drivable_id.png')

    path = path.replace('images/100k', 'drivable_maps/'+label_name)

    image = skimage.io.imread(path)
   

    if image.ndim != 3:
        image = skimage.color.gray2rgb(image)
        drivable_maps = np.zeros_like(image)
        drivable_maps[:,:,0] = 0
        drivable_maps[:,:,1] = 1
        drivable_maps[:,:,2] = 2
        image = image == drivable_maps
    return image*1.0
    
def image_reference(self, image_id):
  
    info = self.image_info[image_id]
    if info["source"] == "coco":
        return "http://cocodataset.org/#explore?id={}".format(info["id"])
    else:
        super(CocoDataset, self).image_reference(image_id)


def annToRLE(self, ann, height, width):

    segm = ann['segmentation']
    if isinstance(segm, list):
        rles = maskUtils.frPyObjects(segm, height, width)
        rle = maskUtils.merge(rles)
    elif isinstance(segm['counts'], list):
        rle = maskUtils.frPyObjects(segm, height, width)
    else:
        rle = ann['segmentation']
    return rle

def annToMask(self, ann, height, width):

    rle = self.annToRLE(ann, height, width)
    m = maskUtils.decode(rle)
    return m

def build_coco_results(dataset, image_ids, rois, class_ids, scores, masks):

if rois is None:
    return []

results = []
for image_id in image_ids:
    for i in range(rois.shape[0]):
        class_id = class_ids[i]
        score = scores[i]
        bbox = np.around(rois[i], 1)
        mask = masks[:, :, i]

        result = {
            "image_id": image_id,
            "category_id": dataset.get_source_class_id(class_id, "coco"),
            "bbox": [bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]],
            "score": score,
            "segmentation": maskUtils.encode(np.asfortranarray(mask))
        }
        results.append(result)
return results

def evaluate_coco(model, dataset, coco, eval_type="bbox", limit=0, image_ids=None):

image_ids = image_ids or dataset.image_ids

if limit:
    image_ids = image_ids[:limit]

coco_image_ids = [dataset.image_info[id]["id"] for id in image_ids]

t_prediction = 0
t_start = time.time()

results = []
for i, image_id in enumerate(image_ids):
    image = dataset.load_image(image_id)

    t = time.time()
    r = model.detect([image])[0]
    t_prediction += (time.time() - t)

    image_results = build_coco_results(dataset, coco_image_ids[i:i + 1],
                                       r["rois"], r["class_ids"],
                                       r["scores"], r["masks"])
    results.extend(image_results)

coco_results = coco.loadRes(results)

cocoEval = COCOeval(coco, coco_results, eval_type)
cocoEval.params.imgIds = coco_image_ids
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()

print("Prediction time: {}. Average {}/image".format(
    t_prediction, t_prediction / len(image_ids)))
print("Total time: ", time.time() - t_start)

if name == 'main':
import argparse

parser = argparse.ArgumentParser(
    description='Train Mask R-CNN on MS COCO.')
parser.add_argument("command",
                    metavar="<command>",
                    help="'train' or 'evaluate' on MS COCO")
parser.add_argument('--dataset', required=True,
                    metavar="/data1/datasets/bdd100k",
                    help='Directory of the MS-COCO dataset')
parser.add_argument('--year', required=False,
                    default=DEFAULT_DATASET_YEAR,
                    metavar="<year>",
                    help='Year of the MS-COCO dataset (2014 or 2017) (default=2014)')
parser.add_argument('--model', required=False,
                    metavar="/path/to/weights.pth",
                    help="Path to weights .pth file or 'coco'")
parser.add_argument('--logs', required=False,
                    default=DEFAULT_LOGS_DIR,
                    metavar="/path/to/logs/",
                    help='Logs and checkpoints directory (default=logs/)')
parser.add_argument('--limit', required=False,
                    default=500,
                    metavar="<image count>",
                    help='Images to use for evaluation (default=500)')
parser.add_argument('--download', required=False,
                    default=False,
                    metavar="<True|False>",
                    help='Automatically download and unzip MS-COCO files (default=False)',
                    type=bool)
parser.add_argument('--lr', required=False,
                    default=0.001,
                    help='Learning rate')
parser.add_argument('--batchsize', required=False,
                    default=4,
                    help='Batch size')
parser.add_argument('--steps', required=False,
                    default=200,
                    help='steps per epoch')    
parser.add_argument('--device', required=False,
                    default="gpu",
                    help='gpu or cpu')                         
args = parser.parse_args()                        

print("Command: ", args.command)
print("Model: ", args.model)
print("Dataset: ", args.dataset)
print("Year: ", args.year)
print("Logs: ", args.logs)
print("Auto Download: ", args.download)

if args.command == "train":
    config = CocoConfig()
else:
    class InferenceConfig(CocoConfig):
        GPU_COUNT = 1
        IMAGES_PER_GPU = 1
        DETECTION_MIN_CONFIDENCE = 0
    config = InferenceConfig()
config.display()

if args.command == "train":
    model = modellib.MaskRCNN(config=config,
                              model_dir=args.logs)
else:
    model = modellib.MaskRCNN(config=config,
                              model_dir=args.logs)

if args.device == "gpu":
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model = model.to(device)

if args.model:
    if args.model.lower() == "coco":
        model_path = COCO_MODEL_PATH
    elif args.model.lower() == "last":
        model_path = model.find_last()[1]
    elif args.model.lower() == "imagenet":
        model_path = config.IMAGENET_MODEL_PATH
    else:
        model_path = args.model
else:
    model_path = ""



if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model, device_ids=[0,1,2,3])

model_path = 'mask_rcnn_coco.pth'
print("Loading weights ", model_path)
model.module.load_pre_weights(model_path)

lr=float(args.lr)
batchsize=int(args.batchsize)
steps=int(args.steps)

print('batchsize', batchsize)
print('lr', lr)
print('steps', steps)

if args.command == "train":
    dataset_train = CocoDataset()
    dataset_train.load_coco(args.dataset, "train", year=args.year, auto_download=args.download)
    dataset_train.prepare()

    dataset_val = CocoDataset()
    dataset_val.load_coco(args.dataset, "val", year=args.year, auto_download=args.download)
    dataset_val.prepare()


    print("Training network heads")
    model.module.train_model(dataset_train, dataset_val,
                learning_rate=config.LEARNING_RATE,
                epochs=1,
                BatchSize=batchsize,
                steps=steps,
                layers='heads')

    print("Fine tune Resnet stage 4 and up")
    model.module.train_model(dataset_train, dataset_val,
                learning_rate=config.LEARNING_RATE,
                epochs=1,
                BatchSize=batchsize,
                steps=steps,
                layers='4+')

    print("Fine tune all layers")
    model.module.train_model(dataset_train, dataset_val,
                learning_rate=config.LEARNING_RATE / 10,
                epochs=2,
                BatchSize=batchsize,
                steps=steps,
                layers='all')

elif args.command == "evaluate":
    dataset_val = CocoDataset()
    coco = dataset_val.load_coco(args.dataset, "val", year=args.year, return_coco=True, auto_download=args.download)
    dataset_val.prepare()
    print("Running COCO evaluation on {} images.".format(args.limit))
    evaluate_coco(model, dataset_val, coco, "bbox", limit=int(args.limit))
else:
    print("'{}' is not recognized. "
          "Use 'train' or 'evaluate'".format(args.command))

`

It's weird since I have checked it on my computer and it works well... I guess you could find some help from this
Possibly I would only have time to solve this after Dec.

commented

@AaronLeong Notably, if you use 'DataParallel', the model will be wrapped in DataParallel(). It means you need to change the model.function() to model.module.function() in the following codes.
For example,
model.train_model --> model.module.train_model

I tried, but it still cannot work,it just opened the multi python thread in GPU but only one GPU worked.
So I think it looks like model.module.xxx can solve the bugs cased by DataParallel, but it makes problem come back original status, I mean the multi GPU of DataParallel to single GPU of module.

@zhangliyun9120 Hi, did you solve the problem? I am in the same situation.

commented

Hey Guy's,
I got the same situation while I am working on kaggle GPU *2's, but I didn't seen both GPU's are not sharing the memory