ufoym / imbalanced-dataset-sampler

A (PyTorch) imbalanced dataset sampler for oversampling low frequent classes and undersampling high frequent ones.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

'MyDataset' object has no attribute 'get_labels'

kuri54 opened this issue · comments

When I try to use my own Dataset class, I get the error 'MyDataset' object has no attribute 'get_labels' and cannot proceed.

The content of the Dataloader is as follows, and there is nothing strange about it.
It processes the image data and label data in .npz format.

class MyDataset(data.Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image = self.images[index]
        label = self.labels[index]

        if self.transform is not None:
            image = self.transform(image=image)["image"]

        return image, label
train_dataset = MyDataset(train_imgs, train_labels, transform=transform)
train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                               sampler=ImbalancedDatasetSampler(train_dataset),
                                               batch_size= batch_size,
                                               shuffle=True,
                                               num_workers=2)

Is there something wrong with the code?
I don't think it's a typo.

How can I fix it so that it works correctly?

add this in your dataset code, and turn off the shuffle

def get_labels(self): return self.labels

@jaebbb
Thank you!
Thanks to you, it is now working properly.

Until then, I had been running the ImbalancedDatasetSampler class by creating my own to match my dataset.

class ImbalancedDatasetSampler(torch.utils.data.sampler.Sampler):
    def __init__(self, dataset, indices=None, num_samples=None):
       
        self.indices = list(range(len(dataset))) \
            if indices is None else indices

        self.num_samples = len(self.indices) \
            if num_samples is None else num_samples

        label_to_count = {}
        for idx in self.indices:
            label = self._get_label(dataset, idx)
            if label in label_to_count:
                label_to_count[label] += 1
            else:
                label_to_count[label] = 1

        weights  = [1.0 for idx in self.indices]
        self.weights = torch.DoubleTensor(weights)

    def _get_label(self, dataset, idx):
        dataset_type = type(dataset)
        if dataset_type is torchvision.datasets.MNIST:
            return dataset.train_labels[idx].item()
        elif dataset_type is torchvision.datasets.ImageFolder:
            return dataset.imgs[idx][1]
        else:
            return dataset.__getitem__(idx)[1]

    def __iter__(self):
        return (self.indices[i] for i in torch.multinomial(
            self.weights, self.num_samples, replacement=True))

    def __len__(self):
        return self.num_samples

add this in your dataset code, and turn off the shuffle

def get_labels(self): return self.labels

hi, why I modify the dataset class (source code), it still doesn;t work?
this is my dataset:

class CincDataset(Dataset):
    def __init__(self, root_dir, transform=None, phase='train2017'):
        super().__init__()
        self.root_dir = root_dir
        self.transform = transform
        self.phase = phase
        
        data_dir = os.path.join(root_dir, phase)
        list_data = os.listdir(data_dir)
        self.list_data = list_data
        
    def __len__(self):
        return len(self.list_data)
    
    def __getitem__(self, idx):
        recordname = self.list_data[idx]
        filename = os.path.join(self.root_dir, self.phase, recordname)
        data = np.load(filename)
        input_0 = data['input_0']
        input_1 = data['input_1']
        input_2 = data['input_2']
        input_3 = data['input_3']
        beat_0 = data['beat_0']
        beat_1 = data['beat_1']
        beat_2 = data['beat_2']
        beat_3 = data['beat_3']
        rhythm_0 = data['rhythm_0']
        rhythm_1 = data['rhythm_1'] 
        rhythm_2 = data['rhythm_2']
        rhythm_3 = data['rhythm_3']
        freq = data['freq']
        #label = np.array([data['label']])
        label = data['label']
        
        if self.transform is not None:
            input_0, input_1, input_2, input_3,
            beat_0, beat_1, beat_2, beat_3,
            rhythm_0, rhythm_1, rhythm_2, rhythm_3,
            freq, label = self.transform(
            input_0=input_0, input_1=input_1, input_2=input_2, input_3=input_3,
            beat_0=beat_0, beat_1=beat_1, beat_2=beat_2, beat_3=beat_3,
            rhythm_0=rhythm_0, rhythm_1=rhythm_1, rhythm_2=rhythm_2, rhythm_3=rhythm_3,
            freq=freq, label=label)
        input_0 = torch.from_numpy(input_0.astype(np.float32))
        input_1 = torch.from_numpy(input_1.astype(np.float32))
        input_2 = torch.from_numpy(input_2.astype(np.float32))
        input_3 = torch.from_numpy(input_3.astype(np.float32))
        beat_0 = torch.from_numpy(beat_0.astype(np.float32))
        beat_1 = torch.from_numpy(beat_1.astype(np.float32))
        beat_2 = torch.from_numpy(beat_2.astype(np.float32))
        beat_3 = torch.from_numpy(beat_3.astype(np.float32))
        rhythm_0 = torch.from_numpy(rhythm_0.astype(np.float32))
        rhythm_1 = torch.from_numpy(rhythm_1.astype(np.float32))
        rhythm_2 = torch.from_numpy(rhythm_2.astype(np.float32))
        rhythm_3 = torch.from_numpy(rhythm_3.astype(np.float32))
        freq = torch.from_numpy(freq.astype(np.float32))
        label = torch.from_numpy(label.astype(np.int32)).long()
        return input_0, input_1, input_2, input_3, \
            beat_0, beat_1, beat_2, beat_3, \
            rhythm_0, rhythm_1, rhythm_2, rhythm_3, \
            freq, label

hi, I faced the same problem.

You need to include a function to get the label in that class.
As you can see in imbalanced-dataset-sampler/torchsampler/imbalanced.py, the code to get the labels looks like this

def _get_labels(self, dataset):
        if self.callback_get_label:
            return self.callback_get_label(dataset)
        elif isinstance(dataset, torchvision.datasets.MNIST):
            return dataset.train_labels.tolist()
        elif isinstance(dataset, torchvision.datasets.ImageFolder):
            return [x[1] for x in dataset.imgs]
        elif isinstance(dataset, torchvision.datasets.DatasetFolder):
            return dataset.samples[:][1]
        elif isinstance(dataset, torch.utils.data.Subset):
            return dataset.dataset.imgs[:][1]
        elif isinstance(dataset, torch.utils.data.Dataset):
            return dataset.get_labels()
        else:
            raise NotImplementedError

If you want to get labels from a custom Dataset class, you will need to define them yourself as this code does not do it.

In the end, I added def get_labels(self): return self.labels to the Dataset class at the top of this page, and it works fine now.

You need to include a function to get the label in that class. As you can see in imbalanced-dataset-sampler/torchsampler/imbalanced.py, the code to get the labels looks like this
Hi,
Should we do shuffle=False for training, validating and testing ?