Exporting network to hdf5 after loading in from state-dict results in error.

Question

Exporting network to hdf5 after loading in from state-dict results in error.

arshdeepsch opened this issue a year ago · comments

Describe the bug
Exporting network to hdf5 after loading in from a state-dict doesn't work.
Exporting network to hdf5 during training does work.

To reproduce current behavior
Steps to reproduce the behavior:
Save state-dict during training.
Load state-dict, and try to export to hdf5.

When I run this code (add code or minimum test case) ...

class ConvClass(torch.nn.Module):
    def __init__(self):
        super(ConvClass, self).__init__()

        neuron_params = {
            'threshold': 1.0,
            'current_decay': 0.25,
            'voltage_decay': 0.03,
            'tau_grad': 0.03,
            'scale_grad': 3,
            'requires_grad': True,
        }

        neuron_params_drop = {**neuron_params, 'dropout': sl.neuron.Dropout(p=0.05)}

        self.blocks = torch.nn.ModuleList([
            sl.block.cuba.Conv(neuron_params, 1, 64, 3, stride=2, weight_norm=True),
            sl.block.cuba.Conv(neuron_params, 64, 128, 3, stride=1, weight_norm=True, groups=64),
            sl.block.cuba.Conv(neuron_params, 128, 256, 3, stride=2, weight_norm=True, groups=128),
            sl.block.cuba.Conv(neuron_params, 256, 256, 1, stride=1, weight_norm=True, groups=256),
            sl.block.cuba.Conv(neuron_params, 256, 64, 1, stride=2, weight_norm=True, groups=64),
            sl.block.cuba.Flatten(),
            sl.block.cuba.Dense(neuron_params_drop, 576, 100, weight_norm=True),
            sl.block.cuba.Dense(neuron_params_drop, 100, 10, weight_norm=True),
        ])

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        return x

    def export_hdf5(self, filename):
        h = h5py.File(filename, 'w')
        layer = h.create_group('layer')
        for i, b in enumerate(self.blocks):
            b.export_hdf5(layer.create_group(f'{i}'))


trained_folder = "./Trained"
net = ConvClass()
#train...
net.export_hdf5(trained_folder + '/network.net') # export, works fine
torch.save(net.state_dict(), trained_folder + '/network.pt') # save

#later in another file...
net = ConvClass()
net.load_state_dict(torch.load(trained_folder + '/network.pt'))
net.export_hdf5(trained_folder + '/network.net') #error

I get this error ...

  File "/file.py", line 223, in export_hdf5
    b.export_hdf5(layer.create_group(f'{i}'))
  File "/lib/python3.8/site-packages/lava/lib/dl/slayer/block/base.py", line 707, in export_hdf5
    handle.create_dataset('shape', data=np.array(self.neuron.shape))
  File "/lib/python3.8/site-packages/h5py/_hl/group.py", line 161, in create_dataset
    dsid = dataset.make_new_dset(group, shape, dtype, data, name, **kwds)
  File "/lib/python3.8/site-packages/h5py/_hl/dataset.py", line 88, in make_new_dset
    tid = h5t.py_create(dtype, logical=1)
  File "h5py/h5t.pyx", line 1663, in h5py.h5t.py_create
  File "h5py/h5t.pyx", line 1687, in h5py.h5t.py_create
  File "h5py/h5t.pyx", line 1747, in h5py.h5t.py_create
TypeError: Object dtype dtype('O') has no native HDF5 equivalent

Expected behavior
Expect a .net file to be generated.

Screenshots
Saving directly during training has neuron.shape defined

Exporting from loading state-dict has neuron.shape to be None, which results in the error.

Environment (please complete the following information):

Device: Laptop
OS: Ubuntu 20.04
Lava version 0.6.0

bamsumit · Answer 1 · Fri May 26 2023 23:08:39 GMT+0800 (China Standard Time)

Hi @arshdeepsch, can you check if this runs correctly for you? It runs fine for me.

import torch
import h5py
from lava.lib.dl import slayer as sl


class ConvClass(torch.nn.Module):
    def __init__(self):
        super(ConvClass, self).__init__()

        neuron_params = {
            'threshold': 1.0,
            'current_decay': 0.25,
            'voltage_decay': 0.03,
            'tau_grad': 0.03,
            'scale_grad': 3,
            'requires_grad': True,
        }

        neuron_params_drop = {**neuron_params, 'dropout': sl.neuron.Dropout(p=0.05)}

        self.blocks = torch.nn.ModuleList([
            sl.block.cuba.Conv(neuron_params, 1, 64, 3, stride=2, weight_norm=True),
            sl.block.cuba.Conv(neuron_params, 64, 128, 3, stride=1, weight_norm=True, groups=64),
            sl.block.cuba.Conv(neuron_params, 128, 256, 3, stride=2, weight_norm=True, groups=128),
            sl.block.cuba.Conv(neuron_params, 256, 256, 1, stride=1, weight_norm=True, groups=256),
            sl.block.cuba.Conv(neuron_params, 256, 64, 1, stride=2, weight_norm=True, groups=64),
            sl.block.cuba.Flatten(),
            sl.block.cuba.Dense(neuron_params_drop, 576, 100, weight_norm=True),
            sl.block.cuba.Dense(neuron_params_drop, 100, 10, weight_norm=True),
        ])

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        return x

    def export_hdf5(self, filename):
        h = h5py.File(filename, 'w')
        layer = h.create_group('layer')
        for i, b in enumerate(self.blocks):
            b.export_hdf5(layer.create_group(f'{i}'))


trained_folder = "./Trained"
net = ConvClass()
#train...
net(torch.zeros([1, 1, 28, 28, 1]))  # Run with a dummy workload to initialize neuron shape
torch.save(net.state_dict(), trained_folder + '/network.pt') # save
net.export_hdf5(trained_folder + '/network.net') # export, works fine

#later in another file...
net1 = ConvClass()
net1(torch.zeros([1, 1, 28, 28, 1])) # Run with a dummy workload to initialize neuron shape
net1.load_state_dict(torch.load(trained_folder + '/network.pt'))
net1.export_hdf5(trained_folder + '/network1.net')

print('SUCCESS!')

You can further check the exported artifact to verify that they are same

diff Trained/network.net Trained/network1.net

Note: In the first export, I've exported the torch model before the netx model. This is due to weight normalization. Exporting netx model disables weight normalization temporarily.