Kaslanarian / PyDyNet

NumPy实现类PyTorch的动态计算图和神经网络框架(MLP, CNN, RNN, Transformer)

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

关于GPU加速的一些bug

2410183509 opened this issue · comments

大佬你好,我最近在研究动态图机制的实现。在使用GPU运行你的代码的时候,我发现cuCNN的例子精度不随迭代变化(cpu正常),观察到网络参数其实并没有改变,可能是因为网络参数没有移动到gpu上。我尝试为每一层添加.to('cuda')之后参数得到了更新,精度表现也正常。但随着训练的进行,显存不断增长(未更改时显存正常,保持不变),我不知道这个的具体原因是什么,大佬可以帮我看一下吗

import numpy as np
from pydynet.tensor import Tensor
import pydynet.nn.functional as F
import pydynet.nn as nn
from pydynet.optim import Adam, SGD
from pydynet.data import DataLoader, Dataset
from tqdm import tqdm


dev = ['cpu', 'cuda'][1]
np.random.seed(42)



from scipy.io import loadmat
data = loadmat('../mnist_uint8.mat')
train_x = np.reshape(data['train_x'], (60000, 1, 28, 28)) / 255.0
train_y = data['train_y']
test_x = np.reshape(data['test_x'], (10000, 1, 28, 28)) / 255.0
test_y = data['test_y']



class mnist_dataset(Dataset):
    def __init__(self, X, y) -> None:
        super().__init__()
        self.data = X
        self.label = y

    def __getitem__(self, index):
        return self.data[index], self.label[index]

    def __len__(self):
        return len(self.data)

train_loader = DataLoader(mnist_dataset(train_x, train_y), 32, True)
test_loader = DataLoader(mnist_dataset(test_x, test_y), 32, False)




class CNN2d(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # 每层添加to之后可以正常更新,但会爆显存
        self.conv1 = nn.Conv2d(1, 1, 3, padding=1).to(dev)
        self.fc1 = nn.Linear(49, 128).to(dev)
        self.fc2 = nn.Linear(128, 10).to(dev)

    def forward(self, x):
        x = self.conv1(x)
        x = F.max_pool2d(x, 4, 4)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        x = F.leaky_relu(x, 0.1)
        return self.fc2(x)


net3 = CNN2d().to(dev)

optim3 = Adam(net3.parameters(), lr=0.01)
loss = nn.CrossEntropyLoss().to(dev)
EPOCHES = 50
BATCH_SIZE = 32


from time import time


t = time()
for epoch in range(EPOCHES):

    net3.train()
    train_out = []
    for batch_X, batch_y in tqdm(train_loader):
        batch_X, batch_y = Tensor(batch_X).to('cuda'), Tensor(batch_y).to('cuda')
        # print(data)
        output3 = net3(batch_X)
        l3 = loss(output3, batch_y)
        optim3.zero_grad()
        l3.backward()
        optim3.step()

        acc = np.argmax(output3.numpy(), axis=1) == np.argmax(batch_y.numpy(), axis=1)
        train_out.append(acc)
        # mp.free_all_blocks()
        # pmp.free_all_blocks()
    train_out = np.concatenate(train_out)
    train_out = np.mean(train_out)

    net3.eval()
    test_out = []
    # test_label
    for batch_X, batch_y in tqdm(test_loader):
        node_y = Tensor(batch_y).to(dev)

        data = Tensor(batch_X).to(dev)
        # print(data)
        output3 = net3(data)
        l3 = loss(output3, node_y)
        t = list(output3.numpy())
        acc = np.argmax(t, axis=1) == np.argmax(batch_y, axis=1)
        test_out.append(acc)
        # del data
    test_out = np.concatenate(test_out)
    test_out = np.mean(test_out)


    print("Epoch {:2d}:".format(epoch + 1))

    print('train acc: {}, test acc: {}'.format(train_out, test_out))
print(time() - t)


大佬你好,我尝试了把
net3 = CNN2d().to(dev)
改为
net3 = CNN2d()
其他地方不变,这样显存不会再增加,收敛正常,可能是在内存和显存之间的移动部分代码有问题,大佬可以debug一下。再次感谢你提供的代码实现。

已经修改,可以通过测试了