Out of memory during searching & training.
ganji15 opened this issue · comments
hi, @yuhuixu1993, thank you for sharing your amazing work.
I have tried your code to search a network on cifar10, and I followed the instruction ``python train_search.py'' to run the code; however, the search procedure was ended up with CUDA out of memory error, i.e.,
09/01 01:55:07 PM train 000 2.426532e-01 90.234375 100.000000
09/01 01:57:19 PM train 050 2.439179e-01 91.590074 99.862132
09/01 01:59:23 PM train_acc 91.776000
09/01 01:59:23 PM epoch 49 lr 1.000000e-03
09/01 01:59:23 PM genotype = Genotype(normal=[('sep_conv_3x3', 0), ('skip_connect', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 0), ('sep_conv_5x5', 1)], normal_concat=range(2, 6), reduce=[('skip_connect', 1), ('max_pool_3x3', 0), ('sep_conv_5x5', 1), ('dil_conv_5x5', 0), ('sep_conv_5x5', 1), ('sep_conv_5x5', 2), ('sep_conv_5x5', 2), ('sep_conv_5x5', 1)], reduce_concat=range(2, 6))
tensor([[0.1646, 0.1040, 0.1085, 0.1185, 0.1612, 0.1110, 0.1120, 0.1202],
[0.1498, 0.1149, 0.1231, 0.1378, 0.0881, 0.1234, 0.1322, 0.1307],
[0.1124, 0.1017, 0.1039, 0.1098, 0.1700, 0.1085, 0.1513, 0.1423],
[0.1205, 0.1086, 0.1074, 0.1125, 0.1670, 0.1348, 0.1133, 0.1358],
[0.1754, 0.0947, 0.0893, 0.1235, 0.1288, 0.1079, 0.1647, 0.1156],
[0.1414, 0.0955, 0.0984, 0.1059, 0.1967, 0.1468, 0.0984, 0.1169],
[0.1193, 0.1157, 0.1068, 0.1123, 0.2055, 0.1308, 0.1107, 0.0988],
[0.1533, 0.1183, 0.1268, 0.1451, 0.1239, 0.1101, 0.1156, 0.1068],
[0.1498, 0.1063, 0.1073, 0.1328, 0.1361, 0.1421, 0.1167, 0.1089],
[0.1132, 0.1281, 0.1156, 0.1162, 0.1608, 0.1445, 0.1159, 0.1056],
[0.1024, 0.1174, 0.1393, 0.1294, 0.1359, 0.1527, 0.1220, 0.1010],
[0.1306, 0.1090, 0.1147, 0.1288, 0.1520, 0.1055, 0.1442, 0.1152],
[0.1462, 0.0942, 0.0924, 0.1288, 0.1084, 0.1127, 0.1749, 0.1425],
[0.1416, 0.0991, 0.0974, 0.1282, 0.1396, 0.1392, 0.1244, 0.1306]],
device='cuda:1', grad_fn=<SoftmaxBackward>)
tensor([[0.1148, 0.1495, 0.1392, 0.1084, 0.1359, 0.1406, 0.1165, 0.0951],
[0.1327, 0.1076, 0.0993, 0.1602, 0.1379, 0.1272, 0.1087, 0.1264],
[0.1164, 0.1317, 0.1378, 0.1009, 0.1293, 0.1330, 0.1105, 0.1405],
[0.1259, 0.1005, 0.1057, 0.1288, 0.1212, 0.1532, 0.1372, 0.1273],
[0.1387, 0.0891, 0.0919, 0.1215, 0.1347, 0.1005, 0.1860, 0.1375],
[0.1181, 0.1241, 0.1199, 0.1367, 0.1470, 0.1228, 0.1066, 0.1249],
[0.1344, 0.1070, 0.1094, 0.1165, 0.1187, 0.1590, 0.1319, 0.1233],
[0.1372, 0.1035, 0.1067, 0.1308, 0.1076, 0.1623, 0.1355, 0.1164],
[0.1400, 0.1138, 0.1198, 0.1409, 0.1042, 0.1349, 0.1158, 0.1307],
[0.1309, 0.1175, 0.1291, 0.1270, 0.1265, 0.1102, 0.1504, 0.1083],
[0.1337, 0.1022, 0.1076, 0.1221, 0.1369, 0.1550, 0.1146, 0.1279],
[0.1339, 0.0890, 0.0939, 0.1212, 0.1117, 0.1830, 0.1362, 0.1313],
[0.1431, 0.1034, 0.1148, 0.1350, 0.1092, 0.1270, 0.1375, 0.1300],
[0.1396, 0.0920, 0.0998, 0.1267, 0.1404, 0.0974, 0.1373, 0.1669]],
device='cuda:1', grad_fn=<SoftmaxBackward>)
tensor([0.3777, 0.3653, 0.2571], device='cuda:1', grad_fn=<SoftmaxBackward>)
09/01 01:59:27 PM train 000 2.647865e-01 89.843750 100.000000
09/01 02:01:39 PM train 050 2.318434e-01 91.980699 99.892770
09/01 02:03:43 PM train_acc 91.660000
09/01 02:03:43 PM valid 000 5.367675e-01 83.593750 99.218750
Traceback (most recent call last):
File "train_search.py", line 206, in <module>
main()
File "train_search.py", line 130, in main
valid_acc, valid_obj = infer(valid_queue, model, criterion)
File "train_search.py", line 190, in infer
logits = model(input)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/model_search.py", line 159, in forward
s0, s1 = s1, cell(s0, s1, weights,weights2)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/model_search.py", line 85, in forward
s = sum(weights2[offset+j]*self._ops[offset+j](h, weights[offset+j]) for j, h in enumerate(states))
File "/home/ganji/Documents/work/pc-darts/model_search.py", line 85, in <genexpr>
s = sum(weights2[offset+j]*self._ops[offset+j](h, weights[offset+j]) for j, h in enumerate(states))
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/model_search.py", line 44, in forward
temp1 = sum(w * op(xtemp) for w, op in zip(weights, self._ops))
RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 1; 15.90 GiB total capacity; 15.29 GiB already allocated; 15.56 MiB free; 7.17 MiB cached)
Then, I continued to run the command ``python train.py --auxiliary --cutout'', and the training of the searched model also raised the OOM error, i.e.,
➜ pc-darts python train.py --auxiliary --cutout --gpu 1
Experiment dir : eval-EXP-20190901-143341
09/01 02:33:41 PM gpu device = 1
09/01 02:33:41 PM args = Namespace(arch='PCDARTS', auxiliary=True, auxiliary_weight=0.4, batch_size=96, cutout=True, cutout_length=16, data='../data', drop_path_prob=0.3, epochs=600, gpu=1, grad_clip=5, init_channels=36, layers=20, learning_rate=0.025, model_path='saved_models', momentum=0.9, report_freq=50, save='eval-EXP-20190901-143341', seed=0, set='cifar10', weight_decay=0.0003)
108 108 36
108 144 36
144 144 36
144 144 36
144 144 36
144 144 36
144 144 72
144 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 144
288 576 144
576 576 144
576 576 144
576 576 144
576 576 144
576 576 144
09/01 02:33:44 PM param size = 3.634678MB
Files already downloaded and verified
Files already downloaded and verified
/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/optim/lr_scheduler.py:82: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule.See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
09/01 02:33:46 PM epoch 0 lr 2.499983e-02
train.py:136: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.
nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
09/01 02:33:48 PM train 000 3.258214e+00 8.333333 50.000000
09/01 02:34:18 PM train 050 3.215875e+00 13.623365 56.638069
09/01 02:34:49 PM train 100 3.148910e+00 15.459983 61.984321
09/01 02:35:19 PM train 150 3.054110e+00 18.329194 67.335814
09/01 02:35:49 PM train 200 2.970589e+00 20.677860 71.035445
09/01 02:36:19 PM train 250 2.899201e+00 22.705012 73.725927
09/01 02:36:50 PM train 300 2.842999e+00 24.228266 75.633303
09/01 02:37:20 PM train 350 2.789153e+00 25.741927 77.148620
09/01 02:37:50 PM train 400 2.736966e+00 27.153469 78.514648
09/01 02:38:20 PM train 450 2.694277e+00 28.466832 79.557000
09/01 02:38:50 PM train 500 2.656195e+00 29.663588 80.561790
09/01 02:39:03 PM train_acc 30.111999
09/01 02:39:03 PM valid 000 1.350200e+00 52.083332 91.666664
Traceback (most recent call last):
File "train.py", line 177, in <module>
main()
File "train.py", line 113, in main
valid_acc, valid_obj = infer(valid_queue, model, criterion)
File "train.py", line 161, in infer
logits, _ = model(input)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/model.py", line 150, in forward
s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/model.py", line 51, in forward
h1 = op1(h1)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/operations.py", line 66, in forward
return self.op(x)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 343, in forward
return self.conv2d_forward(input, self.weight)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 340, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 1; 15.90 GiB total capacity; 14.26 GiB already allocated; 1.56 MiB free; 1.05 GiB cached)
In addition, my environment is ``Ubuntu 16.04 + CUDA 10.0 + Python 3.7 + PyTorch 1.2''.
Since I am new to NAS, I cannot figure out what causes the OOM error. Could you help fix this error or give some suggestions? Thanks.
@ganji15 ,Which GPU is used in your experiment? 1080ti or V100? 1080ti may have OOM error on higher version pytorch just like the original DARTS. Besides, I note that the OOM occurs when test the validation accuracy, you can just comment that validation part and this will not affect the searched result.
@yuhuixu1993 Thanks. My GPU is Quadro P5000, and I also met the OOM error before when using Titan X. I think you are right, and I will modify the code according to your good suggestion.
@yuhuixu1993 The problem is not solved, and I find out that the OOM error occurs in the forward propagation. As a result, I even cannot evaluate the performance of the searched model.
Specifically, I deleted the training part of ``train.py'' and kept the inference part as follows:
for epoch in range(args.epochs):
scheduler.step()
logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
model.drop_path_prob = args.drop_path_prob * epoch / args.epochs
# train_acc, train_obj = train(train_queue, dp_model, model, criterion, optimizer)
# logging.info('train_acc %f', train_acc)
logging.info('enter infer')
valid_acc, valid_obj = infer(valid_queue, dp_model, model, criterion)
logging.info('exit infer')
if valid_acc > best_acc:
best_acc = valid_acc
logging.info('valid_acc %f, best_acc %f', valid_acc, best_acc)
# logging.info('valid_acc %f', valid_acc)
I also added log in the ``infer'' function as follows:
def infer(valid_queue, model, criterion):
...
for step, (input, target) in enumerate(valid_queue):
logging.info('step %d'%step) ## debug information
input = input.cuda()
target = target.cuda(non_blocking=True)
...
Then, I run the ``train.py'' and I got the following errors:
➜ pc-darts python train.py --auxiliary --cutout --gpu 1
Experiment dir : eval-EXP-20190902-110522
09/02 11:05:22 AM gpu device = 1
09/02 11:05:22 AM args = Namespace(arch='PCDARTS', auxiliary=True, auxiliary_weight=0.4, batch_size=96, cutout=True, cutout_length=16, data='../data', drop_path_prob=0.3, epochs=600, gpu='1', grad_clip=5, init_channels=36, layers=20, learning_rate=0.025, model_path='saved_models', momentum=0.9, report_freq=50, save='eval-EXP-20190902-110522', seed=0, set='cifar10', weight_decay=0.0003)
108 108 36
108 144 36
144 144 36
144 144 36
144 144 36
144 144 36
144 144 72
144 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 144
288 576 144
576 576 144
576 576 144
576 576 144
576 576 144
576 576 144
09/02 11:05:24 AM param size = 3.634678MB
Files already downloaded and verified
Files already downloaded and verified
09/02 11:05:26 AM epoch 0 lr 2.500000e-02
09/02 11:05:26 AM enter infer
09/02 11:05:26 AM step 0
09/02 11:05:26 AM valid 000 2.301318e+00 8.333333
09/02 11:05:26 AM step 1
Traceback (most recent call last):
File "train.py", line 193, in <module>
main()
File "train.py", line 125, in main
valid_acc, valid_obj = infer(valid_queue, dp_model, model, criterion)
File "train.py", line 178, in infer
logits, _ = dp_model(input)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/model.py", line 150, in forward
s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/model.py", line 51, in forward
h1 = op1(h1)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/ganji/Documents/work/pc-darts/operations.py", line 66, in forward
return self.op(x)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/cvmt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 320, in forward
self.padding, self.dilation, self.groups)
RuntimeError: CUDA out of memory. Tried to allocate 6.75 MiB (GPU 0; 15.90 GiB total capacity; 14.32 GiB already allocated; 3.56 MiB free; 3.09 MiB cached)
Therefore, I guess that this is something wrong with the searched model (too large? too deep? circle route?). How can I visualize the searched model? How to fix this error? Thanks!
p.s. I downgrade PyTorch 1.2 -> 1.0, and the problem remains.
@ganji15 ,yes you reduce the batchsize and try it again? Pytorch version 0.3 is suggested.
@yuhuixu1993 It works when I change the batch size from 96 to 48. The model is so large, which consumes over 16 GB GPU memory with only the forward propagation.
Can you evaluate the architecture of darts? You can also try and such that we can know if it is the problem of pytorch version or model size. As I can evaluate DARTS on 1080ti. If it does the model size reason, you may search a smaller one by adding flop constraint as SNAS or use smaller batchsize or bigger GPU.
@yuhuixu1993 I think it is the problem of PyTorch version. The original code of ``infer'' function is as follows:
def infer(valid_queue, model, criterion):
objs = utils.AvgrageMeter()
top1 = utils.AvgrageMeter()
top5 = utils.AvgrageMeter()
model.eval()
for step, (input, target) in enumerate(valid_queue):
#input = input.cuda()
#target = target.cuda(non_blocking=True)
input = input.cuda()
target = target.cuda(non_blocking=True)
logits = model(input)
loss = criterion(logits, target)
prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
n = input.size(0)
objs.update(loss.data.item(), n) ## this may lead to GPU memory leak in pytorch 1.0+
top1.update(prec1.data.item(), n) ## this may lead to GPU memory leak in pytorch 1.0+
top5.update(prec5.data.item(), n) ## this may lead to GPU memory leak in pytorch 1.0+
if step % args.report_freq == 0:
logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)
return top1.avg, objs.avg
So, I change to code as follows:
def infer(valid_queue, model, criterion):
objs = utils.AvgrageMeter()
top1 = utils.AvgrageMeter()
top5 = utils.AvgrageMeter()
model.eval()
with torch.no_grad(): # no grad for inference
for step, (input, target) in enumerate(valid_queue):
input = input.cuda()
target = target.cuda(non_blocking=True)
logits = model(input)
loss = criterion(logits, target)
prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
n = input.size(0)
objs.update(loss.item(), n) # deleting .data to fix OOM
top1.update(prec1.item(), n) # deleting .data to fix OOM
top5.update(prec5.item(), n) # deleting .data to fix OOM
if step % args.report_freq == 0:
logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)
return top1.avg, objs.avg
After the above modification, I can evaluate the model with a larger batch size with PyTorch 1.2.
By the way, how can I visualize the searched model and see its detailed configurations? Thanks.
Yes, I have already updated my code too. The visualization code is inherited from DARTS. First, you copy the searched architecture to geonotype.py
and then try python visualize.py PC-DARTS
.
@yuhuixu1993 Thank you for your kind help. Since the problem has been solved, I will close this issue.