BUG: AttributeError: 'float' object has no attribute 'clone'

Question

BUG: AttributeError: 'float' object has no attribute 'clone'

ddemillard opened this issue 4 months ago · comments

On the current master I am getting the following error consistently:

Traceback (most recent call last):
File "/workspace/joliGEN/train.py", line 445, in
launch_training(opt)
File "/workspace/joliGEN/train.py", line 419, in launch_training
mp.spawn(
File "/usr/local/lib/python3.10/dist-packages/torch/multiprocessing/spawn.py", line 246, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
File "/usr/local/lib/python3.10/dist-packages/torch/multiprocessing/spawn.py", line 202, in start_processes
while not context.join():
File "/usr/local/lib/python3.10/dist-packages/torch/multiprocessing/spawn.py", line 163, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/torch/multiprocessing/spawn.py", line 74, in wrap
fn(i, *args)
File "/workspace/joliGEN/train.py", line 197, in train_gpu
model.optimize_parameters() # calculate loss functions, get gradients, update network weights
File "/workspace/joliGEN/models/base_model.py", line 1138, in optimize_parameters
self.compute_step(group.optimizer, loss_names)
File "/workspace/joliGEN/models/base_model.py", line 1029, in compute_step
getattr(self, "loss" + loss_name).clone()
AttributeError: 'float' object has no attribute 'clone'

root@3c5c0934a4de:/workspace/joliGEN#

AttributeError: 'float' object has no attribute 'clone'

Here is my config:
{
"D": {
"dropout": false,
"n_layers": 3,
"ndf": 64,
"netDs": [
"projected_d",
"basic"
],
"norm": "instance",
"proj_interp": 1024,
"proj_network_type": "efficientnet"
},
"G": {
"attn_nb_mask_attn": 10,
"attn_nb_mask_input": 1,
"dropout": false,
"nblocks": 9,
"netG": "mobile_resnet_attn",
"ngf": 64,
"norm": "instance",
"padding_type": "reflect"
},
"alg": {
"gan": {
"lambda": 1.0
},
"cut": {
"HDCE_gamma": 1.0,
"HDCE_gamma_min": 1.0,
"MSE_idt": false,
"flip_equivariance": false,
"lambda_MSE_idt": 1.0,
"lambda_NCE": 1.0,
"lambda_SRC": 0.0,
"nce_T": 0.07,
"nce_idt": true,
"nce_includes_all_negatives_from_minibatch": false,
"nce_layers": "0,4,8,12,16",
"nce_loss": "monce",
"netF": "mlp_sample",
"netF_dropout": false,
"netF_nc": 256,
"netF_norm": "instance",
"num_patches": 256
}
},
"data": {
"crop_size": 256,
"dataset_mode": "unaligned",
"direction": "AtoB",
"load_size": 256,
"max_dataset_size": 1000000000,
"num_threads": 4,
"preprocess": "resize_and_crop"
},
"output": {
"display": {
"freq": 400,
"id": 1,
"ncols": 0,
"type": [
"visdom"
],
"visdom_port": 8097,
"visdom_server": "http://localhost",
"winsize": 256
},
"no_html": false,
"print_freq": 100,
"update_html_freq": 1000,
"verbose": false
},
"model": {
"init_gain": 0.02,
"init_type": "normal",
"input_nc": 3,
"multimodal": false,
"output_nc": 3
},
"train": {
"D_lr": 0.0001,
"G_ema": false,
"G_ema_beta": 0.999,
"G_lr": 0.0002,
"batch_size": 4,
"beta1": 0.9,
"beta2": 0.999,
"continue": false,
"epoch": "latest",
"epoch_count": 1,
"export_jit": false,
"gan_mode": "lsgan",
"iter_size": 8,
"load_iter": 0,
"metrics_every": 1000,
"n_epochs": 200,
"n_epochs_decay": 100,
"nb_img_max_fid": 1000000000,
"optim": "adam",
"pool_size": 50,
"save_by_iter": false,
"save_epoch_freq": 1,
"save_latest_freq": 5000
},
"dataaug": {
"APA": false,
"APA_every": 4,
"APA_nimg": 50,
"APA_p": 0,
"APA_target": 0.6,
"D_diffusion": false,
"D_diffusion_every": 4,
"D_label_smooth": false,
"D_noise": 0.0,
"affine": 0.0,
"affine_scale_max": 1.2,
"affine_scale_min": 0.8,
"affine_shear": 45,
"affine_translate": 0.2,
"diff_aug_policy": "",
"diff_aug_proba": 0.5,
"imgaug": false,
"no_flip": false,
"no_rotate": true
},
"checkpoints_dir": "/root/joliGEN/checkpoints",
"dataroot": "/root/joliGEN/datasets/ecommerce",
"ddp_port": "12355",
"gpu_ids": "0",
"model_type": "cut",
"name": "ecommerce",
"phase": "train",
"test_batch_size": 1,
"warning_mode": false,
"with_amp": false,
"with_tf32": false,
"with_torch_compile": false
}

I have tested in multiple environments with both an rtx 3090 and an rtx 4090 and the error is consistent. If I roll back to this commit: 811ba3d, then I can train just fine.

Emmanuel Benazera · Answer 1 · Fri Mar 08 2024 05:39:38 GMT+0800 (China Standard Time)

Hi, can you print the loss_name value in models/base_model.py:1028 by any chance ? I doubt this is related to amp since you do not appear to have activated it. This is due to a loss not being a tensor, for some unknown reason that I've never encountered, nor does it trigger the tests...

On the side, you´d want D_proj_interp to be 256, there should be no benefit to have it set to a higher value.

ddemillard · Answer 2 · Fri Mar 08 2024 05:45:55 GMT+0800 (China Standard Time)

Hi, yes I will get this printed for you with results asap. I increased as I would like to train on higher resolutions. Your tips training here says that I should set projected D to the anticipated value, is that not correct?

ddemillard · Answer 3 · Fri Mar 08 2024 05:56:23 GMT+0800 (China Standard Time)

Here are the loss names before the script errors:

Loss name: G_tot
Loss name: G_NCE
Loss name: G_supervised

Emmanuel Benazera · Answer 4 · Fri Mar 08 2024 20:26:24 GMT+0800 (China Standard Time)

I increased as I would like to train on higher resolutions

You are correct then :)

Emmanuel Benazera · Answer 5 · Fri Mar 08 2024 20:56:23 GMT+0800 (China Standard Time)

Loss name: G_supervised

Thanks for catching this, I've been able to reproduce, this was introduced very recently and was yet not within the unit tests. The PR above should fix it once merged.