ucbrise / actnn

ActNN: Reducing Training Memory Footprint via 2-Bit Activation Compressed Training

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

There is something wrong with loss.backward()

Harr7y opened this issue · comments

I just modify the model by

model = actnn.QModule(model)

After that, something wrong happened as follows:

Traceback (most recent call last):
File "train.py", line 336, in
main()
File "train.py", line 332, in main
train(args, model)
File "train.py", line 212, in train
loss.backward()
File "/home/hku/anaconda3/envs/torch17/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/hku/anaconda3/envs/torch17/lib/python3.7/site-packages/torch/autograd/init.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: Function linearBackward returned an invalid gradient at index 0 - got [25216, 3072] but expected shape compatible with [128, 197, 3072]

Hi, we cannot know the bug from the backtrace only.
Could you provide the architecture of your model? For example, print(model).
Some reproducible scripts will be even better.

Hi, we cannot know the bug from the backtrace only.
Could you provide the architecture of your model? For example, print(model).
Some reproducible scripts will be even better.

Thanks for your reply! Before applying ActNN to it, the model's structure is as follows:

VisionTransformer(
(transformer): Transformer(
(embeddings): Embeddings(
(patch_embeddings): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): Encoder(
(layer): ModuleList(
(0): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(1): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(2): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(3): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(4): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(5): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(6): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(7): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(8): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(9): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(10): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
(11): Block(
(attention_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(ffn): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(attn): Attention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
(attn_dropout): Dropout(p=0.0, inplace=False)
(proj_dropout): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
)
)
(encoder_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(head): Linear(in_features=768, out_features=10, bias=True)
)