Hi @Greg_S, I am using AWS DL1 instance with AMI Deep Learning AMI Habana PyTorch 1.10.1 SynapseAI 1.3.0 (Ubuntu 20.04) 20220304
The model is a custom ResNet based model but I can replicate this error with this code
import os
from habana_frameworks.torch.utils.library_loader import load_habana_module
import habana_frameworks.torch.core.hccl
import habana_frameworks.torch.core as htcore
import torch
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
def permute_params(model, to_filters_last, lazy_mode):
with torch.no_grad():
for name, param in model.named_parameters():
if(param.ndim == 4):
if to_filters_last:
param.data = param.data.permute((2, 3, 1, 0)) # permute KCRS to RSCK
else:
param.data = param.data.permute((3, 2, 0, 1)) # permute RSCK to KCRS
if lazy_mode:
htcore.mark_step()
class Net(torch.nn.Module):
def __init__(self,):
super().__init__()
self.conv = torch.nn.Sequential(
torch.nn.Conv2d(3, 5, 2, stride=2),
torch.nn.MaxPool2d(8,4),
)
self.linear = torch.nn.Linear(1445,3)
def forward(self, inp):
x = self.conv(inp)
x = x.view(x.size(0),-1)
x = self.linear(x)
return x
os.environ['LOCAL_RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['ID'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '34245'
batch_size = 4
load_habana_module()
torch.distributed.init_process_group(backend="hccl", rank=0, world_size=1)
x = torch.rand((batch_size,3,150,150))
target = torch.tensor([1] * batch_size, dtype=torch.long)
model = Net()
model.to('hpu')
model = DDP(model, broadcast_buffers=False, gradient_as_bucket_view=False)
permute_params(model, True, True)
y = model(x.to('hpu'))
loss = F.cross_entropy(y, target.to('hpu'))
loss.backward()
htcore.mark_step()
which gives the output
synapse_logger INFO. pid=5799 at /home/jenkins/workspace/cdsoftwarebuilder/create-pytorch---bpt-d/repos/pytorch-integration/pytorch_helpers/synapse_logger/synapse_logger.cpp:340 Done command: restart
Loading Habana modules from /home/ubuntu/.local/lib/python3.8/site-packages/habana_frameworks/torch/lib
Traceback (most recent call last):
File "test_conv.py", line 65, in <module>
htcore.mark_step()
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
Variable._execution_engine.run_backward(
RuntimeError: Function ConvolutionOverrideableBackward0 returned an invalid gradient at index 1 - got [2, 2, 3, 5] but expected shape compatible with [5, 3, 2, 2]