Using DDP with fork fails

Using the DDP example here DDP-based Scaling of Gaudi on PyTorch — Gaudi Documentation 1.6.0 documentation

Tried to replace spawn with fork using the following changes resulted in an error:

import os
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP

import habana_frameworks.torch.core as htcore

device = torch.device(‘hpu’)

def setup(rank, world_size):
os.environ[‘MASTER_ADDR’] = ‘localhost’
os.environ[‘MASTER_PORT’] = ‘12355’
os.environ[“ID”] = str(rank)
#distributed package for HCCL
import habana_frameworks.torch.distributed.hccl
dist.init_process_group(backend=‘hccl’, rank=rank, world_size=world_size)

def cleanup():
dist.destroy_process_group()

class ToyModel(nn.Module):
def init(self):
super(ToyModel, self).init()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)

def forward(self, x):
    return self.net2(self.relu(self.net1(x)))

def demo_basic(rank, world_size):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)

model = ToyModel().to(device)
ddp_model = DDP(model)

loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10).to(device))
labels = torch.randn(20, 5).to(device)
loss_fn(outputs, labels).backward()
optimizer.step()

cleanup()

def run_demo(demo_fn, world_size):
mp.start_processes(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True,
daemon=False,
start_method=‘fork’)

if name == “main”:
world_size = 8
run_demo(demo_basic, world_size)

Got the error:

terminate called after throwing an instance of ‘c10::Error’
what(): Host barrier Key error
Exception raised from hostBarrier at /tmp/pip-req-build-51fpew6w/habana_frameworks/torch/distributed/hccl/ProcessGroupHCCL.cpp:254 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits, std::allocator >) + 0x6c (0x7f3ff826312c in /usr/local/lib/python3.8/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, char const*) + 0xf5 (0x7f3ff8240e1f in /usr/local/lib/python3.8/dist-packages/torch/lib/libc10.so)
frame #2: c10d::ProcessGroupHCCL::hostBarrier() + 0x3e7 (0x7f3f31148897 in /usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so)
frame #3: + 0x352a9 (0x7f3f3114a2a9 in /usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so)
frame #4: habana_helpers::JobThread::threadFunction() + 0xda (0x7f3f31159aba in /usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so)
frame #5: + 0xd6de4 (0x7f400dae0de4 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #6: + 0x8609 (0x7f400f302609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x43 (0x7f400f43c133 in /lib/x86_64-linux-gnu/libc.so.6)

Hi,

Thanks for the post. Are you using 1.5 or 1.6 release?

using 1.5 release. Is it different code than 1.6?

I tried it on 1.6 and it seems to work. Can you please try it on 1.6 and see if it helps.