model = th.nn.parallel.DistributedDataParallel(model)
sampler = dgl.dataloading.MultiLayerNeighborSampler([25,10])
train_dataloader = dgl.dataloading.DistNodeDataLoader(
g, train_nid, sampler, batch_size=1024,
shuffle=True, drop_last=False)
valid_dataloader = dgl.dataloading.DistNodeDataLoader(
g, valid_nid, sampler, batch_size=1024,
shuffle=False, drop_last=False)
for epoch in range(5):
t_sampling, t_fwd, t_bwd, t_valid = 0, 0, 0, 0
t1 = time.time()
# Loop over the dataloader to sample mini-batches.
losses = []
for step, (input_nodes, seeds, blocks) in enumerate(train_dataloader):
t2 = time.time()
if not step:
t_sampling += (t2 - t1)
else:
t_sampling += (t2 - t4)
# Load the input features as well as output labels
batch_inputs = g.ndata['feat'][input_nodes]
batch_labels = (g.ndata['labels'][seeds]).type(th.LongTensor)
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, batch_labels)
optimizer.zero_grad()
t3 = time.time()
t_fwd += (t3 - t2)
loss.backward()
losses.append(loss.detach().cpu().numpy())
optimizer.step()
t4 = time.time()
t_bwd += (t4 - t3)
# validation
predictions = []
labels = []
t5 = time.time()
with th.no_grad():
for step, (input_nodes, seeds, blocks) in enumerate(valid_dataloader):
inputs = g.ndata['feat'][input_nodes]
labels.append(g.ndata['labels'][seeds].numpy())
predictions.append(model(blocks, inputs).argmax(1).numpy())
predictions = np.concatenate(predictions)
labels = np.concatenate(labels)
accuracy = sklearn.metrics.accuracy_score(labels, predictions)
print('Time {} - Epoch {}: Validation Accuracy {:.3f}'.format(now().time(), epoch, accuracy))
t6 = time.time()
t_valid = t6 - t5
total = (t_sampling.__round__(3), t_fwd.__round__(3), t_bwd.__round__(3), t_valid.__round__(3), ((t6 - t1) - (t_sampling + t_fwd + t_bwd + t_valid)).__round__(3))
print(f'Epoch - {epoch}, Host - {g.rank()}, Time - {total}')
I made the change you suggested (moved t2 up, as shown in the code above), and I noticed that there is a significant drop in t_sampling (from 10 seconds to 2, in the example I am working on). Is it safe to say that the actual sampling time is 2 seconds and the feature and label fetch time is the remaining 8 seconds? Are the feature and label fetch done in the lines below the same as reading node information from the KV store, and is that why it takes 8 seconds? Is there a way to break down the local and remote fetch time?
batch_inputs = g.ndata['feat'][input_nodes]
batch_labels = (g.ndata['labels'][seeds]).type(th.LongTensor)