Hi folks,
I was trying to figure out the latency breakdown within each training epoch, and below is the demo code I was using:
def train(graph, idx_split, device, model):
# set batch size
batch_size = 1024
# set num of epoch
num_epoch = 20
# obtain nids for the training set
train_nids = idx_split['train']
# initializes model parameters and define the optimizer.
opt = torch.optim.Adam(model.parameters())
# start the epoch loop
for epoch in range(num_epoch):
model.train()
print(f"Epoch {epoch+1}/{num_epoch} start")
# define sampler and dataloader
sampler = dgl.dataloading.NeighborSampler([10, 15, 10])
dataloader = dgl.dataloading.DataLoader(
graph,
train_nids,
sampler,
device=device,
batch_size=batch_size,
shuffle=True,
drop_last=False,
num_workers=0
)
# start sampling
print("start sampling")
t_start = time.time()
input_nodes, output_nodes, mfgs = example_minibatch = next(iter(dataloader))
t_end = time.time()
print(f"finish sampling, cost {(t_end-t_start):.2f}s, to train {len(output_nodes)} nodes, current epoch needs {len(input_nodes)} nodes in all")
# calculate number of iteration per epoch
num_iter = len(dataloader)
# iterations
for step, (input_nodes, output_nodes, mfgs) in enumerate(dataloader):
# feature copy from CPU to GPU takes place here
print(f"Iteration {step+1}/{num_iter} start copying data to GPU:")
inputs = mfgs[0].srcdata['feat']
labels = mfgs[-1].dstdata['label']
# training
print(f"Iteration {step+1}/{num_iter} start training:")
predictions = model(mfgs, inputs)
loss = F.cross_entropy(predictions, labels)
opt.zero_grad()
loss.backward()
opt.step()
def main():
# load dataset from disk to memory
dataset = DglNodePropPredDataset('ogbn-products')
device = 'cuda'
# obtain DGLHeteroGraph and node_lebels list
# (the dataset only has one graph)
graph, node_labels = dataset[0]
# add reverse edges since ogbn-arxiv is unidirectional.
graph = dgl.add_reverse_edges(graph)
# assign labels to graph nodes
graph.ndata['label'] = node_labels[:, 0]
# calculate the number things
node_features = graph.ndata['feat']
number_nodes = graph.number_of_nodes()
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
# obtain the training-validation-test split of the nodes
idx_split = dataset.get_idx_split()
# define the model
model = ModelSAGEConv(in_size=num_features, hid_size=256, out_size=num_classes, num_layer=3).to(device)
# trainning
train(graph=graph, model=model, idx_split=idx_split, device=device)
My experiment was conducted with 3-layer GraphSAGE model and ogbn-products dataset. My result showed that in the first epoch the latency of mini-batch generation with neighborhood samping was relatively longer than the ones in following epoches, which is about 4.69s, while the others are about 0.39s. These results are the measurements of the duration of next(iter(dataloader))
call in each epoch. I am not sure about the reason for this latency difference. Is it involves some data movement between host memory and GPU memory in the first epoch?
Thanks for any reply and suggestions.