Question about the iteration over DGL Dataloader

Hi folks,

I was trying to figure out the latency breakdown within each training epoch, and below is the demo code I was using:

def train(graph, idx_split, device, model):
    # set batch size
    batch_size = 1024
    
    # set num of epoch
    num_epoch = 20

    # obtain nids for the training set
    train_nids = idx_split['train']

    # initializes model parameters and define the optimizer.
    opt = torch.optim.Adam(model.parameters())

    # start the epoch loop
    for epoch in range(num_epoch):
        model.train()

        print(f"Epoch {epoch+1}/{num_epoch} start")

        # define sampler and dataloader
        sampler = dgl.dataloading.NeighborSampler([10, 15, 10])
        dataloader = dgl.dataloading.DataLoader(
                graph,
                train_nids,
                sampler,
                device=device,
                batch_size=batch_size,
                shuffle=True,
                drop_last=False,
                num_workers=0
        )

        # start sampling
        print("start sampling")
        t_start = time.time()
        input_nodes, output_nodes, mfgs = example_minibatch = next(iter(dataloader))
        t_end = time.time()
        print(f"finish sampling, cost {(t_end-t_start):.2f}s, to train {len(output_nodes)} nodes, current epoch needs {len(input_nodes)} nodes in all")

        # calculate number of iteration per epoch
        num_iter = len(dataloader)

        # iterations
        for step, (input_nodes, output_nodes, mfgs) in enumerate(dataloader):
            # feature copy from CPU to GPU takes place here
            print(f"Iteration {step+1}/{num_iter} start copying data to GPU:")
            inputs = mfgs[0].srcdata['feat']
            labels = mfgs[-1].dstdata['label']

            # training
            print(f"Iteration {step+1}/{num_iter} start training:")
            predictions = model(mfgs, inputs)
            loss = F.cross_entropy(predictions, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()

def main():
    # load dataset from disk to memory
    dataset = DglNodePropPredDataset('ogbn-products')
    device = 'cuda'

    # obtain DGLHeteroGraph and node_lebels list
    # (the dataset only has one graph)
    graph, node_labels = dataset[0]

    # add reverse edges since ogbn-arxiv is unidirectional.
    graph = dgl.add_reverse_edges(graph)

    # assign labels to graph nodes
    graph.ndata['label'] = node_labels[:, 0]

    # calculate the number things
    node_features = graph.ndata['feat']
    number_nodes = graph.number_of_nodes()
    num_features = node_features.shape[1]
    num_classes = (node_labels.max() + 1).item()

    # obtain the training-validation-test split of the nodes 
    idx_split = dataset.get_idx_split()

    # define the model
    model = ModelSAGEConv(in_size=num_features, hid_size=256, out_size=num_classes, num_layer=3).to(device)

    # trainning
    train(graph=graph, model=model, idx_split=idx_split, device=device)

My experiment was conducted with 3-layer GraphSAGE model and ogbn-products dataset. My result showed that in the first epoch the latency of mini-batch generation with neighborhood samping was relatively longer than the ones in following epoches, which is about 4.69s, while the others are about 0.39s. These results are the measurements of the duration of next(iter(dataloader)) call in each epoch. I am not sure about the reason for this latency difference. Is it involves some data movement between host memory and GPU memory in the first epoch?

Thanks for any reply and suggestions.

I think it’s not an issue on the difference between the first or the second epoch. If you log the average time cost for each step in the 1st and the 2nd epoch, they should be almost the same. The difference you mentioned may come from some set-up operations when you turn the dataloader to a iterator (iter(dataloader)) at the first time.