GPU sampling on ogbn-papers100M dataset causing core dump

Hello everyone! I am trying to do GPU sampling on ogbn-papers100M dataset and train graphSAGE model. I got Aborted (core dumped). I guess there is something wrong in GPU sampling period, because if I use UVA and GPU sampling, everything is fine.
Device: Nvidia A800 80G
My code:

import time
import dgl
import dgl.nn as dglnn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
from dgl.data import AsNodePredDataset
from dgl.dataloading import (
    DataLoader,
    NeighborSampler,
)
from ogb.nodeproppred import DglNodePropPredDataset

dataset_folder = '/path/to/folder'

class SAGE(nn.Module):
    def __init__(self, in_size, hid_size, out_size):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
        self.hid_size = hid_size
        self.out_size = out_size

    def forward(self, blocks, x):
        h = x
        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
            h = layer(block, h)
            if l != len(self.layers) - 1:
                h = F.relu(h)
        return h


def evaluate(model, graph, dataloader, num_classes):
    model.eval()
    ys = []
    y_hats = []
    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
        with torch.no_grad():
            x = blocks[0].srcdata["feat"]
            ys.append(blocks[-1].dstdata["label"])
            y_hats.append(model(blocks, x))
    return MF.accuracy(
        torch.cat(y_hats),
        torch.cat(ys),
        task="multiclass",
        num_classes=num_classes,
    )


def train(device, g, dataset, model, num_classes):
    # create sampler & dataloader
    train_idx = dataset.train_idx.to(device)
    val_idx = dataset.val_idx.to(device)
    sampler = NeighborSampler(
        [10, 10],
    )

    train_dataloader = DataLoader(
        g,
        train_idx,
        sampler,
        device=device,
        batch_size=1024,
        shuffle=True,
        num_workers=0,
        # use_uva=True,
    )

    val_dataloader = DataLoader(
        g,
        val_idx,
        sampler,
        device=device,
        batch_size=1024,
        shuffle=True,
        num_workers=0,
        # use_uva=True,
    )

    opt = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(20):
        model.train()
        total_loss = 0
        for it, (input_nodes, output_nodes, blocks) in enumerate(
            train_dataloader
        ):
            x = blocks[0].srcdata["feat"]
            y = blocks[-1].dstdata["label"].type(torch.LongTensor).to(device)
            y_hat = model(blocks, x)
            loss = F.cross_entropy(y_hat, y)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        acc = evaluate(model, g, val_dataloader, num_classes)
        print(
            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
                epoch + 1, total_loss / (it + 1), acc.item()
            )
        )


if __name__ == "__main__":
    print(dgl.__path__)
    # print(dgl.__version__)

    dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-papers100M', dataset_folder))

    g = dataset[0]
    num_classes = dataset.num_classes
    device = torch.device('cuda:0')
    # create GraphSAGE mode
    in_size = g.ndata["feat"].shape[1]
    out_size = dataset.num_classes
    model = SAGE(in_size, 256, out_size).to(device)

    # if use UVA, I will comment this line
    g = g.to(device)

    # model training
    print("Training...")
    start = time.perf_counter()
    train(device, g, dataset, model, num_classes)
    torch.cuda.synchronize()
    train_time = time.perf_counter() - start
    print("Training time per epoch:{:.6f}(s)".format(train_time / 20))

Hi, your code looks fine, can you post the detailed error message?

Thanks for your reply! I only got message Aborted(core dumped) and a xxx.core file(100GB+).

It is possible that the issue stems from running out of memory (OOM), as pure GPU sampling loads the entire graph into GPU memory, while UVA places it in CPU memory.

I think so… But I don’t know why I got core dump instead of pytorch OOM error.

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.