CPU sampling on ogbn-papers100M dataset causing errors

I adopted code from the post: GPU sampling on ogbn-papers100M dataset causing core dump, but found errors.

Code:

import time
import dgl
import dgl.nn as dglnn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
from dgl.data import AsNodePredDataset
from dgl.dataloading import (
    DataLoader,
    NeighborSampler,
)
from ogb.nodeproppred import DglNodePropPredDataset

class SAGE(nn.Module):
    def __init__(self, in_size, hid_size, out_size):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
        self.hid_size = hid_size
        self.out_size = out_size

    def forward(self, blocks, x):
        h = x
        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
            h = layer(block, h)
            if l != len(self.layers) - 1:
                h = F.relu(h)
        return h


def evaluate(model, graph, dataloader, num_classes):
    model.eval()
    ys = []
    y_hats = []
    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
        with torch.no_grad():
            x = blocks[0].srcdata["feat"]
            ys.append(blocks[-1].dstdata["label"])
            y_hats.append(model(blocks, x))
    return MF.accuracy(
        torch.cat(y_hats),
        torch.cat(ys),
        task="multiclass",
        num_classes=num_classes,
    )


def train(device, g, dataset, model, num_classes):
    # create sampler & dataloader
    train_idx = dataset.train_idx.to(device)
    val_idx = dataset.val_idx.to(device)
    sampler = NeighborSampler(
        [10, 10],
    )

    train_dataloader = DataLoader(
        g,
        train_idx,
        sampler,
        device=device,
        batch_size=1024,
        shuffle=True,
        num_workers=0,
    )

    val_dataloader = DataLoader(
        g,
        val_idx,
        sampler,
        device=device,
        batch_size=1024,
        shuffle=True,
        num_workers=0,
    )

    opt = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(20):
        model.train()
        total_loss = 0
        for it, (input_nodes, output_nodes, blocks) in enumerate(
            train_dataloader
        ):
            x = blocks[0].srcdata["feat"]
            y = blocks[-1].dstdata["label"].type(torch.LongTensor).to(device)
            y_hat = model(blocks, x)
            loss = F.cross_entropy(y_hat, y)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        acc = evaluate(model, g, val_dataloader, num_classes)
        print(
            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
                epoch + 1, total_loss / (it + 1), acc.item()
            )
        )


if __name__ == "__main__":
    print(dgl.__path__)
    print(dgl.__version__)
    dataset_folder = '../dataset'
    dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-papers100M', dataset_folder))

    g = dataset[0]
    num_classes = dataset.num_classes
    device = torch.device('cpu')
    # create GraphSAGE mode
    in_size = g.ndata["feat"].shape[1]
    out_size = dataset.num_classes
    model = SAGE(in_size, 256, out_size).to(device)

    # model training
    print("Training...")
    start = time.perf_counter()
    train(device, g, dataset, model, num_classes)
    train_time = time.perf_counter() - start
    print("Training time per epoch:{:.6f}(s)".format(train_time / 20))

error:

Traceback (most recent call last):
  File "/data/GAEA/ns/papers100m.py", line 108, in <module>
    dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-papers100M', dataset_folder))
  File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/adapter.py", line 84, in __init__
    super().__init__(
  File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/dgl_dataset.py", line 112, in __init__
    self._load()
  File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/dgl_dataset.py", line 204, in _load
    self.save()
  File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/adapter.py", line 177, in save
    utils.save_graphs(
  File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/graph_serialize.py", line 142, in save_graphs
    save_heterographs(filename, g_list, labels, formats)
  File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/heterograph_serialize.py", line 36, in save_heterographs
    _CAPI_SaveHeteroGraphData(
  File "dgl/_ffi/_cython/./function.pxi", line 295, in dgl._ffi._cy3.core.FunctionBase.__call__
  File "dgl/_ffi/_cython/./function.pxi", line 241, in dgl._ffi._cy3.core.FuncCall
dgl._ffi.base.DGLError: [18:29:08] /opt/dgl/third_party/dmlc-core/src/io/local_filesys.cc:38: Check failed: std::fwrite(ptr, 1, size, fp_) == size: FileStream.Write incomplete
Stack trace:
  [bt] (0) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x744a67) [0x7f355d344a67]
  [bt] (1) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x745d72) [0x7f355d345d72]
  [bt] (2) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Save(dmlc::Stream*) const+0x2c1) [0x7f355d1cefc1]
  [bt] (3) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(dgl::serialize::SaveHeteroGraphs(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, dgl::runtime::List<dgl::serialize::HeteroGraphData, void>, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, dgl::runtime::NDArray>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, dgl::runtime::NDArray> > > const&, unsigned char)+0x626) [0x7f355d296396]
  [bt] (4) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x69aa48) [0x7f355d29aa48]
  [bt] (5) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x69abc4) [0x7f355d29abc4]
  [bt] (6) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(DGLFuncCall+0x4c) [0x7f355d1b9e2c]
  [bt] (7) /users/tingsun/.local/lib/python3.10/site-packages/dgl/_ffi/_cy3/core.cpython-310-x86_64-linux-gnu.so(+0x1bde4) [0x7f355c41bde4]
  [bt] (8) /users/tingsun/.local/lib/python3.10/site-packages/dgl/_ffi/_cy3/core.cpython-310-x86_64-linux-gnu.so(+0x1c14f) [0x7f355c41c14f]

The error might be caused by insufficient disk space. Could you check it?

Thanks for your help. I finally found out the problem: DGL by default uses home dir to store some data, even though I use /data dir to process all the data. As a result, disk is insufficient even though I have enough space in /data dir.

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.