I adopted code from the post: GPU sampling on ogbn-papers100M dataset causing core dump, but found errors.
Code:
import time
import dgl
import dgl.nn as dglnn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
from dgl.data import AsNodePredDataset
from dgl.dataloading import (
DataLoader,
NeighborSampler,
)
from ogb.nodeproppred import DglNodePropPredDataset
class SAGE(nn.Module):
def __init__(self, in_size, hid_size, out_size):
super().__init__()
self.layers = nn.ModuleList()
self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
self.hid_size = hid_size
self.out_size = out_size
def forward(self, blocks, x):
h = x
for l, (layer, block) in enumerate(zip(self.layers, blocks)):
h = layer(block, h)
if l != len(self.layers) - 1:
h = F.relu(h)
return h
def evaluate(model, graph, dataloader, num_classes):
model.eval()
ys = []
y_hats = []
for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
with torch.no_grad():
x = blocks[0].srcdata["feat"]
ys.append(blocks[-1].dstdata["label"])
y_hats.append(model(blocks, x))
return MF.accuracy(
torch.cat(y_hats),
torch.cat(ys),
task="multiclass",
num_classes=num_classes,
)
def train(device, g, dataset, model, num_classes):
# create sampler & dataloader
train_idx = dataset.train_idx.to(device)
val_idx = dataset.val_idx.to(device)
sampler = NeighborSampler(
[10, 10],
)
train_dataloader = DataLoader(
g,
train_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
num_workers=0,
)
val_dataloader = DataLoader(
g,
val_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
num_workers=0,
)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(20):
model.train()
total_loss = 0
for it, (input_nodes, output_nodes, blocks) in enumerate(
train_dataloader
):
x = blocks[0].srcdata["feat"]
y = blocks[-1].dstdata["label"].type(torch.LongTensor).to(device)
y_hat = model(blocks, x)
loss = F.cross_entropy(y_hat, y)
opt.zero_grad()
loss.backward()
opt.step()
total_loss += loss.item()
acc = evaluate(model, g, val_dataloader, num_classes)
print(
"Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
epoch + 1, total_loss / (it + 1), acc.item()
)
)
if __name__ == "__main__":
print(dgl.__path__)
print(dgl.__version__)
dataset_folder = '../dataset'
dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-papers100M', dataset_folder))
g = dataset[0]
num_classes = dataset.num_classes
device = torch.device('cpu')
# create GraphSAGE mode
in_size = g.ndata["feat"].shape[1]
out_size = dataset.num_classes
model = SAGE(in_size, 256, out_size).to(device)
# model training
print("Training...")
start = time.perf_counter()
train(device, g, dataset, model, num_classes)
train_time = time.perf_counter() - start
print("Training time per epoch:{:.6f}(s)".format(train_time / 20))
error:
Traceback (most recent call last):
File "/data/GAEA/ns/papers100m.py", line 108, in <module>
dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-papers100M', dataset_folder))
File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/adapter.py", line 84, in __init__
super().__init__(
File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/dgl_dataset.py", line 112, in __init__
self._load()
File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/dgl_dataset.py", line 204, in _load
self.save()
File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/adapter.py", line 177, in save
utils.save_graphs(
File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/graph_serialize.py", line 142, in save_graphs
save_heterographs(filename, g_list, labels, formats)
File "/users/tingsun/.local/lib/python3.10/site-packages/dgl/data/heterograph_serialize.py", line 36, in save_heterographs
_CAPI_SaveHeteroGraphData(
File "dgl/_ffi/_cython/./function.pxi", line 295, in dgl._ffi._cy3.core.FunctionBase.__call__
File "dgl/_ffi/_cython/./function.pxi", line 241, in dgl._ffi._cy3.core.FuncCall
dgl._ffi.base.DGLError: [18:29:08] /opt/dgl/third_party/dmlc-core/src/io/local_filesys.cc:38: Check failed: std::fwrite(ptr, 1, size, fp_) == size: FileStream.Write incomplete
Stack trace:
[bt] (0) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x744a67) [0x7f355d344a67]
[bt] (1) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x745d72) [0x7f355d345d72]
[bt] (2) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Save(dmlc::Stream*) const+0x2c1) [0x7f355d1cefc1]
[bt] (3) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(dgl::serialize::SaveHeteroGraphs(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, dgl::runtime::List<dgl::serialize::HeteroGraphData, void>, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, dgl::runtime::NDArray>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, dgl::runtime::NDArray> > > const&, unsigned char)+0x626) [0x7f355d296396]
[bt] (4) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x69aa48) [0x7f355d29aa48]
[bt] (5) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(+0x69abc4) [0x7f355d29abc4]
[bt] (6) /users/tingsun/.local/lib/python3.10/site-packages/dgl/libdgl.so(DGLFuncCall+0x4c) [0x7f355d1b9e2c]
[bt] (7) /users/tingsun/.local/lib/python3.10/site-packages/dgl/_ffi/_cy3/core.cpython-310-x86_64-linux-gnu.so(+0x1bde4) [0x7f355c41bde4]
[bt] (8) /users/tingsun/.local/lib/python3.10/site-packages/dgl/_ffi/_cy3/core.cpython-310-x86_64-linux-gnu.so(+0x1c14f) [0x7f355c41c14f]