Hello everyone! I am trying to do GPU sampling on ogbn-papers100M dataset and train graphSAGE model. I got Aborted (core dumped)
. I guess there is something wrong in GPU sampling period, because if I use UVA and GPU sampling, everything is fine.
Device: Nvidia A800 80G
My code:
import time
import dgl
import dgl.nn as dglnn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
from dgl.data import AsNodePredDataset
from dgl.dataloading import (
DataLoader,
NeighborSampler,
)
from ogb.nodeproppred import DglNodePropPredDataset
dataset_folder = '/path/to/folder'
class SAGE(nn.Module):
def __init__(self, in_size, hid_size, out_size):
super().__init__()
self.layers = nn.ModuleList()
self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
self.hid_size = hid_size
self.out_size = out_size
def forward(self, blocks, x):
h = x
for l, (layer, block) in enumerate(zip(self.layers, blocks)):
h = layer(block, h)
if l != len(self.layers) - 1:
h = F.relu(h)
return h
def evaluate(model, graph, dataloader, num_classes):
model.eval()
ys = []
y_hats = []
for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
with torch.no_grad():
x = blocks[0].srcdata["feat"]
ys.append(blocks[-1].dstdata["label"])
y_hats.append(model(blocks, x))
return MF.accuracy(
torch.cat(y_hats),
torch.cat(ys),
task="multiclass",
num_classes=num_classes,
)
def train(device, g, dataset, model, num_classes):
# create sampler & dataloader
train_idx = dataset.train_idx.to(device)
val_idx = dataset.val_idx.to(device)
sampler = NeighborSampler(
[10, 10],
)
train_dataloader = DataLoader(
g,
train_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
num_workers=0,
# use_uva=True,
)
val_dataloader = DataLoader(
g,
val_idx,
sampler,
device=device,
batch_size=1024,
shuffle=True,
num_workers=0,
# use_uva=True,
)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(20):
model.train()
total_loss = 0
for it, (input_nodes, output_nodes, blocks) in enumerate(
train_dataloader
):
x = blocks[0].srcdata["feat"]
y = blocks[-1].dstdata["label"].type(torch.LongTensor).to(device)
y_hat = model(blocks, x)
loss = F.cross_entropy(y_hat, y)
opt.zero_grad()
loss.backward()
opt.step()
total_loss += loss.item()
acc = evaluate(model, g, val_dataloader, num_classes)
print(
"Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
epoch + 1, total_loss / (it + 1), acc.item()
)
)
if __name__ == "__main__":
print(dgl.__path__)
# print(dgl.__version__)
dataset = AsNodePredDataset(DglNodePropPredDataset('ogbn-papers100M', dataset_folder))
g = dataset[0]
num_classes = dataset.num_classes
device = torch.device('cuda:0')
# create GraphSAGE mode
in_size = g.ndata["feat"].shape[1]
out_size = dataset.num_classes
model = SAGE(in_size, 256, out_size).to(device)
# if use UVA, I will comment this line
g = g.to(device)
# model training
print("Training...")
start = time.perf_counter()
train(device, g, dataset, model, num_classes)
torch.cuda.synchronize()
train_time = time.perf_counter() - start
print("Training time per epoch:{:.6f}(s)".format(train_time / 20))