I face a hard net to crack when using Neighborsampler to train a GNN.
When I try to backward, “loss.backward()” will cause the following error, which really perplex me.
Traceback (most recent call last):
File "t.py", line 90, in <module>
loss.backward()
File "/home/xxx/anaconda3/lib/python3.6/site-packages/torch/tensor.py", line 102, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/xxx/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py", line 90, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: shape mismatch: value tensor of shape [5, 128] cannot be broadcast to indexing result of shape [50, 128]
Below is an example of my code. I create a small graph with 50 nodes and some random edges, and I try to use the form of NeighborSampler to train an ordinary GNN. I will do a binary classification with the node features.
The forward phase is normal, and the first backward is also ok. But the second backward will cause the above error.
Besides, when I experiment on a relative large graph (10W nodes, and 170W edges), it will take 3 minutes on “loss.backward()” in the first time, which I think is too slow. And the second backward will also cause the “shape mismatch” error. This error occurs both in CPU and GPU modes.
Is my code correct for using NeighborSampler to train a GNN?
Could someone help me? Thanks in advance!
DGL version : 0.4.1
Pytorch version: 1.0.0
import torch
import torch.nn as nn
import numpy as np
import dgl
import dgl.function as fn
from dgl.nn.pytorch.conv import GraphConv
from utils import *
from dgl.contrib.sampling.sampler import NeighborSampler
from IPython import embed
# build a random graph
g = dgl.DGLGraph()
g.add_nodes(50)
num_edges = 200
for _ in range(num_edges):
a = random.randint(0, 49)
b = random.randint(0, 49)
if a==b or g.has_edge_between(a,b):
continue
g.add_edge(a, b)
g.add_edge(b, a)
g1 = dgl.DGLGraph(g,readonly = True)
g1.readonly()
g = g1
g.ndata['h'] = torch.randn(50, 128)
# My GNN
class ReduceLayer(nn.Module):
def __init__(self, in_feat, out_feat):
super(ReduceLayer, self).__init__()
self.fc = nn.Linear(in_feat, out_feat)
def forward(self, nodes): # without using edge information
h = torch.mean(nodes.mailbox['m'], dim = 1)
h = torch.cat((h, nodes.data['h']), dim = 1)
h = self.fc(h)
return {'h': h}
class Net(nn.Module):
def __init__(self, in_feat, out_feat):
super(Net, self).__init__()
self.reduce_func = ReduceLayer(2*in_feat,out_feat)
def forward(self, nf):
nf.copy_from_parent()
# compute by blocks
nf.register_reduce_func(ReduceLayer, 0)
nf.block_compute(0, message_func = fn.copy_src(src = 'h', out = 'm'), reduce_func = self.reduce_func)
nf.copy_to_parent()
net = Net(128,128)
loss_func = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters())
# go forward
for nf in NeighborSampler(g, batch_size = 5, expand_factor = 3, shuffle = True, num_hops = 1):
net(nf)
# try to backward
for i in range(5):
optimizer.zero_grad()
label = [0]*5 + [1] * 5
label = torch.tensor(label).float()
out = g.ndata['h'][i*10:(i+1)*10]
out = torch.sum(out, dim = 1)
loss = loss_func(out, label)
print(loss)
# an error will occur at the second loop.
loss.backward()
optimizer.step()
print('bp ok')
input()