New to dgl and use the following code to implement GNN which is copy from the answer posted in Zhihu:
__all__ = ["GNN"]
import os
os.environ["DGLBACKEND"] = "mxnet"
import mxnet as mx
from mxnet import gluon
def gcn_msg(edges): # message passing
return {'m': edges.src['h']}
def gcn_reduce(nodes): # sum the the feature of incoming node
return {'h': mx.nd.sum(nodes.mailbox['m'], 1)}
class NodeUpdateModule(gluon.Block): # define the GCN layer
def __init__(self, out_feats, activation=None, dropout=0):
super(NodeUpdateModule, self).__init__()
self.linear = gluon.nn.Dense(out_feats, activation=activation)
self.dropout = dropout
def forward(self, node):
h = self.linear(node.data['h'])
if self.dropout:
h = mx.nd.Dropout(h, p=self.dropout)
return {'h': h}
class GNN(gluon.Block): # define the model
def __init__(self,
graph,
in_feats,
out_feats,
gnn_layers,
activation,
dropout,
):
super(GNN, self).__init__()
self.graph = graph
self.dropout = dropout
self.conv_layers = gluon.nn.Sequential()
for i in range(gnn_layers):
self.conv_layers.add(NodeUpdateModule(in_feats, activation, dropout))
self.out_layer = gluon.nn.Dense(out_feats)
self.gcn_msg = gcn_msg
self.gcn_reduce = gcn_reduce
def forward(self, features):
self.graph.ndata['h'] = features
for layer in self.conv_layers:
self.graph.update_all(self.gcn_msg, self.gcn_reduce, layer) # update the graph
return self.out_layer(self.graph.ndata.pop('h'))
def reset(self):
pass
The graph is created by the following code:
def load_dgl_graph(filename, graph_nodes_num):
graph = DGLGraph()
graph.add_nodes(graph_nodes_num)
with open(filename) as f:
for line in f:
if line.strip():
pre, suc = list(map(int, line.strip().split(',')))
graph.add_edge(pre, suc)
return graph
Everything goes well when I use cpu, but when gpu is applied, the error Segmentation fault 11 appears:
Segmentation fault: 11
Stack trace:
[bt] (0) /home/tongshiwei/.local/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2e50110) [0x7fb6ab9e8110]
[bt] (1) /lib64/libc.so.6(+0x36280) [0x7fb713cce280]
[bt] (2) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(void std::vector<long, std::allocator >::_M_emplace_back_aux<long const&>(long const&)+0x49) [0x7fb66783b929]
[bt] (3) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(dgl::sched::DegreeBucketing(dgl::runtime::NDArray const&, dgl::runtime::NDArray const&, dgl::runtime::NDArray const&)+0xff) [0x7fb667ea936f]
[bt] (4) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(+0x8e3b42) [0x7fb667eaab42]
[bt] (5) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(DGLFuncCall+0x52) [0x7fb667e1eb22]
[bt] (6) /home/python3.6/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c) [0x7fb70d25ec62]
[bt] (7) /home/python3.6/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(ffi_call+0x224) [0x7fb70d25a6a4]
[bt] (8) /home/python3.6/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x237) [0x7fb70d254d67]
==================================================================
Something to supplement
I use the gcn_mp code in lstm block, that is each step, the graph convolution operation performing, and found that once I call the gcn within lstm block, the problem will happen.
def forward(self, questions, inputs, states):
influence_graphs = []
with inputs.context as ctx:
for i, _id in enumerate(questions.asnumpy().tolist()):
influence_graph = mx.nd.zeros((self.ku_num, inputs.shape[1]))
influence_graph = index_copy(
influence_graph, mx.nd.array([int(_id)], ctx=ctx), mx.nd.expand_dims(inputs[i], 0)
)
influence_graph = self.gnn(influence_graph)
influence_graphs.append(influence_graph)
influence_graphs = mx.nd.stack(*influence_graphs)
next_h, [next_h, next_c] = super(GLSTMBlock, self).forward(influence_graphs, states)
return next_h, [next_h, next_c]
And if I invoke the gnn outside the lstm forward, the problem will not happen like the code in print() function as shown in the following:
def unroll(self, length, questions, inputs, begin_state=None, layout='NTC', merge_outputs=None,
valid_length=None):
# pylint: disable=too-many-locals
self.reset()
inputs, axis, F, batch_size = format_sequence(length, inputs, layout, False)
questions, _, _, _ = format_sequence(length, questions, layout, False)
begin_state = get_begin_state(self, F, begin_state, inputs, batch_size)
states = begin_state
outputs = []
all_states = []
for i in range(length):
print(self.gnn(mx.nd.ones((835, 10), ctx=mx.gpu(0))))
output, states = self(questions[i], inputs[i], states)
outputs.append(output)
if valid_length is not None:
all_states.append(states)
if valid_length is not None:
states = [F.SequenceLast(F.stack(*ele_list, axis=0),
sequence_length=valid_length,
use_sequence_length=True,
axis=0)
for ele_list in zip(*all_states)]
outputs = mask_sequence_variable_length(F, outputs, length, valid_length, axis, True)
outputs, _, _, _ = format_sequence(length, outputs, layout, merge_outputs)
return outputs, states
==================================================================
No idea to solve it, can anybody help me?