Segmentation fault 11 when using gpu

tswsxk · August 20, 2019, 2:25am

New to dgl and use the following code to implement GNN which is copy from the answer posted in Zhihu:

__all__ = ["GNN"]

import os

os.environ["DGLBACKEND"] = "mxnet"

import mxnet as mx
from mxnet import gluon


def gcn_msg(edges):  # message passing
    return {'m': edges.src['h']}


def gcn_reduce(nodes):  # sum the the feature of incoming node
    return {'h': mx.nd.sum(nodes.mailbox['m'], 1)}


class NodeUpdateModule(gluon.Block):  # define the GCN layer
    def __init__(self, out_feats, activation=None, dropout=0):
        super(NodeUpdateModule, self).__init__()
        self.linear = gluon.nn.Dense(out_feats, activation=activation)
        self.dropout = dropout

    def forward(self, node):
        h = self.linear(node.data['h'])
        if self.dropout:
            h = mx.nd.Dropout(h, p=self.dropout)
        return {'h': h}


class GNN(gluon.Block):  # define the model
    def __init__(self,
                 graph,
                 in_feats,
                 out_feats,
                 gnn_layers,
                 activation,
                 dropout,
                 ):
        super(GNN, self).__init__()
        self.graph = graph
        self.dropout = dropout
        self.conv_layers = gluon.nn.Sequential()
        for i in range(gnn_layers):
            self.conv_layers.add(NodeUpdateModule(in_feats, activation, dropout))

        self.out_layer = gluon.nn.Dense(out_feats)
        self.gcn_msg = gcn_msg
        self.gcn_reduce = gcn_reduce

    def forward(self, features):
        self.graph.ndata['h'] = features
        for layer in self.conv_layers:
            self.graph.update_all(self.gcn_msg, self.gcn_reduce, layer)  # update the graph
        return self.out_layer(self.graph.ndata.pop('h'))

    def reset(self):
        pass

The graph is created by the following code:

def load_dgl_graph(filename, graph_nodes_num):
    graph = DGLGraph()
    graph.add_nodes(graph_nodes_num)
    with open(filename) as f:
        for line in f:
            if line.strip():
                pre, suc = list(map(int, line.strip().split(',')))
                graph.add_edge(pre, suc)

    return graph

Everything goes well when I use cpu, but when gpu is applied, the error Segmentation fault 11 appears:

Segmentation fault: 11

Stack trace:
[bt] (0) /home/tongshiwei/.local/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2e50110) [0x7fb6ab9e8110]
[bt] (1) /lib64/libc.so.6(+0x36280) [0x7fb713cce280]
[bt] (2) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(void std::vector<long, std::allocator >::_M_emplace_back_aux<long const&>(long const&)+0x49) [0x7fb66783b929]
[bt] (3) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(dgl::sched::DegreeBucketing(dgl::runtime::NDArray const&, dgl::runtime::NDArray const&, dgl::runtime::NDArray const&)+0xff) [0x7fb667ea936f]
[bt] (4) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(+0x8e3b42) [0x7fb667eaab42]
[bt] (5) /home/tongshiwei/.local/lib/python3.6/site-packages/dgl/libdgl.so(DGLFuncCall+0x52) [0x7fb667e1eb22]
[bt] (6) /home/python3.6/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c) [0x7fb70d25ec62]
[bt] (7) /home/python3.6/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(ffi_call+0x224) [0x7fb70d25a6a4]
[bt] (8) /home/python3.6/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x237) [0x7fb70d254d67]

==================================================================
Something to supplement

I use the gcn_mp code in lstm block, that is each step, the graph convolution operation performing, and found that once I call the gcn within lstm block, the problem will happen.

def forward(self, questions, inputs, states):
    influence_graphs = []
    with inputs.context as ctx:
        for i, _id in enumerate(questions.asnumpy().tolist()):
            influence_graph = mx.nd.zeros((self.ku_num, inputs.shape[1]))
            influence_graph = index_copy(
                influence_graph, mx.nd.array([int(_id)], ctx=ctx), mx.nd.expand_dims(inputs[i], 0)
            )
            influence_graph = self.gnn(influence_graph)
            influence_graphs.append(influence_graph)

    influence_graphs = mx.nd.stack(*influence_graphs)
    next_h, [next_h, next_c] = super(GLSTMBlock, self).forward(influence_graphs, states)
    return next_h, [next_h, next_c]

And if I invoke the gnn outside the lstm forward, the problem will not happen like the code in print() function as shown in the following:

def unroll(self, length, questions, inputs, begin_state=None, layout='NTC', merge_outputs=None,
           valid_length=None):
    # pylint: disable=too-many-locals
    self.reset()

    inputs, axis, F, batch_size = format_sequence(length, inputs, layout, False)
    questions, _, _, _ = format_sequence(length, questions, layout, False)
    begin_state = get_begin_state(self, F, begin_state, inputs, batch_size)

    states = begin_state
    outputs = []
    all_states = []
    for i in range(length):
        print(self.gnn(mx.nd.ones((835, 10), ctx=mx.gpu(0))))
        output, states = self(questions[i], inputs[i], states)
        outputs.append(output)
        if valid_length is not None:
            all_states.append(states)
    if valid_length is not None:
        states = [F.SequenceLast(F.stack(*ele_list, axis=0),
                                 sequence_length=valid_length,
                                 use_sequence_length=True,
                                 axis=0)
                  for ele_list in zip(*all_states)]
        outputs = mask_sequence_variable_length(F, outputs, length, valid_length, axis, True)
    outputs, _, _, _ = format_sequence(length, outputs, layout, merge_outputs)

    return outputs, states

==================================================================

No idea to solve it, can anybody help me?

VoVAllen · August 20, 2019, 11:25am

Hi,

Which mxnet version are you using? You can get it by running mxnet.__version__ to get it

tswsxk · August 20, 2019, 11:38am

I am using mxnet 1.5.0 and cuda 9.0

tswsxk · August 20, 2019, 12:01pm

solve the problem by removing the with inputs.context as ctx