CUDA related error in multi_update_all

akkaneko · June 25, 2020, 12:51am

Traceback:

DGLError                                  
Traceback (most recent call last)
<ipython-input-17-e828bb70f04f> in <module>
---> 19     embed = model(train_graph)
     20     scores, loss = model.get_loss(train_graph, embed, ids, labels, types, score_fn)
     21     loss.backward()

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

~/grakn-KG/grakn_kg/rgcn.py in forward(self, G)
    105         for i, layer in enumerate(self.layers):
    106             if i == 0:
--> 107                 _, h_dict = layer(G, self.embed)
    108             else:
    109                 w, h_dict = layer(G, h_dict)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

~/grakn-KG/grakn_kg/rgcn.py in forward(self, G, feat_dict)
     65         # The second one is the type wise reducer, could be "sum", "max",
     66         # "min", "mean", "stack"
---> 67         G.multi_update_all(funcs, 'sum')
     68         # return the updated node feature dictionary
     69         return self.weight, {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

/opt/conda/lib/python3.7/site-packages/dgl/heterograph.py in multi_update_all(self, etype_dict, cross_reducer, apply_node_func)
   3646                 all_out[dtid].append(outframe)
   3647                 merge_order[dtid].append(etid)  # use edge type id as merge order hint
-> 3648             Runtime.run(prog)
   3649         for dtid, frames in all_out.items():
   3650             # merge by cross_reducer

/opt/conda/lib/python3.7/site-packages/dgl/runtime/runtime.py in run(prog)
      9         for exe in prog.execs:
     10             # prog.pprint_exe(exe)
---> 11             exe.run()

/opt/conda/lib/python3.7/site-packages/dgl/runtime/ir/executor.py in run(self)
   1076         self.ret.data = F.binary_reduce(
   1077             self.reducer, self.binary_op, graph, self.lhs, self.rhs,
-> 1078             lhs_data, rhs_data, self.out_size, lhs_map, rhs_map, out_map)
   1079 
   1080 

/opt/conda/lib/python3.7/site-packages/dgl/backend/pytorch/tensor.py in binary_reduce(reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, out_size, lhs_map, rhs_map, out_map)
    378     return BinaryReduce.apply(
    379             reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, out_data,
--> 380             out_size, lhs_map, rhs_map, out_map)
    381 
    382 

/opt/conda/lib/python3.7/site-packages/dgl/backend/pytorch/tensor.py in forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, out_data, out_size, lhs_map, rhs_map, out_map)
    302             reducer if reducer != 'mean' else 'sum',
    303             binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd,
--> 304             out_data_nd, lhs_map[0], rhs_map[0], out_map[0])
    305         # normalize if mean reducer
    306         # NOTE(zihao): this is a temporary hack and we should have better solution in the future.

/opt/conda/lib/python3.7/site-packages/dgl/kernel.py in binary_op_reduce(reducer, op, G, A_target, B_target, A, B, out, A_rows, B_rows, out_rows)
    146         int(A_target), int(B_target),
    147         A, B, out,
--> 148         A_rows, B_rows, out_rows)
    149 
    150 # pylint: disable=invalid-name

dgl/_ffi/_cython/./function.pxi in dgl._ffi._cy3.core.FunctionBase.__call__()

dgl/_ffi/_cython/./function.pxi in dgl._ffi._cy3.core.FuncCall()

dgl/_ffi/_cython/./base.pxi in dgl._ffi._cy3.core.CALL()

DGLError: [00:48:15] /opt/dgl/src/kernel/binary_reduce.cc:186: Check failed: ctx == arrays[i]->ctx (GPU:0 vs. CPU:0) : Expected device context GPU:0. But got CPU:0 for rhs_data.
Stack trace:
  [bt] (0) /opt/conda/lib/python3.7/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x22) [0x7f0332d3d782]
  [bt] (1) /opt/conda/lib/python3.7/site-packages/dgl/libdgl.so(+0x703f01) [0x7f0332d8ff01]
  [bt] (2) /opt/conda/lib/python3.7/site-packages/dgl/libdgl.so(dgl::kernel::BinaryOpReduce(std::string const&, std::string const&, dgl::kernel::CSRWrapper const&, dgl::kernel::binary_op::Target, dgl::kernel::binary_op::Target, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray)+0x2cd) [0x7f0332d9305d]
  [bt] (3) /opt/conda/lib/python3.7/site-packages/dgl/libdgl.so(+0x7084a9) [0x7f0332d944a9]
  [bt] (4) /opt/conda/lib/python3.7/site-packages/dgl/libdgl.so(dgl::kernel::csrwrapper_switch(dgl::runtime::DGLArgValue, std::function<void (dgl::kernel::CSRWrapper const&)>)+0x6d3) [0x7f0332d98383]
  [bt] (5) /opt/conda/lib/python3.7/site-packages/dgl/libdgl.so(+0x70cb66) [0x7f0332d98b66]
  [bt] (6) /opt/conda/lib/python3.7/site-packages/dgl/libdgl.so(DGLFuncCall+0x52) [0x7f03333456e2]
  [bt] (7) /opt/conda/lib/python3.7/site-packages/dgl/_ffi/_cy3/core.cpython-37m-x86_64-linux-gnu.so(+0x19b2b) [0x7f03310b0b2b]
  [bt] (8) /opt/conda/lib/python3.7/site-packages/dgl/_ffi/_cy3/core.cpython-37m-x86_64-linux-gnu.so(+0x1a1db) [0x7f03310b11db]

I’m trying to run a heterogeneous model using a GPU, but something downstream of multi_update_all is causing it to crash when using CUDA. Is the function not supported for CUDA 10.1? I have the nightly version of dgl-cu101installed.

Thanks for your help.

mufeili · June 25, 2020, 5:44am

Hi, it seems that some features are on CPU and some features are on GPU. Can you try moving all features to GPU and see if the issue still exists?