Getting different device context while Constructing Negative Graph

kunal-bhadra · April 28, 2022, 11:16am

I am trying to do Link Prediction with heterographs. Earlier, there was an error which I resolved after adding reverse edges but now I am getting this error, saying

DGLError: [11:11:38] /opt/dgl/include/dgl/./aten/coo.h:115: Check failed: (row)->ctx == (col)->ctx: Expected col to have the same device context as row(cuda:0). But got cpu:0.

I have loaded both my data and graph to my GPU so I’m really not sure why it is happening. Even checked all the new data in the Negative Graph function and they’re all on Cuda as well, so there shouldn’t be a problem. Can anybody help me with this?

Graph Dimensions:

Graph(num_nodes={‘customer’: 74845, ‘product’: 1835},
num_edges={(‘customer’, ‘customerBoughtProducts’, ‘product’): 266617, (‘product’, ‘customerBoughtProductsBack’, ‘customer’): 266617},
metagraph=[(‘customer’, ‘product’, ‘customerBoughtProducts’), (‘product’, ‘customer’, ‘customerBoughtProductsBack’)])

GNN Code:

class HeteroDotProductPredictor(nn.Module):
    def forward(self, graph, h, etype):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
            return graph.edges[etype].data['score']
   

class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')

    def forward(self, graph, inputs):
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h
        
    
def construct_negative_graph(graph, k, etype):
    utype, _, vtype = etype
    src, dst = graph.edges(etype=etype)
    neg_src = src.repeat_interleave(k)
    neg_dst = torch.randint(0, graph.num_nodes(vtype), (len(src) * k,))
    return dgl.heterograph(
        {etype: (neg_src, neg_dst)},
        num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

def compute_loss(pos_score, neg_score):
    n_edges = pos_score.shape[0]
    return (1 - pos_score.unsqueeze(1) + neg_score.view(n_edges, -1)).clamp(min=0).mean()



class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names):
        super().__init__()
        self.sage = RGCN(in_features, hidden_features, out_features, rel_names)
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return self.pred(g, h, etype), self.pred(neg_g, h, type)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

k = 5
graph = linkpred_graph
graph = graph.to(device)
model = Model(512, 1024, 256, graph.etypes)
model.to(device)
user_feats = graph.nodes['customer'].data['embedding'].float()
item_feats = graph.nodes['product'].data['embedding'].float()
node_features = {'customer': user_feats, 'product': item_feats}
opt = torch.optim.Adam(model.parameters())


for epoch in range(10):
    negative_graph = construct_negative_graph(graph, k, ('customer', 'customerBoughtProducts', 'product'))
    pos_score, neg_score = model(graph, negative_graph, node_features, ('customer', 'customerBoughtProducts', 'product'))
    loss = compute_loss(pos_score, neg_score)
    opt.zero_grad()
    loss.backward()
    opt.step()
    print(loss.item())

Error Stacktrace:

DGLError                                  Traceback (most recent call last)
<ipython-input-40-e86228f4d586> in <module>
     13 
     14 for epoch in range(10):
---> 15     negative_graph = construct_negative_graph(graph, k, ('customer', 'customerBoughtProducts', 'product'))
     16     pos_score, neg_score = model(graph, negative_graph, node_features, ('customer', 'customerBoughtProducts', 'product'))
     17     loss = compute_loss(pos_score, neg_score)

<ipython-input-39-59df3828f303> in construct_negative_graph(graph, k, etype)
     32     return dgl.heterograph(
     33         {etype: (neg_src, neg_dst)},
---> 34         num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})
     35 
     36 def compute_loss(pos_score, neg_score):

~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/convert.py in heterograph(data_dict, num_nodes_dict, idtype, device)
    371         sparse_fmt, arrays = node_tensor_dict[(srctype, etype, dsttype)]
    372         g = create_from_edges(sparse_fmt, arrays, srctype, etype, dsttype,
--> 373                               num_nodes_dict[srctype], num_nodes_dict[dsttype])
    374         rel_graphs.append(g)
    375 

~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/convert.py in create_from_edges(sparse_fmt, arrays, utype, etype, vtype, urange, vrange, row_sorted, col_sorted)
   1672         hgidx = heterograph_index.create_unitgraph_from_coo(
   1673             num_ntypes, urange, vrange, u, v, ['coo', 'csr', 'csc'],
-> 1674             row_sorted, col_sorted)
   1675     else:   # 'csr' or 'csc'
   1676         indptr, indices, eids = arrays

~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/heterograph_index.py in create_unitgraph_from_coo(num_ntypes, num_src, num_dst, row, col, formats, row_sorted, col_sorted)
   1143         int(num_ntypes), int(num_src), int(num_dst),
   1144         F.to_dgl_nd(row), F.to_dgl_nd(col),
-> 1145         formats, row_sorted, col_sorted)
   1146 
   1147 def create_unitgraph_from_csr(num_ntypes, num_src, num_dst, indptr, indices, edge_ids,

dgl/_ffi/_cython/./function.pxi in dgl._ffi._cy3.core.FunctionBase.__call__()

dgl/_ffi/_cython/./function.pxi in dgl._ffi._cy3.core.FuncCall()

dgl/_ffi/_cython/./base.pxi in dgl._ffi._cy3.core.CALL()

DGLError: [11:11:38] /opt/dgl/include/dgl/./aten/coo.h:115: Check failed: (row)->ctx == (col)->ctx: Expected col to have the same device context as row(cuda:0). But got cpu:0.
Stack trace:
  [bt] (0) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7f79a28aee0f]
  [bt] (1) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(dgl::aten::COOMatrix::CheckValidity() const+0x254) [0x7f79a28b6d64]
  [bt] (2) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(dgl::UnitGraph::COO::COO(std::shared_ptr<dgl::GraphInterface>, long, long, dgl::runtime::NDArray, dgl::runtime::NDArray, bool, bool)+0x336) [0x7f79a2d51c16]
  [bt] (3) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(dgl::UnitGraph::CreateFromCOO(long, long, long, dgl::runtime::NDArray, dgl::runtime::NDArray, bool, bool, unsigned char)+0x13a) [0x7f79a2d3e6fa]
  [bt] (4) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(dgl::CreateFromCOO(long, long, long, dgl::runtime::NDArray, dgl::runtime::NDArray, bool, bool, unsigned char)+0x6e) [0x7f79a2c11b5e]
  [bt] (5) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(+0x6b44d6) [0x7f79a2c574d6]
  [bt] (6) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(+0x6b4754) [0x7f79a2c57754]
  [bt] (7) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x7f79a2bd4358]
  [bt] (8) /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/_ffi/_cy3/core.cpython-36m-x86_64-linux-gnu.so(+0x165ba) [0x7f79901725ba]

Rhett-Ying · April 29, 2022, 3:30am

I think neg_dst is created on cpu in default. Please make sure neg_src, neg_dst are on same device. And you could specify the target device when creating graph via dgl.heterograph(..., device=device)

def construct_negative_graph(graph, k, etype):
    utype, _, vtype = etype
    src, dst = graph.edges(etype=etype)
    neg_src = src.repeat_interleave(k)
    neg_dst = torch.randint(0, graph.num_nodes(vtype), (len(src) * k,))
    return dgl.heterograph(
        {etype: (neg_src, neg_dst)},
        num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

kunal-bhadra · April 29, 2022, 7:32am

Yes, I added a .to(device) for the neg_dst and I didn’t get the error anymore. The graph was already loaded onto GPU so that should have been fine. Thank you!

system · May 29, 2022, 7:32am

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.