Hi, I’m having some memory errors when training on a GAT model on a GPU. Here’s the detail:
The batched graph contain 3 types of edges(“parent”, “child”, “sibling”), about 2.1 million edges and 16800 nodes. Each node has a 300-dimension feature “h”. Here’s the code:
class GNN_HETER(nn.Module):
def __init__(self, n_in, dropout, device, n_out = 1):
super(GNN_HETER, self).__init__()
self.A = nn.Parameter(th.Tensor(n_in+1, n_in+1)) # +1 for bias
# the reduce function, 3 edge types included
def reduce_func(self, nodes, edge_type):
alpha = F.softmax(nodes.mailbox["attention_" + edge_type], dim = 1)
reduce_result = th.sum(nodes.mailbox["m_" + edge_type] * alpha, dim = 1)
return {("sum_" + edge_type): reduce_result}
def call_parent_reduce(self, nodes):
return self.reduce_func(nodes, "parent")
def call_child_reduce(self, nodes):
return self.reduce_func(nodes, "child")
def call_sibling_reduce(self, nodes):
return self.reduce_func(nodes, "sibling")
def parent_message(self, edges):
return {"m_parent": edges.src['h'], "attention_parent": edges.data["attention"]}
def child_message(self, edges):
return {"m_child": edges.src['h'], "attention_child": edges.data["attention"]}
def sibling_message(self, edges):
return {"m_sibling": edges.src['h'], "attention_sibling": edges.data["attention"]}
# simulate one gnn layer
def gnn_proceed_one_layer(self, g):
g.apply_edges(func = self.edge_attention, etype = "parent")
g["parent"].update_all(self.parent_message, self.call_parent_reduce)
g.apply_edges(func = self.edge_attention, etype = "child")
g["child"].update_all(self.child_message, self.call_child_reduce)
g.apply_edges(func = self.edge_attention, etype = "sibling")
g["sibling"].update_all(self.sibling_message, self.call_sibling_reduce)
return g
def edge_attention(self, edges): # hi^T * A * hj + hi^T * b1 + hj^T * b2
hi = edges.src['h']
hj = edges.dst['h']
hi = th.cat((hi, th.ones_like(hi[..., :1])), -1)
hj = th.cat((hj, th.ones_like(hj[..., :1])), -1)
attn = th.einsum('ab,bc,ac->a', hi, self.A, hj).unsqueeze(1)
return {("attention"): attn}
def forward(self, graphs, num_layers = 1):
graphs = dgl.batch_hetero(graphs, node_attrs = {"node": ['h', '_ID']}, edge_attrs = None)
for i in range(num_layers):
self.gnn_proceed_one_layer(graphs)
Here’s the error:
File "/root/try/bishe/parser/modules/gnn_heter.py", line 198, in forward
self.gnn_proceed_one_layer(graphs)
File "/root/try/bishe/parser/modules/gnn_heter.py", line 147, in gnn_proceed_one_layer
g["sibling"].update_all(self.sibling_message, self.call_sibling_reduce)
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/heterograph.py", line 3196, in update_all
Runtime.run(prog)
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/runtime/runtime.py", line 11, in run
exe.run()
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/runtime/ir/executor.py", line 132, in run
udf_ret = fn_data(node_data, mail_data)
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/runtime/degree_bucketing.py", line 153, in _rfunc_wrapper
return reduce_udf(nbatch)
File "/root/try/bishe/parser/modules/gnn_heter.py", line 122, in call_sibling_reduce
return self.reduce_func(nodes, "sibling")
File "/root/try/bishe/parser/modules/gnn_heter.py", line 112, in reduce_func
reduce_result = th.sum(nodes.mailbox["m_" + edge_type] * alpha, dim = 1)
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/utils.py", line 285, in __getitem__
return self._fn(key)
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/runtime/degree_bucketing.py", line 148, in _reshaped_getter
msg = mail_data[key]
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/utils.py", line 285, in __getitem__
return self._fn(key)
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/frame.py", line 655, in <lambda>
return utils.LazyDict(lambda key: self._frame[key][rows], keys=self.keys())
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/frame.py", line 97, in __getitem__
return F.gather_row(self.data, user_idx)
File "/root/Anacondas/anaconda3/lib/python3.7/site-packages/dgl/backend/pytorch/tensor.py", line 152, in gather_row
return th.index_select(data, 0, row_index)
RuntimeError: CUDA out of memory. Tried to allocate 446.00 MiB (GPU 0; 11.17 GiB total capacity; 10.31 GiB already allocated; 112.75 MiB free; 10.75 GiB reserved in total by PyTorch)
I noticed that the GPU memory occupation increased drastically as gnn_proceed_one_layer
calling apply_edges
and update_all
.
Can someone help me with this? Any suggestions or tips would be appreciated.