I recently encountered the problem of CUDA out of memory in one training epoch. For each iter in one epoch, I feed a batch graphs into model and predict the answer. But I found that the GPU memory is increasing over each iter and it raise ‘CUDA out of memory’ after a few batches. I tried another method without RGCN, and the aforementioned situation did not happen. The following part is my code, borrowed and modified code from https://docs.dgl.ai/en/0.4.x/tutorials/models/1_gnn/4_rgcn.html .
class RGCNLayer(nn.Module):
def __init__(self, feat_size, num_rels, out_size=512, activation=None, gated = True):
super(RGCNLayer, self).__init__()
self.feat_size = feat_size
self.num_rels = num_rels
self.activation = activation
self.gated = gated
self.weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, out_size))
# init trainable parameters
nn.init.xavier_uniform_(self.weight,gain=nn.init.calculate_gain('relu'))
if self.gated:
self.gate_weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, 1))
nn.init.xavier_uniform_(self.gate_weight,gain=nn.init.calculate_gain('sigmoid'))
# @torchsnooper.snoop()
def forward(self, g):
weight = self.weight
gate_weight = self.gate_weight
def message_func(edges):
w = weight[edges.data['rel_type']]
print(edges.data['rel_type'].shape) #1216
print(w.shape) # 1216,512,512
msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
print(msg.shape) #1216,512
print(edges.data['norm'].unsqueeze(1).shape) # 1216,1
msg = msg * edges.data['norm'].unsqueeze(1).cuda()
if self.gated:
gate_w = gate_weight[edges.data['rel_type']]
gate = torch.bmm(edges.src['h'].unsqueeze(1), gate_w).squeeze().reshape(-1,1)
gate = torch.sigmoid(gate)
msg = msg * gate
return {'msg': msg}
def apply_func(nodes):
h = nodes.data['h']
h = self.activation(h)
print('gg')
return {'h': h}
# gpu_tracker.track()
g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)
# gpu_tracker.track()
class RGCNModel(nn.Module):
def __init__(self, h_dim, num_rels, num_hidden_layers=1, gated = True):
super(RGCNModel, self).__init__()
self.h_dim = h_dim
self.num_rels = num_rels
self.num_hidden_layers = num_hidden_layers
self.gated = gated
# create rgcn layers
self.build_model()
def build_model(self):
self.layers = nn.ModuleList()
for _ in range(self.num_hidden_layers):
rgcn_layer = RGCNLayer(self.h_dim, self.num_rels, activation=F.relu, gated = self.gated)
self.layers.append(rgcn_layer)
#@torchsnooper.snoop()
def forward(self, g):
for layer in self.layers:
layer(g)
# gpu_tracker.track()
rst_hidden = []
for sub_g in dgl.unbatch(g):
print(sub_g.ndata['h'].shape)
rst_hidden.append( sub_g.ndata['h'] )
# gpu_tracker.track()
return rst_hidden