CUDA out of memory error on minibatching

I’m trying link prediction on a large graph with 1.2B edges and 8M nodes. Following is the code snippet

import numpy as np
import torch
import torch.nn as nn
import dgl

# from import Data, DataLoader, DataListLoader, Batch
from import load_graphs
from dgl.nn.pytorch.conv import GraphConv
from tqdm import tqdm as tqdm

device = 'cuda:1' if torch.cuda.is_available() else 'cpu'

graph = load_graphs("./concept_tagger_graph.bin")[0][0]

class NegativeSampler(object):
    def __init__(self, g, k):
        # caches the probability distribution
        self.weights = g.in_degrees().float() ** 0.75
        self.k = k

    def __call__(self, g, eids):
        src, _ = g.find_edges(eids)
        src = src.repeat_interleave(self.k)
        dst = self.weights.multinomial(len(src), replacement=True)
        return src, dst

train_eids = torch.from_numpy(np.array([i for i in range(0, graph.num_edges())]))
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
dataloader = dgl.dataloading.EdgeDataLoader(
    graph, train_eids, sampler,
    negative_sampler=NegativeSampler(graph, 3),

class StochasticTwoLayerGCN(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        self.conv1 = GraphConv(in_features, hidden_features)
        self.conv2 = GraphConv(hidden_features, out_features)

    def forward(self, blocks, x):
        x = F.relu(self.conv1(blocks[0], x))
        x = F.relu(self.conv2(blocks[1], x))
        return x

class ScorePredictor(nn.Module):
    def forward(self, edge_subgraph, x):
        with edge_subgraph.local_scope():
            edge_subgraph.ndata['x'] = x
            edge_subgraph.apply_edges(dgl.function.u_dot_v('x', 'x', 'score'))
            return edge_subgraph.edata['score']

class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        self.gcn = StochasticTwoLayerGCN(
            in_features, hidden_features, out_features)

    def forward(self, positive_graph, negative_graph, blocks, x):
        x = self.gcn(blocks, x)
        pos_score = self.predictor(positive_graph, x)
        neg_score = self.predictor(negative_graph, x)
        return pos_score, neg_score

in_features = 512
hidden_features = 1024
out_features = 512
model = Model(in_features, hidden_features, out_features)
model = model.cuda()
opt = torch.optim.Adam(model.parameters())

for input_nodes, positive_graph, negative_graph, blocks in tqdm(dataloader):
    blocks = [ for b in blocks]
    positive_graph =
    negative_graph =
    input_features = blocks[0].srcdata['x']
    pos_score, neg_score = model(positive_graph, negative_graph, blocks, input_features)
    loss = compute_loss(pos_score, neg_score)

I get the following out of memory error

Using backend: pytorch
  0%|                                                                                                                                                                                | 0/161747369 [00:00<?, ?it/s][Block(num_src_nodes=8457156, num_dst_nodes=1056177, num_edges=280331287), Block(num_src_nodes=1056177, num_dst_nodes=40, num_edges=1062790)]
  0%|                                                                                                                                                                                | 0/161747369 [06:32<?, ?it/s]
Traceback (most recent call last):
  File "", line 80, in <module>
    input_features = blocks[0].srcdata['x']
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/", line 66, in __getitem__
    return self._graph._get_n_repr(self._ntid, self._nodes)[key]
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/", line 393, in __getitem__
    return self._columns[name].data
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/", line 127, in data = F.copy_to(, self.device[0], **self.device[1])
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/backend/pytorch/", line 113, in copy_to
    return input.cuda(**kwargs)
RuntimeError: CUDA out of memory. Tried to allocate 16.13 GiB (GPU 1; 15.75 GiB total capacity; 4.01 MiB already allocated; 1.80 GiB free; 22.00 MiB reserved in total by PyTorch)

Either you can replace MultiLayerFullNeighborSampler by MultiLayerNeighborSampler or you can use a smaller batch size.

@mufeili I modified with MultiLayerNeighborSampler
I get the following error in the predictor method

[Block(num_src_nodes=188512, num_dst_nodes=44386, num_edges=221905), Block(num_src_nodes=44386, num_dst_nodes=4824, num_edges=48218), Block(num_src_nodes=4824, num_dst_nodes=319, num_edges=4785)]
  0%|                                                                                                                                              | 0/20218422 [04:10<?, ?it/s]
Traceback (most recent call last):
  File "", line 95, in <module>
    pos_score, neg_score = model(positive_graph, negative_graph, blocks, input_features)
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/torch/nn/modules/", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "", line 78, in forward
    pos_score = self.predictor(positive_graph, x)
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/torch/nn/modules/", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "", line 65, in forward
    edge_subgraph.ndata['x'] = x
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/", line 81, in __setitem__
    self._graph._set_n_repr(self._ntid, self._nodes, {key : val})
  File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/", line 3809, in _set_n_repr
    ' Got %d and %d instead.' % (nfeats, num_nodes))
dgl._ffi.base.DGLError: Expect number of features to match number of nodes (len(u)). Got 4824 and 319 instead.

It seems that the neighbor sampler samples 3-hop neighbors while the model assumes 2-hop neighbors. How did you initialize MultiLayerNeighborSampler?