I’m trying link prediction on a large graph with 1.2B edges and 8M nodes. Following is the code snippet
import numpy as np
import torch
import torch.nn as nn
import dgl
# from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
from dgl.data.utils import load_graphs
from dgl.nn.pytorch.conv import GraphConv
from tqdm import tqdm as tqdm
device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
graph = load_graphs("./concept_tagger_graph.bin")[0][0]
class NegativeSampler(object):
def __init__(self, g, k):
# caches the probability distribution
self.weights = g.in_degrees().float() ** 0.75
self.k = k
def __call__(self, g, eids):
src, _ = g.find_edges(eids)
src = src.repeat_interleave(self.k)
dst = self.weights.multinomial(len(src), replacement=True)
return src, dst
train_eids = torch.from_numpy(np.array([i for i in range(0, graph.num_edges())]))
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
dataloader = dgl.dataloading.EdgeDataLoader(
graph, train_eids, sampler,
negative_sampler=NegativeSampler(graph, 3),
batch_size=64,
shuffle=True,
drop_last=False,
pin_memory=True,
num_workers=0)
class StochasticTwoLayerGCN(nn.Module):
def __init__(self, in_features, hidden_features, out_features):
super().__init__()
self.conv1 = GraphConv(in_features, hidden_features)
self.conv2 = GraphConv(hidden_features, out_features)
def forward(self, blocks, x):
x = F.relu(self.conv1(blocks[0], x))
x = F.relu(self.conv2(blocks[1], x))
return x
class ScorePredictor(nn.Module):
def forward(self, edge_subgraph, x):
with edge_subgraph.local_scope():
edge_subgraph.ndata['x'] = x
edge_subgraph.apply_edges(dgl.function.u_dot_v('x', 'x', 'score'))
return edge_subgraph.edata['score']
class Model(nn.Module):
def __init__(self, in_features, hidden_features, out_features):
super().__init__()
self.gcn = StochasticTwoLayerGCN(
in_features, hidden_features, out_features)
def forward(self, positive_graph, negative_graph, blocks, x):
x = self.gcn(blocks, x)
pos_score = self.predictor(positive_graph, x)
neg_score = self.predictor(negative_graph, x)
return pos_score, neg_score
in_features = 512
hidden_features = 1024
out_features = 512
model = Model(in_features, hidden_features, out_features)
model = model.cuda()
opt = torch.optim.Adam(model.parameters())
for input_nodes, positive_graph, negative_graph, blocks in tqdm(dataloader):
blocks = [b.to(device) for b in blocks]
positive_graph = positive_graph.to(device)
negative_graph = negative_graph.to(device)
input_features = blocks[0].srcdata['x']
pos_score, neg_score = model(positive_graph, negative_graph, blocks, input_features)
loss = compute_loss(pos_score, neg_score)
print(loss.item())
opt.zero_grad()
loss.backward()
opt.step()
I get the following out of memory error
Using backend: pytorch
0%| | 0/161747369 [00:00<?, ?it/s][Block(num_src_nodes=8457156, num_dst_nodes=1056177, num_edges=280331287), Block(num_src_nodes=1056177, num_dst_nodes=40, num_edges=1062790)]
0%| | 0/161747369 [06:32<?, ?it/s]
Traceback (most recent call last):
File "graph_train.py", line 80, in <module>
input_features = blocks[0].srcdata['x']
File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/view.py", line 66, in __getitem__
return self._graph._get_n_repr(self._ntid, self._nodes)[key]
File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/frame.py", line 393, in __getitem__
return self._columns[name].data
File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/frame.py", line 127, in data
self.storage = F.copy_to(self.storage, self.device[0], **self.device[1])
File "/home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/dgl/backend/pytorch/tensor.py", line 113, in copy_to
return input.cuda(**kwargs)
RuntimeError: CUDA out of memory. Tried to allocate 16.13 GiB (GPU 1; 15.75 GiB total capacity; 4.01 MiB already allocated; 1.80 GiB free; 22.00 MiB reserved in total by PyTorch)