The link prediction tutorial gives 0.81 AUC on testset for link prediction, whereas a lightning dataloader implementation isn’t moving beyond 0.55 AUC for almost similar code logic on CORA dataset. Can anyone please notice where could be the mistake? Thanks
reference Link prediction code from tutorial for comparision:
import numpy as np
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl.data import CoraGraphDataset
from dgl.nn import GATConv
import glob
import os
import itertools
from torchmetrics import Accuracy
import scipy.sparse as sp
class argparserclass():
def __init__(self):
self.gpu=1
self.dataset='cora'
self.epochs= 250
self.num_heads=8
self.num_out_heads=1
self.num_hidden=64
self.num_layers=2
self.residual = False
self.in_drop = .6
self.attn_drop = .6
self.lr = .005
self.weight_decay = 5e-4
self.negative_slope = 0.2
self.early_stop=False
self.fastmode=False
self.num_workers=0
self.batch_size = 256
self.shuffle=False
self.raw_dir="/DATA/"
self.data_cpu = False
self.in_dim = 1433
self.num_classes = 7
args = argparserclass()
if args.gpu < 0:
args.device = torch.device('cpu')
else:
args.device = torch.device('cuda:'+str(args.gpu))
dataset = dgl.data.CoraGraphDataset(raw_dir = args.raw_dir)
g = dataset[0]
# Split edge set for training and testing
u, v = g.edges()
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)
neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()).to(args.device)
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()).to(args.device)
test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()).to(args.device)
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()).to(args.device)
class GAT(nn.Module):
def __init__(self,
g,
num_layers,
in_dim,
num_hidden,
num_classes,
heads,
activation,
feat_drop,
attn_drop,
negative_slope,
residual,
allow_self_loops = False):
super(GAT, self).__init__()
self.g = g
self.num_layers = num_layers
self.gat_layers = nn.ModuleList()
self.activation = activation
if num_layers > 1:
# input projection (no residual)
self.gat_layers.append(GATConv(
in_dim, num_hidden, heads[0],
feat_drop, attn_drop, negative_slope, False, self.activation, allow_self_loops))
# hidden layers
for l in range(1, num_layers-1):
# due to multi-head, the in_dim = num_hidden * num_heads
self.gat_layers.append(GATConv(
num_hidden * heads[l-1], num_hidden, heads[l],
feat_drop, attn_drop, negative_slope, residual, self.activation, allow_self_loops))
# output projection
self.gat_layers.append(GATConv(
num_hidden * heads[-2], num_hidden, heads[-1],
feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))
else:
self.gat_layers.append(GATConv(
in_dim, num_hidden, heads[0],
feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))
def forward(self, inputs):
h = inputs
for l in range(self.num_layers):
h = self.gat_layers[l](self.g, h)
h = h.flatten(1) if l != self.num_layers - 1 else h.mean(1)
return h
train_g = dgl.remove_edges(g, eids[:test_size]).to(args.device)
module = GAT(train_g,
args.num_layers,
args.in_dim,
args.num_hidden,
args.num_classes,
heads,
F.elu,
args.in_drop,
args.attn_drop,
args.negative_slope,
args.residual
, True
)
module = module.to(args.device)
pred = MLPPredictor(args.num_hidden).to(args.device)
optimizer = torch.optim.Adam(itertools.chain(module.parameters(), pred.parameters()), lr=0.01)
all_logits = []
for e in range(args.epochs):
# forward
h = module(train_g.ndata['feat'])
pos_score = pred(train_pos_g, h)
neg_score = pred(train_neg_g, h)
loss = compute_loss(pos_score, neg_score)
# backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
if e % 5 == 0:
print('In epoch {}, loss: {}'.format(e, loss))
from sklearn.metrics import roc_auc_score
with torch.no_grad():
pos_score = pred(test_pos_g, h)
neg_score = pred(test_neg_g, h)
print('AUC', compute_auc(pos_score, neg_score))
and the lightning dataloader code for link prediction with AUC issues:
import numpy as np
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl.data import CoraGraphDataset
from dgl.nn import GATConv
import glob
import os
import itertools
from torchmetrics import Accuracy
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
class GAT(nn.Module):
def __init__(self,
# g,
num_layers,
in_dim,
num_hidden,
num_classes,
heads,
activation,
feat_drop,
attn_drop,
negative_slope,
residual,
allow_self_loops = False):
super(GAT, self).__init__()
# self.g = g
self.num_layers = num_layers
self.gat_layers = nn.ModuleList()
self.activation = activation
if num_layers > 1:
self.gat_layers.append(GATConv(
in_dim, num_hidden, heads[0],
feat_drop, attn_drop, negative_slope, False, self.activation, allow_self_loops))
for l in range(1, num_layers-1):
# due to multi-head, the in_dim = num_hidden * num_heads
self.gat_layers.append(GATConv(
num_hidden * heads[l-1], num_hidden, heads[l],
feat_drop, attn_drop, negative_slope, residual, self.activation, allow_self_loops))
self.gat_layers.append(GATConv(
num_hidden * heads[-2], num_hidden, heads[-1],
feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))
else:
self.gat_layers.append(GATConv(
in_dim, num_hidden, heads[0],
feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))
def forward(self, mfgs, h):
for l, (layer, block) in enumerate(zip(self.gat_layers, mfgs)):
h_dst = h[:block.num_dst_nodes()]
h = layer(block, (h, h_dst))
h = h.flatten(1) if l < self.num_layers - 1 else h.mean(1)
return h
class DotPredictor(nn.Module):
def forward(self, edge_subgraph, h):
with edge_subgraph.local_scope():
edge_subgraph.ndata['h'] = h
# evaluating on non-self loop edges
# edge_subgraph = edge_subgraph.remove_self_loop()
edge_subgraph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
return edge_subgraph.edata['score'][:, 0]
class MLPPredictor(nn.Module):
def __init__(self, h_feats):
super().__init__()
self.W1 = nn.Linear(h_feats * 2, h_feats)
self.W2 = nn.Linear(h_feats, 1)
def apply_edges(self, edges):
h = torch.cat([edges.src['h'], edges.dst['h']], 1)
return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}
def forward(self, g, h):
with g.local_scope():
g.ndata['h'] = h
g.apply_edges(self.apply_edges)
return g.edata['score']
def compute_loss(pos_score, neg_score):
scores = torch.cat([pos_score, neg_score])
labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).to(args.device)
return F.binary_cross_entropy_with_logits(scores, labels)
def compute_auc(pos_score, neg_score):
scores = torch.cat([pos_score, neg_score]).cpu().numpy()
labels = torch.cat(
[torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
return roc_auc_score(labels, scores)
class NeighborSelfLoopSampler(dgl.dataloading.BlockSampler):
def __init__(self, fanouts, output_device='cpu', edge_dir='in', prob=None, replace=False,
prefetch_node_feats=None, prefetch_labels=None, prefetch_edge_feats=None,
):
super().__init__(prefetch_node_feats=prefetch_node_feats,
prefetch_labels=prefetch_labels,
prefetch_edge_feats=prefetch_edge_feats,
output_device=output_device)
self.fanouts = fanouts
self.edge_dir = edge_dir
self.prob = prob
self.replace = replace
def sample_blocks(self, g, seed_nodes, exclude_eids=None):
output_nodes = seed_nodes
blocks = []
for fanout in reversed(self.fanouts):
frontier = g.sample_neighbors(
seed_nodes, fanout, edge_dir=self.edge_dir, prob=self.prob,
replace=self.replace, output_device=self.output_device,
exclude_edges=exclude_eids)
# TODO
# frontier = frontier.remove_self_loop().add_self_loop()
block = dgl.transforms.to_block(frontier, seed_nodes)
seed_nodes = block.srcdata[dgl.NID]
blocks.insert(0, block)
return seed_nodes, output_nodes, blocks
class DataModule(LightningDataModule):
def __init__(self, dataset_name, raw_dir, data_cpu=False,
device=torch.device('cpu'), batch_size=1000, num_workers=0):
super().__init__()
self.dataset_name = dataset_name
self.batch_size = batch_size
self.num_workers = num_workers
self.raw_dir = raw_dir
data = CoraGraphDataset(raw_dir = self.raw_dir)
g = data[0]
features = g.ndata['feat']
num_feats = features.shape[1]
n_classes = data.num_classes
self.g = g.to(device)
self.in_feats = num_feats
self.n_classes = n_classes
eids = np.arange(g.number_of_edges())
eids = np.random.RandomState(42).permutation(eids)
val_size = int(len(eids) * 0.2)
test_size = int(len(eids) * 0.1)
self.nei_sampler = NeighborSelfLoopSampler([10, 10], args.device)
self.negative_sampler = dgl.dataloading.negative_sampler.GlobalUniform(1)
self.sampler = dgl.dataloading.as_edge_prediction_sampler(
sampler = self.nei_sampler,
negative_sampler=self.negative_sampler
)
self.eids = eids
self.train_eids = eids[val_size:]
self.val_eids = eids[test_size:val_size]
self.test_eids = eids[:test_size]
def train_dataloader(self):
return dgl.dataloading.DataLoader(
self.g,
torch.tensor(self.train_eids, device = args.device),
self.sampler,
device = args.device,
batch_size = self.batch_size,
shuffle=True,
drop_last=False,
num_workers=self.num_workers
)
def val_dataloader(self):
return dgl.dataloading.DataLoader(
self.g,
torch.tensor(self.val_eids, device = args.device),
self.sampler,
device = args.device,
batch_size=self.batch_size,
shuffle=False,
drop_last=False,
num_workers=self.num_workers
)
def test_dataloader(self):
return dgl.dataloading.DataLoader(
self.g,
torch.tensor(self.test_eids, device = args.device),
self.sampler,
device = args.device,
batch_size=self.batch_size,
shuffle=False,
drop_last=False,
num_workers=self.num_workers
)
class argparserclass():
def __init__(self):
self.gpu=1
self.dataset='cora'
self.epochs= 250
self.num_heads=8
self.num_out_heads=1
self.num_hidden=64
self.num_layers=2
self.residual = False
self.in_drop = .6
self.attn_drop = .6
self.lr = .005
self.weight_decay = 5e-4
self.negative_slope = 0.2
self.early_stop=False
self.fastmode=False
self.num_workers=0
self.batch_size = 256
self.shuffle=False
self.raw_dir="/DATA/"
self.data_cpu = False
self.in_dim = 1433
self.num_classes = 7
args = argparserclass()
if args.gpu < 0:
args.device = torch.device('cpu')
else:
args.device = torch.device('cuda:'+str(args.gpu))
heads = ([args.num_heads] * (args.num_layers-1)) + [args.num_out_heads]
datamodule = DataModule(args.dataset, args.raw_dir, args.data_cpu,
args.device, args.batch_size, args.num_workers)
module = GAT(
args.num_layers,
args.in_dim,
args.num_hidden,
args.num_classes,
heads,
F.elu,
args.in_drop,
args.attn_drop,
args.negative_slope,
args.residual
# , True
)
pred = MLPPredictor(args.num_hidden)
# pred = DotPredictor()
module = module.to(args.device)
pred = pred.to(args.device)
train_acc = Accuracy()
val_acc = Accuracy()
test_acc = Accuracy()
optimizer = torch.optim.Adam(itertools.chain(module.parameters(), pred.parameters()), lr=args.lr, weight_decay=args.weight_decay)
batch_pred_cnt = 0
train_loader = datamodule.train_dataloader()
for epoch in range(args.epochs):
train_auc_ = []
for batch in train_loader:
# batch = next(iter(train_loader))
# for i in range(500): # overfit
input_nodes, train_pos_g, train_neg_g, mfgs = batch
mfgs = [mfg.int().to(args.device) for mfg in mfgs]
batch_inputs = mfgs[0].srcdata['feat']
batch_pred = module(mfgs, batch_inputs)
pos_score = pred(train_pos_g, batch_pred)
neg_score = pred(train_neg_g, batch_pred)
loss = compute_loss(pos_score, neg_score)
train_acc = compute_auc(pos_score.detach(), neg_score.detach())
train_auc_.append(train_acc)
batch_pred_cnt += batch_pred.size(0)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if not epoch % 11:
print('loss', loss.detach().item())
print('epoch acc', sum(train_auc_)/len(train_auc_))
# break
def inference(module, graph):
with torch.no_grad():
sampler = dgl.dataloading.NeighborSampler([4, 4])
train_dataloader = dgl.dataloading.DataLoader(
graph,
torch.arange(graph.number_of_nodes(), device = args.device),
sampler,
batch_size=500,
shuffle=False,
drop_last=False,
device=args.device)
result = []
for input_nodes, output_nodes, mfgs in train_dataloader:
inputs = mfgs[0].srcdata['feat']
result.append(module(mfgs, inputs))
return torch.cat(result)
def eval_auc(pos_g, neg_g, emb):
with torch.no_grad():
module.eval()
pos_score = pred(pos_g.to(args.device), emb)
neg_score = pred(neg_g.to(args.device), emb)
print('AUC', compute_auc(pos_score, neg_score))
module.train()
def test_evaluation():
with torch.no_grad():
module.eval()
emb = inference(module, datamodule.g)
neg_test_edges = dgl.sampling.global_uniform_negative_sampling(datamodule.g, num_samples = datamodule.test_eids.shape[0])
neg_test_g = dgl.graph(num_nodes = datamodule.g.num_nodes(), data = neg_test_edges)
test_eids = datamodule.eids[datamodule.test_eids]
pos_test_g = dgl.remove_edges(datamodule.g, test_eids)
eval_auc(pos_test_g, neg_test_g, emb)
test_evaluation()