Accuracy drop with dataloaders for Link Prediction

The link prediction tutorial gives 0.81 AUC on testset for link prediction, whereas a lightning dataloader implementation isn’t moving beyond 0.55 AUC for almost similar code logic on CORA dataset. Can anyone please notice where could be the mistake? Thanks

reference Link prediction code from tutorial for comparision:

import numpy as np
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl.data import CoraGraphDataset
from dgl.nn import GATConv
import glob
import os
import itertools
from torchmetrics import Accuracy
import scipy.sparse as sp

class argparserclass():
    def __init__(self):
        self.gpu=1
        self.dataset='cora'
        self.epochs= 250
        self.num_heads=8
        self.num_out_heads=1
        self.num_hidden=64
        self.num_layers=2
        self.residual = False
        self.in_drop = .6
        self.attn_drop = .6
        self.lr = .005
        self.weight_decay = 5e-4
        self.negative_slope = 0.2
        self.early_stop=False
        self.fastmode=False
        self.num_workers=0
        self.batch_size = 256
        self.shuffle=False
        self.raw_dir="/DATA/"
        self.data_cpu = False
        self.in_dim = 1433
        self.num_classes = 7

args = argparserclass()

if args.gpu < 0:
    args.device = torch.device('cpu')
else:
    args.device = torch.device('cuda:'+str(args.gpu))


dataset = dgl.data.CoraGraphDataset(raw_dir = args.raw_dir)
g = dataset[0]
# Split edge set for training and testing
u, v = g.edges()
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)
neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()).to(args.device)
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()).to(args.device)
test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()).to(args.device)
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()).to(args.device)

class GAT(nn.Module):
    def __init__(self,
                 g,
                 num_layers,
                 in_dim,
                 num_hidden,
                 num_classes,
                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
                 negative_slope,
                 residual,
                 allow_self_loops = False):
        super(GAT, self).__init__()
        self.g = g
        self.num_layers = num_layers
        self.gat_layers = nn.ModuleList()
        self.activation = activation
        if num_layers > 1:
        # input projection (no residual)
            self.gat_layers.append(GATConv(
                in_dim, num_hidden, heads[0],
                feat_drop, attn_drop, negative_slope, False, self.activation, allow_self_loops))
            # hidden layers
            for l in range(1, num_layers-1):
                # due to multi-head, the in_dim = num_hidden * num_heads
                self.gat_layers.append(GATConv(
                    num_hidden * heads[l-1], num_hidden, heads[l],
                    feat_drop, attn_drop, negative_slope, residual, self.activation, allow_self_loops))
            # output projection
            self.gat_layers.append(GATConv(
                num_hidden * heads[-2], num_hidden, heads[-1],
                feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))
        else:
            self.gat_layers.append(GATConv(
                in_dim, num_hidden, heads[0],
                feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))

    def forward(self, inputs):
        h = inputs
        for l in range(self.num_layers):
            h = self.gat_layers[l](self.g, h)
            h = h.flatten(1) if l != self.num_layers - 1 else h.mean(1)
        return h

train_g = dgl.remove_edges(g, eids[:test_size]).to(args.device)

module = GAT(train_g, 
            args.num_layers,
            args.in_dim, 
            args.num_hidden, 
            args.num_classes,   
            heads,
           F.elu, 
           args.in_drop, 
             args.attn_drop, 
             args.negative_slope, 
             args.residual
            , True
              )
module = module.to(args.device)
pred = MLPPredictor(args.num_hidden).to(args.device)

optimizer = torch.optim.Adam(itertools.chain(module.parameters(), pred.parameters()), lr=0.01)

all_logits = []
for e in range(args.epochs):
    # forward
    h = module(train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

and the lightning dataloader code for link prediction with AUC issues:

import numpy as np
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl.data import CoraGraphDataset
from dgl.nn import GATConv

import glob
import os
import itertools

from torchmetrics import Accuracy
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

class GAT(nn.Module):
    def __init__(self,
                #  g,
                 num_layers,
                 in_dim,
                 num_hidden,
                 num_classes,
                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
                 negative_slope,
                 residual,
                 allow_self_loops = False):
        super(GAT, self).__init__()
        # self.g = g
        self.num_layers = num_layers
        self.gat_layers = nn.ModuleList()
        self.activation = activation
        if num_layers > 1:
            self.gat_layers.append(GATConv(
                in_dim, num_hidden, heads[0],
                feat_drop, attn_drop, negative_slope, False, self.activation, allow_self_loops))
            for l in range(1, num_layers-1):
                # due to multi-head, the in_dim = num_hidden * num_heads
                self.gat_layers.append(GATConv(
                    num_hidden * heads[l-1], num_hidden, heads[l],
                    feat_drop, attn_drop, negative_slope, residual, self.activation, allow_self_loops))
            self.gat_layers.append(GATConv(
                num_hidden * heads[-2], num_hidden, heads[-1],
                feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))
        else:
            self.gat_layers.append(GATConv(
                in_dim, num_hidden, heads[0],
                feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))

    def forward(self, mfgs, h):
        for l, (layer, block) in enumerate(zip(self.gat_layers, mfgs)):
            h_dst = h[:block.num_dst_nodes()]
            h = layer(block, (h, h_dst))
            h = h.flatten(1) if l < self.num_layers - 1 else h.mean(1)
        return h

class DotPredictor(nn.Module):
    def forward(self, edge_subgraph, h):
        with edge_subgraph.local_scope():
            edge_subgraph.ndata['h'] = h
            # evaluating on non-self loop edges
            # edge_subgraph = edge_subgraph.remove_self_loop()
            edge_subgraph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return edge_subgraph.edata['score'][:, 0]

class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).to(args.device)
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).cpu().numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

class NeighborSelfLoopSampler(dgl.dataloading.BlockSampler):
    def __init__(self, fanouts, output_device='cpu', edge_dir='in', prob=None, replace=False,
                 prefetch_node_feats=None, prefetch_labels=None, prefetch_edge_feats=None,
                 ):
        super().__init__(prefetch_node_feats=prefetch_node_feats,
                         prefetch_labels=prefetch_labels,
                         prefetch_edge_feats=prefetch_edge_feats,
                         output_device=output_device)
        self.fanouts = fanouts
        self.edge_dir = edge_dir
        self.prob = prob
        self.replace = replace

    def sample_blocks(self, g, seed_nodes, exclude_eids=None):
        output_nodes = seed_nodes
        blocks = []
        for fanout in reversed(self.fanouts):
            frontier = g.sample_neighbors(
                seed_nodes, fanout, edge_dir=self.edge_dir, prob=self.prob,
                replace=self.replace, output_device=self.output_device,
                exclude_edges=exclude_eids)
            # TODO
            # frontier = frontier.remove_self_loop().add_self_loop()
            block = dgl.transforms.to_block(frontier, seed_nodes)
            seed_nodes = block.srcdata[dgl.NID]
            blocks.insert(0, block)

        return seed_nodes, output_nodes, blocks

class DataModule(LightningDataModule):
    def __init__(self, dataset_name, raw_dir, data_cpu=False,
                 device=torch.device('cpu'), batch_size=1000, num_workers=0):
        super().__init__()
        self.dataset_name = dataset_name
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.raw_dir = raw_dir
        data = CoraGraphDataset(raw_dir = self.raw_dir)
        g = data[0]
        features = g.ndata['feat']
        num_feats = features.shape[1]
        n_classes = data.num_classes
        self.g = g.to(device)
        self.in_feats = num_feats
        self.n_classes = n_classes

        eids = np.arange(g.number_of_edges())
        eids = np.random.RandomState(42).permutation(eids)
        val_size = int(len(eids) * 0.2)
        test_size = int(len(eids) * 0.1)

        self.nei_sampler = NeighborSelfLoopSampler([10, 10], args.device)
        self.negative_sampler = dgl.dataloading.negative_sampler.GlobalUniform(1)
        
        self.sampler = dgl.dataloading.as_edge_prediction_sampler(
                                sampler = self.nei_sampler, 
                                negative_sampler=self.negative_sampler
                                )
        self.eids = eids
        self.train_eids = eids[val_size:]
        self.val_eids = eids[test_size:val_size]
        self.test_eids = eids[:test_size]

    def train_dataloader(self):
        
        return dgl.dataloading.DataLoader(
                self.g,
                torch.tensor(self.train_eids, device = args.device),
                self.sampler,
                device = args.device,
                batch_size = self.batch_size,
                shuffle=True,
                drop_last=False,
                num_workers=self.num_workers
                )
    
    def val_dataloader(self):
        return dgl.dataloading.DataLoader(
                self.g,
                torch.tensor(self.val_eids, device = args.device),
                self.sampler,
                device = args.device,
                batch_size=self.batch_size,
                shuffle=False,
                drop_last=False,
                num_workers=self.num_workers
            )
    
    def test_dataloader(self):
        return dgl.dataloading.DataLoader(
                self.g,
                torch.tensor(self.test_eids, device = args.device),
                self.sampler,
                device = args.device,
                batch_size=self.batch_size,
                shuffle=False,
                drop_last=False,
                num_workers=self.num_workers
            )

class argparserclass():
    def __init__(self):
        self.gpu=1
        self.dataset='cora'
        self.epochs= 250
        self.num_heads=8
        self.num_out_heads=1
        self.num_hidden=64
        self.num_layers=2
        self.residual = False
        self.in_drop = .6
        self.attn_drop = .6
        self.lr = .005
        self.weight_decay = 5e-4
        self.negative_slope = 0.2
        self.early_stop=False
        self.fastmode=False
        self.num_workers=0
        self.batch_size = 256
        self.shuffle=False
        self.raw_dir="/DATA/"
        self.data_cpu = False
        self.in_dim = 1433
        self.num_classes = 7

args = argparserclass()

if args.gpu < 0:
    args.device = torch.device('cpu')
else:
    args.device = torch.device('cuda:'+str(args.gpu))

heads = ([args.num_heads] * (args.num_layers-1)) + [args.num_out_heads]

datamodule = DataModule(args.dataset, args.raw_dir, args.data_cpu, 
                        args.device, args.batch_size, args.num_workers)

module = GAT(
            args.num_layers,
            args.in_dim, 
            args.num_hidden, 
            args.num_classes,   
            heads,
            F.elu, 
            args.in_drop, 
            args.attn_drop, 
            args.negative_slope, 
            args.residual
            #  , True
            )


pred = MLPPredictor(args.num_hidden)
# pred = DotPredictor()
module = module.to(args.device)
pred = pred.to(args.device)

train_acc = Accuracy()
val_acc = Accuracy()
test_acc = Accuracy()

optimizer = torch.optim.Adam(itertools.chain(module.parameters(), pred.parameters()), lr=args.lr,  weight_decay=args.weight_decay)

batch_pred_cnt = 0

train_loader = datamodule.train_dataloader()
for epoch in range(args.epochs):
    train_auc_ = []
    for batch in train_loader:
    # batch = next(iter(train_loader))
    # for i in range(500): # overfit
        input_nodes, train_pos_g, train_neg_g, mfgs = batch
        mfgs = [mfg.int().to(args.device) for mfg in mfgs]
        batch_inputs = mfgs[0].srcdata['feat']
        batch_pred = module(mfgs, batch_inputs)
        pos_score = pred(train_pos_g, batch_pred)
        neg_score = pred(train_neg_g, batch_pred)

        loss = compute_loss(pos_score, neg_score)
        train_acc = compute_auc(pos_score.detach(), neg_score.detach())
        train_auc_.append(train_acc)
        batch_pred_cnt += batch_pred.size(0)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if not epoch % 11:
        print('loss', loss.detach().item())
        print('epoch acc', sum(train_auc_)/len(train_auc_))
    # break

def inference(module, graph):
    with torch.no_grad():
        sampler = dgl.dataloading.NeighborSampler([4, 4])

        train_dataloader = dgl.dataloading.DataLoader(
            graph, 
            torch.arange(graph.number_of_nodes(), device = args.device), 
            sampler,
            batch_size=500,
            shuffle=False,
            drop_last=False,
            device=args.device)

        result = []
        for input_nodes, output_nodes, mfgs in train_dataloader:
            inputs = mfgs[0].srcdata['feat']
            result.append(module(mfgs, inputs))

        return torch.cat(result)


def eval_auc(pos_g, neg_g, emb):
    with torch.no_grad():
        module.eval()
        pos_score = pred(pos_g.to(args.device), emb)
        neg_score = pred(neg_g.to(args.device), emb)
        print('AUC', compute_auc(pos_score, neg_score))
        module.train()

def test_evaluation():       
    with torch.no_grad():
        module.eval()
        emb = inference(module, datamodule.g)
    neg_test_edges = dgl.sampling.global_uniform_negative_sampling(datamodule.g, num_samples = datamodule.test_eids.shape[0])
    neg_test_g = dgl.graph(num_nodes = datamodule.g.num_nodes(), data = neg_test_edges)
    test_eids = datamodule.eids[datamodule.test_eids]
    pos_test_g = dgl.remove_edges(datamodule.g, test_eids)
    eval_auc(pos_test_g, neg_test_g, emb)

test_evaluation()

I noticed that the reference code and PL code are not equivalent. For instance, PL has NeighborSelfLoopSampler while the reference code doesn’t. Could that be a reason? (I didn’t check for all discrepancies)

Thank you for checking it.
Yes, PL code above has Sampler, which is just same as the NeighborSampler provided by dgl. Although, for strict comparison I agree that both the codes should be equivalent, I went with the sampler approach in relation to how other dgl tutorials with batching is presented.
Could you suggest an alternate change for strict comparision or could you notice other implementation mistake(which I believe is the case although I cannot figure) I am doing in PL? as I think both should behave nearly similar ± 5% even with samplers.
I also tried MultiLayerFullNeighborSampler(2) and removing validation edges in pl code but no improvement. The overfit of single batch on PL is working though.

ok, I tried avoiding samplers and pl, but borrowing the preprocessed edges of first reference code and then passing them through dataloaders, but still the same issue.

here is the new trial:

import numpy as np
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl.data import CoraGraphDataset
from dgl.nn import GATConv

import glob
import os
import itertools

from torchmetrics import Accuracy
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

class GAT(nn.Module):
    def __init__(self,
                #  g,
                 num_layers,
                 in_dim,
                 num_hidden,
                 num_classes,
                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
                 negative_slope,
                 residual,
                 allow_self_loops = False):
        super(GAT, self).__init__()
        # self.g = g
        self.num_layers = num_layers
        self.gat_layers = nn.ModuleList()
        self.activation = activation
        if num_layers > 1:
            self.gat_layers.append(GATConv(
                in_dim, num_hidden, heads[0],
                feat_drop, attn_drop, negative_slope, False, self.activation, allow_self_loops))
            for l in range(1, num_layers-1):
                # due to multi-head, the in_dim = num_hidden * num_heads
                self.gat_layers.append(GATConv(
                    num_hidden * heads[l-1], num_hidden, heads[l],
                    feat_drop, attn_drop, negative_slope, residual, self.activation, allow_self_loops))
            self.gat_layers.append(GATConv(
                num_hidden * heads[-2], num_hidden, heads[-1],
                feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))
        else:
            self.gat_layers.append(GATConv(
                in_dim, num_hidden, heads[0],
                feat_drop, attn_drop, negative_slope, residual, None, allow_self_loops))

    def forward(self, mfgs, h):
        for l, layer in enumerate(self.gat_layers):
            h = layer(mfgs, h)
            h = h.flatten(1) if l < self.num_layers - 1 else h.mean(1)
        return h

class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).to(args.device)
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).cpu().numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

class argparserclass():
    def __init__(self):
        self.gpu=1
        self.dataset='cora'
        self.epochs= 250
        self.num_heads=8
        self.num_out_heads=1
        self.num_hidden=64
        self.num_layers=2
        self.residual = False
        self.in_drop = .6
        self.attn_drop = .6
        self.lr = .005
        self.weight_decay = 5e-4
        self.negative_slope = 0.2
        self.early_stop=False
        self.fastmode=False
        self.num_workers=0
        self.batch_size = 256
        self.shuffle=False
        self.raw_dir="/DATA/"
        self.data_cpu = False
        self.in_dim = 1433
        self.num_classes = 7

args = argparserclass()

if args.gpu < 0:
    args.device = torch.device('cpu')
else:
    args.device = torch.device('cuda:'+str(args.gpu))

heads = ([args.num_heads] * (args.num_layers-1)) + [args.num_out_heads]

module = GAT(
            args.num_layers,
            args.in_dim, 
            args.num_hidden, 
            args.num_classes,   
            heads,
            F.elu, 
            args.in_drop, 
            args.attn_drop, 
            args.negative_slope, 
            args.residual
            #  , True
            )


pred = MLPPredictor(args.num_hidden)
# pred = DotPredictor()
module = module.to(args.device)
pred = pred.to(args.device)

optimizer = torch.optim.Adam(itertools.chain(module.parameters(), pred.parameters()), lr=args.lr,  weight_decay=args.weight_decay)

batch_pred_cnt = 0

import scipy.sparse as sp
dataset = dgl.data.CoraGraphDataset(raw_dir = args.raw_dir)
g = dataset[0]
u, v = g.edges()
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)
neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()).to(args.device)
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()).to(args.device)
test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()).to(args.device)
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()).to(args.device)
train_g = dgl.remove_edges(g, eids[:test_size]).to(args.device)

from torch.utils.data import DataLoader
train_loader = DataLoader([[train_g,train_pos_g,train_neg_g]],batch_size=None)
test_loader =DataLoader([[test_pos_g,test_neg_g]],batch_size=None)

for epoch in range(args.epochs):
    train_auc_ = []
    for batch in train_loader:
    # batch = next(iter(train_loader))
    # for i in range(500): # overfit
        input_nodes, train_pos_g, train_neg_g = batch
        g  = g.to(args.device)
        batch_pred = module(train_pos_g, g.ndata['feat'])
        pos_score = pred(train_pos_g, batch_pred)
        neg_score = pred(train_neg_g, batch_pred)

        loss = compute_loss(pos_score, neg_score)
        train_acc = compute_auc(pos_score.detach(), neg_score.detach())
        train_auc_.append(train_acc)
        batch_pred_cnt += batch_pred.size(0)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if not epoch % 11:
        print('loss', loss.detach().item())
        print('epoch acc', sum(train_auc_)/len(train_auc_))

def eval_auc(pos_g, neg_g, emb):
    with torch.no_grad():
        module.eval()
        pos_score = pred(pos_g.to(args.device), emb)
        neg_score = pred(neg_g.to(args.device), emb)
        print('AUC', compute_auc(pos_score, neg_score))
        module.train()

def test_evaluation(test_pos_g, test_neg_g, g):       
    with torch.no_grad():
        module.eval()
        emb = inference(module, g) # needs to be changed
    return eval_auc(test_pos_g, test_neg_g, emb)

# test_evaluation(test_pos_g, test_neg_g, g)

ok, it was the addition of relative high weight_decay, from my habit of using that in node classification, that caused the wide changes. Phew! :sweat_smile: