Use GCN to conduct link predictions

Hi, I am quite new to the DGL graph training and I am doing a link prediction project, predicting whether there should be a connection between two types of nodes. I construct a undirected graph. The node types and number of nodes are given below:

  • Node type 1: 48103
  • Node type 2: 805

Moreover, the edge types and number of edges are:

    1. Node type1-Node type1 connection: 41981
    1. Node type1-Node type2 connection: 5350

My task is to predict whether there should be a link between Node type1 and Node type2. The size of the training data and testing data is:

  • Training edges: 8560
  • Testing edges: 2140

In both the training edges and testing edges, I have ensured that the same number of positive(existing) edges and negative(non-existing, random sampled from the created graph) edges.

Firstly, I use Node2Vec to generate pre-trained node embedding of each node in my created graph. The dimension of node embedding is (128, 1). Then I use the following codes to create link prediction model:

import dgl.function as fn
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F

gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')
# Check the device we are using
device = 'cuda' if th.cuda.is_available() else 'cpu'


class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats, bias=True) # in_feats means size of input; out_feats means size of output
        self.activation = activation

    def forward(self, node):
        h = self.linear(node.data['h']) # create a linear class object h
        if self.activation is not None:
            h = self.activation(h)
        return {'h' : h}


class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.update_all(gcn_msg, gcn_reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')


class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H_1, D_out, dropout_p):
        """
        Create two-layer neural net
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H_1, bias=True)
        self.linear2 = nn.Linear(H_1, D_out, bias=True)
        self.relu = nn.ReLU()
        self.drop_layer = nn.Dropout(p=dropout_p)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h1_linear1 = self.linear1(x)
        h1_after_dropout = self.drop_layer(h1_linear1)
        h1_relu = self.relu(h1_after_dropout)
        h2_linear2 = self.linear2(h1_relu)
        h2_relu = self.relu(h2_linear2)
        y_pred = F.sigmoid(h2_relu)
        return y_pred


class Net(nn.Module):
    """
    Build a neural net: two GCN as encoder and a two-layer MLP as decoder
    """
    def __init__(self):
        super(Net, self).__init__()
        self.gcn1 = GCN(128, 100, F.relu)
        self.gcn2 = GCN(100, 20, None)
        self.twolayer = TwoLayerNet(D_in=40, H_1=100, D_out=2, dropout_p=0.5)

    def forward(self, g, feature, edge_list):
        x1 = self.gcn1(g, feature)
        x2 = self.gcn2(g, x1)
        train_data = Net.build_data_for_train_valid(x2, edge_list = edge_list)
        # g.ndata['h'] = x1  # Update the graph's node embeddings
        # train_data = Net.get_data_from_graph_train_valid(graph=g, edge_list=edge_list)
        output = self.twolayer(train_data)
        return output

    @staticmethod
    def build_data_for_train_valid(processed_features, edge_list):
        """
        Based on the edge list, generate corresponding edge representation
        :param processed_features: the features of nodes
        :param edge_list: edge list. for one edge, edge[0] represents a source node and edge[1] is the target node
        the edge is undirected
        :return: the representation of edge
        """
        data = th.zeros((len(edge_list), 40), device=device)
        for index, edge in enumerate(edge_list):
            node_feature_1 = processed_features[edge[0]]
            node_feature_2 = processed_features[edge[1]]
            edge_emb = th.cat([node_feature_1, node_feature_2])
            data[index] = edge_emb
        return data

And I use the following codes for training and testing:

def evaluate(model, g, features, test_loader):

    model.eval()
    criterion = th.nn.BCELoss()
    with th.no_grad():
        for test_tensor_edges, test_tensor_labels in test_loader:
            test_tensor_edges = test_tensor_edges.to(device)
            test_tensor_labels = test_tensor_labels.float().to(device)
            logits = model(g=g, feature=features, edge_list=test_tensor_edges)
            p = logits[:, 1]
            # test_tensor_labels_reshaped = test_tensor_labels.view((test_tensor_labels.shape[0], 1))
            test_loss = criterion(p, test_tensor_labels)
            predictions_array = p.cpu().numpy()
            test_array = test_tensor_labels.cpu().numpy()
            roc_auc_score_value = roc_auc_score(y_true=test_array, y_score=predictions_array)
        return test_loss, roc_auc_score_value


def train_evaluate_gcn(graph_net, graph, embed, train_loader, test_loader, epoch_num = 10):

    graph_net.train()
    optimizer = th.optim.Adam(itertools.chain(graph_net.parameters(), embed.parameters()), lr=1e-3)
    criterion = th.nn.BCELoss()

    dur = []
    train_loss_list = []
    test_loss_list = []

    for epoch in range(epoch_num):

        print('-----------------------------')
        print('In Epoch: {}'.format(epoch))

        t0 = time.time()

        train_loss = 0
        one_epoch_loss_list = []

        for index, (x_batch, y_batch) in enumerate(train_loader):

            print('Conducting the {} batch...'.format(index))

            x_batch = x_batch.to(device)
            y_batch = y_batch.float().to(device)

            graph_net.train()
            logits = graph_net(g=graph, feature=embed.weight, edge_list=x_batch)
            p = logits[:, 1]
            # y_batch_reshaped = y_batch.view((y_batch.shape[0], 1))
            train_loss = criterion(p, y_batch)
            train_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        dur.append(time.time() - t0)
        one_epoch_loss_list.append(train_loss)
        one_epoch_loss_values = [loss.cpu().detach().numpy() for loss in one_epoch_loss_list]
        train_loss_list.append(np.sum(one_epoch_loss_values))

        test_loss, roc_value = evaluate(model=gcn_net, g = graph, features=embed.weight, test_loader=test_loader)
        test_loss_list.append(test_loss)

        print("Epoch {:05d} | Train Loss {:.4f} | Test Loss {:.4f} | Test ROC value {:.4f} | Time(s) {:.4f}".format(
            epoch, np.sum(one_epoch_loss_values), test_loss, roc_value, np.mean(dur)))
        print('-----------------------------')

    return train_loss_list, test_loss_list

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0], shuffle=True)
node_embedding = th.nn.Embedding(len(list(graph_for_training.nodes())), 128)
node_embedding = node_embedding.to(device)
# th_emb_features saves the node2vec embedding of each node
# For instance, the first row of th_emb_features saves the Node2Vec embedding of node '0'
node_embedding.weight.data.copy_(th_emb_features)

# Train and evaluate the model
train_loss_list, test_loss_list = train_evaluate_gcn(graph_net = gcn_net, graph=dgl_graph, embed=node_embedding,
                       train_loader=train_loader, test_loader=test_loader, epoch_num=10)

However, the results do not look very well. The training result always shows:

Epoch XXXX | Train Loss 0.6931 | Test Loss 0.6931 | Test ROC value 0.5000 | Time(s) 191.6882

Both the training loss and testing loss do not change. I wonder whether there are mistakes in my model, training and validation.

Thank you very much for your help! :laughing:

Can you show us how you generate x_batch and y_batch?

Thank you for your reply! The x_batch and y_batch are generated by using the DataLoader.

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0], shuffle=True)

The train_data saves the edges I use to train the link prediction model. Sample train data is given below:

tensor([[ 2360, 18222],
        [13068, 20297],
        [ 1693, 21247], dtype=torch.int32)

The numbers here represent the nodes in the graph.

def forward(self, g, feature): 
    g.ndata['h'] = feature
    g.update_all(gcn_msg, gcn_reduce) 
    g.apply_nodes(func=self.apply_mod) 
    return g.ndata.pop('h')

In this code, the apply_nodes will generate {‘h’ :h} which should be returned here, other than g.ndata.pop(‘h’).