Hi, I am quite new to the DGL graph training and I am doing a link prediction project, predicting whether there should be a connection between two types of nodes. I construct a undirected graph. The node types and number of nodes are given below:
- Node type 1: 48103
- Node type 2: 805
Moreover, the edge types and number of edges are:
-
- Node type1-Node type1 connection: 41981
-
- Node type1-Node type2 connection: 5350
My task is to predict whether there should be a link between Node type1 and Node type2. The size of the training data and testing data is:
- Training edges: 8560
- Testing edges: 2140
In both the training edges and testing edges, I have ensured that the same number of positive(existing) edges and negative(non-existing, random sampled from the created graph) edges.
Firstly, I use Node2Vec to generate pre-trained node embedding of each node in my created graph. The dimension of node embedding is (128, 1). Then I use the following codes to create link prediction model:
import dgl.function as fn
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F
gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')
# Check the device we are using
device = 'cuda' if th.cuda.is_available() else 'cpu'
class NodeApplyModule(nn.Module):
def __init__(self, in_feats, out_feats, activation):
super(NodeApplyModule, self).__init__()
self.linear = nn.Linear(in_feats, out_feats, bias=True) # in_feats means size of input; out_feats means size of output
self.activation = activation
def forward(self, node):
h = self.linear(node.data['h']) # create a linear class object h
if self.activation is not None:
h = self.activation(h)
return {'h' : h}
class GCN(nn.Module):
def __init__(self, in_feats, out_feats, activation):
super(GCN, self).__init__()
self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)
def forward(self, g, feature):
g.ndata['h'] = feature
g.update_all(gcn_msg, gcn_reduce)
g.apply_nodes(func=self.apply_mod)
return g.ndata.pop('h')
class TwoLayerNet(nn.Module):
def __init__(self, D_in, H_1, D_out, dropout_p):
"""
Create two-layer neural net
"""
super(TwoLayerNet, self).__init__()
self.linear1 = nn.Linear(D_in, H_1, bias=True)
self.linear2 = nn.Linear(H_1, D_out, bias=True)
self.relu = nn.ReLU()
self.drop_layer = nn.Dropout(p=dropout_p)
def forward(self, x):
"""
In the forward function we accept a Tensor of input data and we must return
a Tensor of output data. We can use Modules defined in the constructor as
well as arbitrary operators on Tensors.
"""
h1_linear1 = self.linear1(x)
h1_after_dropout = self.drop_layer(h1_linear1)
h1_relu = self.relu(h1_after_dropout)
h2_linear2 = self.linear2(h1_relu)
h2_relu = self.relu(h2_linear2)
y_pred = F.sigmoid(h2_relu)
return y_pred
class Net(nn.Module):
"""
Build a neural net: two GCN as encoder and a two-layer MLP as decoder
"""
def __init__(self):
super(Net, self).__init__()
self.gcn1 = GCN(128, 100, F.relu)
self.gcn2 = GCN(100, 20, None)
self.twolayer = TwoLayerNet(D_in=40, H_1=100, D_out=2, dropout_p=0.5)
def forward(self, g, feature, edge_list):
x1 = self.gcn1(g, feature)
x2 = self.gcn2(g, x1)
train_data = Net.build_data_for_train_valid(x2, edge_list = edge_list)
# g.ndata['h'] = x1 # Update the graph's node embeddings
# train_data = Net.get_data_from_graph_train_valid(graph=g, edge_list=edge_list)
output = self.twolayer(train_data)
return output
@staticmethod
def build_data_for_train_valid(processed_features, edge_list):
"""
Based on the edge list, generate corresponding edge representation
:param processed_features: the features of nodes
:param edge_list: edge list. for one edge, edge[0] represents a source node and edge[1] is the target node
the edge is undirected
:return: the representation of edge
"""
data = th.zeros((len(edge_list), 40), device=device)
for index, edge in enumerate(edge_list):
node_feature_1 = processed_features[edge[0]]
node_feature_2 = processed_features[edge[1]]
edge_emb = th.cat([node_feature_1, node_feature_2])
data[index] = edge_emb
return data
And I use the following codes for training and testing:
def evaluate(model, g, features, test_loader):
model.eval()
criterion = th.nn.BCELoss()
with th.no_grad():
for test_tensor_edges, test_tensor_labels in test_loader:
test_tensor_edges = test_tensor_edges.to(device)
test_tensor_labels = test_tensor_labels.float().to(device)
logits = model(g=g, feature=features, edge_list=test_tensor_edges)
p = logits[:, 1]
# test_tensor_labels_reshaped = test_tensor_labels.view((test_tensor_labels.shape[0], 1))
test_loss = criterion(p, test_tensor_labels)
predictions_array = p.cpu().numpy()
test_array = test_tensor_labels.cpu().numpy()
roc_auc_score_value = roc_auc_score(y_true=test_array, y_score=predictions_array)
return test_loss, roc_auc_score_value
def train_evaluate_gcn(graph_net, graph, embed, train_loader, test_loader, epoch_num = 10):
graph_net.train()
optimizer = th.optim.Adam(itertools.chain(graph_net.parameters(), embed.parameters()), lr=1e-3)
criterion = th.nn.BCELoss()
dur = []
train_loss_list = []
test_loss_list = []
for epoch in range(epoch_num):
print('-----------------------------')
print('In Epoch: {}'.format(epoch))
t0 = time.time()
train_loss = 0
one_epoch_loss_list = []
for index, (x_batch, y_batch) in enumerate(train_loader):
print('Conducting the {} batch...'.format(index))
x_batch = x_batch.to(device)
y_batch = y_batch.float().to(device)
graph_net.train()
logits = graph_net(g=graph, feature=embed.weight, edge_list=x_batch)
p = logits[:, 1]
# y_batch_reshaped = y_batch.view((y_batch.shape[0], 1))
train_loss = criterion(p, y_batch)
train_loss.backward()
optimizer.step()
optimizer.zero_grad()
dur.append(time.time() - t0)
one_epoch_loss_list.append(train_loss)
one_epoch_loss_values = [loss.cpu().detach().numpy() for loss in one_epoch_loss_list]
train_loss_list.append(np.sum(one_epoch_loss_values))
test_loss, roc_value = evaluate(model=gcn_net, g = graph, features=embed.weight, test_loader=test_loader)
test_loss_list.append(test_loss)
print("Epoch {:05d} | Train Loss {:.4f} | Test Loss {:.4f} | Test ROC value {:.4f} | Time(s) {:.4f}".format(
epoch, np.sum(one_epoch_loss_values), test_loss, roc_value, np.mean(dur)))
print('-----------------------------')
return train_loss_list, test_loss_list
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0], shuffle=True)
node_embedding = th.nn.Embedding(len(list(graph_for_training.nodes())), 128)
node_embedding = node_embedding.to(device)
# th_emb_features saves the node2vec embedding of each node
# For instance, the first row of th_emb_features saves the Node2Vec embedding of node '0'
node_embedding.weight.data.copy_(th_emb_features)
# Train and evaluate the model
train_loss_list, test_loss_list = train_evaluate_gcn(graph_net = gcn_net, graph=dgl_graph, embed=node_embedding,
train_loader=train_loader, test_loader=test_loader, epoch_num=10)
However, the results do not look very well. The training result always shows:
Epoch XXXX | Train Loss 0.6931 | Test Loss 0.6931 | Test ROC value 0.5000 | Time(s) 191.6882
Both the training loss and testing loss do not change. I wonder whether there are mistakes in my model, training and validation.
Thank you very much for your help!