Hello,

I have successfully trained a model using link prediction on a Heterograph. However, when I compute the embeddings and compare the cosine similarity between them, I find something odd which is that nodes that have nodes in the graph that are connected have lower cosine similarity than nodes that are not connected by edges! This is exactly the opposite of what is supposed to happen, so I’m wondering if I made a mistake.

My graph is defined like so:

```
data_dict = {('source', 'has_follower', 'user'): (torch.tensor([0, 0]), torch.tensor([0, 0])), ('user', 'follows', 'source'): (torch.tensor([0, 0]), torch.tensor([0, 0])) }
dgl_graph = dgl.heterograph(data_dict)
```

Here is how I train the model:

```
class TestRGCN(nn.Module):
def __init__(self, in_feats, hid_feats, out_feats, canonical_etypes):
super(TestRGCN, self).__init__()
self.conv1 = dglnn.HeteroGraphConv({
etype : dglnn.GraphConv(in_feats[utype], hid_feats, norm='right')
for utype, etype, vtype in canonical_etypes
})
self.conv2 = dglnn.HeteroGraphConv({
etype : dglnn.GraphConv(hid_feats, out_feats, norm='right')
for _, etype, _ in canonical_etypes
})
def forward(self, blocks, inputs):
x = self.conv1(blocks[0], inputs)
x = self.conv2(blocks[1], x)
return x
def inference(self, curr_g, x, batch_size):
nodes = torch.arange(curr_g.number_of_nodes())
curr_g = curr_g.to('cpu')
for l, layer in enumerate([self.conv1, self.conv2]):
y = {k: torch.zeros(curr_g.number_of_nodes(k), self.hid_feats if l != self.n_layers - 1 else self.out_feats) for k in curr_g.ntypes}
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
dataloader = dgl.dataloading.NodeDataLoader(
curr_g, {k: torch.arange(curr_g.number_of_nodes(k)) for k in curr_g.ntypes}, sampler, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=self.num_workers)
for input_nodes, output_nodes, blocks in tqdm(dataloader):
block = blocks[0].to(torch.device('cuda'))
h = {k: x[k][input_nodes[k].type(torch.LongTensor)].to(torch.device('cuda')) for k in input_nodes.keys()}
h = layer(block, h)
for k in h.keys():
y[k][output_nodes[k].type(torch.LongTensor)] = h[k].cpu()
x = y
return y
class HeteroScorePredictor(nn.Module):
def forward(self, edge_subgraph, x):
with edge_subgraph.local_scope():
edge_subgraph.ndata['h'] = x
for etype in edge_subgraph.canonical_etypes:
edge_subgraph.apply_edges(dgl.function.u_dot_v('h', 'h', 'score'), etype=etype)
# edge_subgraph.apply_edges(self.apply_edges, etype=etype)
return edge_subgraph.edata['score']
class TestModel(nn.Module):
# here we have a model that first computes the representation and then predicts the scores for the edges
def __init__(self, in_features, hidden_features, out_features, canonical_etypes):
super().__init__()
self.sage = TestRGCN(in_features, hidden_features, out_features, canonical_etypes)
self.pred = HeteroScorePredictor()
def forward(self, g, neg_g, blocks, x):
x = self.sage(blocks, x)
pos_score = self.pred(g, x)
neg_score = self.pred(neg_g, x)
return pos_score, neg_score
def compute_loss(pos_score, neg_score, canonical_etypes):
# Margin loss
all_losses = []
for given_type in canonical_etypes:
n_edges = pos_score[given_type].shape[0]
if n_edges == 0:
continue
all_losses.append((1 - neg_score[given_type].view(n_edges, -1) + pos_score[given_type].unsqueeze(1)).clamp(min=0).mean())
return torch.stack(all_losses, dim=0).mean()
model = TestModel(in_features={'source':700, 'user':800}, hidden_features=512, out_features=256, canonical_etypes=g.canonical_etypes)
train_eid_dict = {('source', 'has_follower', 'user'): torch.arange(g.num_edges('has_follower')), ('user', 'follows', 'source'): torch.arange(g.num_edges('has_follower'))}
dataloader = dgl.dataloading.EdgeDataLoader(g, train_eid_dict, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(5), batch_size=args.batch_size, shuffle=True, drop_last=False, pin_memory=True, num_workers=args.num_workers)
...
for epoch in range(args.n_epochs):
for input_nodes, positive_graph, negative_graph, blocks in dataloader:
model.train()
blocks = [b.to(torch.device('cuda')) for b in blocks]
positive_graph = positive_graph.to(torch.device('cuda'))
negative_graph = negative_graph.to(torch.device('cuda'))
node_features = {'source': blocks[0].srcdata['source_embedding']['source'], 'user': blocks[0].srcdata['user_embedding']['user']}
pos_score, neg_score = model(positive_graph, negative_graph, blocks, node_features)
loss = compute_loss(pos_score, neg_score, g.canonical_etypes)
optimizer.zero_grad()
loss.backward()
optimizer.step()
def compute_loss(pos_score, neg_score, canonical_etypes):
# Margin loss
all_losses = []
for given_type in canonical_etypes:
if given_type not in pos_score:
continue
n_edges = pos_score[given_type].shape[0]
if n_edges == 0:
continue
all_losses.append((1 - neg_score[given_type].view(n_edges, -1) + pos_score[given_type].unsqueeze(1)).clamp(min=0).mean())
return torch.stack(all_losses, dim=0).mean()
```

And then, I compute the inference, embeddings, and cosine similarity between them like so:

```
source_feats = g.nodes['source'].data['source_embedding'].to(torch.device('cuda'))
user_feats = g.nodes['user'].data['user_embedding'].to(torch.device('cuda'))
node_features_for_inference = {'source': source_feats, 'user': user_feats}
model.eval()
with torch.no_grad():
# single gpu
# if isinstance(model, FakeNewsModel):
pred = model.inference(g, node_features_for_inference, args.batch_size)
embedding_1 = pred['source'][given_source_id-1]
embedding_2 = pred['user'][given_user_id-1]
embedding_3 = pred['user'][given_user_id_2-1]
print(F.cosine_similarity(embedding_1, embedding_2, dim=0).item())
# the nodes of embedding1 and embedding_3 are not connected but this number below is higher!
print(F.cosine_similarity(embedding_1, embedding_3, dim=0).item())
```

For some reason, the cosine similarity between nodes that are connected ends up being negative (like -0.4) and the one between nodes that aren’t connected ends up being positive (like 0.15)