Hi there,
Using dgl version 2.4.0+cu121
I seem to notice that there are feature duplicates in the Cora dataset. This seems very strange so I’m wondering if I’m missing something obvious or there’s a bug? See below for reproducible code.
#For calculating pairwise cosine sim on features
def cosine_sim(emb):
emb_norm = emb / emb.norm(dim=1)[:, None]
res = torch.mm(emb_norm, emb_norm.transpose(0,1))
res = torch.nan_to_num(res, nan=0.0)
return res
dataset = dgl.data.CoraGraphDataset()
A = dataset[0]
features = A.ndata['feat']
pairwise_sim = cosine_sim(features)
pairwise_sim = pairwise_sim.fill_diagonal_(-1.0) #just removes 1.0's on diagonal
#Looking for 1.0's
print(torch.where(sim == 1.0))
>>>(tensor([ 137, 709, 1031, 1033, 1586, 1897, 2144, 2205]), tensor([2144, 1897, 2205, 1586, 1033, 709, 137, 1031]))
#Checking some of the 1.0's
(features[137] == features[2144]).all()
>>>tensor(True)