Hi!
I am using the PinSAGE sampler to build my frontiers for each layer of convolution. However, I seem to have found an issue of reproducibility. Despite the fact that I set dgl.seed(47), it seems two versions of the frontier generated from the same seeds yield a different subgraph. Please let me know if I am somehow mistaken. Thank you in advance!
import torch
import dgl
#set seed
dgl.seed(47)
#generate synthetic graph
num_playlists = 110
num_tracks = 10_000
num_edges = 12_000
src = torch.randint(0, 100, (num_edges, ))
dst = torch.randint(0, 10_000, (num_edges, ))
data_dict = {
('playlist', 'contains', 'track'): (src, dst),
('track', 'contained_by', 'playlist'): (dst, src),
}
g = dgl.heterograph(data_dict)
g
>> Graph(num_nodes={'playlist': 100, 'track': 10000},
num_edges={('playlist', 'contains', 'track'): 12000, ('track', 'contained_by', 'playlist'): 12000},
metagraph=[('playlist', 'track', 'contains'), ('track', 'playlist', 'contained_by')])
#set hyperparams
n_params = {'RANDOM_WALK_LENGTH': 2,
'RANDOM_WALK_RESTART_PROB': 0.5,
'NUM_RANDOM_WALKS': 10,
'NUM_NEIGHBORS': 2}
seeds = torch.randint(0, 10_000, (32,))
#helper function for transferring NIDs to subgraph
def compact_and_copy(frontier, seeds):
block = dgl.to_block(frontier, seeds)
for col, data in frontier.edata.items():
if col == dgl.EID:
continue
block.edata[col] = data[block.edata[dgl.EID]]
return block
#initialize sampler
sampler = dgl.sampling.PinSAGESampler(g, 'track', 'playlist', n_params["RANDOM_WALK_LENGTH"],
n_params["RANDOM_WALK_RESTART_PROB"],
n_params["NUM_RANDOM_WALKS"],
n_params["NUM_NEIGHBORS"])
#generate frontiers
frontier1 = sampler(seeds)
block1 = compact_and_copy(frontier1, seeds)
block1.ndata[dgl.NID]['track']
>>> tensor([ 97, 2227, 1656, 4462, 7843, 7091, 3473, 6914, 2797, 4342, 9333, 6093,
5736, 7377, 1065, 2359, 7149, 3757, 7067, 6887, 5156, 3322, 30, 607,
5926, 7596, 117, 6910, 3329, 4929, 6566, 6609, 9554, 9222, 8764, 8195,
9532, 8726, 5486, 4103, 9586, 8113, 7547, 9555, 9432, 8848, 9264, 8633,
9862, 9593, 9757, 9262, 6972, 2441, 9673, 9356, 8859, 8181, 9292, 8135,
9321, 2960, 9217, 7991, 9392, 8555, 1976, 9544, 9944, 9737])
frontier2 = sampler(seeds)
block2 = compact_and_copy(frontier2, seeds)
block2.ndata[dgl.NID]['track']
>>> tensor([ 97, 2227, 1656, 4462, 7843, 7091, 3473, 6914, 2797, 4342, 9333, 6093,
5736, 7377, 1065, 2359, 7149, 3757, 7067, 6887, 5156, 3322, 30, 607,
5926, 7596, 117, 6910, 3329, 4929, 6566, 6609, 5340, 9554, 9297, 9136,
9866, 9532, 8070, 7530, 5780, 8126, 9694, 9496, 9581, 9189, 9841, 9691,
9573, 9406, 9262, 1253, 2985, 9321, 9870, 8664, 9291, 9286, 9639, 9578,
8721, 8474, 9186, 8448, 9964, 9201, 9894, 9348, 8277, 6649])
block1.ndata[dgl.NID]['track'] == block2.ndata[dgl.NID]['track']
>>> tensor([ True, True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True, True,
True, True, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False, False])
(block1.ndata[dgl.NID]['track'] == block2.ndata[dgl.NID]['track']).all()
>>> tensor(False)
As you can see here, despite the fact that the seeds didn’t change, the two frontiers are not exactly equal. This causes problems with reproducibility (and leads to instability in final performance results). Perhaps there is a way to set the RNG of the sampler?
Thank you!