Information leakage in EdgeDataLoader

I found Information leakage in EdgeDataLoader, how can I avoid this problem?
In detail, it seems that EdgeDataloader does not drop edges to be predicted in block/subgraph sampling.

code:

import torch
import dgl
dgl.seed(0)
from tqdm import tqdm


data_dict = {
    ('user', 'like', 'post'): (torch.tensor([0, 0, 2]), torch.tensor([0, 1, 2])),
    ('post', 'liked_by', 'user'): (torch.tensor([0, 1, 2]), torch.tensor([0, 0, 2])),
    
    ('user', 'send', 'post'): (torch.tensor([1, 1, 3]), torch.tensor([1, 2, 0])),
    ('post', 'sent_by', 'user'): (torch.tensor([1, 2, 0]), torch.tensor([1, 1, 3]))
}

g = dgl.heterograph(data_dict)

target_eids = g.edges(etype=('user', 'like', 'post'), form='eid')

val_eid_dict = {('user', 'like', 'post'): target_eids}

sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)

val_dataloader = dgl.dataloading.EdgeDataLoader(
    g, val_eid_dict, sampler,
    negative_sampler=dgl.dataloading.negative_sampler.Uniform(1),
    batch_size=1,
    shuffle=False,
    drop_last=False,
    num_workers=1)

for _, (input_nodes, positive_graph, negative_graph, blocks) in tqdm(enumerate(val_dataloader)):
    positive_graph = positive_graph
    negative_graph = negative_graph
    break

print(input_nodes)

print(blocks)

print(blocks[0].ndata)

print(blocks[0].edata)

print(blocks[0].edges(etype='like'),blocks[0].edges(etype='liked_by'),blocks[0].edges(etype='send'),blocks[0].edges(etype='sent_by'))

print(blocks[1].ndata)

print(blocks[1].edata)

print(blocks[1].edges(etype='like'),blocks[1].edges(etype='liked_by'),blocks[1].edges(etype='send'),blocks[1].edges(etype='sent_by'))

print(positive_graph)

print(positive_graph.edata)

print(positive_graph.ndata)

output

{'post': tensor([0, 1]), 'user': tensor([0, 3, 1])}
[Block(num_src_nodes={'post': 2, 'user': 3},
      num_dst_nodes={'post': 2, 'user': 2},
      num_edges={('post', 'liked_by', 'user'): 2, ('post', 'sent_by', 'user'): 1, ('user', 'like', 'post'): 2, ('user', 'send', 'post'): 2},
      metagraph=[('post', 'user', 'liked_by'), ('post', 'user', 'sent_by'), ('user', 'post', 'like'), ('user', 'post', 'send')]), Block(num_src_nodes={'post': 2, 'user': 2},
      num_dst_nodes={'post': 1, 'user': 1},
      num_edges={('post', 'liked_by', 'user'): 2, ('post', 'sent_by', 'user'): 0, ('user', 'like', 'post'): 1, ('user', 'send', 'post'): 1},
      metagraph=[('post', 'user', 'liked_by'), ('post', 'user', 'sent_by'), ('user', 'post', 'like'), ('user', 'post', 'send')])]
defaultdict(<class 'dict'>, {'_ID': {'post': tensor([0, 1]), 'user': tensor([0, 3, 1])}})
defaultdict(<class 'dict'>, {'_ID': {('post', 'liked_by', 'user'): tensor([0, 1]), ('post', 'sent_by', 'user'): tensor([0]), ('user', 'like', 'post'): tensor([0, 1]), ('user', 'send', 'post'): tensor([0, 1])}})
(tensor([0, 0]), tensor([0, 1])) (tensor([0, 1]), tensor([0, 0])) (tensor([1, 2]), tensor([0, 1])) (tensor([0]), tensor([1]))
defaultdict(<class 'dict'>, {'_ID': {'post': tensor([0, 1]), 'user': tensor([0, 3])}})
defaultdict(<class 'dict'>, {'_ID': {('post', 'liked_by', 'user'): tensor([0, 1]), ('post', 'sent_by', 'user'): tensor([], dtype=torch.int64), ('user', 'like', 'post'): tensor([0]), ('user', 'send', 'post'): tensor([0])}})
(tensor([0]), tensor([0])) (tensor([0, 1]), tensor([0, 0])) (tensor([1]), tensor([0])) (tensor([], dtype=torch.int64), tensor([], dtype=torch.int64))
Graph(num_nodes={'post': 1, 'user': 1},
      num_edges={('post', 'liked_by', 'user'): 0, ('post', 'sent_by', 'user'): 0, ('user', 'like', 'post'): 1, ('user', 'send', 'post'): 0},
      metagraph=[('post', 'user', 'liked_by'), ('post', 'user', 'sent_by'), ('user', 'post', 'like'), ('user', 'post', 'send')])
defaultdict(<class 'dict'>, {'_ID': {('post', 'liked_by', 'user'): tensor([], dtype=torch.int64), ('post', 'sent_by', 'user'): tensor([], dtype=torch.int64), ('user', 'like', 'post'): tensor([0]), ('user', 'send', 'post'): tensor([], dtype=torch.int64)}})
defaultdict(<class 'dict'>, {'_ID': {'post': tensor([0]), 'user': tensor([0])}})

In the above case, I’d like to predict edge user0 like post0, however, in the sampled blocks, user0 like post0 existed, in other words, all models learned representation of user0 and post0 with the relation preserved.

Please check the exclude option in EdgeDataloader at dgl.dataloading — DGL 0.8 documentation.
and also example in Edge dataloader exclude parameter not working · Issue #3408 · dmlc/dgl · GitHub.

Note that there’s a bug about this in previous version and just got fixed today. So you need to install the nightly version later than today to have this patch, or you can manually patch this in your dgl following Fix edge ID exclusion not working in EdgeDataLoader by BarclayII · Pull Request #3412 · dmlc/dgl · GitHub

Thanks a lot, I will have a try!

Hi, I tried dgl-cu102 0.8a211014 with torch 1.8.1, however, it seems that nothing changed.
code and output:

import torch
import dgl
dgl.seed(0)
from tqdm import tqdm


data_dict = {
    ('user', 'like', 'post'): (torch.tensor([0, 0, 2]), torch.tensor([0, 1, 2])),
    ('post', 'liked_by', 'user'): (torch.tensor([0, 1, 2]), torch.tensor([0, 0, 2])),
    
    ('user', 'send', 'post'): (torch.tensor([1, 1, 3]), torch.tensor([1, 2, 0])),
    ('post', 'sent_by', 'user'): (torch.tensor([1, 2, 0]), torch.tensor([1, 1, 3]))
}

g = dgl.heterograph(data_dict)

target_eids = g.edges(etype=('user', 'like', 'post'), form='eid')

val_eid_dict = {'like': target_eids}

sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)

val_dataloader = dgl.dataloading.EdgeDataLoader(
    g, val_eid_dict, sampler,
    negative_sampler=dgl.dataloading.negative_sampler.Uniform(1),
    batch_size=1,
    shuffle=False,
    drop_last=False,
    num_workers=1,
    exclude='reverse_types',
    reverse_etypes = {
        'like':'liked_by',
        'send':'sent_by',
        'liked_by':'like',
        'sent_by':'send',
    })

for _, (input_nodes, positive_graph, negative_graph, blocks) in tqdm(enumerate(val_dataloader)):
    positive_graph = positive_graph
    negative_graph = negative_graph
    break

print('blocks[0] ndata')
print(blocks[0].ndata)
print('like')
print(blocks[0].edges(etype='like'))
print('liked_by')
print(blocks[0].edges(etype='liked_by'))


print('positive_graph')
print('ndata')
print(positive_graph.ndata)
print('like')
print(positive_graph.edges(etype='like'))
blocks[0] ndata
defaultdict(<class 'dict'>, {'_ID': {'post': tensor([0, 1]), 'user': tensor([0, 3])}})
like
(tensor([0]), tensor([0]))
liked_by
(tensor([0, 1]), tensor([0, 0]))
positive_graph
ndata
defaultdict(<class 'dict'>, {'_ID': {'post': tensor([0]), 'user': tensor([0])}})
like
(tensor([0]), tensor([0]))

In the case, sampled edge is user 0 like post 0, this edge, however, exists in the block still.

Could you try [Bug] Fix edge exclusion still not working for full neighbor sampling by BarclayII · Pull Request #3424 · dmlc/dgl · GitHub?

OK, I will try. Using this version requires me to install from source, right?

It’s merged in Friday. You can use nightly version later than that

Sorry to bother you, I want to ask what is logic in EdgeDataloder, it seems that subgraphs sampled are not centered on the ends of sampled edges, can you try the toy example below?

import torch
import dgl
dgl.seed(0)
from tqdm import tqdm
import copy

data_dict = {
    ('user', 'like', 'post'): (torch.tensor([0, 0, 2]), torch.tensor([0, 1, 2])),
    ('post', 'liked_by', 'user'): (torch.tensor([0, 1, 2]), torch.tensor([0, 0, 2])),
    
    ('user', 'send', 'post'): (torch.tensor([1, 2]), torch.tensor([1, 0])),
    ('post', 'sent_by', 'user'): (torch.tensor([1, 0]), torch.tensor([1, 2]))
}

g = dgl.heterograph(data_dict)

g_sampling = copy.deepcopy(g)
g_sampling.remove_edges(torch.tensor([0]), etype='like')
g_sampling.remove_edges(torch.tensor([0]), etype='liked_by')

sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
test_dataloader = dgl.dataloading.EdgeDataLoader(
    g=g, eids={'like':torch.tensor([0])}, block_sampler=sampler,
    g_sampling = g_sampling,
    negative_sampler=dgl.dataloading.negative_sampler.Uniform(16),
    batch_size=1,
    shuffle=False,
    drop_last=False,
    num_workers=1)

for _, (input_nodes, positive_graph, negative_graph, blocks) in tqdm(enumerate(test_dataloader)):
    for i, block in enumerate(blocks):
    print(f'blocks[{i}] ndata')
    print(block.ndata)
    print('like')
    print(block.edges(etype='like'))
    print('liked_by')
    print(block.edges(etype='liked_by'))
    print('send')
    print(block.edges(etype='send'))
    print('sent_by')
    print(block.edges(etype='sent_by'))
    print()


    print('positive_graph')
    print('ndata')
    print(positive_graph.ndata)
    print('like')
    print(positive_graph.edges(etype='like'))
    break

output

blocks[0] ndata
defaultdict(<class 'dict'>, {'_ID': {'post': tensor([0, 1, 2]), 'user': tensor([0, 2, 1])}})
like
(tensor([0, 1]), tensor([1, 2]))
liked_by
(tensor([1]), tensor([0]))
send
(tensor([1, 2]), tensor([0, 1]))
sent_by
(tensor([], dtype=torch.int64), tensor([], dtype=torch.int64))

positive_graph
ndata
defaultdict(<class 'dict'>, {'_ID': {'post': tensor([0, 1, 2]), 'user': tensor([0])}})
like
(tensor([0]), tensor([0]))

sampled edge is user 0 like post 0, however, sampled subgraph seems without any logic.

The last block returned by EdgeDataLoader will contain the edges connecting towards the seed nodes sampled by your neighbor sampler. In your case, the seed nodes not only contain the positive example you have sampled (i.e. user 0 and post 0), but also the negative examples (very often user 0 and post 0,1,2, since you have 16 negative examples). This is why the edges of like relation are also sampled.

Also, the node IDs in the blocks are relabeled. So when block.edges(etype='like') returns (tensor([0, 1]), tensor([1, 2])), you will need to index into blocks[0].ndata[dgl.NID] to get the actual node IDs (user 0 → post 1, user 2 → post 2 in your case).

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.