Model not learning individual node embeddings

Hi,

I had implemented a modification from PinSAGE for my Master’s thesis, but the model is not converting to any result. I would like to know if I’m making some mistake in my DGL or if I need to check other aspects of my formulation.

I’m suspecting that for some reason my implementation is using the initial feature vector for the nodes in the convolutions, and not the learned embeddings.

I used the following code:

Scorer

class BAMGScorer(nn.Module):
    def __init__(self, num_users, num_items, user_hidden_dim, item_hidden_dim, final_hidden_dim, num_layers=1):
        super().__init__() 
        self.layers = nn.ModuleList([
            BAMGLayer(user_hidden_dim, item_hidden_dim) for _ in range(num_layers)])

        self.num_users = num_users
        self.num_items = num_items

        # Node-specific learnable embeddings
        self.user_embeddings = nn.Embedding(self.num_users, user_hidden_dim)
        self.item_embeddings = nn.Embedding(self.num_items, item_hidden_dim)
    
    def get_representation(self):
        return self.user_embeddings, self.item_embeddings

    def forward(self, blocks):
        user_embeddings = self.user_embeddings(blocks[0].srcnodes['user'].data[dgl.NID])
        item_embeddings = self.item_embeddings(blocks[0].srcnodes['item'].data[dgl.NID])

        for block, layer in zip(blocks, self.layers):
            user_embeddings, item_embeddings = layer(block, user_embeddings, item_embeddings)

        return user_embeddings, item_embeddings
        
    def compute_score(self, pair_graph, user_embeddings, item_embeddings):
        with pair_graph.local_scope():
            pair_graph.nodes['user'].data['h'] = user_embeddings
            pair_graph.nodes['item'].data['h'] = item_embeddings
            ngh_scorer = NeighborhoodScore(pair_graph)
            scores = ngh_scorer.get_similarity_scores()
            return scores

Layer

class BAMGLayer(nn.Module):
    def __init__(self, user_hidden_dim, item_hidden_dim):
        super().__init__()
        
        self.heteroconv = dglnn.HeteroGraphConv(
            {'watched': BAMGConv(item_hidden_dim, user_hidden_dim), 'watched-by': BAMGConv(user_hidden_dim, item_hidden_dim)})
        
    def forward(self, block, input_user_features, input_item_features):
        with block.local_scope():
            h_user = input_user_features
            h_item = input_item_features
            
            src_features = {'user': h_user, 'item': h_item}
            dst_features = {'user': h_user[:block.number_of_dst_nodes('user')], 'item': h_item[:block.number_of_dst_nodes('item')]}
            result = self.heteroconv(block, (src_features, dst_features))
            return result['user'], result['item']

Convolution

class BAMGConv(nn.Module):
    def __init__(self, src_dim, dest_dim):
        super().__init__()

        self.linear_src = nn.Linear(in_features=src_dim, out_features=dest_dim, bias=True)
        self.linear_dst = nn.Linear(in_features=dest_dim, out_features=dest_dim, bias=True)
        
    def compute_message(self, edges):
        affinity = edges.data['weight'] / torch.sum(edges.data['weight'])
        return affinity

    def forward(self, graph, node_features):
        with graph.local_scope():
            src_features, dst_features = node_features
            graph.srcdata['h'] = src_features
            graph.dstdata['h'] = dst_features
      
            graph.apply_edges(lambda edges: {'a': self.compute_message(edges)})

            graph.update_all(message_func=fn.u_mul_e('h', 'a', 'h_ngh'),
                             reduce_func=fn.sum('h_ngh', 'neighbors_avg'))

            result = F.relu( torch.add( self.linear_src(graph.dstdata['h']), self.linear_dst(graph.dstdata['neighbors_avg']) ) )
            return result

Loss

def compute_margin_loss(scores, margin=0.1):
    loss = 0
    for score in scores:
      loss += (- score['positive_score'] + score['negative_score'] + margin).clamp(min=0)
    return loss

Training Loop

NUM_LAYERS = 1
user_hidden_dim = 7
item_hidden_dim = 10
final_hidden_dim = 70
model = BAMGScorer(graph.number_of_nodes('user'), graph.number_of_nodes('item'), user_hidden_dim, item_hidden_dim, final_hidden_dim, NUM_LAYERS)
opt = torch.optim.Adam(model.parameters())
NUM_EPOCHS = 15

for _ in range(NUM_EPOCHS):
    model.train()
    with tqdm.tqdm(dataloader) as t:
        # for pos_pair_graph, neg_pair_graph, blocks in t: # sampler return
        for input_nodes, pair_graph, blocks in t:
            user_emb, item_emb = model(blocks)
            score = model.compute_score(pair_graph, user_emb, item_emb)
            loss = compute_margin_loss(score)
            opt.zero_grad()
            loss.backward()
            opt.step()
            t.set_postfix({'loss': '%.4f' % loss.item()}, refresh=False)
    model.eval()

Is this implementation making use of the embedding learned with nn.Embedding in the process of message passing and aggregation?

A quick way to verify if your user and item embedding participated in computation at all is to check their gradients. Could you check if their gradients are zero at all times?

Also how did you define the dataloader? As far as I understand PinSage does not involve user embeddings; the neighbors of items are also items, and the loss functions are also comparing the relevance between two items. If you wish to learn user embeddings as well with PinSage, you probably need one dataloader for user and another for item (and probably two training loops as well).

@BarclayII How can I check the gradients of these variables? Sorry, I don’t know how basic it is, but I’m not a have user of Pytorch.

Regarding the dataloader, this is my code:

sampler = dgl.dataloading.MultiLayerNeighborSampler([100])
dataloader = dgl.dataloading.EdgeDataLoader(
    graph,
    {'watched':  torch.arange(graph.num_edges('watched')) ,'watched-by': torch.arange(graph.num_edges('watched-by'))},
    sampler,
    exclude= None,
    batch_size=1024,
    drop_last=False,
    num_workers=4
)

The model is exactly a variation where I’m trying to realize the training and learn the embedding in just one loop (without the use of os network projections). Nodes type A receives messages only from nodes type B and vice versa. Then I compare nodes of the same type using a hinge loss.

Try something like:

{n: p.grad for n, p in model.named_parameters()}

and look for entries like user_embeddings and item_embeddings.

The gradients of these two variables are not zero and have shapes (n_nodes, embedding_dim) for each node type. Does it mean it’s everything OK with my DGL implementation and what’s happening it’s a model convergence problem?

It at least signifies that your model is using the embeddings in computation. It may be a model convergence problem, but other bugs may also exist.

For instance, your compute_message function normalizes the edge weight across all edges in the entire graph, instead of the incoming edges of each node. I’m not sure if this is intended behavior.

Got it! There is an example when I can see how to implement this “weighted message passing” from neighbors? As you notice, I need to normalize the edge weight only regarding the other edges to the same node.

Also, I’m testing remove the “nn.Embedding” that learn an embedding to each node and learn just the aggregation matrix (self.linear_src and self.linear_dst). Does it make sense? Furthermore, in this strategy, what’s the best way to given an input node X return the final embedding (multiplied by learned matrixes)

You can probably do this to normalize edge weights:

g.edata['weight'] = ...
g.update_all(fn.copy_e('weight', 'm'), fn.sum('m', 'sum_weight'))
g.apply_edges(fn.e_div_v('weight', 'sum_weight', 'normalized_weight'))
... = g.edata['normalized_weight']

What features do you assign to the nodes then (provided that they don’t already have features)?

I generated initial features randomly, but using a method that enables each block (I use the Stochastic Block Model to create the network) to have a different vector pattern.

graph.nodes['item'].data['h'] = torch.cat(
    (
        torch.add(torch.randn(sizes[5], item_hidden_dim), 1),
        torch.add(torch.randn(sizes[6], item_hidden_dim), 1.2),
        torch.add(torch.randn(sizes[7], item_hidden_dim), 1.4),
        torch.add(torch.randn(sizes[8], item_hidden_dim), 1.6),
        torch.add(torch.randn(sizes[9], item_hidden_dim), 1.8),
    )
)
graph.nodes['item'].data['h'] -= graph.nodes['item'].data['h'].min()
graph.nodes['item'].data['h'] /= graph.nodes['item'].data['h'].max()
graph.nodes['item'].data['h'] = 2*graph.nodes['item'].data['h'] - 1

graph.nodes['user'].data['h'] = torch.cat(
    (
        torch.add(torch.randn(sizes[0], user_hidden_dim), 1),
        torch.add(torch.randn(sizes[1], user_hidden_dim), 1.2),
        torch.add(torch.randn(sizes[2], user_hidden_dim), 1.4),
        torch.add(torch.randn(sizes[3], user_hidden_dim), 1.6),
        torch.add(torch.randn(sizes[4], user_hidden_dim), 1.8),
    )
)

graph.nodes['user'].data['h'] -= graph.nodes['user'].data['h'].min()
graph.nodes['user'].data['h'] /= graph.nodes['user'].data['h'].max()
graph.nodes['user'].data['h'] = 2*graph.nodes['user'].data['h'] - 1

I’m not learning new embeddings to each vector in training anymore. Instead, the model is only learning suitable aggregation matrices to mix the initial features h. So, after training, my question is how to input a given node N and output this node features after aggregation using the learning matrices.

If you are referring to aggregation with GNN, then I assume you can get the user and item embedding from the GNN output with user_emb, item_emb = model(blocks)?

If you wish to obtain all the node embeddings independently (like not sampling and training on the edges), you can use NodeDataLoader:

dataloader = dgl.dataloading.NodeDataLoader(
    g,
    {'user': ..., 'item': ...},
    sampler,
    ...)
for input_nodes, output_nodes, blocks in dataloader:
    output_dict = model(blocks)

Thanks for the answer @BarclayII . I don’t know if I fully understood what you mean.

I’m training my network with the following DataLoader:

sampler = dgl.dataloading.MultiLayerNeighborSampler([1000])
dataloader = dgl.dataloading.EdgeDataLoader(
    graph,
    {'watched':  torch.arange(graph.num_edges('watched')) ,'watched-by': torch.arange(graph.num_edges('watched-by'))},
    sampler,
    exclude= None,
    batch_size=1024,
    drop_last=False,
    num_workers=4
)

Using this Dataloader, my goal is to train a network that gets the initial features h from a node and learn how to aggregate this h with the neighborhood h to maximize my objective function.

At the end of the training, I want to obtain the aggregated embeddings (h + neighborhood aggregated using learned matrixes) for all nodes in my network.

Using either the first or second method, you suggest I get vectors with different dimensions from my initial network. Example: Having 100 nodes (type user) with dimensions 10, my output has a shape like [21, 10]

from your second suggested approach:

sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 50])
dataloader_n = dgl.dataloading.NodeDataLoader(
    graph,
    {'item':  torch.arange(graph.num_nodes('item')) ,'user': torch.arange(graph.num_nodes('user'))},
    sampler,
    batch_size=1024,
    shuffle=False,
    drop_last=False,
    num_workers=4
)
for input_nodes, output_nodes, blocks in dataloader_n:
    emb_dict = model(blocks)

---

variable: emb_dict[0].shape
output: torch.Size([461, 10])

variable: graph
output: Graph(num_nodes={'item': 250, 'user': 1000},
      num_edges={('item', 'watched-by', 'user'): 48100, ('user', 'watched', 'item'): 48100},
      metagraph=[('item', 'user', 'watched-by'), ('user', 'item', 'watched')])

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.