Is there any generally accepted way to convert semi-supervised graph neural networks to a supervised temporal graph neural networks?

Using DGL, I found the following implementation of RCGN. It’s coped from

import dgl
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
from dgl.nn.functional import edge_softmax

class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        # W_r for each relation
        self.weight = nn.ModuleDict({
                name: nn.Linear(in_size, out_size) for name in etypes

    def forward(self, G, feat_dict):
        # The input is a dictionary of node features for each type
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            # Compute W_r * h
            Wh = self.weight[etype](feat_dict[srctype])
            # Save it in graph for message passing
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            # Specify per-relation message passing functions: (message_func, reduce_func).
            # Note that the results are saved to the same destination feature 'h', which
            # hints the type wise reducer for aggregation.
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))

        # Trigger message passing of multiple types.
        # The first argument is the message passing functions for each relation.
        # The second one is the type wise reducer, could be "sum", "max",
        # "min", "mean", "stack"
        G.multi_update_all(funcs, 'sum')

        # return the updated node feature dictionary
        return {ntype: G.nodes[ntype].data['h'] for ntype in G.ntypes}

class HeteroRGCN(nn.Module):
    def __init__(self, G, in_size, hidden_size, out_size):
        super(HeteroRGCN, self).__init__()
        # create layers
        self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
        self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)

    def forward(self, G, out_key):
        input_dict = {ntype: G.nodes[ntype].data['inp'] for ntype in G.ntypes}
        h_dict = self.layer1(G, input_dict)
        h_dict = {k: F.leaky_relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)

        # get appropriate logits
        return h_dict[out_key]
import urllib.request
import dgl
import math
import numpy as np
from model import *
import argparse

data_url = ''
data_file_path = '/tmp/ACM.mat'

urllib.request.urlretrieve(data_url, data_file_path)
data =

parser = argparse.ArgumentParser(description='Training GNN on ogbn-products benchmark')

parser.add_argument('--n_epoch', type=int, default=200)
parser.add_argument('--n_hid',   type=int, default=256)
parser.add_argument('--n_inp',   type=int, default=256)
parser.add_argument('--clip',    type=int, default=1.0)
parser.add_argument('--max_lr',  type=float, default=1e-3)

args = parser.parse_args()

def get_n_params(model):
    pp = 0
    for p in list(model.parameters()):
        nn = 1
        for s in list(p.size()):
            nn = nn * s
        pp += nn
    return pp

def train(model, G):
    best_val_acc = torch.tensor(0)
    best_test_acc = torch.tensor(0)
    train_step = torch.tensor(0)
    for epoch in np.arange(args.n_epoch) + 1:
        logits = model(G, 'paper')
        # The loss is computed only for labeled nodes.
        loss = F.cross_entropy(logits[train_idx], labels[train_idx].to(device))
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        train_step += 1
        if epoch % 5 == 0:
            logits = model(G, 'paper')
            pred = logits.argmax(1).cpu()
            train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
            val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
            test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
            if best_val_acc < val_acc:
                best_val_acc = val_acc
                best_test_acc = test_acc
            print('Epoch: %d LR: %.5f Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (

device = torch.device("cuda:0")

G = dgl.heterograph({
        ('paper', 'written-by', 'author'): data['PvsA'].nonzero(),
        ('author', 'writing', 'paper'): data['PvsA'].transpose().nonzero(),
        ('paper', 'citing', 'paper'): data['PvsP'].nonzero(),
        ('paper', 'cited', 'paper'): data['PvsP'].transpose().nonzero(),
        ('paper', 'is-about', 'subject'): data['PvsL'].nonzero(),
        ('subject', 'has', 'paper'): data['PvsL'].transpose().nonzero(),

pvc = data['PvsC'].tocsr()
p_selected = pvc.tocoo()
# generate labels
labels = pvc.indices
labels = torch.tensor(labels).long()

# generate train/val/test split
pid = p_selected.row
shuffle = np.random.permutation(pid)
train_idx = torch.tensor(shuffle[0:800]).long()
val_idx = torch.tensor(shuffle[800:900]).long()
test_idx = torch.tensor(shuffle[900:]).long()

node_dict = {}
edge_dict = {}

for ntype in G.ntypes:
    node_dict[ntype] = len(node_dict)
for etype in G.etypes:
    edge_dict[etype] = len(edge_dict)
    G.edges[etype].data['id'] = torch.ones(G.number_of_edges(etype), dtype=torch.long) * edge_dict[etype]

#     Random initialize input feature
for ntype in G.ntypes:
    emb = nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), 256), requires_grad=False)
    G.nodes[ntype].data['inp'] = emb

G =
model = HeteroRGCN(G,
optimizer = torch.optim.AdamW(model.parameters())
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, total_steps=args.n_epoch, max_lr=args.max_lr)
print('Training RGCN with #param: %d' % (get_n_params(model)))
train(model, G)

From the training code I gather this has been written to predict new labels for the unlabeled nodes in the graph.

I’m quite new to working with graphs, and graph neural networks and was wondering if anyone could point me to resources that might explain how I can re-write this as a node regression type of model.

@mufeili Do you know if there’s any examples for node regression?

In general, node regression is quite similar to node classification. The difference is merely what’s the target. For classification, it’s node labels representing the classes while for regression, it’s some value (usually floating point) to optimize for. Therefore, you can usually reuse a large part of node classification model for regression task. For example, you can try modify the train function above to use a different loss function, e.g., replace cross_entropy which is usually for multi-class classification with MSELoss.

Just found that your title is to ask about temporal GNN but your question is about node regression model.

Ah yes, maybe I wasn’t specific enough. My graph consists of 2 types of nodes with 3 types of edge connections (all with different features). The graph is temporal in the sense that it varies over time, and I want to use the temporally changing features as input features as well.

I’m having difficulty finding work or ways of working with temporally changing heterogeneous graph neural networks.

I don’t know examples of node regression.

A simple baseline would be running an LSTM for each node’s temporally varying features to get a sequential representation, and then running a GNN on top of that. Any kind of heterogeneous GNN (e.g. HGT, RGCN) should work.

Thanks for the response.

Do you have any examples of this possibly? I am very new to graph neural networks and I wouldn’t know how to do this free-hand.

And furthermore, I don’t see an example of how to actually work with spatial-temporal graphs. So in my case I have a graph for each day from 1965-2020, the graph has the same structure, but the features of 4 nodes do change from day to day.

The goal is then to regressively predict the value of one of these nodes. Do you know of any work that does this using DGL?

You could check out our STGCN example.