Reproducibility of the results in RGCN

Hi everyone!

I’m working on a node classification problem using graphs. I’m new to DGL and bothered by the reproducibility problem. This is my code, I wondering whether there is some randomness computation?

I’m using dgl==0.6.1, pytorch==1.9.0. The experiments are running in 3090(24GB)
my seed is fixed as follows:

    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        tf.random.set_seed(seed)
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from typing import List, Union
import torch
from dgl import DGLGraph
 
gcn_msg=fn.copy_src(src="h",out="m")
gcn_reduce=fn.sum(msg="m",out="h")


class RGCNLayer(nn.Module):
    def __init__(self, feat_size, num_rels, activation=None, gated = True):
        
        super(RGCNLayer, self).__init__()
        self.feat_size = feat_size
        self.num_rels = num_rels
        self.activation = activation
        self.gated = gated

        self.weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, self.feat_size))
        # init trainable parameters
        nn.init.xavier_uniform_(self.weight,gain=nn.init.calculate_gain('relu'))
        
        if self.gated:
            self.gate_weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, 1))
            nn.init.xavier_uniform_(self.gate_weight,gain=nn.init.calculate_gain('sigmoid'))
        
    def forward(self, g):
        
        weight = self.weight
        gate_weight = self.gate_weight
        
        def message_func(edges):
            w = weight[edges.data['rel_type']]
            msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
            msg = msg * edges.data['norm']
            
            if self.gated:
                gate_w = gate_weight[edges.data['rel_type']]
                gate = torch.bmm(edges.src['h'].unsqueeze(1), gate_w).squeeze().reshape(-1,1)
                gate = torch.sigmoid(gate)
                msg = msg * gate    
            return {'msg': msg}

        def apply_func(nodes):
            h = nodes.data['h']
            h = self.activation(h)
            return {'h': h}

        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)


class RGCNModel(nn.Module):
    def __init__(self, h_dim, num_rels, num_hidden_layers=1, gated = True):
        super(RGCNModel, self).__init__()

        self.h_dim = h_dim
        self.num_rels = num_rels
        self.num_hidden_layers = num_hidden_layers
        self.gated = gated
        
        # create rgcn layers
        self.build_model()
       
    def build_model(self):        
        self.layers = nn.ModuleList() 
        for _ in range(self.num_hidden_layers):
            rgcn_layer = RGCNLayer(self.h_dim, self.num_rels, activation=F.relu, gated = self.gated)
            self.layers.append(rgcn_layer)
    
    def forward(self, g):
        for layer in self.layers:
            layer(g)
        
        rst_hidden = []
        for sub_g in dgl.unbatch(g):
            rst_hidden.append(sub_g.ndata['h'])
        return rst_hidden

class transformerEntailmentWhole(nn.Module):
    def __init__(self, config,transformer_encoder):
        super().__init__()
        self.config = config
        self.dropout = nn.Dropout(0.1)
        self.relation_key_pair = {'Comment': 0, 'Clarification_question': 1, 'Elaboration': 2, 'Acknowledgement': 3,
                                  'Continuation': 4, 'Explanation': 5, 'Conditional': 6, 'Question-answer_pair': 7,
                                  'Alternation': 8, 'Q-Elab': 9, 'Result': 10, 'Background': 11, 'Narration': 12,
                                  'Correction': 13, 'Parallel': 14, 'Contrast': 15}
        # if config.d_model == 1024:
        #     encoder_layer = TransformerEncoderLayer(config.d_model, 16, 4 * self.config.d_model)
        # else:
        #     encoder_layer = TransformerEncoderLayer(config.d_model, 12, 4 * self.config.d_model)
        #
        # encoder_norm = nn.LayerNorm(config.d_model)
        self.num_decoupling = 1

        if config.d_model == 1024:
            electra_config = ElectraConfig.from_pretrained("google/electra-large-discriminator", cache_dir=None)
        else:
            electra_config = ElectraConfig.from_pretrained("google/electra-base-discriminator", cache_dir=None)

        self.fuse = FuseLayer(config)

        self.localMHA = nn.ModuleList([MHA(electra_config) for _ in range(1)])
        self.globalMHA = nn.ModuleList([MHA(electra_config) for _ in range(1)])

        self.transformer_encoder = transformer_encoder

        self.relation_embeds = nn.Embedding(18, config.d_model)
        self.edge_embeds = nn.Embedding(6, config.d_model)

        self.GCN = RGCNModel(config.d_model, 6, 1, True)

        # self._reset_transformer_parameters()



    def _reset_transformer_parameters(self):
        r"""Initiate parameters in the transformer model."""
        for name, param in self.named_parameters():
            if 'transformer' in name and param.dim() > 1:
                xavier_uniform_(param)

    def forward(self, input_ids,hidden_states, rule_idx, relationInput, user_idx, scenario, entailment_len):
        tenc_input, tenc_mask, tenc_input_gcn,tenc_input_rule = [], [], [],[]
        rule_mask = []
        userinfo_input, rule_input = [], []

        for idx in range(len(rule_idx)):
            G = dgl.DGLGraph().to(hidden_states.device)
            relation = []
            edge_type = []  # in total six type of edges
            edges = []

            relationTemp = relationInput[idx]
            rule_idxTemp = rule_idx[idx]
            user_idxTemp = user_idx[idx]

            for item in relationTemp:
                if item['type'] not in relation:
                    relation.append(item['type'])
            G.add_nodes(rule_idxTemp.shape[0] + 1 + len(relation))  # total utterance nodes in the graph

            # Graph Construction
            for item in relationTemp:
                # add default_in and default_out edges
                G.add_edges(item['y'], relation.index(item['type']) + rule_idxTemp.shape[0] + 1)
                edge_type.append(0)
                edges.append([item['y'], relation.index(item['type']) + rule_idxTemp.shape[0] + 1])
                # G.edges[item['y'], relation.index(item['type'])+e['entail']['rule_idx'].shape[0]+1].data['rel_type'] = self.edge_embeds(Variable(torch.LongTensor([0,]).to(self.device)))
                G.add_edges(relation.index(item['type']) + rule_idxTemp.shape[0] + 1, item['x'])
                edge_type.append(1)
                edges.append([relation.index(item['type']) + rule_idxTemp.shape[0] + 1, item['x']])
                # G.edges[relation.index(item['type'])+e['entail']['rule_idx'].shape[0]+1, item['x']].data['rel_type'] = self.edge_embeds(Variable(torch.LongTensor([1,]).to(self.device)))

                # add reverse_out and reverse_in edges
                G.add_edges(relation.index(item['type']) + rule_idxTemp.shape[0] + 1, item['y'])
                edge_type.append(2)
                edges.append([relation.index(item['type']) + rule_idxTemp.shape[0] + 1, item['y']])
                # G.edges[relation.index(item['type'])+e['entail']['rule_idx'].shape[0]+1, item['y']].data['rel_type'] = self.edge_embeds(Variable(torch.LongTensor([2,]).to(self.device)))
                G.add_edges(item['x'], relation.index(item['type']) + rule_idxTemp.shape[0] + 1)
                edge_type.append(3)
                edges.append([item['x'], relation.index(item['type']) + rule_idxTemp.shape[0] + 1])
                # G.edges[item['x'], relation.index(item['type'])+e['entail']['rule_idx'].shape[0]+1].data['rel_type'] = self.edge_embeds(Variable(torch.LongTensor([3,]).to(self.device)))

            # add self edges
            for x in range(rule_idxTemp.shape[0] + 1 + len(relation)):
                G.add_edges(x, x)
                edge_type.append(4)
                edges.append([x, x])
                # G.edges[x,x].data['rel_type'] = self.edge_embeds(Variable(torch.LongTensor([4,]).to(self.device)))

            # add global edges
            for x in range(rule_idxTemp.shape[0] + 1 + len(relation)):
                if x != rule_idxTemp.shape[0]:
                    G.add_edges(rule_idxTemp.shape[0], x)
                    edge_type.append(5)
                    edges.append([rule_idxTemp.shape[0], x])
                    # G.edges[x,x].data['rel_type'] = self.edge_embeds(Variable(torch.LongTensor([5,]).to(self.device)))

            # add node feature
            for i in range(rule_idxTemp.shape[0] + 1 + len(relation)):
                if i < rule_idxTemp.shape[0]:
                    G.nodes[[i]].data['h'] = torch.index_select(hidden_states[idx], 0,
                                                                torch.LongTensor([rule_idxTemp[i], ]).to(
                                                                    hidden_states.device))
                elif i == rule_idxTemp.shape[0]:
                    if scenario[idx] != -1:
                        G.nodes[[i]].data['h'] = torch.index_select(hidden_states[idx], 0, torch.LongTensor(
                            [user_idxTemp[1], ]).to(hidden_states.device))
                    else:
                        G.nodes[[i]].data['h'] = self.relation_embeds(
                            Variable(torch.LongTensor([16, ]).to(hidden_states.device)))

                else:
                    index_relation = self.relation_key_pair[relation[i - rule_idxTemp.shape[0] - 1]]
                    G.nodes[[i]].data['h'] = self.relation_embeds(
                        Variable(torch.LongTensor([index_relation, ]).to(hidden_states.device)))

            edge_norm = []
            for e1, e2 in edges:
                if e1 == e2:
                    edge_norm.append(1)
                else:
                    edge_norm.append(1 / (G.in_degrees(e2) - 1))

            edge_type = torch.tensor(edge_type).to(hidden_states.device)
            edge_norm = torch.tensor(edge_norm).unsqueeze(1).float().to(hidden_states.device)
            G.edata.update({'rel_type': edge_type, })
            G.edata.update({'norm': edge_norm})
            X = self.GCN(G)[0]  # [bz, hdim]

            tenc_idx = torch.cat([rule_idxTemp, user_idxTemp], dim=-1).to(hidden_states.device)
            gcn_user = torch.index_select(hidden_states[idx], 0, user_idxTemp.to(hidden_states.device))
            gcn_rule_idx = torch.LongTensor([i for i in range(rule_idxTemp.shape[0])]).to(hidden_states.device)
            gcn_rule = torch.index_select(X, 0, gcn_rule_idx)
            tenc_input_gcn.append(torch.cat([gcn_rule, gcn_user], dim=0))

            tenc_input_ = []
            tenc_input_global = []
            inp_ = user_idxTemp[0]
            inp = input_ids
            ruleidx = rule_idxTemp
            # construct mask matrix for multihead attention
            M1 = M2 = torch.zeros(inp_, inp_)

            for id_ in range(len(ruleidx) - 1):
                M1[ruleidx[id_]:ruleidx[id_ + 1], ruleidx[id_]:ruleidx[id_ + 1]] = 1.0
            M1[ruleidx[-1]:inp_, ruleidx[-1]:inp_] = 1.0

            M2 = 1.0 - M1
            M1 = (1.0 - M1) * -10000.0
            M2 = (1.0 - M2) * -10000.0

            M1 = M1.unsqueeze(0).unsqueeze(1)
            M2 = M2.unsqueeze(0).unsqueeze(1)

            s = [i for i in range(user_idxTemp[0])]
            s = torch.LongTensor(s)

            rule_selected = torch.index_select(hidden_states[idx], 0, s.to(hidden_states.device))
            rule_selected = rule_selected.unsqueeze(0)

            local_word_level = self.localMHA[0](rule_selected, rule_selected, attention_mask=M1.to(hidden_states.device))[0]
            global_word_level = self.globalMHA[0](rule_selected, rule_selected, attention_mask=M2.to(hidden_states.device))[0]

            for t in range(1, self.num_decoupling):
                local_word_level = \
                self.localMHA[t](local_word_level, local_word_level, attention_mask=M1.to(hidden_states.device))[0]
                global_word_level = \
                self.globalMHA[t](global_word_level, global_word_level, attention_mask=M2.to(hidden_states.device))[0]

            context_word_level = self.fuse(rule_selected, local_word_level, global_word_level)

            rule_input.append(
                torch.index_select(context_word_level.squeeze(0), 0, rule_idxTemp.to(hidden_states.device)))

            userinfo_input.append(torch.index_select(hidden_states[idx], 0, user_idxTemp.to(hidden_states.device)))

            for i in rule_input[-1]:
                tenc_input_.append(i)
            for j in userinfo_input[-1]:
                tenc_input_.append(j)
            tenc_input_rule.append(torch.Tensor([t.cpu().detach().numpy() for t in tenc_input_]))

            tenc_input.append(torch.index_select(hidden_states[idx], 0, tenc_idx))
            tenc_mask.append(torch.tensor([False] * tenc_idx.shape[0], dtype=torch.bool))
            rule_mask.append(torch.tensor([1] * rule_idxTemp.shape[0], dtype=torch.bool))


        tenc_input_gcn_padded = torch.nn.utils.rnn.pad_sequence(tenc_input_gcn).to(hidden_states.device)
        tenc_input_padded = torch.nn.utils.rnn.pad_sequence(tenc_input).to(hidden_states.device)  # [seqlen, N, dim]

        tenc_input_rule_padded = torch.nn.utils.rnn.pad_sequence(tenc_input_rule).to(hidden_states.device)

        tenc_mask_padded = torch.nn.utils.rnn.pad_sequence(tenc_mask, batch_first=True, padding_value=True).to(
            hidden_states.device)
        # tenc_out = self.transformer_encoder(tenc_input_padded, src_key_padding_mask=tenc_mask_padded)
        tenc_out_gcn = self.transformer_encoder(tenc_input_gcn_padded, src_key_padding_mask=tenc_mask_padded).transpose(0,1)

        tenc_out_rule = self.transformer_encoder(tenc_input_rule_padded, src_key_padding_mask=tenc_mask_padded).transpose(0,1)

        return tenc_out_gcn+tenc_out_rule

Thanks a lot!

I’ve tried additionally set
‘’’
torch.backends.cudnn.deterministic = True
dgl.seed(seed)
‘’’
It’s not working.

Do you have to use 0.6.1? Could you try our newer versions?

Basically I think our kernel is deterministic. Did you see non-deterministic also on CPU?

I’ve tried to use 0.7.2, but they still remain the problem. I didn’t try on the CPU, because it’s so slow.