GatedGraphConv Error: ggnn.ggcn.linears.5.weight' s grad is None, when ggnn edge_types is 8

If I set the edge_types to 8, the grad of the bias or weight of ggnn.ggcn.linears.5 or ggnn.ggcn.linears.3 is None, when I try to see the change of the gradient of the model in the process of training the model.

This is the test code:

import torch
import torch.nn.functional as F
from dgl.nn.pytorch import GatedGraphConv
import dgl
import numpy as np


class MyGGCN(torch.nn.Module):
    def __init__(self, in_feats, out_feats):
        super(MyGGCN, self).__init__()
        self.ggcn = GatedGraphConv(in_feats=in_feats, out_feats=out_feats, n_etypes=8, n_steps=5)

    def forward(self, g, feat, edge_types):
        X = self.ggcn(g, feat, edge_types)
        return X


class ConvBlock(torch.nn.Module):
    def __init__(self, kernel_h, emb_size, max_line):
        super(ConvBlock, self).__init__()

        self.cnn = torch.nn.Conv1d(in_channels=emb_size, out_channels=10, kernel_size=kernel_h)

        self.max_pool = torch.nn.MaxPool1d(kernel_size=(max_line - kernel_h + 1))

    def forward(self, X):
        X = self.cnn(X.squeeze(1).permute(0, 2, 1))

        X = F.relu(X)

        # X = X.squeeze(-1)
        X = self.max_pool(X)

        X = X.squeeze(-1)

        return X


class MyTextCNN(torch.nn.Module):
    def __init__(self, emb_size, max_line):
        super(MyTextCNN, self).__init__()

        self.block2 = ConvBlock(3, emb_size, max_line)
        self.block3 = ConvBlock(4, emb_size, max_line)
        self.block4 = ConvBlock(5, emb_size, max_line)

    def forward(self, X):
        X = X.unsqueeze(1)
        X_2 = self.block2(X)
        X_3 = self.block3(X)
        X_4 = self.block4(X)

        X = torch.cat([X_2, X_3, X_4], dim=1)

        return X


class AssembleModel(torch.nn.Module):
    def __init__(self):
        super(AssembleModel, self).__init__()

        self.cnn_1 = MyTextCNN(300, 200)
        self.cnn_2 = MyTextCNN(300, 500)
        self.ggnn = MyGGCN(300, 300)

        self.dropout=torch.nn.Dropout(0.3)

        # self.fc1=torch.nn.Linear(99,10)

        self.fc = torch.nn.Linear(30 * 2 + 300, 2)

    def forward(self, X_1, X_2, X_3):
        X_1 = np.array(X_1)
        X_2 = np.array(X_2)

        X_1 = torch.tensor(X_1, dtype=torch.float)
        X_2 = torch.tensor(X_2, dtype=torch.float)

        X_1 = self.cnn_1(X_1)

        X_1=self.dropout(X_1)

        X_2 = self.cnn_2(X_2)
        X_3 = self.ggnn(X_3, X_3.ndata["h"], X_3.edata["e"])

        X_3=X_3[0:10]

        # X_3=self.fc1(X_3)

        X_3 = self.dropout(X_3)


        new_X_1 = []
        new_X_2 = []

        for idx in range(X_1.shape[0]):
            if idx < 10:
                new_X_1.append(X_1[idx])
                new_X_2.append(X_2[idx])
            else:
                break

        X_1=torch.stack(new_X_1,0)
        X_2 = torch.stack(new_X_2, 0)

        X = torch.cat([X_1, X_2, X_3], dim=1)

        out = self.fc(X)

        return out


device = torch.device("cuda")

model = AssembleModel()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

X_1 = []
X_2 = []
for idx in range(20):
    X_1.append(np.random.random((200, 300)))
    X_2.append(np.random.random((500, 300)))

# X_1 = torch.randn(10, 200, 300)
# X_2 = torch.randn(10, 500, 300)

X_3 = dgl.graph(([34, 96, 79, 13, 46, 11, 98, 68, 23, 50, 57, 4, 42, 85, 38, 19, 52, 96, 97, 42, 3, 79, 21, 4, 72, 97, 38, 34, 4, 33, 2, 67, 4, 4, 4, 24, 19, 57, 75, 4, 2, 41, 39, 26, 34, 35, 4, 54, 26, 2, 75, 49, 34, 77, 30, 24, 31, 60, 50, 40, 41, 4, 40, 79, 2, 14, 43, 34, 68, 52, 60, 4, 26, 31, 79, 19, 34, 24, 42, 34, 2, 2, 54, 17, 77, 34, 21, 51, 23, 41, 43, 16, 41, 51, 89, 34, 34, 34, 4, 23, 23, 23, 34, 49, 4, 95, 9, 4, 23, 73, 43, 76, 49, 86, 72, 67, 51, 23, 0, 9, 17, 79, 9, 92, 26, 7, 38, 96, 17, 98, 71, 64, 31, 74, 21, 20, 44, 92, 96, 11, 76, 40, 76, 35, 26, 7, 3, 97, 4, 26, 26, 23, 38, 92, 39, 81, 34, 77, 43, 39, 30, 4, 26, 10, 4, 67, 91, 75, 44, 7, 3, 4, 50, 23, 38, 17, 45, 0, 57, 60, 68, 42, 49, 0, 85, 19, 74, 51, 7, 52, 41, 87, 3, 19, 30, 35, 70, 72, 2, 79, 4, 4, 2, 41, 79, 85, 2, 38, 4, 34, 79, 4, 34, 49], [0, 78, 68, 60, 68, 42, 34, 52, 28, 73, 27, 25, 28, 28, 3, 74, 28, 28, 2, 80, 28, 26, 32, 38, 77, 81, 98, 70, 79, 75, 91, 40, 41, 89, 19, 28, 43, 95, 66, 34, 87, 28, 25, 94, 13, 79, 28, 85, 57, 28, 33, 28, 64, 24, 91, 3, 86, 19, 22, 67, 21, 77, 30, 45, 56, 76, 31, 23, 46, 68, 18, 51, 33, 28, 63, 31, 60, 77, 2, 19, 62, 79, 26, 47, 8, 51, 41, 64, 31, 49, 5, 97, 53, 70, 41, 74, 7, 28, 7, 43, 60, 74, 31, 34, 96, 57, 46, 44, 93, 50, 92, 10, 71, 72, 83, 87, 43, 48, 6, 69, 28, 88, 45, 23, 75, 13, 11, 17, 42, 31, 0, 7, 39, 23, 10, 54, 28, 37, 50, 16, 29, 28, 14, 52, 95, 23, 73, 40, 2, 55, 28, 58, 17, 7, 51, 35, 43, 96, 28, 43, 59, 42, 85, 21, 43, 15, 30, 85, 34, 64, 24, 23, 96, 61, 49, 96, 9, 34, 33, 13, 84, 65, 76, 71, 75, 77, 19, 82, 12, 9, 77, 67, 72, 90, 87, 20, 51, 86, 40, 52, 14, 17, 67, 19, 46, 57, 30, 1, 26, 36, 9, 37, 86, 41]))
X_3.ndata['h'] = torch.randn(99, 30)
X_3.edata["e"]=torch.tensor([0, 0, 7, 1, 1, 0, 0, 1, 2, 0, 0, 2, 6, 1, 0, 6, 1, 2, 0, 0, 2, 7, 0, 0, 1, 0, 0, 7, 2, 1, 7, 1, 2, 4, 2, 2, 2, 0, 0, 2, 7, 2, 0, 0, 7, 0, 0, 0, 7, 2, 0, 2, 7, 6, 0, 6, 1, 1, 0, 0, 0, 2, 0, 7, 0, 1, 6, 7, 0, 0, 0, 2, 7, 2, 0, 2, 7, 0, 7, 7, 0, 7, 0, 0, 0, 7, 1, 1, 2, 6, 0, 0, 0, 0, 2, 7, 7, 2, 2, 6, 0, 0, 7, 2, 2, 1, 1, 4, 0, 1, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 7, 1, 0, 6, 6, 0, 1, 1, 0, 6, 0, 0, 2, 0, 0, 0, 0, 6, 0, 0, 7, 2, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 7, 2, 2, 0, 0, 2, 7, 1, 2, 0, 1, 1, 2, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0, 7, 7, 1, 2, 7, 2, 7, 0, 7, 0, 2, 0, 7, 2, 1, 0])

label = torch.randint(0, 2, (10,))

model.train()
optimizer.zero_grad()
preds = model(X_1, X_2, X_3)

# preds=torch.tensor(preds)
loss = criterion(preds, label)

loss.backward()
optimizer.step()

for name, param in model.named_parameters():
    try:
        print(name, param.grad.abs().sum())
    except:
        print("error: {} no grad".format(name))

Can you help me? Thanks!

I saw your edge type array does not have entries 3 and 5, so the projection matrices for edge type 5 and 3 indeed would not have any gradient.

1 Like

Thank you! I get it!