I am trying to build a graphormer model with GraphormerLayers. This is how the model looks and the class file for reference. But, I’m facing issue when trying to write the forward function. I’m not sure how to use the attr_bias or mask variables. I’ve checked couple of blogs and repos, but still couldn’t get a clear idea. Can someone please help with this ?
import numpy as np
import torch as th
import dgl
from dgl.nn import NNConv, GraphormerLayer
from dgl.nn import nn
class Graphormer(th.nn.Module):
def __init__(self, gnn_layers, num_feats, n_classes, hidden, num_edge_feats, activation, num_heads, final_activation, dropout):
super(Graphormer, self).__init__()
self._gnn_layers = gnn_layers
self._num_feats = num_feats
self._n_classes = n_classes
self._num_hiden_features = hidden
self.activation = activation
self._num_edge_feats = num_edge_feats
self._final_activation = final_activation
self._num_heads = num_heads
self.dropout = dropout
self.build_model()
def build_model(self):
self.layers = nn.ModuleList()
# input to hidden
i2h = self.build_input_layer()
self.layers.append(i2h)
# hidden to hidden
for i in range(self._gnn_layers - 2):
h2h = self.build_hidden_layer(i)
self.layers.append(h2h)
# hidden to output
h2o = self.build_output_layer()
self.layers.append(h2o)
def build_input_layer(self):
print('Building an INPUT layer of {}x{}'.format(self._num_feats, self._num_hiden_features[0]))
return GraphormerLayer(self._num_feats, self._num_hiden_features[0], self._num_heads, self.dropout, activation=self.activation)
def build_hidden_layer(self, i):
print('Building an HIDDEN layer of {}x{}'.format(self._num_hiden_features[i], self._num_hiden_features[i+1]))
return GraphormerLayer(self._num_hiden_features[i], self._num_hiden_features[i+1], self._num_heads, self.dropout, activation=self.activation)
def build_output_layer(self):
print('Building an OUTPUT layer of {}x{}'.format(self._num_hiden_features[-1], self._n_classes))
return GraphormerLayer(self._num_hiden_features[-1], self._n_classes, self._num_heads, self.dropout, activation=self._final_activation)
@staticmethod
def edge_function(f_in, f_out):
a = int(f_in*0.666 + f_out*0.334)
b = int(f_in*0.334 + f_out*0.666)
return th.nn.Sequential(
th.nn.Linear(f_in, a),
th.nn.ReLU(),
th.nn.Linear(a, b),
th.nn.ReLU(),
th.nn.Linear(b, f_out)
)
def set_g(self, g):
self.g = g
for l in range(self._gnn_layers):
self.layers[l].g = g
def forward(self, graph, feat, efeat):
self.set_g(feat)
# x = self.input_layer(graph)
print(type(efeat))
a, b = feat.edges()
feat_t = th.stack([a, b], dim=0)
x = graph
for idx, layer in enumerate(self.layers):
x = layer(x, efeat, feat)
x = x.flatten(1)
x = self.activation(x)
print(idx, ' iter done')
print('before: ', x.shape)
# x = x.reshape(1, -1, self._num_hiden_features[idx])
# print('after: ', x.shape)
# x = self.output_layer(x)
if self._final_activation is not None:
logits = self._final_activation(x)
else:
logits = x
return logits
Right now the shape of the input tensor doesn’t change after each layer. The model looks as follows:
SELECT_GNN(
(activation): ELU()
(final_activation): ReLU()
(gnn_object): Graphormer(
(activation): ELU()
(_final_activation): ReLU()
(layers): ModuleList(
(0): GraphormerLayer(
(attn): BiasedMHA(
(q_proj): Linear(in_features=42, out_features=42, bias=True)
(k_proj): Linear(in_features=42, out_features=42, bias=True)
(v_proj): Linear(in_features=42, out_features=42, bias=True)
(out_proj): Linear(in_features=42, out_features=42, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(ffn): Sequential(
(0): Linear(in_features=42, out_features=42, bias=True)
(1): ELU()
(2): Dropout(p=0.1, inplace=False)
(3): Linear(in_features=42, out_features=42, bias=True)
(4): Dropout(p=0.1, inplace=False)
)
(dropout): Dropout(p=0.1, inplace=False)
(attn_layer_norm): LayerNorm((42,), eps=1e-05, elementwise_affine=True)
(ffn_layer_norm): LayerNorm((42,), eps=1e-05, elementwise_affine=True)
)
(1): GraphormerLayer(
(attn): BiasedMHA(
(q_proj): Linear(in_features=42, out_features=42, bias=True)
(k_proj): Linear(in_features=42, out_features=42, bias=True)
(v_proj): Linear(in_features=42, out_features=42, bias=True)
(out_proj): Linear(in_features=42, out_features=42, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(ffn): Sequential(
(0): Linear(in_features=42, out_features=35, bias=True)
(1): ELU()
(2): Dropout(p=0.1, inplace=False)
(3): Linear(in_features=35, out_features=42, bias=True)
(4): Dropout(p=0.1, inplace=False)
)
(dropout): Dropout(p=0.1, inplace=False)
(attn_layer_norm): LayerNorm((42,), eps=1e-05, elementwise_affine=True)
(ffn_layer_norm): LayerNorm((42,), eps=1e-05, elementwise_affine=True)
)
(2): GraphormerLayer(
(attn): BiasedMHA(
(q_proj): Linear(in_features=35, out_features=35, bias=True)
(k_proj): Linear(in_features=35, out_features=35, bias=True)
(v_proj): Linear(in_features=35, out_features=35, bias=True)
(out_proj): Linear(in_features=35, out_features=35, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(ffn): Sequential(
(0): Linear(in_features=35, out_features=28, bias=True)
(1): ELU()
(2): Dropout(p=0.1, inplace=False)
(3): Linear(in_features=28, out_features=35, bias=True)
(4): Dropout(p=0.1, inplace=False)
)
(dropout): Dropout(p=0.1, inplace=False)
(attn_layer_norm): LayerNorm((35,), eps=1e-05, elementwise_affine=True)
(ffn_layer_norm): LayerNorm((35,), eps=1e-05, elementwise_affine=True)
)
(3): GraphormerLayer(
(attn): BiasedMHA(
(q_proj): Linear(in_features=28, out_features=28, bias=True)
(k_proj): Linear(in_features=28, out_features=28, bias=True)
(v_proj): Linear(in_features=28, out_features=28, bias=True)
(out_proj): Linear(in_features=28, out_features=28, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(ffn): Sequential(
(0): Linear(in_features=28, out_features=14, bias=True)
(1): ELU()
(2): Dropout(p=0.1, inplace=False)
(3): Linear(in_features=14, out_features=28, bias=True)
(4): Dropout(p=0.1, inplace=False)
)
(dropout): Dropout(p=0.1, inplace=False)
(attn_layer_norm): LayerNorm((28,), eps=1e-05, elementwise_affine=True)
(ffn_layer_norm): LayerNorm((28,), eps=1e-05, elementwise_affine=True)
)
(4): GraphormerLayer(
(attn): BiasedMHA(
(q_proj): Linear(in_features=7, out_features=7, bias=True)
(k_proj): Linear(in_features=7, out_features=7, bias=True)
(v_proj): Linear(in_features=7, out_features=7, bias=True)
(out_proj): Linear(in_features=7, out_features=7, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(ffn): Sequential(
(0): Linear(in_features=7, out_features=2, bias=True)
(1): ReLU()
(2): Dropout(p=0.1, inplace=False)
(3): Linear(in_features=2, out_features=7, bias=True)
(4): Dropout(p=0.1, inplace=False)
)
(dropout): Dropout(p=0.1, inplace=False)
(attn_layer_norm): LayerNorm((7,), eps=1e-05, elementwise_affine=True)
(ffn_layer_norm): LayerNorm((7,), eps=1e-05, elementwise_affine=True)
)
)
)
)
This is what the logs in forward call and error looks like:
0 iter done
after: torch. Size([1, 5793, 42])
1 iter done
after: torch. Size([1, 5793, 42])
RuntimeError: mat1 and mat2 shapes cannot be multiplied (5793x42 and 35x35)