Hi, I am getting the ‘Expect number of features to match number of nodes (len(u))’ error mentioned in other questions. I am getting this specifically in the context of training a TagPPI model (which works to predict protein interactions with protein npz embeddings produced by SeqVec and Alphafold).
The TagPPI Github is pretty quiet - so I am asking here in case anyone has fixed this specific problem?
I have tried environments a fresh as py-3.10/pytorch-2.0/cuda-11.8/dgl-1.0.2 to as old as py-3.6/pytorch-1.10/cuda-10.2/dgl-0.9.1, each time it is the same problem that seems to track back to TagPPI’s graph_cmap_loader.py script. The error can pop up while processing either protein G1 or G2, and sometimes the features > nodes and other times nodes > features. The specific feature/node mismatch values change every attempt, but I believe that is due to the dropout/shuffling used during training.
The error:
Running EPOCH 1
Traceback (most recent call last):
File "/lustre/fs0/home/iwill/TAGPPI/TAGPPI-main/my_main.py", line 25, in <module>
main()
File "/lustre/fs0/home/iwill/TAGPPI/TAGPPI-main/my_main.py", line 22, in main
train(trainArgs)
File "/lustre/fs0/home/iwill/TAGPPI/TAGPPI-main/my_train_and_validation.py", line 54, in train
for batch_idx,(G1,dmap1,G2,dmap2,y) in enumerate(train_loader):
File "/home/iwill/my-envs/tagppi_6/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 634, in __next__
data = self._next_data()
File "/home/iwill/my-envs/tagppi_6/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 678, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/iwill/my-envs/tagppi_6/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/iwill/my-envs/tagppi_6/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/lustre/fs0/home/iwill/TAGPPI/TAGPPI-main/graph_cmap_loader.py", line 74, in __getitem__
G2,embed2 = self.loader(cmaproot+p2+'.npz',p2)
File "/lustre/fs0/home/iwill/TAGPPI/TAGPPI-main/graph_cmap_loader.py", line 42, in default_loader
G.ndata['feat'] = g_embed
File "/home/iwill/my-envs/tagppi_6/lib/python3.10/site-packages/dgl/view.py", line 99, in __setitem__
self._graph._set_n_repr(self._ntid, self._nodes, {key: val})
File "/home/iwill/my-envs/tagppi_6/lib/python3.10/site-packages/dgl/heterograph.py", line 4032, in _set_n_repr
raise DGLError('Expect number of features to match number of nodes (len(u)).'
dgl._ffi.base.DGLError: Expect number of features to match number of nodes (len(u)). Got 295 and 292 instead.
I am using the default data provided by the authors, and only made one modification to this script to get it to work on GPU (added generator=torch.Generator(device=‘cuda’) to the dataloader).
Thanks for the help! I am showing the full graph_cmap_loader.py code below:
import torch
import dgl
import scipy.sparse as spp
from seq2tensor import s2t
import os
import numpy as np
import re
import sys
from torch.utils.data import DataLoader,Dataset
import sys
from my_main import *
if len(sys.argv) > 1:
datasetname, rst_file, pkl_path, batchsize = sys.argv[1:]
batchsize = int(batchsize)
else:
datasetname = 'yeast'
rst_file = './results/yeast_pipr.tsv'
pkl_path = './model_pkl/GAT'
batchsize = 64
device = torch.device('cuda')
def collate(samples):
graphs1,dmaps1,graphs2,dmaps2,labels = map(list, zip(*samples))
return graphs1,dmaps1,graphs2,dmaps2,torch.tensor(labels)
cmaproot = './data/'+datasetname+'/real_cmap/'
embed_data = np.load("./data/"+datasetname+"/dictionary/protein_embeddings.npz")
def default_loader(cpath,pid):
cmap_data = np.load(cpath)
nodenum = len(str(cmap_data['seq']))
cmap = cmap_data['contact']
g_embed = torch.tensor(embed_data[pid][:nodenum]).float().to(device)
adj = spp.coo_matrix(cmap)
G = dgl.DGLGraph(adj).to(device)
G = G.to(torch.device('cuda'))
G.ndata['feat'] = g_embed
if nodenum > 1000:
textembed = embed_data[pid][:1000]
elif nodenum < 1000:
textembed = np.concatenate((embed_data[pid], np.zeros((1000 - nodenum, 1024))))
textembed = torch.tensor(textembed).float().to(device)
return G,textembed
class MyDataset(Dataset):
def __init__(self,type,transform=None,target_transform=None, loader=default_loader):
super(MyDataset,self).__init__()
pns=[]
with open('./data/'+datasetname+'/actions/'+type+'_cmap.actions.tsv', 'r') as fh:
for line in fh:
line = line.strip('\n')
line = line.rstrip('\n')
words = re.split(' |\t',line)
pns.append((words[0],words[1],int(words[2])))
self.pns = pns
self.transform = transform
self.target_transform = target_transform
self.loader = loader
def __getitem__(self, index):
p1,p2, label = self.pns[index]
G1,embed1 = self.loader(cmaproot+p1+'.npz',p1)
G2,embed2 = self.loader(cmaproot+p2+'.npz',p2)
return G1,embed1,G2,embed2,label
def __len__(self):
return len(self.pns)
def pad_sequences(vectorized_seqs, seq_lengths, contactMaps, contact_sizes, properties):
seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()
for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
contactMaps_tensor = torch.zeros((len(contactMaps), contact_sizes.max(), contact_sizes.max())).float()
# contactMaps_tensor = torch.ones((len(contactMaps), contact_sizes.max(), contact_sizes.max())).float()*(-1.0)
for idx, (con, con_size) in enumerate(zip(contactMaps, contact_sizes)):
contactMaps_tensor[idx, :con_size, :con_size] = torch.FloatTensor(con)
seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
seq_tensor = seq_tensor[perm_idx]
contactMaps_tensor = contactMaps_tensor[perm_idx]
contact_sizes = contact_sizes[perm_idx]
target = properties.double()
if len(properties):
target = target[perm_idx]
contactMaps_tensor = contactMaps_tensor.unsqueeze(1) # [batchsize,1,max_length,max_length]
return seq_tensor, seq_lengths, contactMaps_tensor, contact_sizes, target
def pad_dmap(dmaplist):
pad_dmap_tensors = torch.zeros((len(dmaplist), 1000, 1024)).float()
for idx, d in enumerate(dmaplist):
d = d.float().cpu()
pad_dmap_tensors[idx] = torch.FloatTensor(d)
pad_dmap_tensors = pad_dmap_tensors.unsqueeze(1).cuda()
return pad_dmap_tensors
train_dataset = MyDataset(type = 'train')
train_loader = DataLoader(dataset = train_dataset, batch_size = batchsize, shuffle=True,drop_last = True,collate_fn=collate, generator=torch.Generator(device='cuda')) # added generator=torch.Generator(device='cuda')
test_dataset = MyDataset(type = 'test')
test_loader = DataLoader(dataset = test_dataset, batch_size = batchsize , shuffle=True,drop_last = True,collate_fn=collate)