As in the title, I copied the code from the DGL tutorial on the single-machine multi-card part for testing, and it worked fine on the obgn-products dataset. But when I modified the dataset to ogbn-papers100M, the following error occurred:
Traceback (most recent call last):
File "graphsage.py", line 147, in <module>
mp.spawn(run, args=(list(range(num_gpus)),data,), nprocs=num_gpus)
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/root/data/gnnlab/graphsage.py", line 88, in run
model = Model(num_features, 256, num_classes).to(device)
File "/root/data/gnnlab/graphsage.py", line 33, in __init__
self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/dgl/nn/pytorch/conv/sageconv.py", line 126, in __init__
self.fc_neigh = nn.Linear(self._in_src_feats, out_feats, bias=False)
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 96, in __init__
self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
* (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
* (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
After my query, it seems that there is a problem with num_classes. So I manually set this parameter to 172 (its correct value), but a new error occurred as follows:
Traceback (most recent call last):
File "graphsage.py", line 147, in <module>
mp.spawn(run, args=(list(range(num_gpus)),data,), nprocs=num_gpus)
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/root/data/gnnlab/graphsage.py", line 111, in run
loss = F.cross_entropy(predictions, labels)
File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/nn/functional.py", line 3014, in cross_entropy
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'
Iām wondering how I need to modify it to run correctly, or where the problem is. Here is the code part:
def dataLoader(dataName):
dataset = DglNodePropPredDataset(dataName)
graph, node_labels = dataset[0]
#graph = dgl.add_reverse_edges(graph)
graph.ndata['label'] = node_labels[:, 0]
node_features = graph.ndata['feat']
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
idx_split = dataset.get_idx_split()
train_nids,valid_nids,test_nids = idx_split['train'],idx_split['valid'],idx_split['test']
graph.create_formats_()
print(type(num_classes))
data = graph,num_features,num_classes,train_nids,valid_nids,test_nids
return data
class Model(nn.Module):
def __init__(self, in_feats, h_feats, num_classes):
super(Model, self).__init__()
self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
self.h_feats = h_feats
def forward(self, mfgs, x):
h_dst = x[:mfgs[0].num_dst_nodes()]
h = self.conv1(mfgs[0], (x, h_dst))
h = F.relu(h)
h_dst = h[:mfgs[1].num_dst_nodes()]
h = self.conv2(mfgs[1], (h, h_dst))
return h
...
model = Model(num_features, 256, num_classes).to(device)
if device == torch.device('cpu'):
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None)
else:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], output_device=device)
# Define optimizer
opt = torch.optim.Adam(model.parameters())
best_accuracy = 0
best_model_path = './model.pt'
print("begin train...")
for epoch in range(10):
model.train()
with tqdm.tqdm(train_dataloader) as tq:
for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
# feature copy from CPU to GPU takes place here
inputs = mfgs[0].srcdata['feat']
labels = mfgs[-1].dstdata['label']
predictions = model(mfgs, inputs)
loss = F.cross_entropy(predictions, labels)
opt.zero_grad()
loss.backward()
opt.step()
accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())
tq.set_postfix({'loss': '%.03f' % loss.item(), 'acc': '%.03f' % accuracy}, refresh=False)