Running ogbn-papers100M with tutorial code fails

BearBiscuit05 · March 25, 2023, 3:01am

As in the title, I copied the code from the DGL tutorial on the single-machine multi-card part for testing, and it worked fine on the obgn-products dataset. But when I modified the dataset to ogbn-papers100M, the following error occurred:

Traceback (most recent call last):
  File "graphsage.py", line 147, in <module>
    mp.spawn(run, args=(list(range(num_gpus)),data,), nprocs=num_gpus)
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
    while not context.join():
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 2 terminated with the following error:
Traceback (most recent call last):
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/root/data/gnnlab/graphsage.py", line 88, in run
    model = Model(num_features, 256, num_classes).to(device)
  File "/root/data/gnnlab/graphsage.py", line 33, in __init__
    self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/dgl/nn/pytorch/conv/sageconv.py", line 126, in __init__
    self.fc_neigh = nn.Linear(self._in_src_feats, out_feats, bias=False)
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 96, in __init__
    self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)

After my query, it seems that there is a problem with num_classes. So I manually set this parameter to 172 (its correct value), but a new error occurred as follows:

Traceback (most recent call last):
  File "graphsage.py", line 147, in <module>
    mp.spawn(run, args=(list(range(num_gpus)),data,), nprocs=num_gpus)
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
    while not context.join():
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/root/data/gnnlab/graphsage.py", line 111, in run
    loss = F.cross_entropy(predictions, labels)
  File "/root/miniconda3/envs/DGL/lib/python3.8/site-packages/torch/nn/functional.py", line 3014, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'

I’m wondering how I need to modify it to run correctly, or where the problem is. Here is the code part:

def dataLoader(dataName):
    dataset = DglNodePropPredDataset(dataName)
    graph, node_labels = dataset[0]
    #graph = dgl.add_reverse_edges(graph)
    graph.ndata['label'] = node_labels[:, 0]
    node_features = graph.ndata['feat']
    num_features = node_features.shape[1]
    num_classes = (node_labels.max() + 1).item()
    idx_split = dataset.get_idx_split()
    train_nids,valid_nids,test_nids = idx_split['train'],idx_split['valid'],idx_split['test']
    graph.create_formats_()
    print(type(num_classes))
    data = graph,num_features,num_classes,train_nids,valid_nids,test_nids
    return data

class Model(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
        self.h_feats = h_feats

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()]
        h = self.conv1(mfgs[0], (x, h_dst))
        h = F.relu(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]
        h = self.conv2(mfgs[1], (h, h_dst))
        return h

...
    model = Model(num_features, 256, num_classes).to(device)
    if device == torch.device('cpu'):
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None)
    else:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], output_device=device)
    
    # Define optimizer
    opt = torch.optim.Adam(model.parameters())
    
    best_accuracy = 0
    best_model_path = './model.pt'
    print("begin train...")
    for epoch in range(10):
        model.train()

        with tqdm.tqdm(train_dataloader) as tq:
            for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
                # feature copy from CPU to GPU takes place here
                inputs = mfgs[0].srcdata['feat']
                labels = mfgs[-1].dstdata['label']

                predictions = model(mfgs, inputs)

                loss = F.cross_entropy(predictions, labels)
                opt.zero_grad()
                loss.backward()
                opt.step()

                accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())

                tq.set_postfix({'loss': '%.03f' % loss.item(), 'acc': '%.03f' % accuracy}, refresh=False)

dyru · March 26, 2023, 7:13pm

Hi, the input labels should be long type for the computation of cross entropy loss. Please check the data type of labels in your code.

BearBiscuit05 · March 27, 2023, 10:52am

Yes, I think you are right, the code can run now.

system · April 26, 2023, 10:52am

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.