Hi, recently I tried your advice about moving data to gpu in the main progress. It works fine when I set num_workers=0
.
However, when I set the num_worker > 0
. There comes a RuntimeError
:
/run/cqlu/cqlu/DGL_mol/flow.py in train(self, epochs, batch_size, dataset, early_stop, lr, decay_rate, train_size, device, resume)
103 # train
104 self.model.train()
--> 105 for idx, (bg, label) in enumerate(train_loader):
106 for k in bg.node_attr_schemes().keys():
107 bg.ndata[k] = bg.ndata[k].to(device)
~/cq/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
580 self.reorder_dict[idx] = batch
581 continue
--> 582 return self._process_next_batch(batch)
583
584 next = __next__ # Python 2 compatibility
~/cq/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _process_next_batch(self, batch)
606 raise Exception("KeyError:" + batch.exc_msg)
607 else:
--> 608 raise batch.exc_type(batch.exc_msg)
609 return batch
610
RuntimeError: Traceback (most recent call last):
File "/data/home/cqlu/cq/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/run/cqlu/cqlu/DGL_mol/dataset.py", line 125, in <lambda>
collate_fn=lambda samples: collate(samples, self.device))
File "/run/cqlu/cqlu/DGL_mol/dataset.py", line 14, in collate
batched_graph = dgl.batch(graphs)
File "/data/home/cqlu/cq/lib/python3.6/site-packages/dgl/batched_graph.py", line 354, in batch
return BatchedDGLGraph(graph_list, node_attrs, edge_attrs)
File "/data/home/cqlu/cq/lib/python3.6/site-packages/dgl/batched_graph.py", line 192, in __init__
for key in node_attrs}
File "/data/home/cqlu/cq/lib/python3.6/site-packages/dgl/batched_graph.py", line 192, in <dictcomp>
for key in node_attrs}
File "/data/home/cqlu/cq/lib/python3.6/site-packages/dgl/backend/pytorch/tensor.py", line 100, in cat
return th.cat(seq, dim=dim)
RuntimeError: CUDA error: initialization error
Have you met similar problem like this?