While using an object of the GraphDataLoader class I keep getting an error with the traceback:
DGLError Traceback (most recent call last)
<ipython-input-27-2a54b9c16d62> in <module>
2 with torch.no_grad():
3 for x,_ in progress_bar(dl):
----> 4 x = x.to(torch.device("cuda"))
5 results.append(m(x))
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/heterograph.py in to(self, device, **kwargs)
5209
5210 # 1. Copy graph structure
-> 5211 ret._graph = self._graph.copy_to(utils.to_dgl_context(device))
5212
5213 # 2. Copy features
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/heterograph_index.py in copy_to(self, ctx)
233 The graph index on the given device context.
234 """
--> 235 return _CAPI_DGLHeteroCopyTo(self, ctx.device_type, ctx.device_id)
236
237 def shared_memory(self, name, ntypes=None, etypes=None, formats=('coo', 'csr', 'csc')):
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/_ffi/_ctypes/function.py in __call__(self, *args)
188 check_call(_LIB.DGLFuncCall(
189 self.handle, values, tcodes, ctypes.c_int(num_args),
--> 190 ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
191 _ = temp_args
192 _ = args
~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/_ffi/base.py in check_call(ret)
62 """
63 if ret != 0:
---> 64 raise DGLError(py_str(_LIB.DGLGetLastError()))
65
66
DGLError: [07:57:59] /opt/dgl/src/runtime/cuda/cuda_device_api.cc:196: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: device-side assert triggered
Stack trace:
[bt] (0) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7f5cc49f70cf]
[bt] (1) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::runtime::CUDADeviceAPI::CopyDataFromTo(void const*, unsigned long, void*, unsigned long, unsigned long, DLContext, DLContext, DLDataType, void*)+0x7b) [0x7f5cc5271cfb]
[bt] (2) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyFromTo(DLTensor*, DLTensor*, void*)+0x267) [0x7f5cc511d147]
[bt] (3) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DLContext const&) const+0xee) [0x7f5cc515409e]
[bt] (4) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::aten::COOMatrix::CopyTo(DLContext const&) const+0x7d) [0x7f5cc5253ebd]
[bt] (5) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DLContext const&)+0x292) [0x7f5cc52446f2]
[bt] (6) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DLContext const&)+0xf5) [0x7f5cc51652a5]
[bt] (7) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(+0xccfc3b) [0x7f5cc5171c3b]
[bt] (8) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x7f5cc5100d28]
This is my code:
ds = InferenceDataset(combinations)
import dgl.dataloading.pytorch
def my_collate(t):
xs, ys = zip(*t)
batched_g = dgl.batch(xs)
batched_g.shape = (len(xs), 1)
return batched_g, torch.stack(ys)
m = ClassifierInference(item_embeddings)
m.load_state_dict(torch.load("models/unfreeze-1-epoch-naive.pth"))
m = m.cuda()
dl = dgl.dataloading.pytorch.GraphDataLoader(ds, batch_size=512, num_workers=2)
results = []
with torch.no_grad():
for x,_ in progress_bar(dl):
x = x.to(torch.device("cuda")) #ERROR OCCURS IN THIS LINE
results.append(m(x))
I’m running dgl with CUDA 10.2 (the problem is still there with 11.0)
fastai 2.3.1
torch 1.8.1.CUDA 11.0 version