CUDA: device-side assert triggered

While using an object of the GraphDataLoader class I keep getting an error with the traceback:


DGLError                                  Traceback (most recent call last)
<ipython-input-27-2a54b9c16d62> in <module>
      2 with torch.no_grad():
      3     for x,_ in progress_bar(dl):
----> 4         x = x.to(torch.device("cuda"))
      5         results.append(m(x))

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/heterograph.py in to(self, device, **kwargs)
   5209 
   5210         # 1. Copy graph structure
-> 5211         ret._graph = self._graph.copy_to(utils.to_dgl_context(device))
   5212 
   5213         # 2. Copy features

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/heterograph_index.py in copy_to(self, ctx)
    233             The graph index on the given device context.
    234         """
--> 235         return _CAPI_DGLHeteroCopyTo(self, ctx.device_type, ctx.device_id)
    236 
    237     def shared_memory(self, name, ntypes=None, etypes=None, formats=('coo', 'csr', 'csc')):

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/_ffi/_ctypes/function.py in __call__(self, *args)
    188         check_call(_LIB.DGLFuncCall(
    189             self.handle, values, tcodes, ctypes.c_int(num_args),
--> 190             ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
    191         _ = temp_args
    192         _ = args

~/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/_ffi/base.py in check_call(ret)
     62     """
     63     if ret != 0:
---> 64         raise DGLError(py_str(_LIB.DGLGetLastError()))
     65 
     66 

DGLError: [07:57:59] /opt/dgl/src/runtime/cuda/cuda_device_api.cc:196: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: device-side assert triggered
Stack trace:
  [bt] (0) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7f5cc49f70cf]
  [bt] (1) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::runtime::CUDADeviceAPI::CopyDataFromTo(void const*, unsigned long, void*, unsigned long, unsigned long, DLContext, DLContext, DLDataType, void*)+0x7b) [0x7f5cc5271cfb]
  [bt] (2) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyFromTo(DLTensor*, DLTensor*, void*)+0x267) [0x7f5cc511d147]
  [bt] (3) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DLContext const&) const+0xee) [0x7f5cc515409e]
  [bt] (4) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::aten::COOMatrix::CopyTo(DLContext const&) const+0x7d) [0x7f5cc5253ebd]
  [bt] (5) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DLContext const&)+0x292) [0x7f5cc52446f2]
  [bt] (6) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DLContext const&)+0xf5) [0x7f5cc51652a5]
  [bt] (7) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(+0xccfc3b) [0x7f5cc5171c3b]
  [bt] (8) /home/ec2-user/SageMaker/custom-miniconda/miniconda/envs/okkular_training_gui/lib/python3.6/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x7f5cc5100d28]

This is my code:

ds = InferenceDataset(combinations)
import dgl.dataloading.pytorch 

def my_collate(t):
    xs, ys = zip(*t)
    batched_g = dgl.batch(xs)
    batched_g.shape = (len(xs), 1)
    return batched_g, torch.stack(ys)

m = ClassifierInference(item_embeddings)

m.load_state_dict(torch.load("models/unfreeze-1-epoch-naive.pth"))

m = m.cuda()

dl = dgl.dataloading.pytorch.GraphDataLoader(ds, batch_size=512, num_workers=2)

results = []
with torch.no_grad():
    for x,_ in progress_bar(dl):
        x = x.to(torch.device("cuda")) #ERROR OCCURS IN THIS LINE
        results.append(m(x))

I’m running dgl with CUDA 10.2 (the problem is still there with 11.0)
fastai 2.3.1
torch 1.8.1.CUDA 11.0 version

duplicate question: RuntimeError: DataLoader worker (pid(s) 19716) exited unexpectedly