dgl._ffi.base.DGLError: [21:51:37] /opt/dgl/src/runtime/cuda/cuda_device_api.cc:117: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: unspecified launch failure

Hello, I have come across such an error, I am very puzzled. From the error, it appears that the image was transferred to the gpu in an error.

Traceback (most recent call last):
File “/seu_share/home/lingchongyi/lingchongyi/workspace/xoxu/transfer_learning/m3gnet/trainer.py”, line 443, in
main("./dimenet_pp.yaml")
File “/seu_share/home/lingchongyi/lingchongyi/workspace/xoxu/transfer_learning/m3gnet/trainer.py”, line 400, in main
train_loss = train(device, model, opt, loss_fn, train_loader, mean, std)
File “/seu_share/home/lingchongyi/lingchongyi/workspace/xoxu/transfer_learning/m3gnet/trainer.py”, line 230, in train
g = g.to(device)
File “/seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/heterograph.py”, line 5709, in to
ret._graph = self._graph.copy_to(utils.to_dgl_context(device))
File “/seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/heterograph_index.py”, line 255, in copy_to
return _CAPI_DGLHeteroCopyTo(self, ctx.device_type, ctx.device_id)
File “dgl/_ffi/_cython/./function.pxi”, line 295, in dgl._ffi._cy3.core.FunctionBase.call
File “dgl/_ffi/_cython/./function.pxi”, line 227, in dgl._ffi._cy3.core.FuncCall
File “dgl/_ffi/_cython/./function.pxi”, line 217, in dgl._ffi._cy3.core.FuncCall3
dgl._ffi.base.DGLError: [21:51:37] /opt/dgl/src/runtime/cuda/cuda_device_api.cc:117: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: unspecified launch failure
Stack trace:
[bt] (0) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(+0x8b0a35) [0x2b5bda785a35]
[bt] (1) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::CUDADeviceAPI::AllocDataSpace(DGLContext, unsigned long, unsigned long, DGLDataType)+0x17d) [0x2b5bda78735d]
[bt] (2) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Empty(std::vector<long, std::allocator >, DGLDataType, DGLContext)+0x170) [0x2b5bda5ff720]
[bt] (3) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DGLContext const&) const+0xc3) [0x2b5bda639e93]
[bt] (4) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptrdgl::BaseHeteroGraph, DGLContext const&)+0x3ff) [0x2b5bda747ecf]
[bt] (5) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptrdgl::BaseHeteroGraph, DGLContext const&)+0xf6) [0x2b5bda646716]
[bt] (6) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(+0x780156) [0x2b5bda655156]
[bt] (7) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x2b5bda5e33f8]
[bt] (8) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/_ffi/_cy3/core.cpython-39-x86_64-linux-gnu.so(+0x16883) [0x2b5bfeb8c883]

def get_graph_from_processed_structure(
        structure,
        src_id,
        dst_id,
        images,
        lattice_matrix,
        element_types,
        frac_coords,
        is_atoms: bool = False,
) -> Tuple[dgl.DGLGraph, torch.Tensor, list]:
    """Construct a dgl graph from processed structure and bond information.

    Args:
        structure: Input crystals or molecule of pymatgen structure or molecule types.
        src_id: site indices for starting point of bonds.
        dst_id: site indices for destination point of bonds.
        images: the periodic image offsets for the bonds.
        lattice_matrix: lattice information of the structure.
        element_types: Element symbols of all atoms in the structure.
        frac_coords: Fractional coordinates of all atoms in the structure. Note: Cartesian coordinates for molecule
        is_atoms: whether the input structure object is ASE atoms object or not.

    Returns:
        DGLGraph object, state_attr

    """
    u, v = torch.tensor(src_id, dtype=torch.long), torch.tensor(dst_id, dtype=torch.long)
    g = dgl.graph((u, v), num_nodes=len(structure), idtype=torch.int64)
    # TODO: Need to check if the variable needs to be double or float, now use float
    pbc_offset = torch.tensor(images, dtype=matgl.float_th)
    g.edata["pbc_offset"] = pbc_offset
    # TODO: Need to check if the variable needs to be double or float, now use float
    lattice = torch.tensor(np.array(lattice_matrix), dtype=matgl.float_th)
    # Note: pbc_ offshift and pos needs to be float64 to handle cases where bonds are exactly at cutoff
    element_to_index = {elem: idx for idx, elem in enumerate(element_types)}
    node_type = (
        np.array([element_types.index(site.specie.symbol) for site in structure])
        if is_atoms is False
        else np.array([element_to_index[elem] for elem in structure.get_chemical_symbols()])
    )
    g.ndata["node_type"] = torch.tensor(node_type, dtype=matgl.int_th)
    # TODO: Need to check if the variable needs to be double or float, now use float
    g.ndata["frac_coords"] = torch.tensor(frac_coords, dtype=matgl.float_th)
    state_attr = np.array([0.0, 0.0]).astype(matgl.float_np)
    return g, lattice, state_attr


class Structure2Graph():
    """Construct a DGL graph from Pymatgen Structure."""

    def __init__(
            self,
            element_types: Tuple[str, ...],
            cutoff: float = 5.0,
    ):
        """Parameters
        ----------
        element_types: List of elements present in dataset for graph conversion. This ensures all graphs are
            constructed with the same dimensionality of features.
        cutoff: Cutoff radius for graph representation
        """
        self.element_types = tuple(element_types)
        self.cutoff = cutoff

    def get_graph(self, structure: Structure) -> Tuple[dgl.DGLGraph, torch.Tensor, List]:
        """Get a DGL graph from an input Structure.

        :param structure: pymatgen structure object
        :return:
            g: DGL graph
            lat: lattice for periodic systems
            state_attr: state features
        """
        numerical_tol = 1.0e-8
        pbc = np.array([1, 1, 1], dtype=int)
        element_types = self.element_types
        lattice_matrix = structure.lattice.matrix
        cart_coords = structure.cart_coords
        src_id, dst_id, images, bond_dist = find_points_in_spheres(
            cart_coords,
            cart_coords,
            r=self.cutoff,
            pbc=pbc,
            lattice=lattice_matrix,
            tol=numerical_tol,
        )
        exclude_self = (src_id != dst_id) | (bond_dist > numerical_tol)
        src_id, dst_id, images, bond_dist = (
            src_id[exclude_self],
            dst_id[exclude_self],
            images[exclude_self],
            bond_dist[exclude_self],
        )
        g, lat, state_attr = get_graph_from_processed_structure(
            structure,
            src_id,
            dst_id,
            images,
            [lattice_matrix],
            element_types,
            structure.frac_coords,
        )
        return g, lat, state_attr

The above is the main structure of my code to constrct the graph, I hope you can give me some advice, thank you very much!

Device info:
GPU: V100
torch: 2.1.1-cu11.8
dgl: 1.1.2-cu11.8

OK, i solve this question when i use the 1.0.4 version of dgl…

However, when i debug this code in my laptop (GPU 1650Ti), dgl 1.1.2-cu11.8 works…

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.