Hello, I have come across such an error, I am very puzzled. From the error, it appears that the image was transferred to the gpu in an error.
Traceback (most recent call last):
File “/seu_share/home/lingchongyi/lingchongyi/workspace/xoxu/transfer_learning/m3gnet/trainer.py”, line 443, in
main("./dimenet_pp.yaml")
File “/seu_share/home/lingchongyi/lingchongyi/workspace/xoxu/transfer_learning/m3gnet/trainer.py”, line 400, in main
train_loss = train(device, model, opt, loss_fn, train_loader, mean, std)
File “/seu_share/home/lingchongyi/lingchongyi/workspace/xoxu/transfer_learning/m3gnet/trainer.py”, line 230, in train
g = g.to(device)
File “/seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/heterograph.py”, line 5709, in to
ret._graph = self._graph.copy_to(utils.to_dgl_context(device))
File “/seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/heterograph_index.py”, line 255, in copy_to
return _CAPI_DGLHeteroCopyTo(self, ctx.device_type, ctx.device_id)
File “dgl/_ffi/_cython/./function.pxi”, line 295, in dgl._ffi._cy3.core.FunctionBase.call
File “dgl/_ffi/_cython/./function.pxi”, line 227, in dgl._ffi._cy3.core.FuncCall
File “dgl/_ffi/_cython/./function.pxi”, line 217, in dgl._ffi._cy3.core.FuncCall3
dgl._ffi.base.DGLError: [21:51:37] /opt/dgl/src/runtime/cuda/cuda_device_api.cc:117: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: unspecified launch failure
Stack trace:
[bt] (0) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(+0x8b0a35) [0x2b5bda785a35]
[bt] (1) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::CUDADeviceAPI::AllocDataSpace(DGLContext, unsigned long, unsigned long, DGLDataType)+0x17d) [0x2b5bda78735d]
[bt] (2) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Empty(std::vector<long, std::allocator >, DGLDataType, DGLContext)+0x170) [0x2b5bda5ff720]
[bt] (3) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DGLContext const&) const+0xc3) [0x2b5bda639e93]
[bt] (4) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptrdgl::BaseHeteroGraph, DGLContext const&)+0x3ff) [0x2b5bda747ecf]
[bt] (5) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptrdgl::BaseHeteroGraph, DGLContext const&)+0xf6) [0x2b5bda646716]
[bt] (6) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(+0x780156) [0x2b5bda655156]
[bt] (7) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x2b5bda5e33f8]
[bt] (8) /seu_share/home/lingchongyi/lingchongyi/.conda/envs/transfer_learning/lib/python3.9/site-packages/dgl/_ffi/_cy3/core.cpython-39-x86_64-linux-gnu.so(+0x16883) [0x2b5bfeb8c883]
def get_graph_from_processed_structure(
structure,
src_id,
dst_id,
images,
lattice_matrix,
element_types,
frac_coords,
is_atoms: bool = False,
) -> Tuple[dgl.DGLGraph, torch.Tensor, list]:
"""Construct a dgl graph from processed structure and bond information.
Args:
structure: Input crystals or molecule of pymatgen structure or molecule types.
src_id: site indices for starting point of bonds.
dst_id: site indices for destination point of bonds.
images: the periodic image offsets for the bonds.
lattice_matrix: lattice information of the structure.
element_types: Element symbols of all atoms in the structure.
frac_coords: Fractional coordinates of all atoms in the structure. Note: Cartesian coordinates for molecule
is_atoms: whether the input structure object is ASE atoms object or not.
Returns:
DGLGraph object, state_attr
"""
u, v = torch.tensor(src_id, dtype=torch.long), torch.tensor(dst_id, dtype=torch.long)
g = dgl.graph((u, v), num_nodes=len(structure), idtype=torch.int64)
# TODO: Need to check if the variable needs to be double or float, now use float
pbc_offset = torch.tensor(images, dtype=matgl.float_th)
g.edata["pbc_offset"] = pbc_offset
# TODO: Need to check if the variable needs to be double or float, now use float
lattice = torch.tensor(np.array(lattice_matrix), dtype=matgl.float_th)
# Note: pbc_ offshift and pos needs to be float64 to handle cases where bonds are exactly at cutoff
element_to_index = {elem: idx for idx, elem in enumerate(element_types)}
node_type = (
np.array([element_types.index(site.specie.symbol) for site in structure])
if is_atoms is False
else np.array([element_to_index[elem] for elem in structure.get_chemical_symbols()])
)
g.ndata["node_type"] = torch.tensor(node_type, dtype=matgl.int_th)
# TODO: Need to check if the variable needs to be double or float, now use float
g.ndata["frac_coords"] = torch.tensor(frac_coords, dtype=matgl.float_th)
state_attr = np.array([0.0, 0.0]).astype(matgl.float_np)
return g, lattice, state_attr
class Structure2Graph():
"""Construct a DGL graph from Pymatgen Structure."""
def __init__(
self,
element_types: Tuple[str, ...],
cutoff: float = 5.0,
):
"""Parameters
----------
element_types: List of elements present in dataset for graph conversion. This ensures all graphs are
constructed with the same dimensionality of features.
cutoff: Cutoff radius for graph representation
"""
self.element_types = tuple(element_types)
self.cutoff = cutoff
def get_graph(self, structure: Structure) -> Tuple[dgl.DGLGraph, torch.Tensor, List]:
"""Get a DGL graph from an input Structure.
:param structure: pymatgen structure object
:return:
g: DGL graph
lat: lattice for periodic systems
state_attr: state features
"""
numerical_tol = 1.0e-8
pbc = np.array([1, 1, 1], dtype=int)
element_types = self.element_types
lattice_matrix = structure.lattice.matrix
cart_coords = structure.cart_coords
src_id, dst_id, images, bond_dist = find_points_in_spheres(
cart_coords,
cart_coords,
r=self.cutoff,
pbc=pbc,
lattice=lattice_matrix,
tol=numerical_tol,
)
exclude_self = (src_id != dst_id) | (bond_dist > numerical_tol)
src_id, dst_id, images, bond_dist = (
src_id[exclude_self],
dst_id[exclude_self],
images[exclude_self],
bond_dist[exclude_self],
)
g, lat, state_attr = get_graph_from_processed_structure(
structure,
src_id,
dst_id,
images,
[lattice_matrix],
element_types,
structure.frac_coords,
)
return g, lat, state_attr
The above is the main structure of my code to constrct the graph, I hope you can give me some advice, thank you very much!
Device info:
GPU: V100
torch: 2.1.1-cu11.8
dgl: 1.1.2-cu11.8