I want to use dgl.nn.pytorch.conv.gatv2conv
to process my tensor, but when I use GPUs to run my model, it crashed like below:
Torch is using cuda:2
using 0 workers
0%| | 0/9006 [01:20<?, ?it/s]
experiments/train_with_era5land_gnn.py:28 (test_run_model)
train_with_era5land_gnn.py:50:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../torchhydro/trainers/trainer.py:71: in train_and_evaluate
deephydro.model_train()
../torchhydro/trainers/deep_hydro.py:261: in model_train
total_loss, n_iter_ep = torch_single_train(
../torchhydro/trainers/train_utils.py:391: in torch_single_train
trg, output = model_infer(seq_first, device, model, src, trg)
../torchhydro/trainers/train_utils.py:69: in model_infer
output = model(*xs)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1511: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1520: in _call_impl
return forward_call(*args, **kwargs)
../torchhydro/models/seq2seq.py:403: in forward
encoder_outputs, hidden, cell = self.encoder(src1)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1511: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1520: in _call_impl
return forward_call(*args, **kwargs)
../torchhydro/models/seq2seq.py:337: in forward
output_g = self.gnn(graph=self.graph, feat=outputs[:, i, :].unsqueeze(1))
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1511: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1520: in _call_impl
return forward_call(*args, **kwargs)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/dgl/nn/pytorch/conv/gatv2conv.py:308: in forward
e = self.leaky_relu(
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1511: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/module.py:1520: in _call_impl
return forward_call(*args, **kwargs)
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/modules/activation.py:774: in forward
return F.leaky_relu(input, self.negative_slope, self.inplace)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
input = <[RuntimeError('CUDA error: an illegal memory access was encountered\nCUDA kernel errors might be asynchronously repor...pile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n') raised in repr()] Tensor object at 0x7f6bb7c8c8f0>
negative_slope = 0.2, inplace = False
def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = False) -> Tensor: # noqa: D400,D402
r"""
leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor
Applies element-wise,
:math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)`
See :class:`~torch.nn.LeakyReLU` for more details.
"""
if has_torch_function_unary(input):
return handle_torch_function(leaky_relu, (input,), input, negative_slope=negative_slope, inplace=inplace)
if inplace:
result = torch._C._nn.leaky_relu_(input, negative_slope)
else:
> result = torch._C._nn.leaky_relu(input, negative_slope)
E RuntimeError: CUDA error: an illegal memory access was encountered
E CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
E For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
E Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
../../.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/nn/functional.py:1648: RuntimeError
--- Logging error ---
Traceback (most recent call last):
File "/home/wangyang1/.conda/envs/torchhydro1/lib/python3.11/logging/__init__.py", line 1113, in emit
stream.write(msg + self.terminator)
ValueError: I/O operation on closed file.
Call stack:
File "/home/wangyang1/.conda/envs/torchhydro1/lib/python3.11/site-packages/torch/_dynamo/utils.py", line 302, in dump_compile_times
log.info(compile_times(repr="str", aggregate=True))
Message: 'TorchDynamo compilation metrics:\nFunction, Runtimes (s)'
Arguments: ()
And this is my __forward__
method:
def forward(self, x):
# x: (batch_size=256, seq_length=240, features=24)
x = self.pre_fc(x)
x = self.pre_relu(x)
outputs, (hidden, cell) = self.lstm(x)
node_amount = self.graph.num_nodes()
dim_diff = node_amount - outputs.size(0)
if dim_diff >= 0:
out_res_matrix = torch.zeros(dim_diff, outputs.size(1), outputs.size(2)).to(x.device)
outputs = torch.cat([outputs, out_res_matrix], dim=0)
else:
self.graph = dgl.add_nodes(self.graph, abs(dim_diff))
self.graph = dgl.add_self_loop(self.graph)
gnn_outputs = torch.tensor([]).to(x.device)
gnn_hidden = torch.tensor([]).to(x.device)
gnn_cell = torch.tensor([]).to(x.device)
for i in range(outputs.shape[1]):
output_g = self.gnn(graph=self.graph, feat=outputs[:, i, :].unsqueeze(1))
hidden_g = self.gnn(graph=self.graph, feat=hidden[:, i, :].unsqueeze(1))
cell_g = self.gnn(graph=self.graph, feat=cell[:, i, :].unsqueeze(1))
gnn_outputs = torch.cat([gnn_outputs, output_g], dim=1)
gnn_hidden = torch.cat([gnn_hidden, hidden_g], dim=1)
gnn_cell = torch.cat([gnn_cell, cell_g], dim=1)
gnn_outputs = self.dropout(gnn_outputs)
gnn_outputs = self.fc(gnn_outputs)
return gnn_outputs, gnn_hidden, gnn_cell
I’m sure graph and tensors are on same GPU, how to local and solve the problem?
Thank for your reply.