Here is the code recall the bug…
import argparse
import os
import torch
import dgl
from multiprocessing import Process
from dgl.utils.shared_mem import create_shared_mem_array, get_shared_mem_array
# from mailbox_daemon import start_mailbox_daemon
omp_num_threads = '2'
os.environ['OMP_NUM_THREADS'] = omp_num_threads
os.environ['MKL_NUM_THREADS'] = omp_num_threads
def start_daemon(omp_num_threads):
os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
os.environ['MKL_NUM_THREADS'] = str(omp_num_threads)
import torch
import dgl
import random
import numpy as np
from dgl.utils.shared_mem import create_shared_mem_array, get_shared_mem_array
node_memory = get_shared_mem_array('node_memory', torch.Size([9999, 100]), dtype=torch.float32)
print("before zero")
node_memory.zero_()
print("after zero")
_a = torch.rand((157475, 172))
a = create_shared_mem_array('a', _a.shape, dtype=_a.dtype)
a.copy_(_a)
node_memory = create_shared_mem_array('node_memory', torch.Size([9999, 100]), dtype=torch.float32)
mailbox_daemon = Process(target=start_daemon, args=(omp_num_threads))
mailbox_daemon.start()
print("Done!")
As you see, the code now will stuck in node_memory.zero()
this line. And I found several ways that can make code run successfully:
- Set
omp_num_threads
to'1'
. - Remove the code line
a.copy_(_a)
. - Move the code line after
mailbox_daemon.start()
. - Using a smaller
_a
shape.
Every methods list above make the code run sucessfully. I’m so confused where the problem is.