Hi everyone! Is the copy time (t4-t3
) for CPU-GPU data transfer correctly measured in this inference function? It would also be very helpful if someone could confirm if other timings are correctly measured and if there is a similar way to profile the CUDA memory allocation for each invocation of inference(node_id)
.
# Inference Function
def inference(node_id):
# get computation graph
t1 = time.time()
_, _, computation_graphs = sampler.sample_blocks(g, node_id)
t2 = time.time()
# copy to device
t3 = time.time()
computation_graphs = [cg.to(device) for cg in computation_graphs]
x = computation_graphs[0].srcdata['feat']
t4 = time.time()
# forward
t5 = time.time()
with torch.no_grad():
y_pred = model(computation_graphs, x)
pred = y_pred.argmax(1)
t6 = time.time()
# deallocate memory
t7 = time.time()
del computation_graphs
torch.cuda.empty_cache()
t8 = time.time()
return pred, (t2 - t1) * 1000, (t4 - t3) * 1000, (t6 - t5) * 1000, (t8 - t7) * 1000, (t8 - t1) * 1000