Help with training on user data

gudeh · November 21, 2022, 6:24pm

Hi there! I am trying to perform a simple training on a GCN but it has been really challenging for me. I tried to use two code snippets from the tutorial on DGL, here is what I am trying.

You can find my input CSVs in this link: DGL doubts - Google Sheets

import dgl
from dgl.data import DGLDataset
import torch
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

#root="aes_cipher_top/"
root="myDataSet/"
class fromYosysDS(DGLDataset):
	def __init__(self):
		super().__init__(name='mydata_from_yosys')

	def process(self):
		#nodes_data = pd.read_csv(root+'aes_DGLcells_labeled.csv')
		nodes_data = pd.read_csv('gatesToHeat.csv')
#		node_labels = torch.from_numpy(nodes_data['ConnCount'].to_numpy())
#		node_features = torch.from_numpy(nodes_data['type'].astype('category').cat.codes.to_numpy())

		#edges_data = pd.read_csv(root+'aes_DGLedges.csv')
		edges_data = pd.read_csv('DGLedges.csv')
		edges_src = torch.from_numpy(edges_data['Src'].to_numpy())
		edges_dst = torch.from_numpy(edges_data['Dst'].to_numpy())
		#edge_features = torch.from_numpy(edges_data['Weight'].to_numpy())

		print("nodes_data.shape[0]",nodes_data.shape[0])
		self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
		
		self.graph.ndata['type'] = torch.from_numpy(nodes_data['type'].astype('category').cat.codes.to_numpy())
		print("classes:",self.graph.ndata['type'])
		self.graph.ndata['conCount'] = torch.from_numpy(nodes_data['conCount'].to_numpy())
		#self.graph.ndata['label'] = torch.from_numpy(nodes_data['congestion'].to_numpy())
		self.graph.ndata['placementHeat'] = torch.from_numpy (nodes_data['placementHeat'].to_numpy())
		self.graph.ndata['powerHeat'] = torch.from_numpy (nodes_data['powerHeat'].to_numpy())
		self.graph.ndata['routingHeat'] = torch.from_numpy (nodes_data['routingHeat'].to_numpy())
		self.graph.ndata['irDropHeat'] = torch.from_numpy (nodes_data['irDropHeat'].to_numpy())
	
		#self.graph.edata['weight'] = edge_features

		# If your dataset is a node classification dataset, you will need to assign
		# masks indicating whether a node belongs to training, validation, and test set.
		n_nodes = nodes_data.shape[0]
		n_train = int(n_nodes * 0.6)
		n_val = int(n_nodes * 0.2)
		train_mask = torch.zeros(n_nodes, dtype=torch.bool)
		val_mask = torch.zeros(n_nodes, dtype=torch.bool)
		test_mask = torch.zeros(n_nodes, dtype=torch.bool)
		train_mask[:n_train] = True
		val_mask[n_train:n_train + n_val] = True
		test_mask[n_train + n_val:] = True
		self.graph.ndata['train_mask'] = train_mask
		self.graph.ndata['val_mask'] = val_mask
		self.graph.ndata['test_mask'] = test_mask

	def __getitem__(self, i):
		return self.graph

	def __len__(self):
		return 1



from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    g = dgl.add_self_loop(g)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata['type']
    labels = g.ndata['placementHeat']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    for e in range(100):
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc))

dataset = fromYosysDS()
print("dataset:",dataset)
graph = dataset[0]
print("graph:",graph,type(graph))
#print("graph.ndata:",graph.ndata)
print("len graph.ndata:",len(graph.ndata))
print("type:",type(graph.ndata))
print("\n\n\nmodel:")
# Create the model with given dimensions
#model = GCN( graph.ndata['type'].shape[0], 365, 31 )
model = GCN( 1, 1, 31 )
print("model type:",type(model))

print("\n\n\ntrain:")
train(graph, model)

I have a lot of questions I can’t figure out by myself:

I can’t make sense of the feature size and other parameters on line “model = GCN ( 1, 1, 31 )”. It seems The original code has a different datatype for “graph.ndata”. In my case I would like my features to be from columns “type” and “conCount”. I tried doing a slice with: “features = g.ndata[:1]” but it seems I can’t slice this type. I wanted to do only one feature from column “type” at a first moment, but even like this is not working

Also I would like to be able to choose between the four possible labels from columns with “Heat” in their names. To do so I would have to set a different value on the last parameter on "GCN ( …, …, … ), correct? On the original code it was “dataset.num_classes”, although with my generated dataset there is no such method “num_classes”.

The tutorial is awkward, I don’t understand why there is no training code for the user generated data: Make Your Own Dataset — DGL 0.9.1post1 documentation

Another awkward thing with the documentation is that when I do “type(graph)” I get “<class ‘dgl.view.HeteroNodeDataView’>” although there is no such type on the DGL documentation. This makes it really hard to learn how to code with the library by myself!!

minjie · November 24, 2022, 1:05am

g.ndata is a dictionary. If you want your features to be from columns “type” and “conCount”. You can first cast them to the same data type (e.g., float) and then concatenate them. Below is an example snippet:

x = torch.cat([g.ndata['type'].float(), g.ndata['conCount'].float()], dim=1)

Also I would like to be able to choose between the four possible labels from columns with “Heat” in their names. To do so I would have to set a different value on the last parameter on "GCN ( …, …, … ), correct? On the original code it was “dataset.num_classes”, although with my generated dataset there is no such method “num_classes”.

Your learning task seems to be more advanced than the typical node classification where each node only belongs to one class. Is this more like a node regression task? i.e., let the model produce heat values for each node to be as close as to the ones in the training set?

Sorry for your bad experience. The tutorial is meant for the dataset part not the end-to-end training. You can copy-paste the dataset code to replace the dataset code in any of the end-to-end tutorial (for example this one) and it should work.

Another awkward thing with the documentation is that when I do “type(graph)” I get “<class ‘dgl.view.HeteroNodeDataView’>” although there is no such type on the DGL documentation. This makes it really hard to learn how to code with the library by myself!!

Sorry again for the confusion. It seems that the you are printing out type(graph.ndata) instead of type(graph) which gives the type of node data view instead of graph. Also, we recently have fixed type(graph) in https://github.com/dmlc/dgl/pull/4833 . Now type(graph) should give <class dgl.DGLGraph>.

gudeh · November 29, 2022, 9:34pm

I am sorry if I was too rough on the documentation. My intention was to provide a constructive criticism and possibly a tip to improve the library.

Yes, you are correct. At first I was trying to make everything else work. Is there any webpage or on your own way you could indicate me how a regression training could be put in place here?

I was able to overcome the previous errors by modifying my single column feature into a one-hot tensor. I am still not using the two available features. I will also try what you indicated with torch.cat. Right now I am having this error:

Traceback (most recent call last):
File “/home/gudeh/Desktop/OpenROAD-flow-scripts/flow/myStuff/mydata.py”, line 157, in
train(graph, model)
File “/home/gudeh/Desktop/OpenROAD-flow-scripts/flow/myStuff/mydata.py”, line 112, in train
loss = F.cross_entropy(logits[train_mask], labels[train_mask])
File “/home/gudeh/.local/lib/python3.10/site-packages/torch/nn/functional.py”, line 3026, in cross_entropy
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
RuntimeError: expected scalar type Long but found Float

gudeh · December 1, 2022, 5:57pm

I was able to make training work. My issue now is attempting to get good accuracy values. The model seems to not be learning with the current configs. Any tips on how to improve learning?

This is my code which is working now:

import dgl
from dgl.data import DGLDataset
import torch
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

print("dgl.__version_",dgl.__version__)

#root="aes_cipher_top/"
root="myDataSet/"
class fromYosysDS(DGLDataset):
	def __init__(self):
		super().__init__(name='mydata_from_yosys')

	def process(self):
		#nodes_data = pd.read_csv(root+'aes_DGLcells_labeled.csv')
		nodes_data = pd.read_csv('/home/gudeh/Desktop/OpenROAD-flow-scripts/flow/myStuff/gcd/gatesToHeat.csv')
#		node_labels = torch.from_numpy(nodes_data['ConnCount'].to_numpy())

		#edges_data = pd.read_csv(root+'aes_DGLedges.csv')
		edges_data = pd.read_csv('/home/gudeh/Desktop/OpenROAD-flow-scripts/flow/myStuff/gcd/DGLedges.csv')
		edges_src = torch.from_numpy(edges_data['Src'].to_numpy())
		edges_dst = torch.from_numpy(edges_data['Dst'].to_numpy())
		#edge_features = torch.from_numpy(edges_data['Weight'].to_numpy())

		self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
		self.graph.ndata['type'] = torch.from_numpy(nodes_data['type'].astype('category').cat.codes.to_numpy())
		print("self.graph.ndata['type']",type(self.graph.ndata['type']),self.graph.ndata['type'].shape,self.graph.ndata['type'].type())
		self.graph.ndata['conCount'] = torch.from_numpy(nodes_data['conCount'].to_numpy())
		
		self.graph.ndata['placementHeat'] = torch.from_numpy (nodes_data['placementHeat'].to_numpy())
		self.graph.ndata['powerHeat'] = torch.from_numpy (nodes_data['powerHeat'].to_numpy())
		self.graph.ndata['routingHeat'] = torch.from_numpy (nodes_data['routingHeat'].to_numpy())
		self.graph.ndata['irDropHeat'] = torch.from_numpy (nodes_data['irDropHeat'].to_numpy())
	
		#self.graph.edata['weight'] = edge_features

		# If your dataset is a node classification dataset, you will need to assign
		# masks indicating whether a node belongs to training, validation, and test set.
		n_nodes = nodes_data.shape[0]
		n_train = int(n_nodes * 0.6)
		n_val = int(n_nodes * 0.2)
		train_mask = torch.zeros(n_nodes, dtype=torch.bool)
		val_mask = torch.zeros(n_nodes, dtype=torch.bool)
		test_mask = torch.zeros(n_nodes, dtype=torch.bool)
		train_mask[:n_train] = True
		val_mask[n_train:n_train + n_val] = True
		test_mask[n_train + n_val:] = True
		self.graph.ndata['train_mask'] = train_mask
		self.graph.ndata['val_mask'] = val_mask
		self.graph.ndata['test_mask'] = test_mask

	def __getitem__(self, i):
		return self.graph

	def __len__(self):
		return 1




import dgl.nn as dglnn


class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super().__init__()
        self.conv1 = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats, aggregator_type='mean')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=out_feats, aggregator_type='mean')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = F.relu(h)
        h = self.conv2(graph, h)
        return h


def evaluate(model, graph, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(graph, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)


def regressionTrain(graph):
#	node_features = graph.ndata['type'][None:1]
#	print("graph.ndata['type']",type(graph.ndata['type']),graph.ndata['type'].shape,graph.ndata['type'].type())
#	print("graph.ndata['conCount']",type(graph.ndata['conCount']),graph.ndata['conCount'].shape,graph.ndata['conCount'].type())
	node_features = torch.cat([graph.ndata['type'].float()[:,None], graph.ndata['conCount'].float()[:,None]], dim=1)
	print("node_features",type(node_features),node_features.shape)
	node_labels = graph.ndata['placementHeat'].long()
	node_labels[ node_labels == -1 ] = 0
	train_mask = graph.ndata['train_mask']
	valid_mask = graph.ndata['val_mask']
	test_mask = graph.ndata['test_mask']
	n_features = node_features.shape[1]
	n_labels = int(node_labels.max().item() + 1)

	model = SAGE(in_feats=n_features, hid_feats=100, out_feats=n_labels)
	opt = torch.optim.Adam(model.parameters())
	for epoch in range(10000):
		model.train()
		# forward propagation by using all nodes
		logits = model(graph, node_features)
		# compute loss
		loss = F.cross_entropy(logits[train_mask], node_labels[train_mask])
		# compute validation accuracy
		acc = evaluate(model, graph, node_features, node_labels, valid_mask)
		# backward propagation
		opt.zero_grad()
		loss.backward()
		opt.step()
	print( "loss", loss.item(), "acc", acc )



dataset = fromYosysDS()
print("dataset:",dataset)
graph = dataset[0]
#graph.ndata['type'] = torch.nn.functional.one_hot(graph.ndata['type'].to(torch.int64))
#graph.ndata['type'] = torch.from_numpy(graph.ndata['type'].astype('category').cat.codes.to_numpy())

print("graph:",graph,type(graph))
print('We have %d nodes.' % graph.number_of_nodes())
print('We have %d edges.' % graph.number_of_edges())

#import networkx as nx
#import matplotlib.pyplot as plt
#nx_G = graph.to_networkx()
#pos = nx.kamada_kawai_layout(nx_G)
#nx.draw(nx_G, pos, with_labels=True, node_color=[[.7, .7, .7]])
#plt.show()

print("len graph.ndata:",len(graph.ndata))
print("type graph.ndata:",type(graph.ndata))

regressionTrain(graph)

minjie · December 8, 2022, 2:00am

Hi, it sometimes happens when the model is training smoothly as expected. Please try play with the hyperparameters such as learning rate, weight decay, hidden size, number of layers, etc. For example, I saw you were using the default learning rate in your optimizer opt = orch.optim.Adam(model.parameters()).

gudeh · December 12, 2022, 1:08pm

Hello Minjie. Exactly, the training setup I have thus far is really limited to actually learn something.

What I am trying to do now is to have multiple graphs as my training set, and another few graphs as test. Would you know how I could achieve this?

minjie · December 15, 2022, 2:27am

Yes, it is a typical inductive training setting. See our GAT example on PPI dataset.

system · January 14, 2023, 2:27am

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.