Custom DGL dataset using hetero-graphs

TudorAndrei · July 6, 2021, 5:46pm

Hey, I’m trying to use the ConceptNet (only for english) for a link prediction task and I struggle implementing a DGLDataset.

This is what I have until now taken from the Dataset Tutorial. But it doesn’t seem to work.

class ConceptNetDataset(DGLDataset):
    def __init__(self, path, sep):
        self.path = path
        self.sep = sep
        super().__init__(name='concept_net')
        self.bidirections = ["RelatedTo","Synonym", "Antonym", "DistinctFrom", "LocatedNear", "SimilarTo", "EtymologicallyRelatedTo"]

    def process(self):
        data = pd.read_csv(self.path, sep=self.sep)
        # get all the entities
        nodes = pd.concat([data["e1"], data["e2"]], axis=0).unique()
        entities = {y: x for x,y in enumerate(nodes)}
        # encode all entities
        data["e1"] = data["e1"].apply(lambda x: entities[x])
        data["e2"] = data["e2"].apply(lambda x: entities[x])

        # encode all entities in the nodes list
        encode = lambda x: entities[x]
        nodes = [encode(x) for x in nodes]
        nodes = np.array(nodes)
        # create node labels
        node_labels = torch.from_numpy(nodes)

        edge_features = torch.from_numpy(data['score'].to_numpy())
        edges_src = torch.from_numpy(data['e1'].to_numpy())
        edges_dst = torch.from_numpy(data['e2'].to_numpy())

        self.graph = dgl.heterograph((edges_src, edges_dst))
        self.graph.ndata['label'] = node_labels
        self.graph.edata['weight'] = edge_features

    def __getitem__(self, i):
        return self.graph[i]

    def __len__(self):
        return 1

I have also taken a look at Hetero-graph Tutorial, but I don’t understand how I would go about creating the DGLDataset.
I thought about creating a dictionary similar to:

graph_data = {

   ('drug', 'interacts', 'drug'): (th.tensor([0, 1]), th.tensor([1, 2])),

   ('drug', 'interacts', 'gene'): (th.tensor([0, 1]), th.tensor([2, 3])),

   ('drug', 'treats', 'disease'): (th.tensor([1]), th.tensor([2]))

}

But I don’t know how to add the edge features.

Any hint/help/suggestion is welcomed.

Thank you!

mufeili · July 11, 2021, 1:32pm

graph_data is required for creating a heterogeneous DGLGraph using dgl.heterograph. Once you have the graph created, you can assign edge features as in user guide 1.5.

TudorAndrei · July 11, 2021, 1:34pm

Thank you for the reply. I have managed to put something together.

class ConceptNetDataset(DGLDataset):
    def __init__(self, path, sep):
        self.path = path
        self.sep = sep
        super().__init__(name='concept_net')

    def process(self):
        bidirections = ["RelatedTo", "Synonym", "Antonym", "DistinctFrom",
                        "LocatedNear", "SimilarTo", "EtymologicallyRelatedTo"]
        data = pd.read_csv(self.path, sep=self.sep)
        # get all the entities
        nodes = pd.concat([data["e1"], data["e2"]], axis=0).unique()
        edges_type = data["rel"].unique().tolist()
        edges = {y: x for x, y in enumerate(edges_type)}
        entities = {y: x for x, y in enumerate(nodes)}
        # encode all entities
        data["e1"] = data["e1"].apply(lambda x: entities[x])
        data["e2"] = data["e2"].apply(lambda x: entities[x])

        # encode all entities in the nodes list
        def encode(x): return entities[x]
        nodes = [encode(x) for x in nodes]
        nodes = np.array(nodes)
        # create node labels
        node_labels = torch.from_numpy(nodes)

        # edge_features = torch.from_numpy(data['score'].to_numpy())
        node_type = "_N"  # '_N' can be replaced by an arbitrary name
        data_dict = dict()
        num_nodes_dict = {node_type: len(entities)}

        # create backlinks to node with certain edge types
        for bd in bidirections:
            aux = data[data["rel"] == bd].copy()
            col_list = list(aux)
            col_list[0], col_list[1] = col_list[1], col_list[0]
            aux.columns = col_list
            aux = aux[sorted(aux)]
            data = pd.concat([data, aux], axis=0, ignore_index=True)

        data.reset_index(drop=True)
        for e_t in edges_type:
            aux = data[data["rel"] == e_t]
            src = torch.from_numpy(aux['e1'].to_numpy())
            dst = torch.from_numpy(aux['e2'].to_numpy())
            data_dict[(node_type, e_t, node_type)] = (src, dst)

        self.graph = dgl.heterograph(data_dict, num_nodes_dict)
        for e_t in edges_type:
            # add the weitght to each node
            self.graph.edges[e_t].data["weight"] = torch.from_numpy(
                data[data["rel"] == e_t]['score'].to_numpy())
            # add the train mask
            e_len = len(data[data['rel'] == e_t])
            self.graph.edges[e_t].data['train_mask'] = torch.zeros(
                e_len, dtype=torch.bool).bernoulli(0.6)
        # add a feature to each node, the feature is the index of the word in the vocab
        self.graph.nodes['_N'].data["feature"] = node_labels
        # add the train tamsk to the nodes
        self.graph.nodes['_N'].data["train_mask"] = torch.zeros(
            len(entities), dtype=torch.bool).bernoulli(0.6)

        # Train val splti for node classification
        n_nodes = nodes.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train:n_train + n_val] = True
        test_mask[n_train + n_val:] = True
        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['val_mask'] = val_mask
        self.graph.ndata['test_mask'] = test_mask

    def __getitem__(self, i):
        return self.graph[i]

    def __len__(self):
        return 1

system · August 10, 2021, 1:35pm

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.