Source code for skmultilearn.embedding.openne

from copy import copy
from openne.gf import GraphFactorization
from openne.graph import Graph
from openne.grarep import GraRep
from openne.hope import HOPE
from openne.lap import LaplacianEigenmaps
from openne.line import LINE
from openne.lle import LLE
import networkx as nx
import numpy as np
import tensorflow as tf
import scipy.sparse as sp

[docs]class OpenNetworkEmbedder:
    """Embed the label space using a label network embedder from OpenNE

    Implements an OpenNE based LNEMLC: label network embeddings for multi-label classification.

    Parameters
    ----------
    graph_builder: a GraphBuilderBase inherited transformer
        the graph builder to provide the adjacency matrix and weight map for the underlying graph
    embedding : string, one of {'GraphFactorization', 'GraRep', 'HOPE', 'LaplacianEigenmaps', 'LINE', 'LLE'}
        the selected OpenNE_ embedding

        +----------------------+--------------------------------------------------------------------------------+
        | Method name string   |                             Description                                        |
        +----------------------+--------------------------------------------------------------------------------+
        | GraphFactorization_  | Graph factorization embeddings                                                 |
        +----------------------+--------------------------------------------------------------------------------+
        | GraRep_              | Graph representations with global structural information                       |
        +----------------------+--------------------------------------------------------------------------------+
        | HOPE_                | High-order Proximity Preserved Embedding                                       |
        +----------------------+--------------------------------------------------------------------------------+
        | LaplacianEigenmaps_  | Detecting communities from multiple async label propagation on the graph       |
        +----------------------+--------------------------------------------------------------------------------+
        | LINE_                | Large-scale information network embedding                                      |
        +----------------------+--------------------------------------------------------------------------------+
        | LLE_                 | Locally Linear Embedding                                                       |
        +----------------------+--------------------------------------------------------------------------------+

        .. _OpenNE: https://github.com/thunlp/OpenNE/
        .. _GraphFactorization: https://github.com/thunlp/OpenNE/blob/master/src/openne/gf.py
        .. _GraRep: https://github.com/thunlp/OpenNE/blob/master/src/openne/grarep.py
        .. _HOPE: https://github.com/thunlp/OpenNE/blob/master/src/openne/hope.py
        .. _LaplacianEigenmaps: https://github.com/thunlp/OpenNE/blob/master/src/openne/lap.py
        .. _LINE: https://github.com/thunlp/OpenNE/blob/master/src/openne/line.py
        .. _LLE: https://github.com/thunlp/OpenNE/blob/master/src/openne/lle.py


    dimension: int
        the dimension of the label embedding vectors
    aggregation_function: 'add', 'multiply', 'average' or Callable
        the function used to aggregate label vectors for all labels assigned to each of the samples
    normalize_weights: boolean
        whether to normalize weights in the label graph by the number of samples or not
    param_dict
        parameters passed to the embedder, don't use the dimension and graph parameters, this class will set them at fit

    If you use this classifier please cite the relevant embedding method paper
    and the label network embedding for multi-label classification paper:

    .. code :: bibtex

        @article{zhang2007ml,
          title={ML-KNN: A lazy learning approach to multi-label learning},
          author={Zhang, Min-Ling and Zhou, Zhi-Hua},
          journal={Pattern recognition},
          volume={40},
          number={7},
          pages={2038--2048},
          year={2007},
          publisher={Elsevier}
        }

    Example code for using this embedder looks like this:

    .. code-block:: python

        from skmultilearn.embedding import OpenNetworkEmbedder, EmbeddingClassifier
        from sklearn.ensemble import RandomForestRegressor
        from skmultilearn.adapt import MLkNN
        from skmultilearn.cluster import LabelCooccurrenceGraphBuilder

        graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
        openne_line_params = dict(batch_size=1000, negative_ratio=5)

        clf = EmbeddingClassifier(
            OpenNetworkEmbedder(graph_builder, 'LINE', 4, 'add', True, openne_line_params),
            RandomForestRegressor(n_estimators=10),
            MLkNN(k=5)
        )

        clf.fit(X_train, y_train)

        predictions = clf.predict(X_test)
    """

    _EMBEDDINGS = {
        'GraphFactorization': (GraphFactorization, 'rep_size'),
        'GraRep': (GraRep, 'dim'),
        'HOPE': (HOPE, 'd'),
        'LaplacianEigenmaps': (LaplacianEigenmaps, 'rep_size'),
        'LINE': (LINE, 'rep_size'),
        'LLE': (LLE, 'd'),
    }

    _AGGREGATION_FUNCTIONS = {
        'add': np.add.reduce,
        'multiply': np.multiply.reduce,
        'average': lambda x: np.average(x, axis=0),
    }

    def __init__(self, graph_builder, embedding, dimension, aggregation_function, normalize_weights, param_dict=None):
        if embedding not in self._EMBEDDINGS:
            raise ValueError('Embedding must be one of {}'.format(', '.join(self._EMBEDDINGS.keys())))

        if aggregation_function in self._AGGREGATION_FUNCTIONS:
            self.aggregation_function = self._AGGREGATION_FUNCTIONS[aggregation_function]
        elif callable(aggregation_function):
            self.aggregation_function = aggregation_function
        else:
            raise ValueError('Aggregation function must be callable or one of {}'.format(
                ', '.join(self._AGGREGATION_FUNCTIONS.keys()))
            )

        self.embedding = embedding
        self.param_dict = param_dict if param_dict is not None else {}
        self.dimension = dimension
        self.graph_builder = graph_builder
        self.normalize_weights = normalize_weights

    def fit(self, X, y):
        self.fit_transform(X, y)

    def fit_transform(self, X, y):
        tf.reset_default_graph()
        self._init_openne_graph(y)
        embedding_class, dimension_key = self._EMBEDDINGS[self.embedding]
        param_dict = copy(self.param_dict)
        param_dict['graph'] = self.graph_
        param_dict[dimension_key] = self.dimension
        self.embeddings_ = embedding_class(**param_dict)
        return X, self._embedd_y(y)

    def _init_openne_graph(self, y):
        self.graph_ = Graph()
        self.graph_.G = nx.DiGraph()
        for (src, dst), w in self.graph_builder.transform(y).items():
            self.graph_.G.add_edge(src, dst)
            self.graph_.G.add_edge(dst, src)
            if self.normalize_weights:
                w = float(w) / y.shape[0]
            self.graph_.G[src][dst]['weight'] = w
            self.graph_.G[dst][src]['weight'] = w
        self.graph_.encode_node()

    def _embedd_y(self, y):
        empty_vector = np.zeros(shape=self.dimension)
        if sp.issparse(y):
            return np.array([
                self.aggregation_function([self.embeddings_.vectors[node] for node in row])
                if len(row) > 0 else empty_vector
                for row in _iterate_over_sparse_matrix(y)
            ]).astype('float64')


        return np.array([
            self.aggregation_function([self.embeddings_.vectors[node] for node, v in enumerate(row) if v > 0])
            if len(row) > 0 else empty_vector
            for row in (y.A if isinstance(y, np.matrix) else y)
        ]).astype('float64')


def _iterate_over_sparse_matrix(y):
    for r in range(y.shape[0]):
        yield y[r,:].indices