Source code for skmultilearn.cluster.graphtool

from __future__ import absolute_import
from __future__ import print_function

import graph_tool.all as gt
import numpy as np

from .base import LabelGraphClustererBase
from .helpers import _membership_to_list_of_communities, _overlapping_membership_to_list_of_communities

[docs]class StochasticBlockModel:
    """A Stochastic Blockmodel fit to Label Graph

    This contains a stochastic block model instance constructed for a block model variant specified in parameters.
    It can be fit to an instance of a graph and set of weights. More information on how to select parameters can be
    found in `the extensive introduction into Stochastic Block Models
    <https://graph-tool.skewed.de/static/doc/demos/inference/inference.html>`_ in graphtool documentation.

    Parameters
    ----------
    nested: boolean
        whether to build a nested Stochastic Block Model or the regular variant,
        will be automatically put under :code:`self.nested`.
    use_degree_correlation: boolean
        whether to correct for degree correlation in modeling, will be automatically
        put under :code:`self.use_degree_correlation`.
    allow_overlap: boolean
        whether to allow overlapping clusters or not, will be automatically
        put under :code:`self.allow_overlap`.
    weight_model: string or None
        decide whether to generate a weighted or unweighted graph,
        will be automatically put under :code:`self.weight_model`.

    Attributes
    ----------
    model_: graph_tool.inference.BlockState or its subclass
        an instance of the fitted model obtained from graph-tool
    """
    def __init__(self, nested, use_degree_correlation, allow_overlap, weight_model):
        self.nested = nested
        self.use_degree_correlation = use_degree_correlation
        self.allow_overlap = allow_overlap
        self.weight_model = weight_model
        self.model_ = None

[docs]    def fit_predict(self, graph, weights):
        """Fits model to a given graph and weights list

        Sets :code:`self.model_` to the state of graphtool's Stochastic Block Model the after fitting.

        Attributes
        ----------
        graph: graphtool.Graph
            the graph to fit the model to
        weights: graphtool.EdgePropertyMap<double>
            the property map: edge -> weight (double) to fit the model to, if weighted variant
            is selected

        Returns
        -------
        numpy.ndarray
            partition of labels, each sublist contains label indices
            related to label positions in :code:`y`
        """
        if self.weight_model:
            self.model_ = self._model_fit_function()(
                graph,
                deg_corr=self.use_degree_correlation,
                overlap=self.allow_overlap,
                state_args=dict(recs=[weights],
                                rec_types=[self.weight_model])
            )
        else:
            self.model_ = self._model_fit_function()(
                graph,
                deg_corr=self.use_degree_correlation,
                overlap=self.allow_overlap
            )
        return self._detect_communities()

    def _detect_communities(self):
        if self.nested:
            lowest_level = self.model_.get_levels()[0]
        else:
            lowest_level = self.model_

        number_of_communities = lowest_level.get_B()
        if self.allow_overlap:
            # the overlaps block returns
            # membership vector, and also edges vectors, we need just the membership here at the moment
            membership_vector = list(lowest_level.get_overlap_blocks()[0])
        else:
            membership_vector = list(lowest_level.get_blocks())

        if self.allow_overlap:
            return _overlapping_membership_to_list_of_communities(membership_vector, number_of_communities)

        return _membership_to_list_of_communities(membership_vector, number_of_communities)

    def _model_fit_function(self):
        if self.nested:
            return gt.minimize_nested_blockmodel_dl
        else:
            return gt.minimize_blockmodel_dl


[docs]class GraphToolLabelGraphClusterer(LabelGraphClustererBase):
    """Fits a Stochastic Block Model to the Label Graph and infers the communities

    This clusterer clusters the label space using by fitting a stochastic block
    model to the label network and inferring the community structure using graph-tool.
    The obtained community structure is returned as the label clustering. More information on the inference itself
    can be found in `the extensive introduction into Stochastic Block Models
    <https://graph-tool.skewed.de/static/doc/demos/inference/inference.html>`_ in graphtool documentation.

    Parameters
    ----------
    graph_builder: a GraphBuilderBase inherited transformer
        the graph builder to provide the adjacency matrix and weight map for the underlying graph
    model: StochasticBlockModel
        the desired stochastic block model variant to use


    Attributes
    ----------
    graph_ : graphtool.Graph
        object representing a label co-occurence graph
    weights_ : graphtool.EdgeProperty<double>
        edge weights defined by graph builder stored in a graphtool compatible format


    .. note ::

        This functionality is still undergoing research.


    .. note ::

        This clusterer is GPL-licenced and will taint your code with GPL restrictions.



    References
    ----------

    If you use this class please cite:

    .. code : latex

        article{peixoto_graph-tool_2014,
         title = {The graph-tool python library},
         url = {http://figshare.com/articles/graph_tool/1164194},
         doi = {10.6084/m9.figshare.1164194},
         urldate = {2014-09-10},
         journal = {figshare},
         author = {Peixoto, Tiago P.},
         year = {2014},
         keywords = {all, complex networks, graph, network, other}}


    Examples
    --------

    An example code for using this clusterer with a classifier looks like this:

    .. code-block:: python

        from sklearn.ensemble import RandomForestClassifier
        from skmultilearn.problem_transform import LabelPowerset
        from skmultilearn.cluster import IGraphLabelGraphClusterer, LabelCooccurrenceGraphBuilder
        from skmultilearn.ensemble import LabelSpacePartitioningClassifier

        # construct base forest classifier
        base_classifier = RandomForestClassifier(n_estimators=1000)

        # construct a graph builder that will include
        # label relations weighted by how many times they
        # co-occurred in the data, without self-edges
        graph_builder = LabelCooccurrenceGraphBuilder(
            weighted = True,
            include_self_edges = False
        )

        # select parameters for the model, we fit a flat,
        # non-degree correlated, partitioning model
        # which will use fit the normal distribution as the weights model
        model = StochasticBlockModel(
            nested=False,
            use_degree_correlation=True,
            allow_overlap=False,
            weight_model='real-normal'
        )

        # setup problem transformation approach with sparse matrices for random forest
        problem_transform_classifier = LabelPowerset(classifier=base_classifier,
            require_dense=[False, False])

        # setup the clusterer to use, we selected the fast greedy modularity-maximization approach
        clusterer = GraphToolLabelGraphClusterer(graph_builder=graph_builder, model=model)

        # setup the ensemble metaclassifier
        classifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer)

        # train
        classifier.fit(X_train, y_train)

        # predict
        predictions = classifier.predict(X_test)

    For more use cases see `the label relations exploration guide <../labelrelations.ipynb>`_.

    """

    def __init__(self, graph_builder, model):
        super(GraphToolLabelGraphClusterer, self).__init__(graph_builder)

        self.model = model
        self.graph_builder = graph_builder

[docs]    def fit_predict(self, X, y):
        """Performs clustering on y and returns list of label lists

        Builds a label graph using the provided graph builder's `transform` method
        on `y` and then detects communities using the selected `method`.

        Sets :code:`self.weights_` and :code:`self.graph_`.

        Parameters
        ----------
        X : None
            currently unused, left for scikit compatibility
        y : scipy.sparse
            label space of shape :code:`(n_samples, n_labels)`

        Returns
        -------
        arrray of arrays of label indexes (numpy.ndarray)
            label space division, each sublist represents labels that are in that community
        """
        self._build_graph_instance(y)
        clusters = self.model.fit_predict(self.graph_, weights=self.weights_)
        return np.array([community for community in clusters if len(community) > 0])

    def _build_graph_instance(self, y):
        edge_map = self.graph_builder.transform(y)

        g = gt.Graph(directed=False)
        g.add_vertex(y.shape[1])

        self.weights_ = g.new_edge_property('double')

        for edge, weight in edge_map.items():
            e = g.add_edge(edge[0], edge[1])
            self.weights_[e] = weight

        self.graph_ = g