Source code for skmultilearn.cluster.graphtool

from __future__ import absolute_import
from __future__ import print_function

import graph_tool.all as gt
import numpy as np

from .base import LabelGraphClustererBase
from .helpers import _membership_to_list_of_communities, _overlapping_membership_to_list_of_communities

[docs]class StochasticBlockModel: """A Stochastic Blockmodel fit to Label Graph This contains a stochastic block model instance constructed for a block model variant specified in parameters. It can be fit to an instance of a graph and set of weights. More information on how to select parameters can be found in `the extensive introduction into Stochastic Block Models <https://graph-tool.skewed.de/static/doc/demos/inference/inference.html>`_ in graphtool documentation. Parameters ---------- nested: boolean whether to build a nested Stochastic Block Model or the regular variant, will be automatically put under :code:`self.nested`. use_degree_correlation: boolean whether to correct for degree correlation in modeling, will be automatically put under :code:`self.use_degree_correlation`. allow_overlap: boolean whether to allow overlapping clusters or not, will be automatically put under :code:`self.allow_overlap`. weight_model: string or None decide whether to generate a weighted or unweighted graph, will be automatically put under :code:`self.weight_model`. Attributes ---------- model_: graph_tool.inference.BlockState or its subclass an instance of the fitted model obtained from graph-tool """ def __init__(self, nested, use_degree_correlation, allow_overlap, weight_model): self.nested = nested self.use_degree_correlation = use_degree_correlation self.allow_overlap = allow_overlap self.weight_model = weight_model self.model_ = None
[docs] def fit_predict(self, graph, weights): """Fits model to a given graph and weights list Sets :code:`self.model_` to the state of graphtool's Stochastic Block Model the after fitting. Attributes ---------- graph: graphtool.Graph the graph to fit the model to weights: graphtool.EdgePropertyMap<double> the property map: edge -> weight (double) to fit the model to, if weighted variant is selected Returns ------- numpy.ndarray partition of labels, each sublist contains label indices related to label positions in :code:`y` """ if self.weight_model: self.model_ = self._model_fit_function()( graph, deg_corr=self.use_degree_correlation, overlap=self.allow_overlap, state_args=dict(recs=[weights], rec_types=[self.weight_model]) ) else: self.model_ = self._model_fit_function()( graph, deg_corr=self.use_degree_correlation, overlap=self.allow_overlap ) return self._detect_communities()
def _detect_communities(self): if self.nested: lowest_level = self.model_.get_levels()[0] else: lowest_level = self.model_ number_of_communities = lowest_level.get_B() if self.allow_overlap: # the overlaps block returns # membership vector, and also edges vectors, we need just the membership here at the moment membership_vector = list(lowest_level.get_overlap_blocks()[0]) else: membership_vector = list(lowest_level.get_blocks()) if self.allow_overlap: return _overlapping_membership_to_list_of_communities(membership_vector, number_of_communities) return _membership_to_list_of_communities(membership_vector, number_of_communities) def _model_fit_function(self): if self.nested: return gt.minimize_nested_blockmodel_dl else: return gt.minimize_blockmodel_dl
[docs]class GraphToolLabelGraphClusterer(LabelGraphClustererBase): """Fits a Stochastic Block Model to the Label Graph and infers the communities This clusterer clusters the label space using by fitting a stochastic block model to the label network and inferring the community structure using graph-tool. The obtained community structure is returned as the label clustering. More information on the inference itself can be found in `the extensive introduction into Stochastic Block Models <https://graph-tool.skewed.de/static/doc/demos/inference/inference.html>`_ in graphtool documentation. Parameters ---------- graph_builder: a GraphBuilderBase inherited transformer the graph builder to provide the adjacency matrix and weight map for the underlying graph model: StochasticBlockModel the desired stochastic block model variant to use Attributes ---------- graph_ : graphtool.Graph object representing a label co-occurence graph weights_ : graphtool.EdgeProperty<double> edge weights defined by graph builder stored in a graphtool compatible format .. note :: This functionality is still undergoing research. .. note :: This clusterer is GPL-licenced and will taint your code with GPL restrictions. References ---------- If you use this class please cite: .. code : latex article{peixoto_graph-tool_2014, title = {The graph-tool python library}, url = {http://figshare.com/articles/graph_tool/1164194}, doi = {10.6084/m9.figshare.1164194}, urldate = {2014-09-10}, journal = {figshare}, author = {Peixoto, Tiago P.}, year = {2014}, keywords = {all, complex networks, graph, network, other}} Examples -------- An example code for using this clusterer with a classifier looks like this: .. code-block:: python from sklearn.ensemble import RandomForestClassifier from skmultilearn.problem_transform import LabelPowerset from skmultilearn.cluster import IGraphLabelGraphClusterer, LabelCooccurrenceGraphBuilder from skmultilearn.ensemble import LabelSpacePartitioningClassifier # construct base forest classifier base_classifier = RandomForestClassifier(n_estimators=1000) # construct a graph builder that will include # label relations weighted by how many times they # co-occurred in the data, without self-edges graph_builder = LabelCooccurrenceGraphBuilder( weighted = True, include_self_edges = False ) # select parameters for the model, we fit a flat, # non-degree correlated, partitioning model # which will use fit the normal distribution as the weights model model = StochasticBlockModel( nested=False, use_degree_correlation=True, allow_overlap=False, weight_model='real-normal' ) # setup problem transformation approach with sparse matrices for random forest problem_transform_classifier = LabelPowerset(classifier=base_classifier, require_dense=[False, False]) # setup the clusterer to use, we selected the fast greedy modularity-maximization approach clusterer = GraphToolLabelGraphClusterer(graph_builder=graph_builder, model=model) # setup the ensemble metaclassifier classifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer) # train classifier.fit(X_train, y_train) # predict predictions = classifier.predict(X_test) For more use cases see `the label relations exploration guide <../labelrelations.ipynb>`_. """ def __init__(self, graph_builder, model): super(GraphToolLabelGraphClusterer, self).__init__(graph_builder) self.model = model self.graph_builder = graph_builder
[docs] def fit_predict(self, X, y): """Performs clustering on y and returns list of label lists Builds a label graph using the provided graph builder's `transform` method on `y` and then detects communities using the selected `method`. Sets :code:`self.weights_` and :code:`self.graph_`. Parameters ---------- X : None currently unused, left for scikit compatibility y : scipy.sparse label space of shape :code:`(n_samples, n_labels)` Returns ------- arrray of arrays of label indexes (numpy.ndarray) label space division, each sublist represents labels that are in that community """ self._build_graph_instance(y) clusters = self.model.fit_predict(self.graph_, weights=self.weights_) return np.array([community for community in clusters if len(community) > 0])
def _build_graph_instance(self, y): edge_map = self.graph_builder.transform(y) g = gt.Graph(directed=False) g.add_vertex(y.shape[1]) self.weights_ = g.new_edge_property('double') for edge, weight in edge_map.items(): e = g.add_edge(edge[0], edge[1]) self.weights_[e] = weight self.graph_ = g