Source code for skmultilearn.cluster.random

from __future__ import absolute_import

import random

import numpy as np

from .base import LabelSpaceClustererBase


[docs]class RandomLabelSpaceClusterer(LabelSpaceClustererBase):
    """Randomly divides the label space into equally-sized clusters

    This method divides the label space by drawing without replacement a desired number of
    equally sized subsets of label space, in a partitioning or overlapping scheme.

    Parameters
    ----------
    cluster_size : int
        desired size of a single cluster, will be automatically
        put under :code:`self.cluster_size`.
    cluster_count: int
        number of clusters to divide into, will be automatically
        put under :code:`self.cluster_count`.
    allow_overlap : bool
        whether to allow overlapping clusters or not, will be automatically
        put under :code:`self.allow_overlap`.

    Examples
    --------

    The following code performs random label space partitioning.

    .. code :: python

        from skmultilearn.cluster import RandomLabelSpaceClusterer

        # assume X,y contain the data, example y contains 5 labels
        cluster_count = 2
        cluster_size = y.shape[1]//cluster_count # == 2
        clr = RandomLabelSpaceClusterer(cluster_size, cluster_count, allow_overlap=False)
        clr.fit_predict(X,y)
        # Result:
        # array([list([0, 4]), list([2, 3]), list([1])], dtype=object)


    Note that the leftover labels that did not fit in `cluster_size` x `cluster_count` classifiers will be appended
    to an additional last cluster of size at most `cluster_size` - 1.

    You can also use this class to get a random division of the label space, even with multiple overlaps:

    .. code :: python

        from skmultilearn.cluster import RandomLabelSpaceClusterer

        cluster_size = 3
        cluster_count = 5
        clr = RandomLabelSpaceClusterer(cluster_size, cluster_count, allow_overlap=True)
        clr.fit_predict(X,y)

        # Result
        # array([[2, 1, 3],
        #        [3, 0, 4],
        #        [2, 3, 1],
        #        [2, 3, 4],
        #        [3, 4, 0],
        #        [3, 0, 2]])


    Note that you will never get the same label subset twice.
    """

    def __init__(self, cluster_size, cluster_count, allow_overlap):
        super(RandomLabelSpaceClusterer, self).__init__()

        self.cluster_size = cluster_size
        self.cluster_count = cluster_count
        self.allow_overlap = allow_overlap

[docs]    def fit_predict(self, X, y):
        """Cluster the output space

        Parameters
        ----------
        X : currently unused, left for scikit compatibility
        y : scipy.sparse
            label space of shape :code:`(n_samples, n_labels)`

        Returns
        -------
        arrray of arrays of label indexes (numpy.ndarray)
            label space division, each sublist represents labels that are in that community
        """

        if (self.cluster_count+1) * self.cluster_size < y.shape[1]:
            raise ValueError("Cannot include all of {} labels in {} clusters of {} labels".format(
                y.shape[1],
                self.cluster_count,
                self.cluster_size
            ))

        all_labels_assigned_to_division = False
        # make sure the final label set division includes all labels
        while not all_labels_assigned_to_division:
            label_sets = []
            free_labels = range(y.shape[1])

            while len(label_sets) <= self.cluster_count:
                if not self.allow_overlap:
                    if len(free_labels) == 0:
                        break

                    # in this case, we are unable to draw new labels, add all that remain
                    if len(free_labels) < self.cluster_size:
                        label_sets.append(free_labels)
                        break

                label_set = random.sample(free_labels, self.cluster_size)
                if not self.allow_overlap:
                    free_labels = list(set(free_labels).difference(set(label_set)))

                if label_set not in label_sets:
                    label_sets.append(label_set)

            all_labels_assigned_to_division = all(
                any(label in subset for subset in label_sets)
                for label in range(y.shape[1])
            )

        return np.array(label_sets)