Source code for skmultilearn.problem_transform.br

import copy
import numpy as np

from scipy.sparse import hstack, issparse, lil_matrix

from ..base.problem_transformation import ProblemTransformationBase
from ..base.base import MLClassifierBase


[docs]class BinaryRelevance(ProblemTransformationBase):
    """Performs classification per label

    Transforms a multi-label classification problem with L labels
    into L single-label separate binary classification problems
    using the same base classifier provided in the constructor. The
    prediction output is the union of all per label classifiers

    Parameters
    ----------
    classifier : :class:`~sklearn.base.BaseEstimator`
        scikit-learn compatible base classifier
    require_dense : [bool, bool], optional
        whether the base classifier requires dense representations
        for input features and classes/labels matrices in fit/predict.
        If value not provided, sparse representations are used if base classifier is
        an instance of :class:`~skmultilearn.base.MLClassifierBase` and dense otherwise.

    Attributes
    ----------
    model_count_ : int
        number of trained models, in this classifier equal to `n_labels`
    partition_ : List[List[int]], shape=(`model_count_`,)
        list of lists of label indexes, used to index the output space matrix, set in :meth:`_generate_partition`
        via :meth:`fit`
    classifiers_ : List[:class:`~sklearn.base.BaseEstimator`] of shape `model_count`
        list of classifiers trained per partition, set in :meth:`fit`

    Notes
    -----
    .. note ::

        This is one of the most basic approaches to multi-label classification, it ignores relationships between labels.

    Examples
    --------
    An example use case for Binary Relevance classification
    with an :class:`sklearn.svm.SVC` base classifier which supports sparse input:


    .. code-block:: python

        from skmultilearn.problem_transform import BinaryRelevance
        from sklearn.svm import SVC

        # initialize Binary Relevance multi-label classifier
        # with an SVM classifier
        # SVM in scikit only supports the X matrix in sparse representation

        classifier = BinaryRelevance(
            classifier = SVC(),
            require_dense = [False, True]
        )

        # train
        classifier.fit(X_train, y_train)

        # predict
        predictions = classifier.predict(X_test)

    Another way to use this classifier is to select the best scenario from a set of single-label classifiers used
    with Binary Relevance, this can be done using cross validation grid search. In the example below, the model
    with highest accuracy results is selected from either a :class:`sklearn.naive_bayes.MultinomialNB` or
    :class:`sklearn.svm.SVC` base classifier, alongside with best parameters for that base classifier.

    .. code-block:: python

        from skmultilearn.problem_transform import BinaryRelevance
        from sklearn.model_selection import GridSearchCV
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.svm import SVC

        parameters = [
            {
                'classifier': [MultinomialNB()],
                'classifier__alpha': [0.7, 1.0],
            },
            {
                'classifier': [SVC()],
                'classifier__kernel': ['rbf', 'linear'],
            },
        ]


        clf = GridSearchCV(BinaryRelevance(), parameters, scoring='accuracy')
        clf.fit(x, y)

        print (clf.best_params_, clf.best_score_)

        # result:
        #
        # {
        #   'classifier': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
        #   decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
        #   max_iter=-1, probability=False, random_state=None, shrinking=True,
        #   tol=0.001, verbose=False), 'classifier__kernel': 'linear'
        # } 0.17

    """

    def __init__(self, classifier=None, require_dense=None):
        super(BinaryRelevance, self).__init__(classifier, require_dense)

    def _generate_partition(self, X, y):
        """Partitions the label space into singletons

        Sets `self.partition_` (list of single item lists) and `self.model_count_` (equal to number of labels).

        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            not used, only for API compatibility
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `int`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        """
        self.partition_ = list(range(y.shape[1]))
        self.model_count_ = y.shape[1]

[docs]    def fit(self, X, y):
        """Fits classifier to training data

        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments

        Returns
        -------
        self
            fitted instance of self

        Notes
        -----
        .. note :: Input matrices are converted to sparse format internally if a numpy representation is passed
        """
        X = self._ensure_input_format(
            X, sparse_format='csr', enforce_sparse=True)
        y = self._ensure_output_format(
            y, sparse_format='csc', enforce_sparse=True)

        self.classifiers_ = []
        self._generate_partition(X, y)
        self._label_count = y.shape[1]

        for i in range(self.model_count_):
            classifier = copy.deepcopy(self.classifier)
            y_subset = self._generate_data_subset(y, self.partition_[i], axis=1)
            if issparse(y_subset) and y_subset.ndim > 1 and y_subset.shape[1] == 1:
                y_subset = np.ravel(y_subset.toarray())
            classifier.fit(self._ensure_input_format(
                X), self._ensure_output_format(y_subset))
            self.classifiers_.append(classifier)

        return self

[docs]    def predict(self, X):
        """Predict labels for X

        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix

        Returns
        -------
        :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        """
        predictions = [self._ensure_multi_label_from_single_class(
            self.classifiers_[label].predict(self._ensure_input_format(X)))
            for label in range(self.model_count_)]

        return hstack(predictions)

[docs]    def predict_proba(self, X):
        """Predict probabilities of label assignments for X

        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix

        Returns
        -------
        :mod:`scipy.sparse` matrix of `float in [0.0, 1.0]`, shape=(n_samples, n_labels)
            matrix with label assignment probabilities
        """

        result = lil_matrix((X.shape[0], self._label_count), dtype='float')
        for label_assignment, classifier in zip(self.partition_, self.classifiers_):
            if isinstance(self.classifier, MLClassifierBase):
                # the multilabel classifier should provide a (n_samples, n_labels) matrix
                # we just need to reorder it column wise
                result[:, label_assignment] = classifier.predict_proba(X)
            else:
                # a base classifier for binary relevance returns
                # n_samples x n_classes, where n_classes = [0, 1] - 1 is the probability of
                # the label being assigned
                result[:, label_assignment] = self._ensure_multi_label_from_single_class(
                    classifier.predict_proba(
                        self._ensure_input_format(X))
                )[:, 1]  # probability that label is assigned

        return result