Source code for skmultilearn.problem_transform.lp

from ..base.problem_transformation import ProblemTransformationBase
import numpy as np
from scipy import sparse


[docs]class LabelPowerset(ProblemTransformationBase): """Transform multi-label problem to a multi-class problem Label Powerset is a problem transformation approach to multi-label classification that transforms a multi-label problem to a multi-class problem with 1 multi-class classifier trained on all unique label combinations found in the training data. The method maps each combination to a unique combination id number, and performs multi-class classification using the `classifier` as multi-class classifier and combination ids as classes. Parameters ---------- classifier : :class:`~sklearn.base.BaseEstimator` scikit-learn compatible base classifier require_dense : [bool, bool], optional whether the base classifier requires dense representations for input features and classes/labels matrices in fit/predict. If value not provided, sparse representations are used if base classifier is an instance of :class:`skmultilearn.base.MLClassifierBase` and dense otherwise. Attributes ---------- unique_combinations_ : Dict[str, int] mapping from label combination as string to label combination id :meth:`transform:` via :meth:`fit` reverse_combinations_ : List[List[int]] label combination id ordered list to list of label indexes for a given combination :meth:`transform:` via :meth:`fit` Notes ----- .. note :: `n_classes` in this document denotes the number of unique label combinations present in the training `y` passed to :meth:`fit`, in practice it is equal to :code:`len(self.unique_combinations)` Examples -------- An example use case for Label Powerset with an :class:`sklearn.ensemble.RandomForestClassifier` base classifier which supports sparse input: .. code-block:: python from skmultilearn.problem_transform import LabelPowerset from sklearn.ensemble import RandomForestClassifier # initialize LabelPowerset multi-label classifier with a RandomForest classifier = ClassifierChain( classifier = RandomForestClassifier(n_estimators=100), require_dense = [False, True] ) # train classifier.fit(X_train, y_train) # predict predictions = classifier.predict(X_test) Another way to use this classifier is to select the best scenario from a set of multi-class classifiers used with Label Powerset, this can be done using cross validation grid search. In the example below, the model with highest accuracy results is selected from either a :class:`sklearn.ensemble.RandomForestClassifier` or :class:`sklearn.naive_bayes.MultinomialNB` base classifier, alongside with best parameters for that base classifier. .. code-block:: python from skmultilearn.problem_transform import LabelPowerset from sklearn.model_selection import GridSearchCV from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier parameters = [ { 'classifier': [MultinomialNB()], 'classifier__alpha': [0.7, 1.0], }, { 'classifier': [RandomForestClassifier()], 'classifier__criterion': ['gini', 'entropy'], 'classifier__n_estimators': [10, 20, 50], }, ] clf = GridSearchCV(LabelPowerset(), parameters, scoring='accuracy') clf.fit(x, y) print (clf.best_params_, clf.best_score_) # result # { # 'classifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', # max_depth=None, max_features='auto', max_leaf_nodes=None, # min_impurity_decrease=0.0, min_impurity_split=None, # min_samples_leaf=1, min_samples_split=2, # min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, # oob_score=False, random_state=None, verbose=0, # warm_start=False), 'classifier__criterion': 'gini', 'classifier__n_estimators': 50 # } 0.16 """ def __init__(self, classifier=None, require_dense=None): super(LabelPowerset, self).__init__( classifier=classifier, require_dense=require_dense) self._clean() def _clean(self): """Reset classifier internals before refitting""" self.unique_combinations_ = {} self.reverse_combinations_ = [] self._label_count = None
[docs] def fit(self, X, y): """Fits classifier to training data Parameters ---------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments Returns ------- self fitted instance of self Notes ----- .. note :: Input matrices are converted to sparse format internally if a numpy representation is passed """ X = self._ensure_input_format( X, sparse_format='csr', enforce_sparse=True) self.classifier.fit(self._ensure_input_format(X), self.transform(y)) return self
[docs] def predict(self, X): """Predict labels for X Parameters ---------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix Returns ------- :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments """ # this will be an np.array of integers representing classes lp_prediction = self.classifier.predict(self._ensure_input_format(X)) return self.inverse_transform(lp_prediction)
[docs] def predict_proba(self, X): """Predict probabilities of label assignments for X Parameters ---------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix Returns ------- :mod:`scipy.sparse` matrix of `float in [0.0, 1.0]`, shape=(n_samples, n_labels) matrix with label assignment probabilities """ lp_prediction = self.classifier.predict_proba( self._ensure_input_format(X)) result = sparse.lil_matrix( (X.shape[0], self._label_count), dtype='float') for row in range(len(lp_prediction)): assignment = lp_prediction[row] for combination_id in range(len(assignment)): for label in self.reverse_combinations_[combination_id]: result[row, label] += assignment[combination_id] return result
[docs] def transform(self, y): """Transform multi-label output space to multi-class Transforms a mutli-label problem into a single-label multi-class problem where each label combination is a separate class. Parameters ----------- y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments Returns ------- numpy.ndarray of `{0, ... , n_classes-1}`, shape=(n_samples,) a multi-class output space vector """ y = self._ensure_output_format( y, sparse_format='lil', enforce_sparse=True) self._clean() self._label_count = y.shape[1] last_id = 0 train_vector = [] for labels_applied in y.rows: label_string = ",".join(map(str, labels_applied)) if label_string not in self.unique_combinations_: self.unique_combinations_[label_string] = last_id self.reverse_combinations_.append(labels_applied) last_id += 1 train_vector.append(self.unique_combinations_[label_string]) return np.array(train_vector)
[docs] def inverse_transform(self, y): """Transforms multi-class assignment to multi-label Transforms a mutli-label problem into a single-label multi-class problem where each label combination is a separate class. Parameters ----------- y : numpy.ndarray of `{0, ... , n_classes-1}`, shape=(n_samples,) binary indicator matrix with label assignments Returns ------- :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments """ n_samples = len(y) result = sparse.lil_matrix((n_samples, self._label_count), dtype='i8') for row in range(n_samples): assignment = y[row] result[row, self.reverse_combinations_[assignment]] = 1 return result