from ..base.problem_transformation import ProblemTransformationBase
import numpy as np
from scipy import sparse
[docs]class LabelPowerset(ProblemTransformationBase):
"""Transform multi-label problem to a multi-class problem
Label Powerset is a problem transformation approach to multi-label
classification that transforms a multi-label problem to a multi-class
problem with 1 multi-class classifier trained on all unique label
combinations found in the training data.
The method maps each combination to a unique combination id number, and performs multi-class classification
using the `classifier` as multi-class classifier and combination ids as classes.
Parameters
----------
classifier : :class:`~sklearn.base.BaseEstimator`
scikit-learn compatible base classifier
require_dense : [bool, bool], optional
whether the base classifier requires dense representations
for input features and classes/labels matrices in fit/predict.
If value not provided, sparse representations are used if base classifier is
an instance of :class:`skmultilearn.base.MLClassifierBase` and dense otherwise.
Attributes
----------
unique_combinations_ : Dict[str, int]
mapping from label combination as string to label combination id :meth:`transform:` via :meth:`fit`
reverse_combinations_ : List[List[int]]
label combination id ordered list to list of label indexes for a given combination :meth:`transform:`
via :meth:`fit`
Notes
-----
.. note ::
`n_classes` in this document denotes the number of unique label combinations present in the training `y`
passed to :meth:`fit`, in practice it is equal to :code:`len(self.unique_combinations)`
Examples
--------
An example use case for Label Powerset with an :class:`sklearn.ensemble.RandomForestClassifier` base classifier
which supports sparse input:
.. code-block:: python
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
# initialize LabelPowerset multi-label classifier with a RandomForest
classifier = ClassifierChain(
classifier = RandomForestClassifier(n_estimators=100),
require_dense = [False, True]
)
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_test)
Another way to use this classifier is to select the best scenario from a set of multi-class classifiers used
with Label Powerset, this can be done using cross validation grid search. In the example below, the model
with highest accuracy results is selected from either a :class:`sklearn.ensemble.RandomForestClassifier` or
:class:`sklearn.naive_bayes.MultinomialNB` base classifier, alongside with best parameters for
that base classifier.
.. code-block:: python
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
parameters = [
{
'classifier': [MultinomialNB()],
'classifier__alpha': [0.7, 1.0],
},
{
'classifier': [RandomForestClassifier()],
'classifier__criterion': ['gini', 'entropy'],
'classifier__n_estimators': [10, 20, 50],
},
]
clf = GridSearchCV(LabelPowerset(), parameters, scoring='accuracy')
clf.fit(x, y)
print (clf.best_params_, clf.best_score_)
# result
# {
# 'classifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
# max_depth=None, max_features='auto', max_leaf_nodes=None,
# min_impurity_decrease=0.0, min_impurity_split=None,
# min_samples_leaf=1, min_samples_split=2,
# min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
# oob_score=False, random_state=None, verbose=0,
# warm_start=False), 'classifier__criterion': 'gini', 'classifier__n_estimators': 50
# } 0.16
"""
def __init__(self, classifier=None, require_dense=None):
super(LabelPowerset, self).__init__(
classifier=classifier, require_dense=require_dense)
self._clean()
def _clean(self):
"""Reset classifier internals before refitting"""
self.unique_combinations_ = {}
self.reverse_combinations_ = []
self._label_count = None
[docs] def fit(self, X, y):
"""Fits classifier to training data
Parameters
----------
X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
input feature matrix
y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
binary indicator matrix with label assignments
Returns
-------
self
fitted instance of self
Notes
-----
.. note :: Input matrices are converted to sparse format internally if a numpy representation is passed
"""
X = self._ensure_input_format(
X, sparse_format='csr', enforce_sparse=True)
self.classifier.fit(self._ensure_input_format(X),
self.transform(y))
return self
[docs] def predict(self, X):
"""Predict labels for X
Parameters
----------
X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
input feature matrix
Returns
-------
:mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
binary indicator matrix with label assignments
"""
# this will be an np.array of integers representing classes
lp_prediction = self.classifier.predict(self._ensure_input_format(X))
return self.inverse_transform(lp_prediction)
[docs] def predict_proba(self, X):
"""Predict probabilities of label assignments for X
Parameters
----------
X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
input feature matrix
Returns
-------
:mod:`scipy.sparse` matrix of `float in [0.0, 1.0]`, shape=(n_samples, n_labels)
matrix with label assignment probabilities
"""
lp_prediction = self.classifier.predict_proba(
self._ensure_input_format(X))
result = sparse.lil_matrix(
(X.shape[0], self._label_count), dtype='float')
for row in range(len(lp_prediction)):
assignment = lp_prediction[row]
for combination_id in range(len(assignment)):
for label in self.reverse_combinations_[combination_id]:
result[row, label] += assignment[combination_id]
return result