Source code for skmultilearn.adapt.mlknn

from builtins import range
from ..base import MLClassifierBase
from ..utils import get_matrix_in_format

from sklearn.neighbors import NearestNeighbors
import numpy as np
import scipy.sparse as sparse


[docs]class MLkNN(MLClassifierBase): """kNN classification method adapted for multi-label classification MLkNN builds uses k-NearestNeighbors find nearest examples to a test class and uses Bayesian inference to select assigned labels. Parameters ---------- k : int number of neighbours of each input instance to take into account s: float (default is 1.0) the smoothing parameter ignore_first_neighbours : int (default is 0) ability to ignore first N neighbours, useful for comparing with other classification software. Attributes ---------- knn_ : an instance of sklearn.NearestNeighbors the nearest neighbors single-label classifier used underneath .. note:: If you don't know what :code:`ignore_first_neighbours` does, the default is safe. Please see this `issue`_. .. _issue: https://github.com/scikit-multilearn/scikit-multilearn/issues/22 References ---------- If you use this classifier please cite the original paper introducing the method: .. code :: bibtex @article{zhang2007ml, title={ML-KNN: A lazy learning approach to multi-label learning}, author={Zhang, Min-Ling and Zhou, Zhi-Hua}, journal={Pattern recognition}, volume={40}, number={7}, pages={2038--2048}, year={2007}, publisher={Elsevier} } Examples -------- Here's a very simple example of using MLkNN with a fixed number of neighbors: .. code :: python from skmultilearn.adapt import MLkNN classifier = MLkNN(k=3) # train classifier.fit(X_train, y_train) # predict predictions = classifier.predict(X_test) You can also use :class:`~sklearn.model_selection.GridSearchCV` to find an optimal set of parameters: .. code :: python from skmultilearn.adapt import MLkNN from sklearn.model_selection import GridSearchCV parameters = {'k': range(1,3), 's': [0.5, 0.7, 1.0]} score = 'f1-macro clf = GridSearchCV(MLkNN(), parameters, scoring=score) clf.fit(X, y) print clf.best_params_, clf.best_score_ # output ({'k': 1, 's': 0.5}, 0.78988303374297597) """ def __init__(self, k=10, s=1.0, ignore_first_neighbours=0): """Initializes the classifier Parameters ---------- k : int number of neighbours of each input instance to take into account s: float (default is 1.0) the smoothing parameter ignore_first_neighbours : int (default is 0) ability to ignore first N neighbours, useful for comparing with other classification software. Attributes ---------- knn_ : an instance of sklearn.NearestNeighbors the nearest neighbors single-label classifier used underneath .. note:: If you don't know what :code:`ignore_first_neighbours` does, the default is safe. Please see this `issue`_. .. _issue: https://github.com/scikit-multilearn/scikit-multilearn/issues/22 """ super(MLkNN, self).__init__() self.k = k # Number of neighbours self.s = s # Smooth parameter self.ignore_first_neighbours = ignore_first_neighbours self.copyable_attrs = ['k', 's', 'ignore_first_neighbours'] def _compute_prior(self, y): """Helper function to compute for the prior probabilities Parameters ---------- y : numpy.ndarray or scipy.sparse the training labels Returns ------- numpy.ndarray the prior probability given true numpy.ndarray the prior probability given false """ prior_prob_true = np.array((self.s + y.sum(axis=0)) / (self.s * 2 + self._num_instances))[0] prior_prob_false = 1 - prior_prob_true return (prior_prob_true, prior_prob_false) def _compute_cond(self, X, y): """Helper function to compute for the posterior probabilities Parameters ---------- X : numpy.ndarray or scipy.sparse input features, can be a dense or sparse matrix of size :code:`(n_samples, n_features)` y : numpy.ndaarray or scipy.sparse {0,1} binary indicator matrix with label assignments. Returns ------- numpy.ndarray the posterior probability given true numpy.ndarray the posterior probability given false """ self.knn_ = NearestNeighbors(self.k).fit(X) c = sparse.lil_matrix((self._num_labels, self.k + 1), dtype='i8') cn = sparse.lil_matrix((self._num_labels, self.k + 1), dtype='i8') label_info = get_matrix_in_format(y, 'dok') neighbors = [a[self.ignore_first_neighbours:] for a in self.knn_.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False)] for instance in range(self._num_instances): deltas = label_info[neighbors[instance], :].sum(axis=0) for label in range(self._num_labels): if label_info[instance, label] == 1: c[label, deltas[0, label]] += 1 else: cn[label, deltas[0, label]] += 1 c_sum = c.sum(axis=1) cn_sum = cn.sum(axis=1) cond_prob_true = sparse.lil_matrix((self._num_labels, self.k + 1), dtype='float') cond_prob_false = sparse.lil_matrix((self._num_labels, self.k + 1), dtype='float') for label in range(self._num_labels): for neighbor in range(self.k + 1): cond_prob_true[label, neighbor] = (self.s + c[label, neighbor]) / ( self.s * (self.k + 1) + c_sum[label, 0]) cond_prob_false[label, neighbor] = (self.s + cn[label, neighbor]) / ( self.s * (self.k + 1) + cn_sum[label, 0]) return cond_prob_true, cond_prob_false
[docs] def fit(self, X, y): """Fit classifier with training data Parameters ---------- X : numpy.ndarray or scipy.sparse input features, can be a dense or sparse matrix of size :code:`(n_samples, n_features)` y : numpy.ndaarray or scipy.sparse {0,1} binary indicator matrix with label assignments. Returns ------- self fitted instance of self """ self._label_cache = get_matrix_in_format(y, 'lil') self._num_instances = self._label_cache.shape[0] self._num_labels = self._label_cache.shape[1] # Computing the prior probabilities self._prior_prob_true, self._prior_prob_false = self._compute_prior(self._label_cache) # Computing the posterior probabilities self._cond_prob_true, self._cond_prob_false = self._compute_cond(X, self._label_cache) return self
[docs] def predict(self, X): """Predict labels for X Parameters ---------- X : numpy.ndarray or scipy.sparse.csc_matrix input features of shape :code:`(n_samples, n_features)` Returns ------- scipy.sparse matrix of int binary indicator matrix with label assignments with shape :code:`(n_samples, n_labels)` """ result = sparse.lil_matrix((X.shape[0], self._num_labels), dtype='i8') neighbors = [a[self.ignore_first_neighbours:] for a in self.knn_.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False)] for instance in range(X.shape[0]): deltas = self._label_cache[neighbors[instance],].sum(axis=0) for label in range(self._num_labels): p_true = self._prior_prob_true[label] * self._cond_prob_true[label, deltas[0, label]] p_false = self._prior_prob_false[label] * self._cond_prob_false[label, deltas[0, label]] result[instance, label] = int(p_true >= p_false) return result
[docs] def predict_proba(self, X): """Predict probabilities of label assignments for X Parameters ---------- X : numpy.ndarray or scipy.sparse.csc_matrix input features of shape :code:`(n_samples, n_features)` Returns ------- scipy.sparse matrix of int binary indicator matrix with label assignment probabilities with shape :code:`(n_samples, n_labels)` """ result = sparse.lil_matrix((X.shape[0], self._num_labels), dtype='float') neighbors = [a[self.ignore_first_neighbours:] for a in self.knn_.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False)] for instance in range(X.shape[0]): deltas = self._label_cache[neighbors[instance],].sum(axis=0) for label in range(self._num_labels): p_true = self._prior_prob_true[label] * self._cond_prob_true[label, deltas[0, label]] result[instance, label] = p_true return result